Merge "Compile x86 and x86-64 SIMD optimizations"
diff --git a/LGPL.txt b/LGPL.txt
deleted file mode 100644
index b1e3f5a..0000000
--- a/LGPL.txt
+++ /dev/null
@@ -1,504 +0,0 @@
-		  GNU LESSER GENERAL PUBLIC LICENSE
-		       Version 2.1, February 1999
-
- Copyright (C) 1991, 1999 Free Software Foundation, Inc.
-     59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
-[This is the first released version of the Lesser GPL.  It also counts
- as the successor of the GNU Library Public License, version 2, hence
- the version number 2.1.]
-
-			    Preamble
-
-  The licenses for most software are designed to take away your
-freedom to share and change it.  By contrast, the GNU General Public
-Licenses are intended to guarantee your freedom to share and change
-free software--to make sure the software is free for all its users.
-
-  This license, the Lesser General Public License, applies to some
-specially designated software packages--typically libraries--of the
-Free Software Foundation and other authors who decide to use it.  You
-can use it too, but we suggest you first think carefully about whether
-this license or the ordinary General Public License is the better
-strategy to use in any particular case, based on the explanations below.
-
-  When we speak of free software, we are referring to freedom of use,
-not price.  Our General Public Licenses are designed to make sure that
-you have the freedom to distribute copies of free software (and charge
-for this service if you wish); that you receive source code or can get
-it if you want it; that you can change the software and use pieces of
-it in new free programs; and that you are informed that you can do
-these things.
-
-  To protect your rights, we need to make restrictions that forbid
-distributors to deny you these rights or to ask you to surrender these
-rights.  These restrictions translate to certain responsibilities for
-you if you distribute copies of the library or if you modify it.
-
-  For example, if you distribute copies of the library, whether gratis
-or for a fee, you must give the recipients all the rights that we gave
-you.  You must make sure that they, too, receive or can get the source
-code.  If you link other code with the library, you must provide
-complete object files to the recipients, so that they can relink them
-with the library after making changes to the library and recompiling
-it.  And you must show them these terms so they know their rights.
-
-  We protect your rights with a two-step method: (1) we copyright the
-library, and (2) we offer you this license, which gives you legal
-permission to copy, distribute and/or modify the library.
-
-  To protect each distributor, we want to make it very clear that
-there is no warranty for the free library.  Also, if the library is
-modified by someone else and passed on, the recipients should know
-that what they have is not the original version, so that the original
-author's reputation will not be affected by problems that might be
-introduced by others.
-
-  Finally, software patents pose a constant threat to the existence of
-any free program.  We wish to make sure that a company cannot
-effectively restrict the users of a free program by obtaining a
-restrictive license from a patent holder.  Therefore, we insist that
-any patent license obtained for a version of the library must be
-consistent with the full freedom of use specified in this license.
-
-  Most GNU software, including some libraries, is covered by the
-ordinary GNU General Public License.  This license, the GNU Lesser
-General Public License, applies to certain designated libraries, and
-is quite different from the ordinary General Public License.  We use
-this license for certain libraries in order to permit linking those
-libraries into non-free programs.
-
-  When a program is linked with a library, whether statically or using
-a shared library, the combination of the two is legally speaking a
-combined work, a derivative of the original library.  The ordinary
-General Public License therefore permits such linking only if the
-entire combination fits its criteria of freedom.  The Lesser General
-Public License permits more lax criteria for linking other code with
-the library.
-
-  We call this license the "Lesser" General Public License because it
-does Less to protect the user's freedom than the ordinary General
-Public License.  It also provides other free software developers Less
-of an advantage over competing non-free programs.  These disadvantages
-are the reason we use the ordinary General Public License for many
-libraries.  However, the Lesser license provides advantages in certain
-special circumstances.
-
-  For example, on rare occasions, there may be a special need to
-encourage the widest possible use of a certain library, so that it becomes
-a de-facto standard.  To achieve this, non-free programs must be
-allowed to use the library.  A more frequent case is that a free
-library does the same job as widely used non-free libraries.  In this
-case, there is little to gain by limiting the free library to free
-software only, so we use the Lesser General Public License.
-
-  In other cases, permission to use a particular library in non-free
-programs enables a greater number of people to use a large body of
-free software.  For example, permission to use the GNU C Library in
-non-free programs enables many more people to use the whole GNU
-operating system, as well as its variant, the GNU/Linux operating
-system.
-
-  Although the Lesser General Public License is Less protective of the
-users' freedom, it does ensure that the user of a program that is
-linked with the Library has the freedom and the wherewithal to run
-that program using a modified version of the Library.
-
-  The precise terms and conditions for copying, distribution and
-modification follow.  Pay close attention to the difference between a
-"work based on the library" and a "work that uses the library".  The
-former contains code derived from the library, whereas the latter must
-be combined with the library in order to run.
-
-		  GNU LESSER GENERAL PUBLIC LICENSE
-   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
-
-  0. This License Agreement applies to any software library or other
-program which contains a notice placed by the copyright holder or
-other authorized party saying it may be distributed under the terms of
-this Lesser General Public License (also called "this License").
-Each licensee is addressed as "you".
-
-  A "library" means a collection of software functions and/or data
-prepared so as to be conveniently linked with application programs
-(which use some of those functions and data) to form executables.
-
-  The "Library", below, refers to any such software library or work
-which has been distributed under these terms.  A "work based on the
-Library" means either the Library or any derivative work under
-copyright law: that is to say, a work containing the Library or a
-portion of it, either verbatim or with modifications and/or translated
-straightforwardly into another language.  (Hereinafter, translation is
-included without limitation in the term "modification".)
-
-  "Source code" for a work means the preferred form of the work for
-making modifications to it.  For a library, complete source code means
-all the source code for all modules it contains, plus any associated
-interface definition files, plus the scripts used to control compilation
-and installation of the library.
-
-  Activities other than copying, distribution and modification are not
-covered by this License; they are outside its scope.  The act of
-running a program using the Library is not restricted, and output from
-such a program is covered only if its contents constitute a work based
-on the Library (independent of the use of the Library in a tool for
-writing it).  Whether that is true depends on what the Library does
-and what the program that uses the Library does.
-  
-  1. You may copy and distribute verbatim copies of the Library's
-complete source code as you receive it, in any medium, provided that
-you conspicuously and appropriately publish on each copy an
-appropriate copyright notice and disclaimer of warranty; keep intact
-all the notices that refer to this License and to the absence of any
-warranty; and distribute a copy of this License along with the
-Library.
-
-  You may charge a fee for the physical act of transferring a copy,
-and you may at your option offer warranty protection in exchange for a
-fee.
-
-  2. You may modify your copy or copies of the Library or any portion
-of it, thus forming a work based on the Library, and copy and
-distribute such modifications or work under the terms of Section 1
-above, provided that you also meet all of these conditions:
-
-    a) The modified work must itself be a software library.
-
-    b) You must cause the files modified to carry prominent notices
-    stating that you changed the files and the date of any change.
-
-    c) You must cause the whole of the work to be licensed at no
-    charge to all third parties under the terms of this License.
-
-    d) If a facility in the modified Library refers to a function or a
-    table of data to be supplied by an application program that uses
-    the facility, other than as an argument passed when the facility
-    is invoked, then you must make a good faith effort to ensure that,
-    in the event an application does not supply such function or
-    table, the facility still operates, and performs whatever part of
-    its purpose remains meaningful.
-
-    (For example, a function in a library to compute square roots has
-    a purpose that is entirely well-defined independent of the
-    application.  Therefore, Subsection 2d requires that any
-    application-supplied function or table used by this function must
-    be optional: if the application does not supply it, the square
-    root function must still compute square roots.)
-
-These requirements apply to the modified work as a whole.  If
-identifiable sections of that work are not derived from the Library,
-and can be reasonably considered independent and separate works in
-themselves, then this License, and its terms, do not apply to those
-sections when you distribute them as separate works.  But when you
-distribute the same sections as part of a whole which is a work based
-on the Library, the distribution of the whole must be on the terms of
-this License, whose permissions for other licensees extend to the
-entire whole, and thus to each and every part regardless of who wrote
-it.
-
-Thus, it is not the intent of this section to claim rights or contest
-your rights to work written entirely by you; rather, the intent is to
-exercise the right to control the distribution of derivative or
-collective works based on the Library.
-
-In addition, mere aggregation of another work not based on the Library
-with the Library (or with a work based on the Library) on a volume of
-a storage or distribution medium does not bring the other work under
-the scope of this License.
-
-  3. You may opt to apply the terms of the ordinary GNU General Public
-License instead of this License to a given copy of the Library.  To do
-this, you must alter all the notices that refer to this License, so
-that they refer to the ordinary GNU General Public License, version 2,
-instead of to this License.  (If a newer version than version 2 of the
-ordinary GNU General Public License has appeared, then you can specify
-that version instead if you wish.)  Do not make any other change in
-these notices.
-
-  Once this change is made in a given copy, it is irreversible for
-that copy, so the ordinary GNU General Public License applies to all
-subsequent copies and derivative works made from that copy.
-
-  This option is useful when you wish to copy part of the code of
-the Library into a program that is not a library.
-
-  4. You may copy and distribute the Library (or a portion or
-derivative of it, under Section 2) in object code or executable form
-under the terms of Sections 1 and 2 above provided that you accompany
-it with the complete corresponding machine-readable source code, which
-must be distributed under the terms of Sections 1 and 2 above on a
-medium customarily used for software interchange.
-
-  If distribution of object code is made by offering access to copy
-from a designated place, then offering equivalent access to copy the
-source code from the same place satisfies the requirement to
-distribute the source code, even though third parties are not
-compelled to copy the source along with the object code.
-
-  5. A program that contains no derivative of any portion of the
-Library, but is designed to work with the Library by being compiled or
-linked with it, is called a "work that uses the Library".  Such a
-work, in isolation, is not a derivative work of the Library, and
-therefore falls outside the scope of this License.
-
-  However, linking a "work that uses the Library" with the Library
-creates an executable that is a derivative of the Library (because it
-contains portions of the Library), rather than a "work that uses the
-library".  The executable is therefore covered by this License.
-Section 6 states terms for distribution of such executables.
-
-  When a "work that uses the Library" uses material from a header file
-that is part of the Library, the object code for the work may be a
-derivative work of the Library even though the source code is not.
-Whether this is true is especially significant if the work can be
-linked without the Library, or if the work is itself a library.  The
-threshold for this to be true is not precisely defined by law.
-
-  If such an object file uses only numerical parameters, data
-structure layouts and accessors, and small macros and small inline
-functions (ten lines or less in length), then the use of the object
-file is unrestricted, regardless of whether it is legally a derivative
-work.  (Executables containing this object code plus portions of the
-Library will still fall under Section 6.)
-
-  Otherwise, if the work is a derivative of the Library, you may
-distribute the object code for the work under the terms of Section 6.
-Any executables containing that work also fall under Section 6,
-whether or not they are linked directly with the Library itself.
-
-  6. As an exception to the Sections above, you may also combine or
-link a "work that uses the Library" with the Library to produce a
-work containing portions of the Library, and distribute that work
-under terms of your choice, provided that the terms permit
-modification of the work for the customer's own use and reverse
-engineering for debugging such modifications.
-
-  You must give prominent notice with each copy of the work that the
-Library is used in it and that the Library and its use are covered by
-this License.  You must supply a copy of this License.  If the work
-during execution displays copyright notices, you must include the
-copyright notice for the Library among them, as well as a reference
-directing the user to the copy of this License.  Also, you must do one
-of these things:
-
-    a) Accompany the work with the complete corresponding
-    machine-readable source code for the Library including whatever
-    changes were used in the work (which must be distributed under
-    Sections 1 and 2 above); and, if the work is an executable linked
-    with the Library, with the complete machine-readable "work that
-    uses the Library", as object code and/or source code, so that the
-    user can modify the Library and then relink to produce a modified
-    executable containing the modified Library.  (It is understood
-    that the user who changes the contents of definitions files in the
-    Library will not necessarily be able to recompile the application
-    to use the modified definitions.)
-
-    b) Use a suitable shared library mechanism for linking with the
-    Library.  A suitable mechanism is one that (1) uses at run time a
-    copy of the library already present on the user's computer system,
-    rather than copying library functions into the executable, and (2)
-    will operate properly with a modified version of the library, if
-    the user installs one, as long as the modified version is
-    interface-compatible with the version that the work was made with.
-
-    c) Accompany the work with a written offer, valid for at
-    least three years, to give the same user the materials
-    specified in Subsection 6a, above, for a charge no more
-    than the cost of performing this distribution.
-
-    d) If distribution of the work is made by offering access to copy
-    from a designated place, offer equivalent access to copy the above
-    specified materials from the same place.
-
-    e) Verify that the user has already received a copy of these
-    materials or that you have already sent this user a copy.
-
-  For an executable, the required form of the "work that uses the
-Library" must include any data and utility programs needed for
-reproducing the executable from it.  However, as a special exception,
-the materials to be distributed need not include anything that is
-normally distributed (in either source or binary form) with the major
-components (compiler, kernel, and so on) of the operating system on
-which the executable runs, unless that component itself accompanies
-the executable.
-
-  It may happen that this requirement contradicts the license
-restrictions of other proprietary libraries that do not normally
-accompany the operating system.  Such a contradiction means you cannot
-use both them and the Library together in an executable that you
-distribute.
-
-  7. You may place library facilities that are a work based on the
-Library side-by-side in a single library together with other library
-facilities not covered by this License, and distribute such a combined
-library, provided that the separate distribution of the work based on
-the Library and of the other library facilities is otherwise
-permitted, and provided that you do these two things:
-
-    a) Accompany the combined library with a copy of the same work
-    based on the Library, uncombined with any other library
-    facilities.  This must be distributed under the terms of the
-    Sections above.
-
-    b) Give prominent notice with the combined library of the fact
-    that part of it is a work based on the Library, and explaining
-    where to find the accompanying uncombined form of the same work.
-
-  8. You may not copy, modify, sublicense, link with, or distribute
-the Library except as expressly provided under this License.  Any
-attempt otherwise to copy, modify, sublicense, link with, or
-distribute the Library is void, and will automatically terminate your
-rights under this License.  However, parties who have received copies,
-or rights, from you under this License will not have their licenses
-terminated so long as such parties remain in full compliance.
-
-  9. You are not required to accept this License, since you have not
-signed it.  However, nothing else grants you permission to modify or
-distribute the Library or its derivative works.  These actions are
-prohibited by law if you do not accept this License.  Therefore, by
-modifying or distributing the Library (or any work based on the
-Library), you indicate your acceptance of this License to do so, and
-all its terms and conditions for copying, distributing or modifying
-the Library or works based on it.
-
-  10. Each time you redistribute the Library (or any work based on the
-Library), the recipient automatically receives a license from the
-original licensor to copy, distribute, link with or modify the Library
-subject to these terms and conditions.  You may not impose any further
-restrictions on the recipients' exercise of the rights granted herein.
-You are not responsible for enforcing compliance by third parties with
-this License.
-
-  11. If, as a consequence of a court judgment or allegation of patent
-infringement or for any other reason (not limited to patent issues),
-conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot
-distribute so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you
-may not distribute the Library at all.  For example, if a patent
-license would not permit royalty-free redistribution of the Library by
-all those who receive copies directly or indirectly through you, then
-the only way you could satisfy both it and this License would be to
-refrain entirely from distribution of the Library.
-
-If any portion of this section is held invalid or unenforceable under any
-particular circumstance, the balance of the section is intended to apply,
-and the section as a whole is intended to apply in other circumstances.
-
-It is not the purpose of this section to induce you to infringe any
-patents or other property right claims or to contest validity of any
-such claims; this section has the sole purpose of protecting the
-integrity of the free software distribution system which is
-implemented by public license practices.  Many people have made
-generous contributions to the wide range of software distributed
-through that system in reliance on consistent application of that
-system; it is up to the author/donor to decide if he or she is willing
-to distribute software through any other system and a licensee cannot
-impose that choice.
-
-This section is intended to make thoroughly clear what is believed to
-be a consequence of the rest of this License.
-
-  12. If the distribution and/or use of the Library is restricted in
-certain countries either by patents or by copyrighted interfaces, the
-original copyright holder who places the Library under this License may add
-an explicit geographical distribution limitation excluding those countries,
-so that distribution is permitted only in or among countries not thus
-excluded.  In such case, this License incorporates the limitation as if
-written in the body of this License.
-
-  13. The Free Software Foundation may publish revised and/or new
-versions of the Lesser General Public License from time to time.
-Such new versions will be similar in spirit to the present version,
-but may differ in detail to address new problems or concerns.
-
-Each version is given a distinguishing version number.  If the Library
-specifies a version number of this License which applies to it and
-"any later version", you have the option of following the terms and
-conditions either of that version or of any later version published by
-the Free Software Foundation.  If the Library does not specify a
-license version number, you may choose any version ever published by
-the Free Software Foundation.
-
-  14. If you wish to incorporate parts of the Library into other free
-programs whose distribution conditions are incompatible with these,
-write to the author to ask for permission.  For software which is
-copyrighted by the Free Software Foundation, write to the Free
-Software Foundation; we sometimes make exceptions for this.  Our
-decision will be guided by the two goals of preserving the free status
-of all derivatives of our free software and of promoting the sharing
-and reuse of software generally.
-
-			    NO WARRANTY
-
-  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
-WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
-EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
-OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
-KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
-LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
-THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
-  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
-WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
-AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
-FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
-CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
-LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
-RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
-FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
-SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
-DAMAGES.
-
-		     END OF TERMS AND CONDITIONS
-
-           How to Apply These Terms to Your New Libraries
-
-  If you develop a new library, and you want it to be of the greatest
-possible use to the public, we recommend making it free software that
-everyone can redistribute and change.  You can do so by permitting
-redistribution under these terms (or, alternatively, under the terms of the
-ordinary General Public License).
-
-  To apply these terms, attach the following notices to the library.  It is
-safest to attach them to the start of each source file to most effectively
-convey the exclusion of warranty; and each file should have at least the
-"copyright" line and a pointer to where the full notice is found.
-
-    <one line to give the library's name and a brief idea of what it does.>
-    Copyright (C) <year>  <name of author>
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
-Also add information on how to contact you by electronic and paper mail.
-
-You should also get your employer (if you work as a programmer) or your
-school, if any, to sign a "copyright disclaimer" for the library, if
-necessary.  Here is a sample; alter the names:
-
-  Yoyodyne, Inc., hereby disclaims all copyright interest in the
-  library `Frob' (a library for tweaking knobs) written by James Random Hacker.
-
-  <signature of Ty Coon>, 1 April 1990
-  Ty Coon, President of Vice
-
-That's all there is to it!
-
-
diff --git a/README.chromium b/README.chromium
deleted file mode 100644
index de3b9d9..0000000
--- a/README.chromium
+++ /dev/null
@@ -1,57 +0,0 @@
-Name: libjpeg-turbo
-URL: http://sourceforge.net/projects/libjpeg-turbo/
-Version: 1.3.1
-License: Custom license
-License File: LICENSE.txt
-Security Critical: yes
-License Android Compatible: yes
-
-Description:
-This consists of the components:
-* A partial copy of libjpeg-turbo 1.3.1 (r1219);
-* Revision r1188 cherry-picked from upstream trunk into config.h to fix
-  compiler warning on newer versions of gcc;
-* Revision r1220 cherry-picked from upstream trunk into jchuff.c to use
-  clz/bsr instructions on ARM for bit counting rather than the lookup table
-  (reduces memory footprint and can improve performance in some cases);
-* Revisions r1108, r1109, r1333, r1375, r1386, r1389 and r1390 cherry-picked
-  from upstream trunk for Arm64 NEON SIMD support;
-* Revisions r1582, r1583, r1586, r1587, r1591, and
-  commit 91eceba0a132a3fc70388a82c75616e67725a93a (code moved to GitHub)
-  cherry-picked from upstream trunk for partial decoding optimization;
-  http://crbug.com/515694
-* Revisions r1295, r1385, r1398, and r1402 (r1386 is also required but has
-  already been cherry-picked) cherry-picked from upstream trunk to enable
-  decoding to 565 as a memory optimization;
-  http://crbug.com/516761
-* A build file (libjpeg.gyp), and;
-* Patched header files used by Chromium.
-
-More details on cherry-picked revisions and commits can be found at:
-https://sourceforge.net/p/libjpeg-turbo/code/commit_browser
-https://github.com/libjpeg-turbo/libjpeg-turbo/commits/master
-
-This libjpeg-turbo can replace our libjpeg-6b without any modifications in the
-Chromium code.
-
-Same as our copy of libjpeg-6b, this libjpeg-turbo also added a new file
-jpeglibmangler.h and included it from jpeglib.h that changes the names of all
-externally visible functions to chromium_* so that we can avoid conflicts that
-arise when system libraries attempt to use our libjpeg. Also, we applied the
-following changes which are not merged to upstream:
-
-* Added the 'private_extern' flags on Mac (or the 'hidden' flags on Linux) to
-  all the global symbols in '.asm' files to prevent making them external ones.
-* Supported motion-JPEG frames that do not have DHT markers.
-* Fix libjpeg_turbo svn r64 libjpeg6b compat issue: make the fast path Huffman
-  decoder fallback to slow decoding if the Huffman decoding bit sentinel > 16,
-  this to match the exact behavior of jpeg_huff_decode().
-  http://crbug.com/398235
-* Fixed an issue with the ARM NEON build.
-  http://crbug.com/451035
-
-Refer to working-with-nested-repos [1] for details of how to setup your git
-svn client to update the code (for making local changes, cherry picking from
-upstream, etc).
-
-[1] https://www.chromium.org/developers/how-tos/get-the-code/working-with-nested-repos
diff --git a/README.version b/README.version
index 45072ef..895b5d3 100644
--- a/README.version
+++ b/README.version
@@ -1,3 +1,3 @@
-URL: https://github.com/libjpeg-turbo/libjpeg-turbo/archive/1.3.1.tar.gz
-Version: 1.3.1 (plus patches see README.chromium)
+URL: https://github.com/libjpeg-turbo/libjpeg-turbo/archive/1.4.2.tar.gz
+Version: 1.4.2
 BugComponent: 24950
diff --git a/codereview.settings b/codereview.settings
deleted file mode 100644
index 06e4778..0000000
--- a/codereview.settings
+++ /dev/null
@@ -1,9 +0,0 @@
-CODE_REVIEW_SERVER: codereview.chromium.org
-CC_LIST: chromium-reviews@chromium.org
-VIEW_VC: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo/+/
-STATUS: http://chromium-status.appspot.com/status
-TRY_ON_UPLOAD: False
-TRYSERVER_SVN_URL: svn://svn.chromium.org/chrome-try/try
-GITCL_PREUPLOAD: http://src.chromium.org/viewvc/chrome/trunk/tools/depot_tools/git-cl-upload-hook?revision=HEAD
-GITCL_PREDCOMMIT: http://src.chromium.org/viewvc/chrome/trunk/tools/depot_tools/git-cl-upload-hook?revision=HEAD
-PROJECT: chromium_deps
diff --git a/config.h b/config.h
deleted file mode 100644
index d5a6218..0000000
--- a/config.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/* config.h.  Generated from config.h.in by configure.  */
-/* config.h.in.  Generated from configure.ac by autoheader.  */
-
-/* Build number */
-#define BUILD "20140410"
-
-/* Support arithmetic encoding */
-/* #undef C_ARITH_CODING_SUPPORTED */
-
-/* Support arithmetic decoding */
-/* #undef D_ARITH_CODING_SUPPORTED */
-
-/* Support in-memory source/destination managers */
-/* #undef MEM_SRCDST_SUPPORTED */
-
-/* Define to 1 if you have the <dlfcn.h> header file. */
-#define HAVE_DLFCN_H 1
-
-/* Define to 1 if you have the <inttypes.h> header file. */
-#define HAVE_INTTYPES_H 1
-
-/* Define to 1 if you have the <jni.h> header file. */
-/* #undef HAVE_JNI_H */
-
-/* Define to 1 if you have the `memcpy' function. */
-#define HAVE_MEMCPY 1
-
-/* Define to 1 if you have the <memory.h> header file. */
-#define HAVE_MEMORY_H 1
-
-/* Define to 1 if you have the `memset' function. */
-#define HAVE_MEMSET 1
-
-/* Define if your compiler supports prototypes */
-#define HAVE_PROTOTYPES 1
-
-/* Define to 1 if you have the <stddef.h> header file. */
-#define HAVE_STDDEF_H 1
-
-/* Define to 1 if you have the <stdint.h> header file. */
-#define HAVE_STDINT_H 1
-
-/* Define to 1 if you have the <stdlib.h> header file. */
-#define HAVE_STDLIB_H 1
-
-/* Define to 1 if you have the <strings.h> header file. */
-#define HAVE_STRINGS_H 1
-
-/* Define to 1 if you have the <string.h> header file. */
-#define HAVE_STRING_H 1
-
-/* Define to 1 if you have the <sys/stat.h> header file. */
-#define HAVE_SYS_STAT_H 1
-
-/* Define to 1 if you have the <sys/types.h> header file. */
-#define HAVE_SYS_TYPES_H 1
-
-/* Define to 1 if you have the <unistd.h> header file. */
-#if !defined(_MSC_VER)
-#define HAVE_UNISTD_H 1
-#endif
-
-/* Define to 1 if the system has the type `unsigned char'. */
-#define HAVE_UNSIGNED_CHAR 1
-
-/* Define to 1 if the system has the type `unsigned short'. */
-#define HAVE_UNSIGNED_SHORT 1
-
-/* Compiler does not support pointers to undefined structures. */
-/* #undef INCOMPLETE_TYPES_BROKEN */
-
-/* How to obtain function inlining. */
-#ifndef INLINE
-#if defined(__GNUC__)
-#define INLINE inline __attribute__((always_inline))
-#elif defined(_MSC_VER)
-#define INLINE __forceinline
-#else
-#define INLINE
-#endif
-#endif
-
-/* libjpeg API version */
-#define JPEG_LIB_VERSION 62
-
-/* libjpeg-turbo version */
-#define LIBJPEG_TURBO_VERSION 1.3.1
-
-/* Define to the sub-directory in which libtool stores uninstalled libraries.
-   */
-#define LT_OBJDIR ".libs/"
-
-/* Define if you have BSD-like bzero and bcopy */
-/* #undef NEED_BSD_STRINGS */
-
-/* Define if you need short function names */
-/* #undef NEED_SHORT_EXTERNAL_NAMES */
-
-/* Define if you have sys/types.h */
-#define NEED_SYS_TYPES_H 1
-
-/* Name of package */
-#define PACKAGE "libjpeg-turbo"
-
-/* Define to the address where bug reports for this package should be sent. */
-#define PACKAGE_BUGREPORT ""
-
-/* Define to the full name of this package. */
-#define PACKAGE_NAME "libjpeg-turbo"
-
-/* Define to the full name and version of this package. */
-#define PACKAGE_STRING "libjpeg-turbo 1.3.1"
-
-/* Define to the one symbol short name of this package. */
-#define PACKAGE_TARNAME "libjpeg-turbo"
-
-/* Define to the home page for this package. */
-#define PACKAGE_URL ""
-
-/* Define to the version of this package. */
-#define PACKAGE_VERSION "1.3.1"
-
-/* Define if shift is unsigned */
-/* #undef RIGHT_SHIFT_IS_UNSIGNED */
-
-/* Define to 1 if you have the ANSI C header files. */
-#define STDC_HEADERS 1
-
-/* Version number of package */
-#define VERSION "1.3.1"
-
-/* Use accelerated SIMD routines. */
-#define WITH_SIMD 1
-
-/* Define to 1 if type `char' is unsigned and you are not using gcc.  */
-#ifndef __CHAR_UNSIGNED__
-/* # undef __CHAR_UNSIGNED__ */
-#endif
-
-/* Define to empty if `const' does not conform to ANSI C. */
-/* #undef const */
-
-/* Define to `__inline__' or `__inline' if that's what the C compiler
-   calls it, or to nothing if 'inline' is not supported under any name.  */
-#ifndef __cplusplus
-/* #undef inline */
-#endif
-
-/* Define to `unsigned int' if <sys/types.h> does not define. */
-/* #undef size_t */
diff --git a/jpeglibmangler.h b/jpeglibmangler.h
deleted file mode 100644
index 59f554a..0000000
--- a/jpeglibmangler.h
+++ /dev/null
@@ -1,116 +0,0 @@
-// Copyright (c) 2009 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef THIRD_PARTY_LIBJPEG_TURBO_JPEGLIBMANGLER_H_
-#define THIRD_PARTY_LIBJPEG_TURBO_JPEGLIBMANGLER_H_
-
-// Mangle all externally visible function names so we can build our own libjpeg
-// without system libraries trying to use it.
-
-#define jpeg_make_c_derived_tbl chromium_jpeg_make_c_derived_tbl
-#define jpeg_gen_optimal_table chromium_jpeg_gen_optimal_table
-#define jpeg_make_d_derived_tbl chromium_jpeg_make_d_derived_tbl
-#define jpeg_fill_bit_buffer chromium_jpeg_fill_bit_buffer
-#define jpeg_huff_decode chromium_jpeg_huff_decode
-#define jpeg_fdct_islow chromium_jpeg_fdct_islow
-#define jpeg_fdct_ifast chromium_jpeg_fdct_ifast
-#define jpeg_fdct_float chromium_jpeg_fdct_float
-#define jpeg_idct_islow chromium_jpeg_idct_islow
-#define jpeg_idct_ifast chromium_jpeg_idct_ifast
-#define jpeg_idct_float chromium_jpeg_idct_float
-#define jpeg_idct_4x4 chromium_jpeg_idct_4x4
-#define jpeg_idct_2x2 chromium_jpeg_idct_2x2
-#define jpeg_idct_1x1 chromium_jpeg_idct_1x1
-#define jinit_compress_master chromium_jinit_compress_master
-#define jinit_c_master_control chromium_jinit_c_master_control
-#define jinit_c_main_controller chromium_jinit_c_main_controller
-#define jinit_c_prep_controller chromium_jinit_c_prep_controller
-#define jinit_c_coef_controller chromium_jinit_c_coef_controller
-#define jinit_color_converter chromium_jinit_color_converter
-#define jinit_downsampler chromium_jinit_downsampler
-#define jinit_forward_dct chromium_jinit_forward_dct
-#define jinit_huff_encoder chromium_jinit_huff_encoder
-#define jinit_phuff_encoder chromium_jinit_phuff_encoder
-#define jinit_marker_writer chromium_jinit_marker_writer
-#define jinit_master_decompress chromium_jinit_master_decompress
-#define jinit_d_main_controller chromium_jinit_d_main_controller
-#define jinit_d_coef_controller chromium_jinit_d_coef_controller
-#define jinit_d_post_controller chromium_jinit_d_post_controller
-#define jinit_input_controller chromium_jinit_input_controller
-#define jinit_marker_reader chromium_jinit_marker_reader
-#define jinit_huff_decoder chromium_jinit_huff_decoder
-#define jinit_phuff_decoder chromium_jinit_phuff_decoder
-#define jinit_inverse_dct chromium_jinit_inverse_dct
-#define jinit_upsampler chromium_jinit_upsampler
-#define jinit_color_deconverter chromium_jinit_color_deconverter
-#define jinit_1pass_quantizer chromium_jinit_1pass_quantizer
-#define jinit_2pass_quantizer chromium_jinit_2pass_quantizer
-#define jinit_merged_upsampler chromium_jinit_merged_upsampler
-#define jinit_memory_mgr chromium_jinit_memory_mgr
-#define jdiv_round_up chromium_jdiv_round_up
-#define jround_up chromium_jround_up
-#define jcopy_sample_rows chromium_jcopy_sample_rows
-#define jcopy_block_row chromium_jcopy_block_row
-#define jzero_far chromium_jzero_far
-#define jpeg_std_error chromium_jpeg_std_error
-#define jpeg_CreateCompress chromium_jpeg_CreateCompress
-#define jpeg_CreateDecompress chromium_jpeg_CreateDecompress
-#define jpeg_destroy_compress chromium_jpeg_destroy_compress
-#define jpeg_destroy_decompress chromium_jpeg_destroy_decompress
-#define jpeg_stdio_dest chromium_jpeg_stdio_dest
-#define jpeg_stdio_src chromium_jpeg_stdio_src
-#define jpeg_set_defaults chromium_jpeg_set_defaults
-#define jpeg_set_colorspace chromium_jpeg_set_colorspace
-#define jpeg_default_colorspace chromium_jpeg_default_colorspace
-#define jpeg_set_quality chromium_jpeg_set_quality
-#define jpeg_set_linear_quality chromium_jpeg_set_linear_quality
-#define jpeg_add_quant_table chromium_jpeg_add_quant_table
-#define jpeg_quality_scaling chromium_jpeg_quality_scaling
-#define jpeg_simple_progression chromium_jpeg_simple_progression
-#define jpeg_suppress_tables chromium_jpeg_suppress_tables
-#define jpeg_alloc_quant_table chromium_jpeg_alloc_quant_table
-#define jpeg_alloc_huff_table chromium_jpeg_alloc_huff_table
-#define jpeg_start_compress chromium_jpeg_start_compress
-#define jpeg_write_scanlines chromium_jpeg_write_scanlines
-#define jpeg_finish_compress chromium_jpeg_finish_compress
-#define jpeg_write_raw_data chromium_jpeg_write_raw_data
-#define jpeg_write_marker chromium_jpeg_write_marker
-#define jpeg_write_m_header chromium_jpeg_write_m_header
-#define jpeg_write_m_byte chromium_jpeg_write_m_byte
-#define jpeg_write_tables chromium_jpeg_write_tables
-#define jpeg_read_header chromium_jpeg_read_header
-#define jpeg_start_decompress chromium_jpeg_start_decompress
-#define jpeg_read_scanlines chromium_jpeg_read_scanlines
-#define jpeg_skip_scanlines chromium_jpeg_skip_scanlines
-#define jpeg_finish_decompress chromium_jpeg_finish_decompress
-#define jpeg_read_raw_data chromium_jpeg_read_raw_data
-#define jpeg_has_multiple_scans chromium_jpeg_has_multiple_scans
-#define jpeg_start_output chromium_jpeg_start_output
-#define jpeg_finish_output chromium_jpeg_finish_output
-#define jpeg_input_complete chromium_jpeg_input_complete
-#define jpeg_new_colormap chromium_jpeg_new_colormap
-#define jpeg_consume_input chromium_jpeg_consume_input
-#define jpeg_calc_output_dimensions chromium_jpeg_calc_output_dimensions
-#define jpeg_save_markers chromium_jpeg_save_markers
-#define jpeg_set_marker_processor chromium_jpeg_set_marker_processor
-#define jpeg_read_coefficients chromium_jpeg_read_coefficients
-#define jpeg_write_coefficients chromium_jpeg_write_coefficients
-#define jpeg_copy_critical_parameters chromium_jpeg_copy_critical_parameters
-#define jpeg_abort_compress chromium_jpeg_abort_compress
-#define jpeg_abort_decompress chromium_jpeg_abort_decompress
-#define jpeg_abort chromium_jpeg_abort
-#define jpeg_destroy chromium_jpeg_destroy
-#define jpeg_resync_to_restart chromium_jpeg_resync_to_restart
-#define jpeg_get_small chromium_jpeg_get_small
-#define jpeg_free_small chromium_jpeg_free_small
-#define jpeg_get_large chromium_jpeg_get_large
-#define jpeg_free_large chromium_jpeg_free_large
-#define jpeg_mem_available chromium_jpeg_mem_available
-#define jpeg_open_backing_store chromium_jpeg_open_backing_store
-#define jpeg_mem_init chromium_jpeg_mem_init
-#define jpeg_mem_term chromium_jpeg_mem_term
-#define jpeg_std_message_table chromium_jpeg_std_message_table
-#define jpeg_natural_order chromium_jpeg_natural_order
-
-#endif  // THIRD_PARTY_LIBJPEG_TURBO_JPEGLIBMANGLER_H_
diff --git a/jpegut.c b/jpegut.c
deleted file mode 100644
index cec0f72..0000000
--- a/jpegut.c
+++ /dev/null
@@ -1,387 +0,0 @@
-/* Copyright (C)2004 Landmark Graphics Corporation
- * Copyright (C)2005 Sun Microsystems, Inc.
- * Copyright (C)2009 D. R. Commander
- *
- * This library is free software and may be redistributed and/or modified under
- * the terms of the wxWindows Library License, Version 3.1 or (at your option)
- * any later version.  The full license is in the LICENSE.txt file included
- * with this distribution.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * wxWindows Library License for more details.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "./rrtimer.h"
-#include "./turbojpeg.h"
-
-#define _catch(f) {if((f)==-1) {printf("TJPEG: %s\n", tjGetErrorStr());  bailout();}}
-
-const char *_subnamel[NUMSUBOPT]={"4:4:4", "4:2:2", "4:2:0", "GRAY"};
-const char *_subnames[NUMSUBOPT]={"444", "422", "420", "GRAY"};
-
-int exitstatus=0;
-#define bailout() {exitstatus=-1;  goto finally;}
-
-int pixels[9][3]=
-{
-	{0, 255, 0},
-	{255, 0, 255},
-	{255, 255, 0},
-	{0, 0, 255},
-	{0, 255, 255},
-	{255, 0, 0},
-	{255, 255, 255},
-	{0, 0, 0},
-	{255, 0, 0}
-};
-
-void initbuf(unsigned char *buf, int w, int h, int ps, int flags)
-{
-	int roffset=(flags&TJ_BGR)?2:0, goffset=1, boffset=(flags&TJ_BGR)?0:2, i,
-		_i, j;
-	if(flags&TJ_ALPHAFIRST) {roffset++;  goffset++;  boffset++;}
-	memset(buf, 0, w*h*ps);
-	for(_i=0; _i<16; _i++)
-	{
-		if(flags&TJ_BOTTOMUP) i=h-_i-1;  else i=_i;
-		for(j=0; j<w; j++)
-		{
-			buf[(w*i+j)*ps+roffset]=255;
-			if(((_i/8)+(j/8))%2==0)
-			{
-				buf[(w*i+j)*ps+goffset]=255;
-				buf[(w*i+j)*ps+boffset]=255;
-			}
-		}
-	}
-	for(_i=16; _i<h; _i++)
-	{
-		if(flags&TJ_BOTTOMUP) i=h-_i-1;  else i=_i;
-		for(j=0; j<w; j++)
-		{
-			if(((_i/8)+(j/8))%2!=0)
-			{
-				buf[(w*i+j)*ps+roffset]=255;
-				buf[(w*i+j)*ps+goffset]=255;
-			}
-		}
-	}
-}
-
-void dumpbuf(unsigned char *buf, int w, int h, int ps, int flags)
-{
-	int roffset=(flags&TJ_BGR)?2:0, goffset=1, boffset=(flags&TJ_BGR)?0:2, i,
-		j;
-	for(i=0; i<h; i++)
-	{
-		for(j=0; j<w; j++)
-		{
-			printf("%.3d/%.3d/%.3d ", buf[(w*i+j)*ps+roffset],
-				buf[(w*i+j)*ps+roffset], buf[(w*i+j)*ps+roffset]);
-		}
-		printf("\n");
-	}
-}
-
-int checkbuf(unsigned char *buf, int w, int h, int ps, int subsamp, int flags)
-{
-	int roffset=(flags&TJ_BGR)?2:0, goffset=1, boffset=(flags&TJ_BGR)?0:2, i,
-		_i, j;
-	if(flags&TJ_ALPHAFIRST) {roffset++;  goffset++;  boffset++;}
-	if(subsamp==TJ_GRAYSCALE)
-	{
-		for(_i=0; _i<16; _i++)
-		{
-			if(flags&TJ_BOTTOMUP) i=h-_i-1;  else i=_i;
-			for(j=0; j<w; j++)
-			{
-				unsigned char r=buf[(w*i+j)*ps+roffset],
-					g=buf[(w*i+j)*ps+goffset],
-					b=buf[(w*i+j)*ps+boffset];
-				if(((_i/8)+(j/8))%2==0)
-				{
-					if(r<253 || g<253 || b<253) return 0;
-				}
-				else
-				{
-					if(r<74 || r>78 || g<74 || g>78 || b<74 || b>78) return 0;
-				}
-			}
-		}
-		for(_i=16; _i<h; _i++)
-		{
-			if(flags&TJ_BOTTOMUP) i=h-_i-1;  else i=_i;
-			for(j=0; j<w; j++)
-			{
-				unsigned char r=buf[(w*i+j)*ps+roffset],
-					g=buf[(w*i+j)*ps+goffset],
-					b=buf[(w*i+j)*ps+boffset];
-				if(((_i/8)+(j/8))%2==0)
-				{
-					if(r>2 || g>2 || b>2) return 0;
-				}
-				else
-				{
-					if(r<224 || r>228 || g<224 || g>228 || b<224 || b>228) return 0;
-				}
-			}
-		}
-	}
-	else
-	{
-		for(_i=0; _i<16; _i++)
-		{
-			if(flags&TJ_BOTTOMUP) i=h-_i-1;  else i=_i;
-			for(j=0; j<w; j++)
-			{
-				if(buf[(w*i+j)*ps+roffset]<253) return 0;
-				if(((_i/8)+(j/8))%2==0)
-				{
-					if(buf[(w*i+j)*ps+goffset]<253) return 0;
-					if(buf[(w*i+j)*ps+boffset]<253) return 0;
-				}
-				else
-				{
-					if(buf[(w*i+j)*ps+goffset]>2) return 0;
-					if(buf[(w*i+j)*ps+boffset]>2) return 0;
-				}
-			}
-		}
-		for(_i=16; _i<h; _i++)
-		{
-			if(flags&TJ_BOTTOMUP) i=h-_i-1;  else i=_i;
-			for(j=0; j<w; j++)
-			{
-				if(buf[(w*i+j)*ps+boffset]>2) return 0;
-				if(((_i/8)+(j/8))%2==0)
-				{
-					if(buf[(w*i+j)*ps+roffset]>2) return 0;
-					if(buf[(w*i+j)*ps+goffset]>2) return 0;
-				}
-				else
-				{
-					if(buf[(w*i+j)*ps+roffset]<253) return 0;
-					if(buf[(w*i+j)*ps+goffset]<253) return 0;
-				}
-			}
-		}
-	}
-	return 1;
-}
-
-void writejpeg(unsigned char *jpegbuf, unsigned long jpgbufsize, char *filename)
-{
-	FILE *outfile=NULL;
-	if((outfile=fopen(filename, "wb"))==NULL)
-	{
-		printf("ERROR: Could not open %s for writing.\n", filename);
-		bailout();
-	}
-	if(fwrite(jpegbuf, jpgbufsize, 1, outfile)!=1)
-	{
-		printf("ERROR: Could not write to %s.\n", filename);
-		bailout();
-	}
-
-	finally:
-	if(outfile) fclose(outfile);
-}
-
-void gentestjpeg(tjhandle hnd, unsigned char *jpegbuf, unsigned long *size,
-	int w, int h, int ps, char *basefilename, int subsamp, int qual, int flags)
-{
-	char tempstr[1024];  unsigned char *bmpbuf=NULL;
-	const char *pixformat;  double t;
-
-	if(flags&TJ_BGR)
-	{
-		if(ps==3) pixformat="BGR";
-		else {if(flags&TJ_ALPHAFIRST) pixformat="ABGR";  else pixformat="BGRA";}
-	}
-	else
-	{
-		if(ps==3) pixformat="RGB";
-		else {if(flags&TJ_ALPHAFIRST) pixformat="ARGB";  else pixformat="RGBA";}
-	}
-	printf("%s %s -> %s Q%d ... ", pixformat,
-		(flags&TJ_BOTTOMUP)?"Bottom-Up":"Top-Down ", _subnamel[subsamp], qual);
-
-	if((bmpbuf=(unsigned char *)malloc(w*h*ps+1))==NULL)
-	{
-		printf("ERROR: Could not allocate buffer\n");  bailout();
-	}
-	initbuf(bmpbuf, w, h, ps, flags);
-	memset(jpegbuf, 0, TJBUFSIZE(w, h));
-
-	t=rrtime();
-	_catch(tjCompress(hnd, bmpbuf, w, 0, h, ps, jpegbuf, size, subsamp, qual, flags));
-	t=rrtime()-t;
-
-	sprintf(tempstr, "%s_enc_%s_%s_%sQ%d.jpg", basefilename, pixformat,
-		(flags&TJ_BOTTOMUP)? "BU":"TD", _subnames[subsamp], qual);
-	writejpeg(jpegbuf, *size, tempstr);
-	printf("Done.  %f ms\n  Result in %s\n", t*1000., tempstr);
-
-	finally:
-	if(bmpbuf) free(bmpbuf);
-}
-
-void gentestbmp(tjhandle hnd, unsigned char *jpegbuf, unsigned long jpegsize,
-	int w, int h, int ps, char *basefilename, int subsamp, int qual, int flags)
-{
-	unsigned char *bmpbuf=NULL;
-	const char *pixformat;  int _w=0, _h=0;  double t;
-
-	if(flags&TJ_BGR)
-	{
-		if(ps==3) pixformat="BGR";
-		else {if(flags&TJ_ALPHAFIRST) pixformat="ABGR";  else pixformat="BGRA";}
-	}
-	else
-	{
-		if(ps==3) pixformat="RGB";
-		else {if(flags&TJ_ALPHAFIRST) pixformat="ARGB";  else pixformat="RGBA";}
-	}
-	printf("JPEG -> %s %s ... ", pixformat, (flags&TJ_BOTTOMUP)?"Bottom-Up":"Top-Down ");
-
-	_catch(tjDecompressHeader(hnd, jpegbuf, jpegsize, &_w, &_h));
-	if(_w!=w || _h!=h)
-	{
-		printf("Incorrect JPEG header\n");  bailout();
-	}
-
-	if((bmpbuf=(unsigned char *)malloc(w*h*ps+1))==NULL)
-	{
-		printf("ERROR: Could not allocate buffer\n");  bailout();
-	}
-	memset(bmpbuf, 0, w*ps*h);
-
-	t=rrtime();
-	_catch(tjDecompress(hnd, jpegbuf, jpegsize, bmpbuf, w, w*ps, h, ps, flags));
-	t=rrtime()-t;
-
-	if(checkbuf(bmpbuf, w, h, ps, subsamp, flags)) printf("Passed.");
-	else {printf("FAILED!");  dumpbuf(bmpbuf, w, h, ps, flags);}
-
-	printf("  %f ms\n\n", t*1000.);
-
-	finally:
-	if(bmpbuf) free(bmpbuf);
-}
-
-void dotest(int w, int h, int ps, int subsamp, char *basefilename)
-{
-	tjhandle hnd=NULL, dhnd=NULL;  unsigned char *jpegbuf=NULL;
-	unsigned long size;
-
-	if((jpegbuf=(unsigned char *)malloc(TJBUFSIZE(w, h))) == NULL)
-	{
-		puts("ERROR: Could not allocate buffer.");  bailout();
-	}
-
-	if((hnd=tjInitCompress())==NULL)
-		{printf("Error in tjInitCompress():\n%s\n", tjGetErrorStr());  bailout();}
-	if((dhnd=tjInitDecompress())==NULL)
-		{printf("Error in tjInitDecompress():\n%s\n", tjGetErrorStr());  bailout();}
-
-	gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, 0);
-	gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, 0);
-
-	gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, TJ_BGR);
-	gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, TJ_BGR);
-
-	gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, TJ_BOTTOMUP);
-	gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, TJ_BOTTOMUP);
-
-	gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, TJ_BGR|TJ_BOTTOMUP);
-	gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, TJ_BGR|TJ_BOTTOMUP);
-
-	if(ps==4)
-	{
-		gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST);
-		gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST);
-
-		gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST|TJ_BGR);
-		gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST|TJ_BGR);
-
-		gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST|TJ_BOTTOMUP);
-		gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST|TJ_BOTTOMUP);
-
-		gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST|TJ_BGR|TJ_BOTTOMUP);
-		gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST|TJ_BGR|TJ_BOTTOMUP);
-	}
-
-	finally:
-	if(hnd) tjDestroy(hnd);
-	if(dhnd) tjDestroy(dhnd);
-
-	if(jpegbuf) free(jpegbuf);
-}
-
-#define MAXLENGTH 2048
-
-void dotest1(void)
-{
-	int i, j, i2;  unsigned char *bmpbuf=NULL, *jpgbuf=NULL;
-	tjhandle hnd=NULL;  unsigned long size;
-	if((hnd=tjInitCompress())==NULL)
-		{printf("Error in tjInitCompress():\n%s\n", tjGetErrorStr());  bailout();}
-	printf("Buffer size regression test\n");
-	for(j=1; j<48; j++)
-	{
-		for(i=1; i<(j==1?MAXLENGTH:48); i++)
-		{
-			if(i%100==0) printf("%.4d x %.4d\b\b\b\b\b\b\b\b\b\b\b", i, j);
-			if((bmpbuf=(unsigned char *)malloc(i*j*4))==NULL
-			|| (jpgbuf=(unsigned char *)malloc(TJBUFSIZE(i, j)))==NULL)
-			{
-				printf("Memory allocation failure\n");  bailout();
-			}
-			memset(bmpbuf, 0, i*j*4);
-			for(i2=0; i2<i*j; i2++)
-			{
-				bmpbuf[i2*4]=pixels[i2%9][2];
-				bmpbuf[i2*4+1]=pixels[i2%9][1];
-				bmpbuf[i2*2+2]=pixels[i2%9][0];
-			}
-			_catch(tjCompress(hnd, bmpbuf, i, i*4, j, 4,
-				jpgbuf, &size, TJ_444, 100, TJ_BGR));
-			free(bmpbuf);  bmpbuf=NULL;  free(jpgbuf);  jpgbuf=NULL;
-
-			if((bmpbuf=(unsigned char *)malloc(j*i*4))==NULL
-			|| (jpgbuf=(unsigned char *)malloc(TJBUFSIZE(j, i)))==NULL)
-			{
-				printf("Memory allocation failure\n");  bailout();
-			}
-			for(i2=0; i2<j*i*4; i2++)
-			{
-				if(i2%2==0) bmpbuf[i2]=0xFF;
-				else bmpbuf[i2]=0;
-			}
-			_catch(tjCompress(hnd, bmpbuf, j, j*4, i, 4,
-				jpgbuf, &size, TJ_444, 100, TJ_BGR));
-			free(bmpbuf);  bmpbuf=NULL;  free(jpgbuf);  jpgbuf=NULL;
-		}
-	}
-	printf("Done.      \n");
-
-	finally:
-	if(bmpbuf) free(bmpbuf);  if(jpgbuf) free(jpgbuf);
-	if(hnd) tjDestroy(hnd);
-}
-
-int main(int argc, char *argv[])
-{
-	dotest(35, 41, 3, TJ_444, "test");
-	dotest(35, 41, 4, TJ_444, "test");
-	dotest(35, 41, 3, TJ_GRAYSCALE, "test");
-	dotest(35, 41, 4, TJ_GRAYSCALE, "test");
-	dotest1();
-
-	return exitstatus;
-}
diff --git a/jpgtest.cxx b/jpgtest.cxx
deleted file mode 100644
index b1c5e1a..0000000
--- a/jpgtest.cxx
+++ /dev/null
@@ -1,392 +0,0 @@
-/* Copyright (C)2004 Landmark Graphics Corporation
- * Copyright (C)2005, 2006 Sun Microsystems, Inc.
- * Copyright (C)2009 D. R. Commander
- *
- * This library is free software and may be redistributed and/or modified under
- * the terms of the wxWindows Library License, Version 3.1 or (at your option)
- * any later version.  The full license is in the LICENSE.txt file included
- * with this distribution.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * wxWindows Library License for more details.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include "./bmp.h"
-#include "./rrutil.h"
-#include "./rrtimer.h"
-#include "./turbojpeg.h"
-
-#define _catch(f) {if((f)==-1) {printf("Error in %s:\n%s\n", #f, tjGetErrorStr());  goto bailout;}}
-
-int forcemmx=0, forcesse=0, forcesse2=0, forcesse3=0, fastupsample=0;
-const int _ps[BMPPIXELFORMATS]={3, 4, 3, 4, 4, 4};
-const int _flags[BMPPIXELFORMATS]={0, 0, TJ_BGR, TJ_BGR,
-	TJ_BGR|TJ_ALPHAFIRST, TJ_ALPHAFIRST};
-const int _rindex[BMPPIXELFORMATS]={0, 0, 2, 2, 3, 1};
-const int _gindex[BMPPIXELFORMATS]={1, 1, 1, 1, 2, 2};
-const int _bindex[BMPPIXELFORMATS]={2, 2, 0, 0, 1, 3};
-const char *_pfname[]={"RGB", "RGBA", "BGR", "BGRA", "ABGR", "ARGB"};
-const char *_subnamel[NUMSUBOPT]={"4:4:4", "4:2:2", "4:2:0", "GRAY"};
-const char *_subnames[NUMSUBOPT]={"444", "422", "420", "GRAY"};
-
-void printsigfig(double val, int figs)
-{
-	char format[80];
-	double _l=log10(val);  int l;
-	if(_l<0.)
-	{
-		l=(int)fabs(_l);
-		sprintf(format, "%%%d.%df", figs+l+2, figs+l);
-	}
-	else
-	{
-		l=(int)_l+1;
-		if(figs<=l) sprintf(format, "%%.0f");
-		else sprintf(format, "%%%d.%df", figs+1, figs-l);
-	}	
-	printf(format, val);
-}
-
-void dotest(unsigned char *srcbuf, int w, int h, BMPPIXELFORMAT pf, int bu,
-	int jpegsub, int qual, char *filename, int dotile, int useppm, int quiet)
-{
-	char tempstr[1024];
-	FILE *outfile;  tjhandle hnd;
-	unsigned char **jpegbuf=NULL, *rgbbuf=NULL;
-	rrtimer timer; double elapsed;
-	int jpgbufsize=0, i, j, tilesizex, tilesizey, numtilesx, numtilesy, ITER;
-	unsigned long *comptilesize=NULL;
-	int flags=(forcemmx?TJ_FORCEMMX:0)|(forcesse?TJ_FORCESSE:0)
-		|(forcesse2?TJ_FORCESSE2:0)|(forcesse3?TJ_FORCESSE3:0)
-		|(fastupsample?TJ_FASTUPSAMPLE:0);
-	int ps=_ps[pf];
-	int pitch=w*ps;
-
-	flags |= _flags[pf];
-	if(bu) flags |= TJ_BOTTOMUP;
-
-	if((rgbbuf=(unsigned char *)malloc(pitch*h)) == NULL)
-	{
-		puts("ERROR: Could not allocate image buffer.");
-		exit(1);
-	}
-
-	if(!quiet) printf("\n>>>>>  %s (%s) <--> JPEG %s Q%d  <<<<<\n", _pfname[pf],
-		bu?"Bottom-up":"Top-down", _subnamel[jpegsub], qual);
-	if(dotile) {tilesizex=tilesizey=4;}  else {tilesizex=w;  tilesizey=h;}
-
-	do
-	{
-		tilesizex*=2;  if(tilesizex>w) tilesizex=w;
-		tilesizey*=2;  if(tilesizey>h) tilesizey=h;
-		numtilesx=(w+tilesizex-1)/tilesizex;
-		numtilesy=(h+tilesizey-1)/tilesizey;
-		if((comptilesize=(unsigned long *)malloc(sizeof(unsigned long)*numtilesx*numtilesy)) == NULL
-		|| (jpegbuf=(unsigned char **)malloc(sizeof(unsigned char *)*numtilesx*numtilesy)) == NULL)
-		{
-			puts("ERROR: Could not allocate image buffers.");
-			goto bailout;
-		}
-		memset(jpegbuf, 0, sizeof(unsigned char *)*numtilesx*numtilesy);
-		for(i=0; i<numtilesx*numtilesy; i++)
-		{
-			if((jpegbuf[i]=(unsigned char *)malloc(TJBUFSIZE(tilesizex, tilesizey))) == NULL)
-			{
-				puts("ERROR: Could not allocate image buffers.");
-				goto bailout;
-			}
-		}
-
-		// Compression test
-		if(quiet) printf("%s\t%s\t%s\t%d\t",  _pfname[pf], bu?"BU":"TD",
-			_subnamel[jpegsub], qual);
-		for(i=0; i<h; i++) memcpy(&rgbbuf[pitch*i], &srcbuf[w*ps*i], w*ps);
-		if((hnd=tjInitCompress())==NULL)
-		{
-			printf("Error in tjInitCompress():\n%s\n", tjGetErrorStr());
-			goto bailout;
-		}
-		_catch(tjCompress(hnd, rgbbuf, tilesizex, pitch, tilesizey, ps,
-			jpegbuf[0], &comptilesize[0], jpegsub, qual, flags));
-		ITER=0;
-		timer.start();
-		do
-		{
-			jpgbufsize=0;  int tilen=0;
-			for(i=0; i<h; i+=tilesizey)
-			{
-				for(j=0; j<w; j+=tilesizex)
-				{
-					int tempw=min(tilesizex, w-j), temph=min(tilesizey, h-i);
-					_catch(tjCompress(hnd, &rgbbuf[pitch*i+j*ps], tempw, pitch,
-						temph, ps, jpegbuf[tilen], &comptilesize[tilen], jpegsub, qual,
-						flags));
-					jpgbufsize+=comptilesize[tilen];
-					tilen++;
-				}
-			}
-			ITER++;
-		} while((elapsed=timer.elapsed())<5.);
-		_catch(tjDestroy(hnd));
-		if(quiet)
-		{
-			if(tilesizex==w && tilesizey==h) printf("Full     \t");
-			else printf("%-4d %-4d\t", tilesizex, tilesizey);
-			printsigfig((double)(w*h)/1000000.*(double)ITER/elapsed, 4);
-			printf("\t");
-			printsigfig((double)(w*h*ps)/(double)jpgbufsize, 4);
-			printf("\t");
-		}
-		else
-		{
-			if(tilesizex==w && tilesizey==h) printf("\nFull image\n");
-			else printf("\nTile size: %d x %d\n", tilesizex, tilesizey);
-			printf("C--> Frame rate:           %f fps\n", (double)ITER/elapsed);
-			printf("     Output image size:    %d bytes\n", jpgbufsize);
-			printf("     Compression ratio:    %f:1\n",
-				(double)(w*h*ps)/(double)jpgbufsize);
-			printf("     Source throughput:    %f Megapixels/sec\n",
-				(double)(w*h)/1000000.*(double)ITER/elapsed);
-			printf("     Output bit stream:    %f Megabits/sec\n",
-				(double)jpgbufsize*8./1000000.*(double)ITER/elapsed);
-		}
-		if(tilesizex==w && tilesizey==h)
-		{
-			sprintf(tempstr, "%s_%sQ%d.jpg", filename, _subnames[jpegsub], qual);
-			if((outfile=fopen(tempstr, "wb"))==NULL)
-			{
-				puts("ERROR: Could not open reference image");
-				exit(1);
-			}
-			if(fwrite(jpegbuf[0], jpgbufsize, 1, outfile)!=1)
-			{
-				puts("ERROR: Could not write reference image");
-				exit(1);
-			}
-			fclose(outfile);
-			if(!quiet) printf("Reference image written to %s\n", tempstr);
-		}
-
-		// Decompression test
-		memset(rgbbuf, 127, pitch*h);  // Grey image means decompressor did nothing
-		if((hnd=tjInitDecompress())==NULL)
-		{
-			printf("Error in tjInitDecompress():\n%s\n", tjGetErrorStr());
-			goto bailout;
-		}
-		_catch(tjDecompress(hnd, jpegbuf[0], jpgbufsize, rgbbuf, tilesizex, pitch,
-			tilesizey, ps, flags));
-		ITER=0;
-		timer.start();
-		do
-		{
-			int tilen=0;
-			for(i=0; i<h; i+=tilesizey)
-			{
-				for(j=0; j<w; j+=tilesizex)
-				{
-					int tempw=min(tilesizex, w-j), temph=min(tilesizey, h-i);
-					_catch(tjDecompress(hnd, jpegbuf[tilen], comptilesize[tilen],
-						&rgbbuf[pitch*i+ps*j], tempw, pitch, temph, ps, flags));
-					tilen++;
-				}
-			}
-			ITER++;
-		}	while((elapsed=timer.elapsed())<5.);
-		_catch(tjDestroy(hnd));
-		if(quiet)
-		{
-			printsigfig((double)(w*h)/1000000.*(double)ITER/elapsed, 4);
-			printf("\n");
-		}
-		else
-		{
-			printf("D--> Frame rate:           %f fps\n", (double)ITER/elapsed);
-			printf("     Dest. throughput:     %f Megapixels/sec\n",
-				(double)(w*h)/1000000.*(double)ITER/elapsed);
-		}
-		if(tilesizex==w && tilesizey==h)
-			sprintf(tempstr, "%s_%sQ%d_full.%s", filename, _subnames[jpegsub], qual,
-				useppm?"ppm":"bmp");
-		else sprintf(tempstr, "%s_%sQ%d_%dx%d.%s", filename, _subnames[jpegsub],
-			qual, tilesizex, tilesizey, useppm?"ppm":"bmp");
-		if(savebmp(tempstr, rgbbuf, w, h, pf, pitch, bu)==-1)
-		{
-			printf("ERROR saving bitmap: %s\n", bmpgeterr());
-			goto bailout;
-		}
-		sprintf(strrchr(tempstr, '.'), "-err.%s", useppm?"ppm":"bmp");
-		if(!quiet)
-			printf("Computing compression error and saving to %s.\n", tempstr);
-		if(jpegsub==TJ_GRAYSCALE)
-		{
-			for(j=0; j<h; j++)
-			{
-				for(i=0; i<w*ps; i+=ps)
-				{
-					int y=(int)((double)srcbuf[w*ps*j+i+_rindex[pf]]*0.299
-						+ (double)srcbuf[w*ps*j+i+_gindex[pf]]*0.587
-						+ (double)srcbuf[w*ps*j+i+_bindex[pf]]*0.114 + 0.5);
-					if(y>255) y=255;  if(y<0) y=0;
-					rgbbuf[pitch*j+i+_rindex[pf]]=abs(rgbbuf[pitch*j+i+_rindex[pf]]-y);
-					rgbbuf[pitch*j+i+_gindex[pf]]=abs(rgbbuf[pitch*j+i+_gindex[pf]]-y);
-					rgbbuf[pitch*j+i+_bindex[pf]]=abs(rgbbuf[pitch*j+i+_bindex[pf]]-y);
-				}
-			}
-		}		
-		else
-		{
-			for(j=0; j<h; j++) for(i=0; i<w*ps; i++)
-				rgbbuf[pitch*j+i]=abs(rgbbuf[pitch*j+i]-srcbuf[w*ps*j+i]);
-		}
-		if(savebmp(tempstr, rgbbuf, w, h, pf, pitch, bu)==-1)
-		{
-			printf("ERROR saving bitmap: %s\n", bmpgeterr());
-			goto bailout;
-		}
-
-		// Cleanup
-		if(jpegbuf)
-		{
-			for(i=0; i<numtilesx*numtilesy; i++)
-				{if(jpegbuf[i]) free(jpegbuf[i]);  jpegbuf[i]=NULL;}
-			free(jpegbuf);  jpegbuf=NULL;
-		}
-		if(comptilesize) {free(comptilesize);  comptilesize=NULL;}
-	} while(tilesizex<w || tilesizey<h);
-
-	if(rgbbuf) {free(rgbbuf);  rgbbuf=NULL;}
-	return;
-
-	bailout:
-	if(jpegbuf)
-	{
-		for(i=0; i<numtilesx*numtilesy; i++)
-			{if(jpegbuf[i]) free(jpegbuf[i]);  jpegbuf[i]=NULL;}
-		free(jpegbuf);  jpegbuf=NULL;
-	}
-	if(comptilesize) {free(comptilesize);  comptilesize=NULL;}
-	if(rgbbuf) {free(rgbbuf);  rgbbuf=NULL;}
-	return;
-}
-
-
-int main(int argc, char *argv[])
-{
-	unsigned char *bmpbuf=NULL;  int w, h, i, useppm=0;
-	int qual, dotile=0, quiet=0, hiqual=-1;  char *temp;
-	BMPPIXELFORMAT pf=BMP_BGR;
-	int bu=0;
-
-	printf("\n");
-
-	if(argc<3)
-	{
-		printf("USAGE: %s <Inputfile (BMP|PPM)> <%% Quality>\n\n", argv[0]);
-		printf("       [-tile]\n");
-		printf("       Test performance of the codec when the image is encoded\n");
-		printf("       as separate tiles of varying sizes.\n\n");
-		printf("       [-forcemmx] [-forcesse] [-forcesse2] [-forcesse3]\n");
-		printf("       Force MMX, SSE, or SSE2 code paths in Intel codec\n\n");
-		printf("       [-rgb | -bgr | -rgba | -bgra | -abgr | -argb]\n");
-		printf("       Test the specified color conversion path in the codec (default: BGR)\n\n");
-		printf("       [-fastupsample]\n");
-		printf("       Use fast, inaccurate upsampling code to perform 4:2:2 and 4:2:0\n");
-		printf("       YUV decoding in libjpeg decompressor\n\n");
-		printf("       [-quiet]\n");
-		printf("       Output in tabular rather than verbose format\n\n");
-		printf("       NOTE: If the quality is specified as a range, i.e. 90-100, a separate\n");
-		printf("       test will be performed for all quality values in the range.\n");
-		exit(1);
-	}
-	if((qual=atoi(argv[2]))<1 || qual>100)
-	{
-		puts("ERROR: Quality must be between 1 and 100.");
-		exit(1);
-	}
-	if((temp=strchr(argv[2], '-'))!=NULL && strlen(temp)>1
-		&& sscanf(&temp[1], "%d", &hiqual)==1 && hiqual>qual && hiqual>=1
-		&& hiqual<=100) {}
-	else hiqual=qual;
-
-	if(argc>3)
-	{
-		for(i=3; i<argc; i++)
-		{
-			if(!stricmp(argv[i], "-tile")) dotile=1;
-			if(!stricmp(argv[i], "-forcesse3"))
-			{
-				printf("Using SSE3 code\n");
-				forcesse3=1;
-			}
-			if(!stricmp(argv[i], "-forcesse2"))
-			{
-				printf("Using SSE2 code\n");
-				forcesse2=1;
-			}
-			if(!stricmp(argv[i], "-forcesse"))
-			{
-				printf("Using SSE code\n");
-				forcesse=1;
-			}
-			if(!stricmp(argv[i], "-forcemmx"))
-			{
-				printf("Using MMX code\n");
-				forcemmx=1;
-			}
-			if(!stricmp(argv[i], "-fastupsample"))
-			{
-				printf("Using fast upsampling code\n");
-				fastupsample=1;
-			}
-			if(!stricmp(argv[i], "-rgb")) pf=BMP_RGB;
-			if(!stricmp(argv[i], "-rgba")) pf=BMP_RGBA;
-			if(!stricmp(argv[i], "-bgr")) pf=BMP_BGR;
-			if(!stricmp(argv[i], "-bgra")) pf=BMP_BGRA;
-			if(!stricmp(argv[i], "-abgr")) pf=BMP_ABGR;
-			if(!stricmp(argv[i], "-argb")) pf=BMP_ARGB;
-			if(!stricmp(argv[i], "-bottomup")) bu=1;
-			if(!stricmp(argv[i], "-quiet")) quiet=1;
-		}
-	}
-
-	if(loadbmp(argv[1], &bmpbuf, &w, &h, pf, 1, bu)==-1)
-	{
-		printf("ERROR loading bitmap: %s\n", bmpgeterr());  exit(1);
-	}
-
-	temp=strrchr(argv[1], '.');
-	if(temp!=NULL)
-	{
-		if(!stricmp(temp, ".ppm")) useppm=1;
-		*temp='\0';
-	}
-
-	if(quiet)
-	{
-		printf("All performance values in Mpixels/sec\n\n");
-		printf("Bitmap\tBitmap\tJPEG\tJPEG\tTile Size\tCompr\tCompr\tDecomp\n");
-		printf("Format\tOrder\tFormat\tQual\t X    Y  \tPerf \tRatio\tPerf\n\n");
-	}
-
-	for(i=hiqual; i>=qual; i--)
-		dotest(bmpbuf, w, h, pf, bu, TJ_GRAYSCALE, i, argv[1], dotile, useppm, quiet);
-	if(quiet) printf("\n");
-	for(i=hiqual; i>=qual; i--)
-		dotest(bmpbuf, w, h, pf, bu, TJ_420, i, argv[1], dotile, useppm, quiet);
-	if(quiet) printf("\n");
-	for(i=hiqual; i>=qual; i--)
-		dotest(bmpbuf, w, h, pf, bu, TJ_422, i, argv[1], dotile, useppm, quiet);
-	if(quiet) printf("\n");
-	for(i=hiqual; i>=qual; i--)
-		dotest(bmpbuf, w, h, pf, bu, TJ_444, i, argv[1], dotile, useppm, quiet);
-
-	if(bmpbuf) free(bmpbuf);
-	return 0;
-}
diff --git a/libjpeg.gyp b/libjpeg.gyp
deleted file mode 100644
index c5d8dce..0000000
--- a/libjpeg.gyp
+++ /dev/null
@@ -1,317 +0,0 @@
-# Copyright (c) 2012 The Chromium Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-
-{
-  # This file is not used when use_system_libjpeg==1. Settings for building with
-  # the system libjpeg is in third_party/libjpeg/libjpeg.gyp.
-  'variables': {
-    'shared_generated_dir': '<(SHARED_INTERMEDIATE_DIR)/third_party/libjpeg_turbo',
-    'conditions': [
-      [ 'OS=="win"', {
-        'object_suffix': 'obj',
-      }, {
-        'object_suffix': 'o',
-      }],
-    ],
-  },
-  'targets': [
-    {
-      'target_name': 'libjpeg',
-      'type': 'static_library',
-      'include_dirs': [
-        '.',
-      ],
-      'defines': [
-        'WITH_SIMD',
-        'MOTION_JPEG_SUPPORTED',
-        'NO_GETENV',
-      ],
-      'sources': [
-        'jcapimin.c',
-        'jcapistd.c',
-        'jccoefct.c',
-        'jccolor.c',
-        'jcdctmgr.c',
-        'jchuff.c',
-        'jchuff.h',
-        'jcinit.c',
-        'jcmainct.c',
-        'jcmarker.c',
-        'jcmaster.c',
-        'jcomapi.c',
-        'jconfig.h',
-        'jcparam.c',
-        'jcphuff.c',
-        'jcprepct.c',
-        'jcsample.c',
-        'jdapimin.c',
-        'jdapistd.c',
-        'jdatadst.c',
-        'jdatasrc.c',
-        'jdcoefct.c',
-        'jdcolor.c',
-        'jdct.h',
-        'jddctmgr.c',
-        'jdhuff.c',
-        'jdhuff.h',
-        'jdinput.c',
-        'jdmainct.c',
-        'jdmarker.c',
-        'jdmaster.c',
-        'jdmerge.c',
-        'jdphuff.c',
-        'jdpostct.c',
-        'jdsample.c',
-        'jerror.c',
-        'jerror.h',
-        'jfdctflt.c',
-        'jfdctfst.c',
-        'jfdctint.c',
-        'jidctflt.c',
-        'jidctfst.c',
-        'jidctint.c',
-        'jidctred.c',
-        'jinclude.h',
-        'jmemmgr.c',
-        'jmemnobs.c',
-        'jmemsys.h',
-        'jmorecfg.h',
-        'jpegint.h',
-        'jpeglib.h',
-        'jpeglibmangler.h',
-        'jquant1.c',
-        'jquant2.c',
-        'jutils.c',
-        'jversion.h',
-      ],
-      'direct_dependent_settings': {
-        'include_dirs': [
-          '.',
-        ],
-      },
-      'msvs_disabled_warnings': [4018, 4101],
-      # VS2010 does not correctly incrementally link obj files generated
-      # from asm files. This flag disables UseLibraryDependencyInputs to
-      # avoid this problem.
-      'msvs_2010_disable_uldi_when_referenced': 1,
-      'conditions': [
-        [ 'OS!="win"', {'product_name': 'jpeg_turbo'}],
-        # Add target-specific source files.
-        [ 'target_arch=="ia32"', {
-          'sources': [
-            'simd/jsimd_i386.c',
-            'simd/jccolmmx.asm',
-            'simd/jccolss2.asm',
-            'simd/jcgrammx.asm',
-            'simd/jcgrass2.asm',
-            'simd/jcqnt3dn.asm',
-            'simd/jcqntmmx.asm',
-            'simd/jcqnts2f.asm',
-            'simd/jcqnts2i.asm',
-            'simd/jcqntsse.asm',
-            'simd/jcsammmx.asm',
-            'simd/jcsamss2.asm',
-            'simd/jdcolmmx.asm',
-            'simd/jdcolss2.asm',
-            'simd/jdmermmx.asm',
-            'simd/jdmerss2.asm',
-            'simd/jdsammmx.asm',
-            'simd/jdsamss2.asm',
-            'simd/jf3dnflt.asm',
-            'simd/jfmmxfst.asm',
-            'simd/jfmmxint.asm',
-            'simd/jfss2fst.asm',
-            'simd/jfss2int.asm',
-            'simd/jfsseflt.asm',
-            'simd/ji3dnflt.asm',
-            'simd/jimmxfst.asm',
-            'simd/jimmxint.asm',
-            'simd/jimmxred.asm',
-            'simd/jiss2flt.asm',
-            'simd/jiss2fst.asm',
-            'simd/jiss2int.asm',
-            'simd/jiss2red.asm',
-            'simd/jisseflt.asm',
-            'simd/jsimdcpu.asm',
-          ],
-        }],
-        [ 'target_arch=="x64" and msan!=1', {
-          'sources': [
-            'simd/jsimd_x86_64.c',
-            'simd/jccolss2-64.asm',
-            'simd/jcgrass2-64.asm',
-            'simd/jcqnts2f-64.asm',
-            'simd/jcqnts2i-64.asm',
-            'simd/jcsamss2-64.asm',
-            'simd/jdcolss2-64.asm',
-            'simd/jdmerss2-64.asm',
-            'simd/jdsamss2-64.asm',
-            'simd/jfss2fst-64.asm',
-            'simd/jfss2int-64.asm',
-            'simd/jfsseflt-64.asm',
-            'simd/jiss2flt-64.asm',
-            'simd/jiss2fst-64.asm',
-            'simd/jiss2int-64.asm',
-            'simd/jiss2red-64.asm',
-          ],
-        }],
-        # MemorySanitizer doesn't support assembly code, so keep it disabled in
-        # MSan builds for now.
-        [ 'msan==1', {
-          'sources': [
-            'jsimd_none.c',
-          ],
-        }],
-        # The ARM SIMD implementation can be used for devices that support
-        # the NEON instruction set. This can safely be done dynamically by
-        # probing CPU features at runtime, if you wish.
-        [ 'target_arch=="arm"', {
-          'conditions': [
-            [ 'arm_version >= 7 and (arm_neon == 1 or arm_neon_optional == 1)', {
-              'sources': [
-                'simd/jsimd_arm.c',
-                'simd/jsimd_arm_neon.S',
-              ],
-            }, {
-              'sources': [
-                'jsimd_none.c',
-              ],
-            }]
-          ],
-        }],
-        [ 'target_arch=="arm64"', {
-          'sources': [
-            'simd/jsimd_arm64.c',
-            'simd/jsimd_arm64_neon.S',
-          ],
-        }],
-        [ 'target_arch=="mipsel" or target_arch=="mips64el"', {
-          'sources': [
-            'jsimd_none.c',
-          ],
-        }],
-
-        # Build rules for an asm file.
-        # On Windows, we use the precompiled yasm binary. On Linux, we build
-        # our patched yasm and use it except when use_system_yasm is 1. On
-        # Mac, we always build our patched yasm and use it because of
-        # <http://www.tortall.net/projects/yasm/ticket/236>.
-        [ 'OS=="win"', {
-          'variables': {
-            'yasm_path': '../yasm/binaries/win/yasm<(EXECUTABLE_SUFFIX)',
-            'conditions': [
-              [ 'target_arch=="ia32"', {
-                'yasm_format': '-fwin32',
-                'yasm_flags': [
-                  '-D__x86__',
-                  '-DWIN32',
-                  '-DMSVC',
-                  '-Iwin/'
-                ],
-              }, {
-                'yasm_format': '-fwin64',
-                'yasm_flags': [
-                  '-D__x86_64__',
-                  '-DWIN64',
-                  '-DMSVC',
-                  '-Iwin/'
-                ],
-              }],
-            ],
-          },
-        }],
-        [ 'OS=="mac" or OS=="ios"', {
-          'dependencies': [
-            '../yasm/yasm.gyp:yasm#host',
-          ],
-          'variables': {
-            'yasm_path': '<(PRODUCT_DIR)/yasm',
-            'conditions': [
-              [ 'target_arch=="ia32"', {
-                'yasm_format': '-fmacho',
-                'yasm_flags': [
-                  '-D__x86__',
-                  '-DMACHO',
-                  '-Imac/'
-                ],
-              }, {
-                'yasm_format': '-fmacho64',
-                'yasm_flags': [
-                  '-D__x86_64__',
-                  '-DMACHO',
-                  '-Imac/'
-                ],
-              }],
-            ],
-          },
-        }],
-        [ 'OS=="linux" or OS=="freebsd" or (OS=="android" and (target_arch=="ia32" or target_arch=="x64"))', {
-          'conditions': [
-            [ 'use_system_yasm==0', {
-              'dependencies': [
-                '../yasm/yasm.gyp:yasm#host',
-              ],
-            }],
-          ],
-          'variables': {
-            'conditions': [
-              [ 'use_system_yasm==1', {
-                'yasm_path': '<!(which yasm)',
-              }, {
-                'yasm_path': '<(PRODUCT_DIR)/yasm',
-              }],
-              [ 'target_arch=="ia32"', {
-                'yasm_format': '-felf',
-                'yasm_flags': [
-                  '-D__x86__',
-                  '-DELF',
-                  '-Ilinux/'
-                ],
-              }, {
-                'yasm_format': '-felf64',
-                'yasm_flags': [
-                  '-D__x86_64__',
-                  '-DELF',
-                  '-Ilinux/'
-                ],
-              }],
-            ],
-          },
-        }],
-      ],
-      'rules': [
-        {
-          'rule_name': 'assemble',
-          'extension': 'asm',
-          'conditions': [
-            [ 'target_arch=="ia32" or target_arch=="x64"', {
-              'inputs': [ '<(yasm_path)', ],
-              'outputs': [
-                '<(shared_generated_dir)/<(RULE_INPUT_ROOT).<(object_suffix)',
-              ],
-              'action': [
-                '<(yasm_path)',
-                '<(yasm_format)',
-                '<@(yasm_flags)',
-                '-DRGBX_FILLER_0XFF',
-                '-DSTRICT_MEMORY_ACCESS',
-                '-Isimd/',
-                '-o', '<(shared_generated_dir)/<(RULE_INPUT_ROOT).<(object_suffix)',
-                '<(RULE_INPUT_PATH)',
-              ],
-              'process_outputs_as_sources': 1,
-              'message': 'Building <(RULE_INPUT_ROOT).<(object_suffix)',
-            }],
-          ]
-        },
-      ],
-    },
-  ],
-}
-
-# Local Variables:
-# tab-width:2
-# indent-tabs-mode:nil
-# End:
-# vim: set expandtab tabstop=2 shiftwidth=2:
diff --git a/linux/jsimdcfg.inc b/linux/jsimdcfg.inc
deleted file mode 100644
index 9d4aede..0000000
--- a/linux/jsimdcfg.inc
+++ /dev/null
@@ -1,94 +0,0 @@
-;
-; Automatically generated include file from jsimdcfg.inc.h
-;
-;
-; -- jpeglib.h
-;
-%define DCTSIZE 8
-%define DCTSIZE2 64
-;
-; -- jmorecfg.h
-;
-%define RGB_RED 0
-%define RGB_GREEN 1
-%define RGB_BLUE 2
-%define RGB_PIXELSIZE 3
-%define EXT_RGB_RED 0
-%define EXT_RGB_GREEN 1
-%define EXT_RGB_BLUE 2
-%define EXT_RGB_PIXELSIZE 3
-%define EXT_RGBX_RED 0
-%define EXT_RGBX_GREEN 1
-%define EXT_RGBX_BLUE 2
-%define EXT_RGBX_PIXELSIZE 4
-%define EXT_BGR_RED 2
-%define EXT_BGR_GREEN 1
-%define EXT_BGR_BLUE 0
-%define EXT_BGR_PIXELSIZE 3
-%define EXT_BGRX_RED 2
-%define EXT_BGRX_GREEN 1
-%define EXT_BGRX_BLUE 0
-%define EXT_BGRX_PIXELSIZE 4
-%define EXT_XBGR_RED 3
-%define EXT_XBGR_GREEN 2
-%define EXT_XBGR_BLUE 1
-%define EXT_XBGR_PIXELSIZE 4
-%define EXT_XRGB_RED 1
-%define EXT_XRGB_GREEN 2
-%define EXT_XRGB_BLUE 3
-%define EXT_XRGB_PIXELSIZE 4
-%define RGBX_FILLER_0XFF 1
-; Representation of a single sample (pixel element value).
-; On this SIMD implementation, this must be 'unsigned char'.
-;
-%define JSAMPLE byte ; unsigned char
-%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE)
-%define CENTERJSAMPLE 128
-; Representation of a DCT frequency coefficient.
-; On this SIMD implementation, this must be 'short'.
-;
-%define JCOEF word ; short
-%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF)
-; Datatype used for image dimensions.
-; On this SIMD implementation, this must be 'unsigned int'.
-;
-%define JDIMENSION dword ; unsigned int
-%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION)
-%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h)
-%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h)
-%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h)
-%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h)
-%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW)
-%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY)
-%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE)
-%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR)
-;
-; -- jdct.h
-;
-; A forward DCT routine is given a pointer to a work area of type DCTELEM[];
-; the DCT is to be performed in-place in that buffer.
-; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
-;
-%define DCTELEM word ; short
-%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM)
-%define float FP32 ; float
-%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(float)
-; To maximize parallelism, Type short is changed to short.
-;
-%define ISLOW_MULT_TYPE word ; must be short
-%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE)
-%define IFAST_MULT_TYPE word ; must be short
-%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE)
-%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors
-%define FLOAT_MULT_TYPE FP32 ; must be float
-%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE)
-;
-; -- jsimd.h
-;
-%define JSIMD_NONE 0x00
-%define JSIMD_MMX 0x01
-%define JSIMD_3DNOW 0x02
-%define JSIMD_SSE 0x04
-%define JSIMD_SSE2 0x08
-; Short forms of external names for systems with brain-damaged linkers.
-;
diff --git a/mac/jsimdcfg.inc b/mac/jsimdcfg.inc
deleted file mode 100644
index 9d4aede..0000000
--- a/mac/jsimdcfg.inc
+++ /dev/null
@@ -1,94 +0,0 @@
-;
-; Automatically generated include file from jsimdcfg.inc.h
-;
-;
-; -- jpeglib.h
-;
-%define DCTSIZE 8
-%define DCTSIZE2 64
-;
-; -- jmorecfg.h
-;
-%define RGB_RED 0
-%define RGB_GREEN 1
-%define RGB_BLUE 2
-%define RGB_PIXELSIZE 3
-%define EXT_RGB_RED 0
-%define EXT_RGB_GREEN 1
-%define EXT_RGB_BLUE 2
-%define EXT_RGB_PIXELSIZE 3
-%define EXT_RGBX_RED 0
-%define EXT_RGBX_GREEN 1
-%define EXT_RGBX_BLUE 2
-%define EXT_RGBX_PIXELSIZE 4
-%define EXT_BGR_RED 2
-%define EXT_BGR_GREEN 1
-%define EXT_BGR_BLUE 0
-%define EXT_BGR_PIXELSIZE 3
-%define EXT_BGRX_RED 2
-%define EXT_BGRX_GREEN 1
-%define EXT_BGRX_BLUE 0
-%define EXT_BGRX_PIXELSIZE 4
-%define EXT_XBGR_RED 3
-%define EXT_XBGR_GREEN 2
-%define EXT_XBGR_BLUE 1
-%define EXT_XBGR_PIXELSIZE 4
-%define EXT_XRGB_RED 1
-%define EXT_XRGB_GREEN 2
-%define EXT_XRGB_BLUE 3
-%define EXT_XRGB_PIXELSIZE 4
-%define RGBX_FILLER_0XFF 1
-; Representation of a single sample (pixel element value).
-; On this SIMD implementation, this must be 'unsigned char'.
-;
-%define JSAMPLE byte ; unsigned char
-%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE)
-%define CENTERJSAMPLE 128
-; Representation of a DCT frequency coefficient.
-; On this SIMD implementation, this must be 'short'.
-;
-%define JCOEF word ; short
-%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF)
-; Datatype used for image dimensions.
-; On this SIMD implementation, this must be 'unsigned int'.
-;
-%define JDIMENSION dword ; unsigned int
-%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION)
-%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h)
-%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h)
-%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h)
-%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h)
-%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW)
-%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY)
-%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE)
-%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR)
-;
-; -- jdct.h
-;
-; A forward DCT routine is given a pointer to a work area of type DCTELEM[];
-; the DCT is to be performed in-place in that buffer.
-; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
-;
-%define DCTELEM word ; short
-%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM)
-%define float FP32 ; float
-%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(float)
-; To maximize parallelism, Type short is changed to short.
-;
-%define ISLOW_MULT_TYPE word ; must be short
-%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE)
-%define IFAST_MULT_TYPE word ; must be short
-%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE)
-%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors
-%define FLOAT_MULT_TYPE FP32 ; must be float
-%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE)
-;
-; -- jsimd.h
-;
-%define JSIMD_NONE 0x00
-%define JSIMD_MMX 0x01
-%define JSIMD_3DNOW 0x02
-%define JSIMD_SSE 0x04
-%define JSIMD_SSE2 0x08
-; Short forms of external names for systems with brain-damaged linkers.
-;
diff --git a/rrtimer.h b/rrtimer.h
deleted file mode 100644
index 4db5e37..0000000
--- a/rrtimer.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (C)2004 Landmark Graphics Corporation
- * Copyright (C)2005 Sun Microsystems, Inc.
- *
- * This library is free software and may be redistributed and/or modified under
- * the terms of the wxWindows Library License, Version 3.1 or (at your option)
- * any later version.  The full license is in the LICENSE.txt file included
- * with this distribution.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * wxWindows Library License for more details.
- */
-
-#ifndef __RRTIMER_H__
-#define __RRTIMER_H__
-
-#ifdef __cplusplus
-
-#ifdef _WIN32
-#include <windows.h>
-#else
-#include <sys/time.h>
-#endif
-
-class rrtimer
-{
-	public:
-
-		rrtimer(void) : t1(0.0)
-		{
-			#ifdef _WIN32
-			highres=false;  tick=0.001;
-			LARGE_INTEGER Frequency;
-			if(QueryPerformanceFrequency(&Frequency)!=0)
-			{
-				tick=(double)1.0/(double)(Frequency.QuadPart);
-				highres=true;
-			}
-			#endif
-		}
-
-		void start(void)
-		{
-			t1=time();
-		}
-
-		double time(void)
-		{
-			#ifdef _WIN32
-			if(highres)
-			{
-				LARGE_INTEGER Time;
-				QueryPerformanceCounter(&Time);
-				return((double)(Time.QuadPart)*tick);
-			}
-			else
-				return((double)GetTickCount()*tick);
-			#else
-			struct timeval __tv;
-			gettimeofday(&__tv, (struct timezone *)NULL);
-			return((double)(__tv.tv_sec)+(double)(__tv.tv_usec)*0.000001);
-			#endif
-		}
-
-		double elapsed(void)
-		{
-			return time()-t1;
-		}
-
-	private:
-
-		#ifdef _WIN32
-		bool highres;  double tick;
-		#endif
-		double t1;
-};
-
-#endif  // __cplusplus
-
-#ifdef _WIN32
-
-#include <windows.h>
-
-__inline double rrtime(void)
-{
-	LARGE_INTEGER Frequency, Time;
-	if(QueryPerformanceFrequency(&Frequency)!=0)
-	{
-		QueryPerformanceCounter(&Time);
-		return (double)Time.QuadPart/(double)Frequency.QuadPart;
-	}
-	else return (double)GetTickCount()*0.001;
-}
-
-#else
-
-#include <sys/time.h>
-
-#ifdef sun
-#define __inline inline
-#endif
-
-static __inline double rrtime(void)
-{
-	struct timeval __tv;
-	gettimeofday(&__tv, (struct timezone *)NULL);
-	return((double)__tv.tv_sec+(double)__tv.tv_usec*0.000001);
-}
-
-#endif
-
-#endif
-
diff --git a/rrutil.h b/rrutil.h
deleted file mode 100644
index 4b61dbf..0000000
--- a/rrutil.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (C)2004 Landmark Graphics Corporation
- * Copyright (C)2005 Sun Microsystems, Inc.
- * Copyright (C)2010 D. R. Commander
- *
- * This library is free software and may be redistributed and/or modified under
- * the terms of the wxWindows Library License, Version 3.1 or (at your option)
- * any later version.  The full license is in the LICENSE.txt file included
- * with this distribution.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * wxWindows Library License for more details.
- */
-
-#ifndef __RRUTIL_H__
-#define __RRUTIL_H__
-
-#ifdef _WIN32
-	#include <windows.h>
-	#define sleep(t) Sleep((t)*1000)
-	#define usleep(t) Sleep((t)/1000)
-#else
-	#include <unistd.h>
-	#define stricmp strcasecmp
-	#define strnicmp strncasecmp
-#endif
-
-#ifndef min
- #define min(a,b) ((a)<(b)?(a):(b))
-#endif
-
-#ifndef max
- #define max(a,b) ((a)>(b)?(a):(b))
-#endif
-
-#define pow2(i) (1<<(i))
-#define isPow2(x) (((x)&(x-1))==0)
-
-#ifdef sgi
-#define _SC_NPROCESSORS_CONF _SC_NPROC_CONF
-#endif
-
-#ifdef sun
-#define __inline inline
-#endif
-
-static __inline int numprocs(void)
-{
-	#ifdef _WIN32
-	DWORD_PTR ProcAff, SysAff, i;  int count=0;
-	if(!GetProcessAffinityMask(GetCurrentProcess(), &ProcAff, &SysAff)) return(1);
-	for(i=0; i<sizeof(long*)*8; i++) if(ProcAff&(1LL<<i)) count++;
-	return(count);
-	#elif defined (__APPLE__)
-	return(1);
-	#else
-	long count=1;
-	if((count=sysconf(_SC_NPROCESSORS_CONF))!=-1) return((int)count);
-	else return(1);
-	#endif
-}
-
-#define byteswap(i) ( \
-	(((i) & 0xff000000) >> 24) | \
-	(((i) & 0x00ff0000) >>  8) | \
-	(((i) & 0x0000ff00) <<  8) | \
-	(((i) & 0x000000ff) << 24) )
-
-#define byteswap16(i) ( \
-	(((i) & 0xff00) >> 8) | \
-	(((i) & 0x00ff) << 8) )
-
-static __inline int littleendian(void)
-{
-	unsigned int value=1;
-	unsigned char *ptr=(unsigned char *)(&value);
-	if(ptr[0]==1 && ptr[3]==0) return 1;
-	else return 0;
-}
-
-#endif
diff --git a/simd/jcclrmmx.asm b/simd/jcclrmmx.asm
deleted file mode 100644
index 7c93401..0000000
--- a/simd/jcclrmmx.asm
+++ /dev/null
@@ -1,477 +0,0 @@
-;
-; jcclrmmx.asm - colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_ycc_convert_mmx (JDIMENSION img_width,
-;                           JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                           JDIMENSION output_row, int num_rows);
-;
-
-%define img_width(b)	(b)+8			; JDIMENSION img_width
-%define input_buf(b)	(b)+12		; JSAMPARRAY input_buf
-%define output_buf(b)	(b)+16		; JSAMPIMAGE output_buf
-%define output_row(b)	(b)+20		; JDIMENSION output_row
-%define num_rows(b)	(b)+24		; int num_rows
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
-%define WK_NUM		8
-%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
-
-	align	16
-	global	EXTN(jsimd_rgb_ycc_convert_mmx) PRIVATE
-
-EXTN(jsimd_rgb_ycc_convert_mmx):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	eax		; make a room for GOT address
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx			; get GOT address
-	movpic	POINTER [gotptr], ebx	; save GOT address
-
-	mov	ecx, JDIMENSION [img_width(eax)]	; num_cols
-	test	ecx,ecx
-	jz	near .return
-
-	push	ecx
-
-	mov	esi, JSAMPIMAGE [output_buf(eax)]
-	mov	ecx, JDIMENSION [output_row(eax)]
-	mov	edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
-	mov	ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
-	mov	edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
-	lea	edi, [edi+ecx*SIZEOF_JSAMPROW]
-	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
-	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
-
-	pop	ecx
-
-	mov	esi, JSAMPARRAY [input_buf(eax)]
-	mov	eax, INT [num_rows(eax)]
-	test	eax,eax
-	jle	near .return
-	alignx	16,7
-.rowloop:
-	pushpic	eax
-	push	edx
-	push	ebx
-	push	edi
-	push	esi
-	push	ecx			; col
-
-	mov	esi, JSAMPROW [esi]	; inptr
-	mov	edi, JSAMPROW [edi]	; outptr0
-	mov	ebx, JSAMPROW [ebx]	; outptr1
-	mov	edx, JSAMPROW [edx]	; outptr2
-	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
-
-	cmp	ecx, byte SIZEOF_MMWORD
-	jae	short .columnloop
-	alignx	16,7
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-	push	eax
-	push	edx
-	lea	ecx,[ecx+ecx*2]		; imul ecx,RGB_PIXELSIZE
-	test	cl, SIZEOF_BYTE
-	jz	short .column_ld2
-	sub	ecx, byte SIZEOF_BYTE
-	xor	eax,eax
-	mov	al, BYTE [esi+ecx]
-.column_ld2:
-	test	cl, SIZEOF_WORD
-	jz	short .column_ld4
-	sub	ecx, byte SIZEOF_WORD
-	xor	edx,edx
-	mov	dx, WORD [esi+ecx]
-	shl	eax, WORD_BIT
-	or	eax,edx
-.column_ld4:
-	movd	mmA,eax
-	pop	edx
-	pop	eax
-	test	cl, SIZEOF_DWORD
-	jz	short .column_ld8
-	sub	ecx, byte SIZEOF_DWORD
-	movd	mmG, DWORD [esi+ecx]
-	psllq	mmA, DWORD_BIT
-	por	mmA,mmG
-.column_ld8:
-	test	cl, SIZEOF_MMWORD
-	jz	short .column_ld16
-	movq	mmG,mmA
-	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-	mov	ecx, SIZEOF_MMWORD
-	jmp	short .rgb_ycc_cnv
-.column_ld16:
-	test	cl, 2*SIZEOF_MMWORD
-	mov	ecx, SIZEOF_MMWORD
-	jz	short .rgb_ycc_cnv
-	movq	mmF,mmA
-	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-	movq	mmG, MMWORD [esi+1*SIZEOF_MMWORD]
-	jmp	short .rgb_ycc_cnv
-	alignx	16,7
-
-.columnloop:
-	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-	movq	mmG, MMWORD [esi+1*SIZEOF_MMWORD]
-	movq	mmF, MMWORD [esi+2*SIZEOF_MMWORD]
-
-.rgb_ycc_cnv:
-	; mmA=(00 10 20 01 11 21 02 12)
-	; mmG=(22 03 13 23 04 14 24 05)
-	; mmF=(15 25 06 16 26 07 17 27)
-
-	movq      mmD,mmA
-	psllq     mmA,4*BYTE_BIT	; mmA=(-- -- -- -- 00 10 20 01)
-	psrlq     mmD,4*BYTE_BIT	; mmD=(11 21 02 12 -- -- -- --)
-
-	punpckhbw mmA,mmG		; mmA=(00 04 10 14 20 24 01 05)
-	psllq     mmG,4*BYTE_BIT	; mmG=(-- -- -- -- 22 03 13 23)
-
-	punpcklbw mmD,mmF		; mmD=(11 15 21 25 02 06 12 16)
-	punpckhbw mmG,mmF		; mmG=(22 26 03 07 13 17 23 27)
-
-	movq      mmE,mmA
-	psllq     mmA,4*BYTE_BIT	; mmA=(-- -- -- -- 00 04 10 14)
-	psrlq     mmE,4*BYTE_BIT	; mmE=(20 24 01 05 -- -- -- --)
-
-	punpckhbw mmA,mmD		; mmA=(00 02 04 06 10 12 14 16)
-	psllq     mmD,4*BYTE_BIT	; mmD=(-- -- -- -- 11 15 21 25)
-
-	punpcklbw mmE,mmG		; mmE=(20 22 24 26 01 03 05 07)
-	punpckhbw mmD,mmG		; mmD=(11 13 15 17 21 23 25 27)
-
-	pxor      mmH,mmH
-
-	movq      mmC,mmA
-	punpcklbw mmA,mmH		; mmA=(00 02 04 06)
-	punpckhbw mmC,mmH		; mmC=(10 12 14 16)
-
-	movq      mmB,mmE
-	punpcklbw mmE,mmH		; mmE=(20 22 24 26)
-	punpckhbw mmB,mmH		; mmB=(01 03 05 07)
-
-	movq      mmF,mmD
-	punpcklbw mmD,mmH		; mmD=(11 13 15 17)
-	punpckhbw mmF,mmH		; mmF=(21 23 25 27)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-	test	cl, SIZEOF_MMWORD/8
-	jz	short .column_ld2
-	sub	ecx, byte SIZEOF_MMWORD/8
-	movd	mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld2:
-	test	cl, SIZEOF_MMWORD/4
-	jz	short .column_ld4
-	sub	ecx, byte SIZEOF_MMWORD/4
-	movq	mmF,mmA
-	movq	mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld4:
-	test	cl, SIZEOF_MMWORD/2
-	mov	ecx, SIZEOF_MMWORD
-	jz	short .rgb_ycc_cnv
-	movq	mmD,mmA
-	movq	mmC,mmF
-	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-	movq	mmF, MMWORD [esi+1*SIZEOF_MMWORD]
-	jmp	short .rgb_ycc_cnv
-	alignx	16,7
-
-.columnloop:
-	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-	movq	mmF, MMWORD [esi+1*SIZEOF_MMWORD]
-	movq	mmD, MMWORD [esi+2*SIZEOF_MMWORD]
-	movq	mmC, MMWORD [esi+3*SIZEOF_MMWORD]
-
-.rgb_ycc_cnv:
-	; mmA=(00 10 20 30 01 11 21 31)
-	; mmF=(02 12 22 32 03 13 23 33)
-	; mmD=(04 14 24 34 05 15 25 35)
-	; mmC=(06 16 26 36 07 17 27 37)
-
-	movq      mmB,mmA
-	punpcklbw mmA,mmF		; mmA=(00 02 10 12 20 22 30 32)
-	punpckhbw mmB,mmF		; mmB=(01 03 11 13 21 23 31 33)
-
-	movq      mmG,mmD
-	punpcklbw mmD,mmC		; mmD=(04 06 14 16 24 26 34 36)
-	punpckhbw mmG,mmC		; mmG=(05 07 15 17 25 27 35 37)
-
-	movq      mmE,mmA
-	punpcklwd mmA,mmD		; mmA=(00 02 04 06 10 12 14 16)
-	punpckhwd mmE,mmD		; mmE=(20 22 24 26 30 32 34 36)
-
-	movq      mmH,mmB
-	punpcklwd mmB,mmG		; mmB=(01 03 05 07 11 13 15 17)
-	punpckhwd mmH,mmG		; mmH=(21 23 25 27 31 33 35 37)
-
-	pxor      mmF,mmF
-
-	movq      mmC,mmA
-	punpcklbw mmA,mmF		; mmA=(00 02 04 06)
-	punpckhbw mmC,mmF		; mmC=(10 12 14 16)
-
-	movq      mmD,mmB
-	punpcklbw mmB,mmF		; mmB=(01 03 05 07)
-	punpckhbw mmD,mmF		; mmD=(11 13 15 17)
-
-	movq      mmG,mmE
-	punpcklbw mmE,mmF		; mmE=(20 22 24 26)
-	punpckhbw mmG,mmF		; mmG=(30 32 34 36)
-
-	punpcklbw mmF,mmH
-	punpckhbw mmH,mmH
-	psrlw     mmF,BYTE_BIT		; mmF=(21 23 25 27)
-	psrlw     mmH,BYTE_BIT		; mmH=(31 33 35 37)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-	; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
-	; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
-
-	; (Original)
-	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-	;
-	; (This implementation)
-	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-
-	movq      MMWORD [wk(0)], mm0	; wk(0)=RE
-	movq      MMWORD [wk(1)], mm1	; wk(1)=RO
-	movq      MMWORD [wk(2)], mm4	; wk(2)=BE
-	movq      MMWORD [wk(3)], mm5	; wk(3)=BO
-
-	movq      mm6,mm1
-	punpcklwd mm1,mm3
-	punpckhwd mm6,mm3
-	movq      mm7,mm1
-	movq      mm4,mm6
-	pmaddwd   mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-	pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-	pmaddwd   mm7,[GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
-	pmaddwd   mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
-
-	movq      MMWORD [wk(4)], mm1	; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
-	movq      MMWORD [wk(5)], mm6	; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-	pxor      mm1,mm1
-	pxor      mm6,mm6
-	punpcklwd mm1,mm5		; mm1=BOL
-	punpckhwd mm6,mm5		; mm6=BOH
-	psrld     mm1,1			; mm1=BOL*FIX(0.500)
-	psrld     mm6,1			; mm6=BOH*FIX(0.500)
-
-	movq      mm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ]
-
-	paddd     mm7,mm1
-	paddd     mm4,mm6
-	paddd     mm7,mm5
-	paddd     mm4,mm5
-	psrld     mm7,SCALEBITS		; mm7=CbOL
-	psrld     mm4,SCALEBITS		; mm4=CbOH
-	packssdw  mm7,mm4		; mm7=CbO
-
-	movq      mm1, MMWORD [wk(2)]	; mm1=BE
-
-	movq      mm6,mm0
-	punpcklwd mm0,mm2
-	punpckhwd mm6,mm2
-	movq      mm5,mm0
-	movq      mm4,mm6
-	pmaddwd   mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
-	pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
-	pmaddwd   mm5,[GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
-	pmaddwd   mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
-
-	movq      MMWORD [wk(6)], mm0	; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
-	movq      MMWORD [wk(7)], mm6	; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-	pxor      mm0,mm0
-	pxor      mm6,mm6
-	punpcklwd mm0,mm1		; mm0=BEL
-	punpckhwd mm6,mm1		; mm6=BEH
-	psrld     mm0,1			; mm0=BEL*FIX(0.500)
-	psrld     mm6,1			; mm6=BEH*FIX(0.500)
-
-	movq      mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
-
-	paddd     mm5,mm0
-	paddd     mm4,mm6
-	paddd     mm5,mm1
-	paddd     mm4,mm1
-	psrld     mm5,SCALEBITS		; mm5=CbEL
-	psrld     mm4,SCALEBITS		; mm4=CbEH
-	packssdw  mm5,mm4		; mm5=CbE
-
-	psllw     mm7,BYTE_BIT
-	por       mm5,mm7		; mm5=Cb
-	movq      MMWORD [ebx], mm5	; Save Cb
-
-	movq      mm0, MMWORD [wk(3)]	; mm0=BO
-	movq      mm6, MMWORD [wk(2)]	; mm6=BE
-	movq      mm1, MMWORD [wk(1)]	; mm1=RO
-
-	movq      mm4,mm0
-	punpcklwd mm0,mm3
-	punpckhwd mm4,mm3
-	movq      mm7,mm0
-	movq      mm5,mm4
-	pmaddwd   mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-	pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-	pmaddwd   mm7,[GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
-	pmaddwd   mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
-
-	movq      mm3,[GOTOFF(eax,PD_ONEHALF)]	; mm3=[PD_ONEHALF]
-
-	paddd     mm0, MMWORD [wk(4)]
-	paddd     mm4, MMWORD [wk(5)]
-	paddd     mm0,mm3
-	paddd     mm4,mm3
-	psrld     mm0,SCALEBITS		; mm0=YOL
-	psrld     mm4,SCALEBITS		; mm4=YOH
-	packssdw  mm0,mm4		; mm0=YO
-
-	pxor      mm3,mm3
-	pxor      mm4,mm4
-	punpcklwd mm3,mm1		; mm3=ROL
-	punpckhwd mm4,mm1		; mm4=ROH
-	psrld     mm3,1			; mm3=ROL*FIX(0.500)
-	psrld     mm4,1			; mm4=ROH*FIX(0.500)
-
-	movq      mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
-
-	paddd     mm7,mm3
-	paddd     mm5,mm4
-	paddd     mm7,mm1
-	paddd     mm5,mm1
-	psrld     mm7,SCALEBITS		; mm7=CrOL
-	psrld     mm5,SCALEBITS		; mm5=CrOH
-	packssdw  mm7,mm5		; mm7=CrO
-
-	movq      mm3, MMWORD [wk(0)]	; mm3=RE
-
-	movq      mm4,mm6
-	punpcklwd mm6,mm2
-	punpckhwd mm4,mm2
-	movq      mm1,mm6
-	movq      mm5,mm4
-	pmaddwd   mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-	pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-	pmaddwd   mm1,[GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
-	pmaddwd   mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
-
-	movq      mm2,[GOTOFF(eax,PD_ONEHALF)]	; mm2=[PD_ONEHALF]
-
-	paddd     mm6, MMWORD [wk(6)]
-	paddd     mm4, MMWORD [wk(7)]
-	paddd     mm6,mm2
-	paddd     mm4,mm2
-	psrld     mm6,SCALEBITS		; mm6=YEL
-	psrld     mm4,SCALEBITS		; mm4=YEH
-	packssdw  mm6,mm4		; mm6=YE
-
-	psllw     mm0,BYTE_BIT
-	por       mm6,mm0		; mm6=Y
-	movq      MMWORD [edi], mm6	; Save Y
-
-	pxor      mm2,mm2
-	pxor      mm4,mm4
-	punpcklwd mm2,mm3		; mm2=REL
-	punpckhwd mm4,mm3		; mm4=REH
-	psrld     mm2,1			; mm2=REL*FIX(0.500)
-	psrld     mm4,1			; mm4=REH*FIX(0.500)
-
-	movq      mm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ]
-
-	paddd     mm1,mm2
-	paddd     mm5,mm4
-	paddd     mm1,mm0
-	paddd     mm5,mm0
-	psrld     mm1,SCALEBITS		; mm1=CrEL
-	psrld     mm5,SCALEBITS		; mm5=CrEH
-	packssdw  mm1,mm5		; mm1=CrE
-
-	psllw     mm7,BYTE_BIT
-	por       mm1,mm7		; mm1=Cr
-	movq      MMWORD [edx], mm1	; Save Cr
-
-	sub	ecx, byte SIZEOF_MMWORD
-	add	esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; inptr
-	add	edi, byte SIZEOF_MMWORD			; outptr0
-	add	ebx, byte SIZEOF_MMWORD			; outptr1
-	add	edx, byte SIZEOF_MMWORD			; outptr2
-	cmp	ecx, byte SIZEOF_MMWORD
-	jae	near .columnloop
-	test	ecx,ecx
-	jnz	near .column_ld1
-
-	pop	ecx			; col
-	pop	esi
-	pop	edi
-	pop	ebx
-	pop	edx
-	poppic	eax
-
-	add	esi, byte SIZEOF_JSAMPROW	; input_buf
-	add	edi, byte SIZEOF_JSAMPROW
-	add	ebx, byte SIZEOF_JSAMPROW
-	add	edx, byte SIZEOF_JSAMPROW
-	dec	eax				; num_rows
-	jg	near .rowloop
-
-	emms		; empty MMX state
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jcclrss2-64.asm b/simd/jcclrss2-64.asm
deleted file mode 100644
index 02ccaf3..0000000
--- a/simd/jcclrss2-64.asm
+++ /dev/null
@@ -1,485 +0,0 @@
-;
-; jcclrss2-64.asm - colorspace conversion (64-bit SSE2)
-;
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; Copyright (C) 2009, D. R. Commander.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
-;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                             JDIMENSION output_row, int num_rows);
-;
-
-; r10 = JDIMENSION img_width
-; r11 = JSAMPARRAY input_buf
-; r12 = JSAMPIMAGE output_buf
-; r13 = JDIMENSION output_row
-; r14 = int num_rows
-
-%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		8
-
-	align	16
-
-	global	EXTN(jsimd_rgb_ycc_convert_sse2) PRIVATE
-
-EXTN(jsimd_rgb_ycc_convert_sse2):
-	push	rbp
-	mov	rax,rsp				; rax = original rbp
-	sub	rsp, byte 4
-	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[rsp],rax
-	mov	rbp,rsp				; rbp = aligned rbp
-	lea	rsp, [wk(0)]
-	collect_args
-	push	rbx
-
-	mov	rcx, r10
-	test	rcx,rcx
-	jz	near .return
-
-	push	rcx
-
-	mov rsi, r12
-	mov rcx, r13
-	mov	rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
-	mov	rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
-	mov	rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
-	lea	rdi, [rdi+rcx*SIZEOF_JSAMPROW]
-	lea	rbx, [rbx+rcx*SIZEOF_JSAMPROW]
-	lea	rdx, [rdx+rcx*SIZEOF_JSAMPROW]
-
-	pop	rcx
-
-	mov rsi, r11
-	mov	eax, r14d
-	test	rax,rax
-	jle	near .return
-.rowloop:
-	push	rdx
-	push	rbx
-	push	rdi
-	push	rsi
-	push	rcx			; col
-
-	mov	rsi, JSAMPROW [rsi]	; inptr
-	mov	rdi, JSAMPROW [rdi]	; outptr0
-	mov	rbx, JSAMPROW [rbx]	; outptr1
-	mov	rdx, JSAMPROW [rdx]	; outptr2
-
-	cmp	rcx, byte SIZEOF_XMMWORD
-	jae	near .columnloop
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-	push	rax
-	push	rdx
-	lea	rcx,[rcx+rcx*2]		; imul ecx,RGB_PIXELSIZE
-	test	cl, SIZEOF_BYTE
-	jz	short .column_ld2
-	sub	rcx, byte SIZEOF_BYTE
-	movzx	rax, BYTE [rsi+rcx]
-.column_ld2:
-	test	cl, SIZEOF_WORD
-	jz	short .column_ld4
-	sub	rcx, byte SIZEOF_WORD
-	movzx	rdx, WORD [rsi+rcx]
-	shl	rax, WORD_BIT
-	or	rax,rdx
-.column_ld4:
-	movd	xmmA,eax
-	pop	rdx
-	pop	rax
-	test	cl, SIZEOF_DWORD
-	jz	short .column_ld8
-	sub	rcx, byte SIZEOF_DWORD
-	movd	xmmF, XMM_DWORD [rsi+rcx]
-	pslldq	xmmA, SIZEOF_DWORD
-	por	xmmA,xmmF
-.column_ld8:
-	test	cl, SIZEOF_MMWORD
-	jz	short .column_ld16
-	sub	rcx, byte SIZEOF_MMWORD
-	movq	xmmB, XMM_MMWORD [rsi+rcx]
-	pslldq	xmmA, SIZEOF_MMWORD
-	por	xmmA,xmmB
-.column_ld16:
-	test	cl, SIZEOF_XMMWORD
-	jz	short .column_ld32
-	movdqa	xmmF,xmmA
-	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	mov	rcx, SIZEOF_XMMWORD
-	jmp	short .rgb_ycc_cnv
-.column_ld32:
-	test	cl, 2*SIZEOF_XMMWORD
-	mov	rcx, SIZEOF_XMMWORD
-	jz	short .rgb_ycc_cnv
-	movdqa	xmmB,xmmA
-	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	movdqu	xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-	jmp	short .rgb_ycc_cnv
-
-.columnloop:
-	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	movdqu	xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-	movdqu	xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
-
-.rgb_ycc_cnv:
-	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-	; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-	; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-	movdqa    xmmG,xmmA
-	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
-	psrldq    xmmG,8	; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
-
-	punpckhbw xmmA,xmmF	; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
-	pslldq    xmmF,8	; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
-
-	punpcklbw xmmG,xmmB	; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
-	punpckhbw xmmF,xmmB	; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
-
-	movdqa    xmmD,xmmA
-	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
-	psrldq    xmmD,8	; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
-
-	punpckhbw xmmA,xmmG	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
-	pslldq    xmmG,8	; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
-
-	punpcklbw xmmD,xmmF	; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
-	punpckhbw xmmG,xmmF	; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
-
-	movdqa    xmmE,xmmA
-	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
-	psrldq    xmmE,8	; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
-
-	punpckhbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-	pslldq    xmmD,8	; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
-
-	punpcklbw xmmE,xmmG	; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
-	punpckhbw xmmD,xmmG	; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
-
-	pxor      xmmH,xmmH
-
-	movdqa    xmmC,xmmA
-	punpcklbw xmmA,xmmH	; xmmA=(00 02 04 06 08 0A 0C 0E)
-	punpckhbw xmmC,xmmH	; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-	movdqa    xmmB,xmmE
-	punpcklbw xmmE,xmmH	; xmmE=(20 22 24 26 28 2A 2C 2E)
-	punpckhbw xmmB,xmmH	; xmmB=(01 03 05 07 09 0B 0D 0F)
-
-	movdqa    xmmF,xmmD
-	punpcklbw xmmD,xmmH	; xmmD=(11 13 15 17 19 1B 1D 1F)
-	punpckhbw xmmF,xmmH	; xmmF=(21 23 25 27 29 2B 2D 2F)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-	test	cl, SIZEOF_XMMWORD/16
-	jz	short .column_ld2
-	sub	rcx, byte SIZEOF_XMMWORD/16
-	movd	xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
-.column_ld2:
-	test	cl, SIZEOF_XMMWORD/8
-	jz	short .column_ld4
-	sub	rcx, byte SIZEOF_XMMWORD/8
-	movq	xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
-	pslldq	xmmA, SIZEOF_MMWORD
-	por	xmmA,xmmE
-.column_ld4:
-	test	cl, SIZEOF_XMMWORD/4
-	jz	short .column_ld8
-	sub	rcx, byte SIZEOF_XMMWORD/4
-	movdqa	xmmE,xmmA
-	movdqu	xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
-.column_ld8:
-	test	cl, SIZEOF_XMMWORD/2
-	mov	rcx, SIZEOF_XMMWORD
-	jz	short .rgb_ycc_cnv
-	movdqa	xmmF,xmmA
-	movdqa	xmmH,xmmE
-	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	movdqu	xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-	jmp	short .rgb_ycc_cnv
-
-.columnloop:
-	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	movdqu	xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-	movdqu	xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
-	movdqu	xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
-
-.rgb_ycc_cnv:
-	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-	; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-	; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-	movdqa    xmmD,xmmA
-	punpcklbw xmmA,xmmE	; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
-	punpckhbw xmmD,xmmE	; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
-	movdqa    xmmC,xmmF
-	punpcklbw xmmF,xmmH	; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
-	punpckhbw xmmC,xmmH	; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
-	movdqa    xmmB,xmmA
-	punpcklwd xmmA,xmmF	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
-	punpckhwd xmmB,xmmF	; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
-	movdqa    xmmG,xmmD
-	punpcklwd xmmD,xmmC	; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
-	punpckhwd xmmG,xmmC	; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
-	movdqa    xmmE,xmmA
-	punpcklbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-	punpckhbw xmmE,xmmD	; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
-	movdqa    xmmH,xmmB
-	punpcklbw xmmB,xmmG	; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
-	punpckhbw xmmH,xmmG	; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
-	pxor      xmmF,xmmF
-
-	movdqa    xmmC,xmmA
-	punpcklbw xmmA,xmmF	; xmmA=(00 02 04 06 08 0A 0C 0E)
-	punpckhbw xmmC,xmmF	; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-	movdqa    xmmD,xmmB
-	punpcklbw xmmB,xmmF	; xmmB=(01 03 05 07 09 0B 0D 0F)
-	punpckhbw xmmD,xmmF	; xmmD=(11 13 15 17 19 1B 1D 1F)
-
-	movdqa    xmmG,xmmE
-	punpcklbw xmmE,xmmF	; xmmE=(20 22 24 26 28 2A 2C 2E)
-	punpckhbw xmmG,xmmF	; xmmG=(30 32 34 36 38 3A 3C 3E)
-
-	punpcklbw xmmF,xmmH
-	punpckhbw xmmH,xmmH
-	psrlw     xmmF,BYTE_BIT	; xmmF=(21 23 25 27 29 2B 2D 2F)
-	psrlw     xmmH,BYTE_BIT	; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-	; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
-	; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
-	; (Original)
-	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-	;
-	; (This implementation)
-	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-
-	movdqa    XMMWORD [wk(0)], xmm0	; wk(0)=RE
-	movdqa    XMMWORD [wk(1)], xmm1	; wk(1)=RO
-	movdqa    XMMWORD [wk(2)], xmm4	; wk(2)=BE
-	movdqa    XMMWORD [wk(3)], xmm5	; wk(3)=BO
-
-	movdqa    xmm6,xmm1
-	punpcklwd xmm1,xmm3
-	punpckhwd xmm6,xmm3
-	movdqa    xmm7,xmm1
-	movdqa    xmm4,xmm6
-	pmaddwd   xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-	pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-	pmaddwd   xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
-	pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
-
-	movdqa    XMMWORD [wk(4)], xmm1	; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
-	movdqa    XMMWORD [wk(5)], xmm6	; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-	pxor      xmm1,xmm1
-	pxor      xmm6,xmm6
-	punpcklwd xmm1,xmm5		; xmm1=BOL
-	punpckhwd xmm6,xmm5		; xmm6=BOH
-	psrld     xmm1,1		; xmm1=BOL*FIX(0.500)
-	psrld     xmm6,1		; xmm6=BOH*FIX(0.500)
-
-	movdqa    xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
-
-	paddd     xmm7,xmm1
-	paddd     xmm4,xmm6
-	paddd     xmm7,xmm5
-	paddd     xmm4,xmm5
-	psrld     xmm7,SCALEBITS	; xmm7=CbOL
-	psrld     xmm4,SCALEBITS	; xmm4=CbOH
-	packssdw  xmm7,xmm4		; xmm7=CbO
-
-	movdqa    xmm1, XMMWORD [wk(2)]	; xmm1=BE
-
-	movdqa    xmm6,xmm0
-	punpcklwd xmm0,xmm2
-	punpckhwd xmm6,xmm2
-	movdqa    xmm5,xmm0
-	movdqa    xmm4,xmm6
-	pmaddwd   xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
-	pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
-	pmaddwd   xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
-	pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
-
-	movdqa    XMMWORD [wk(6)], xmm0	; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
-	movdqa    XMMWORD [wk(7)], xmm6	; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-	pxor      xmm0,xmm0
-	pxor      xmm6,xmm6
-	punpcklwd xmm0,xmm1		; xmm0=BEL
-	punpckhwd xmm6,xmm1		; xmm6=BEH
-	psrld     xmm0,1		; xmm0=BEL*FIX(0.500)
-	psrld     xmm6,1		; xmm6=BEH*FIX(0.500)
-
-	movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
-
-	paddd     xmm5,xmm0
-	paddd     xmm4,xmm6
-	paddd     xmm5,xmm1
-	paddd     xmm4,xmm1
-	psrld     xmm5,SCALEBITS	; xmm5=CbEL
-	psrld     xmm4,SCALEBITS	; xmm4=CbEH
-	packssdw  xmm5,xmm4		; xmm5=CbE
-
-	psllw     xmm7,BYTE_BIT
-	por       xmm5,xmm7		; xmm5=Cb
-	movdqa    XMMWORD [rbx], xmm5	; Save Cb
-
-	movdqa    xmm0, XMMWORD [wk(3)]	; xmm0=BO
-	movdqa    xmm6, XMMWORD [wk(2)]	; xmm6=BE
-	movdqa    xmm1, XMMWORD [wk(1)]	; xmm1=RO
-
-	movdqa    xmm4,xmm0
-	punpcklwd xmm0,xmm3
-	punpckhwd xmm4,xmm3
-	movdqa    xmm7,xmm0
-	movdqa    xmm5,xmm4
-	pmaddwd   xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-	pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-	pmaddwd   xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
-	pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
-
-	movdqa    xmm3,[rel PD_ONEHALF]	; xmm3=[PD_ONEHALF]
-
-	paddd     xmm0, XMMWORD [wk(4)]
-	paddd     xmm4, XMMWORD [wk(5)]
-	paddd     xmm0,xmm3
-	paddd     xmm4,xmm3
-	psrld     xmm0,SCALEBITS	; xmm0=YOL
-	psrld     xmm4,SCALEBITS	; xmm4=YOH
-	packssdw  xmm0,xmm4		; xmm0=YO
-
-	pxor      xmm3,xmm3
-	pxor      xmm4,xmm4
-	punpcklwd xmm3,xmm1		; xmm3=ROL
-	punpckhwd xmm4,xmm1		; xmm4=ROH
-	psrld     xmm3,1		; xmm3=ROL*FIX(0.500)
-	psrld     xmm4,1		; xmm4=ROH*FIX(0.500)
-
-	movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
-
-	paddd     xmm7,xmm3
-	paddd     xmm5,xmm4
-	paddd     xmm7,xmm1
-	paddd     xmm5,xmm1
-	psrld     xmm7,SCALEBITS	; xmm7=CrOL
-	psrld     xmm5,SCALEBITS	; xmm5=CrOH
-	packssdw  xmm7,xmm5		; xmm7=CrO
-
-	movdqa    xmm3, XMMWORD [wk(0)]	; xmm3=RE
-
-	movdqa    xmm4,xmm6
-	punpcklwd xmm6,xmm2
-	punpckhwd xmm4,xmm2
-	movdqa    xmm1,xmm6
-	movdqa    xmm5,xmm4
-	pmaddwd   xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-	pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-	pmaddwd   xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
-	pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
-
-	movdqa    xmm2,[rel PD_ONEHALF]	; xmm2=[PD_ONEHALF]
-
-	paddd     xmm6, XMMWORD [wk(6)]
-	paddd     xmm4, XMMWORD [wk(7)]
-	paddd     xmm6,xmm2
-	paddd     xmm4,xmm2
-	psrld     xmm6,SCALEBITS	; xmm6=YEL
-	psrld     xmm4,SCALEBITS	; xmm4=YEH
-	packssdw  xmm6,xmm4		; xmm6=YE
-
-	psllw     xmm0,BYTE_BIT
-	por       xmm6,xmm0		; xmm6=Y
-	movdqa    XMMWORD [rdi], xmm6	; Save Y
-
-	pxor      xmm2,xmm2
-	pxor      xmm4,xmm4
-	punpcklwd xmm2,xmm3		; xmm2=REL
-	punpckhwd xmm4,xmm3		; xmm4=REH
-	psrld     xmm2,1		; xmm2=REL*FIX(0.500)
-	psrld     xmm4,1		; xmm4=REH*FIX(0.500)
-
-	movdqa    xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
-
-	paddd     xmm1,xmm2
-	paddd     xmm5,xmm4
-	paddd     xmm1,xmm0
-	paddd     xmm5,xmm0
-	psrld     xmm1,SCALEBITS	; xmm1=CrEL
-	psrld     xmm5,SCALEBITS	; xmm5=CrEH
-	packssdw  xmm1,xmm5		; xmm1=CrE
-
-	psllw     xmm7,BYTE_BIT
-	por       xmm1,xmm7		; xmm1=Cr
-	movdqa    XMMWORD [rdx], xmm1	; Save Cr
-
-	sub	rcx, byte SIZEOF_XMMWORD
-	add	rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; inptr
-	add	rdi, byte SIZEOF_XMMWORD		; outptr0
-	add	rbx, byte SIZEOF_XMMWORD		; outptr1
-	add	rdx, byte SIZEOF_XMMWORD		; outptr2
-	cmp	rcx, byte SIZEOF_XMMWORD
-	jae	near .columnloop
-	test	rcx,rcx
-	jnz	near .column_ld1
-
-	pop	rcx			; col
-	pop	rsi
-	pop	rdi
-	pop	rbx
-	pop	rdx
-
-	add	rsi, byte SIZEOF_JSAMPROW	; input_buf
-	add	rdi, byte SIZEOF_JSAMPROW
-	add	rbx, byte SIZEOF_JSAMPROW
-	add	rdx, byte SIZEOF_JSAMPROW
-	dec	rax				; num_rows
-	jg	near .rowloop
-
-.return:
-	pop	rbx
-	uncollect_args
-	mov	rsp,rbp		; rsp <- aligned rbp
-	pop	rsp		; rsp <- original rbp
-	pop	rbp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jcclrss2.asm b/simd/jcclrss2.asm
deleted file mode 100644
index bcd51fc..0000000
--- a/simd/jcclrss2.asm
+++ /dev/null
@@ -1,503 +0,0 @@
-;
-; jcclrss2.asm - colorspace conversion (SSE2)
-;
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
-;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                             JDIMENSION output_row, int num_rows);
-;
-
-%define img_width(b)	(b)+8			; JDIMENSION img_width
-%define input_buf(b)	(b)+12		; JSAMPARRAY input_buf
-%define output_buf(b)	(b)+16		; JSAMPIMAGE output_buf
-%define output_row(b)	(b)+20		; JDIMENSION output_row
-%define num_rows(b)	(b)+24		; int num_rows
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		8
-%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
-
-	align	16
-
-	global	EXTN(jsimd_rgb_ycc_convert_sse2) PRIVATE
-
-EXTN(jsimd_rgb_ycc_convert_sse2):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	eax		; make a room for GOT address
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx			; get GOT address
-	movpic	POINTER [gotptr], ebx	; save GOT address
-
-	mov	ecx, JDIMENSION [img_width(eax)]
-	test	ecx,ecx
-	jz	near .return
-
-	push	ecx
-
-	mov	esi, JSAMPIMAGE [output_buf(eax)]
-	mov	ecx, JDIMENSION [output_row(eax)]
-	mov	edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
-	mov	ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
-	mov	edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
-	lea	edi, [edi+ecx*SIZEOF_JSAMPROW]
-	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
-	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
-
-	pop	ecx
-
-	mov	esi, JSAMPARRAY [input_buf(eax)]
-	mov	eax, INT [num_rows(eax)]
-	test	eax,eax
-	jle	near .return
-	alignx	16,7
-.rowloop:
-	pushpic	eax
-	push	edx
-	push	ebx
-	push	edi
-	push	esi
-	push	ecx			; col
-
-	mov	esi, JSAMPROW [esi]	; inptr
-	mov	edi, JSAMPROW [edi]	; outptr0
-	mov	ebx, JSAMPROW [ebx]	; outptr1
-	mov	edx, JSAMPROW [edx]	; outptr2
-	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
-
-	cmp	ecx, byte SIZEOF_XMMWORD
-	jae	near .columnloop
-	alignx	16,7
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-	push	eax
-	push	edx
-	lea	ecx,[ecx+ecx*2]		; imul ecx,RGB_PIXELSIZE
-	test	cl, SIZEOF_BYTE
-	jz	short .column_ld2
-	sub	ecx, byte SIZEOF_BYTE
-	movzx	eax, BYTE [esi+ecx]
-.column_ld2:
-	test	cl, SIZEOF_WORD
-	jz	short .column_ld4
-	sub	ecx, byte SIZEOF_WORD
-	movzx	edx, WORD [esi+ecx]
-	shl	eax, WORD_BIT
-	or	eax,edx
-.column_ld4:
-	movd	xmmA,eax
-	pop	edx
-	pop	eax
-	test	cl, SIZEOF_DWORD
-	jz	short .column_ld8
-	sub	ecx, byte SIZEOF_DWORD
-	movd	xmmF, XMM_DWORD [esi+ecx]
-	pslldq	xmmA, SIZEOF_DWORD
-	por	xmmA,xmmF
-.column_ld8:
-	test	cl, SIZEOF_MMWORD
-	jz	short .column_ld16
-	sub	ecx, byte SIZEOF_MMWORD
-	movq	xmmB, XMM_MMWORD [esi+ecx]
-	pslldq	xmmA, SIZEOF_MMWORD
-	por	xmmA,xmmB
-.column_ld16:
-	test	cl, SIZEOF_XMMWORD
-	jz	short .column_ld32
-	movdqa	xmmF,xmmA
-	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	mov	ecx, SIZEOF_XMMWORD
-	jmp	short .rgb_ycc_cnv
-.column_ld32:
-	test	cl, 2*SIZEOF_XMMWORD
-	mov	ecx, SIZEOF_XMMWORD
-	jz	short .rgb_ycc_cnv
-	movdqa	xmmB,xmmA
-	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	movdqu	xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
-	jmp	short .rgb_ycc_cnv
-	alignx	16,7
-
-.columnloop:
-	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	movdqu	xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
-	movdqu	xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
-
-.rgb_ycc_cnv:
-	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-	; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-	; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-	movdqa    xmmG,xmmA
-	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
-	psrldq    xmmG,8	; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
-
-	punpckhbw xmmA,xmmF	; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
-	pslldq    xmmF,8	; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
-
-	punpcklbw xmmG,xmmB	; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
-	punpckhbw xmmF,xmmB	; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
-
-	movdqa    xmmD,xmmA
-	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
-	psrldq    xmmD,8	; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
-
-	punpckhbw xmmA,xmmG	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
-	pslldq    xmmG,8	; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
-
-	punpcklbw xmmD,xmmF	; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
-	punpckhbw xmmG,xmmF	; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
-
-	movdqa    xmmE,xmmA
-	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
-	psrldq    xmmE,8	; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
-
-	punpckhbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-	pslldq    xmmD,8	; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
-
-	punpcklbw xmmE,xmmG	; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
-	punpckhbw xmmD,xmmG	; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
-
-	pxor      xmmH,xmmH
-
-	movdqa    xmmC,xmmA
-	punpcklbw xmmA,xmmH	; xmmA=(00 02 04 06 08 0A 0C 0E)
-	punpckhbw xmmC,xmmH	; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-	movdqa    xmmB,xmmE
-	punpcklbw xmmE,xmmH	; xmmE=(20 22 24 26 28 2A 2C 2E)
-	punpckhbw xmmB,xmmH	; xmmB=(01 03 05 07 09 0B 0D 0F)
-
-	movdqa    xmmF,xmmD
-	punpcklbw xmmD,xmmH	; xmmD=(11 13 15 17 19 1B 1D 1F)
-	punpckhbw xmmF,xmmH	; xmmF=(21 23 25 27 29 2B 2D 2F)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-	test	cl, SIZEOF_XMMWORD/16
-	jz	short .column_ld2
-	sub	ecx, byte SIZEOF_XMMWORD/16
-	movd	xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld2:
-	test	cl, SIZEOF_XMMWORD/8
-	jz	short .column_ld4
-	sub	ecx, byte SIZEOF_XMMWORD/8
-	movq	xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
-	pslldq	xmmA, SIZEOF_MMWORD
-	por	xmmA,xmmE
-.column_ld4:
-	test	cl, SIZEOF_XMMWORD/4
-	jz	short .column_ld8
-	sub	ecx, byte SIZEOF_XMMWORD/4
-	movdqa	xmmE,xmmA
-	movdqu	xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld8:
-	test	cl, SIZEOF_XMMWORD/2
-	mov	ecx, SIZEOF_XMMWORD
-	jz	short .rgb_ycc_cnv
-	movdqa	xmmF,xmmA
-	movdqa	xmmH,xmmE
-	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	movdqu	xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
-	jmp	short .rgb_ycc_cnv
-	alignx	16,7
-
-.columnloop:
-	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	movdqu	xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
-	movdqu	xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
-	movdqu	xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
-
-.rgb_ycc_cnv:
-	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-	; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-	; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-	movdqa    xmmD,xmmA
-	punpcklbw xmmA,xmmE	; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
-	punpckhbw xmmD,xmmE	; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
-	movdqa    xmmC,xmmF
-	punpcklbw xmmF,xmmH	; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
-	punpckhbw xmmC,xmmH	; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
-	movdqa    xmmB,xmmA
-	punpcklwd xmmA,xmmF	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
-	punpckhwd xmmB,xmmF	; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
-	movdqa    xmmG,xmmD
-	punpcklwd xmmD,xmmC	; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
-	punpckhwd xmmG,xmmC	; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
-	movdqa    xmmE,xmmA
-	punpcklbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-	punpckhbw xmmE,xmmD	; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
-	movdqa    xmmH,xmmB
-	punpcklbw xmmB,xmmG	; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
-	punpckhbw xmmH,xmmG	; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
-	pxor      xmmF,xmmF
-
-	movdqa    xmmC,xmmA
-	punpcklbw xmmA,xmmF	; xmmA=(00 02 04 06 08 0A 0C 0E)
-	punpckhbw xmmC,xmmF	; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-	movdqa    xmmD,xmmB
-	punpcklbw xmmB,xmmF	; xmmB=(01 03 05 07 09 0B 0D 0F)
-	punpckhbw xmmD,xmmF	; xmmD=(11 13 15 17 19 1B 1D 1F)
-
-	movdqa    xmmG,xmmE
-	punpcklbw xmmE,xmmF	; xmmE=(20 22 24 26 28 2A 2C 2E)
-	punpckhbw xmmG,xmmF	; xmmG=(30 32 34 36 38 3A 3C 3E)
-
-	punpcklbw xmmF,xmmH
-	punpckhbw xmmH,xmmH
-	psrlw     xmmF,BYTE_BIT	; xmmF=(21 23 25 27 29 2B 2D 2F)
-	psrlw     xmmH,BYTE_BIT	; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-	; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
-	; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
-	; (Original)
-	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-	;
-	; (This implementation)
-	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-
-	movdqa    XMMWORD [wk(0)], xmm0	; wk(0)=RE
-	movdqa    XMMWORD [wk(1)], xmm1	; wk(1)=RO
-	movdqa    XMMWORD [wk(2)], xmm4	; wk(2)=BE
-	movdqa    XMMWORD [wk(3)], xmm5	; wk(3)=BO
-
-	movdqa    xmm6,xmm1
-	punpcklwd xmm1,xmm3
-	punpckhwd xmm6,xmm3
-	movdqa    xmm7,xmm1
-	movdqa    xmm4,xmm6
-	pmaddwd   xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-	pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-	pmaddwd   xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
-	pmaddwd   xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
-
-	movdqa    XMMWORD [wk(4)], xmm1	; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
-	movdqa    XMMWORD [wk(5)], xmm6	; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-	pxor      xmm1,xmm1
-	pxor      xmm6,xmm6
-	punpcklwd xmm1,xmm5		; xmm1=BOL
-	punpckhwd xmm6,xmm5		; xmm6=BOH
-	psrld     xmm1,1		; xmm1=BOL*FIX(0.500)
-	psrld     xmm6,1		; xmm6=BOH*FIX(0.500)
-
-	movdqa    xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ]
-
-	paddd     xmm7,xmm1
-	paddd     xmm4,xmm6
-	paddd     xmm7,xmm5
-	paddd     xmm4,xmm5
-	psrld     xmm7,SCALEBITS	; xmm7=CbOL
-	psrld     xmm4,SCALEBITS	; xmm4=CbOH
-	packssdw  xmm7,xmm4		; xmm7=CbO
-
-	movdqa    xmm1, XMMWORD [wk(2)]	; xmm1=BE
-
-	movdqa    xmm6,xmm0
-	punpcklwd xmm0,xmm2
-	punpckhwd xmm6,xmm2
-	movdqa    xmm5,xmm0
-	movdqa    xmm4,xmm6
-	pmaddwd   xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
-	pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
-	pmaddwd   xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
-	pmaddwd   xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
-
-	movdqa    XMMWORD [wk(6)], xmm0	; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
-	movdqa    XMMWORD [wk(7)], xmm6	; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-	pxor      xmm0,xmm0
-	pxor      xmm6,xmm6
-	punpcklwd xmm0,xmm1		; xmm0=BEL
-	punpckhwd xmm6,xmm1		; xmm6=BEH
-	psrld     xmm0,1		; xmm0=BEL*FIX(0.500)
-	psrld     xmm6,1		; xmm6=BEH*FIX(0.500)
-
-	movdqa    xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
-
-	paddd     xmm5,xmm0
-	paddd     xmm4,xmm6
-	paddd     xmm5,xmm1
-	paddd     xmm4,xmm1
-	psrld     xmm5,SCALEBITS	; xmm5=CbEL
-	psrld     xmm4,SCALEBITS	; xmm4=CbEH
-	packssdw  xmm5,xmm4		; xmm5=CbE
-
-	psllw     xmm7,BYTE_BIT
-	por       xmm5,xmm7		; xmm5=Cb
-	movdqa    XMMWORD [ebx], xmm5	; Save Cb
-
-	movdqa    xmm0, XMMWORD [wk(3)]	; xmm0=BO
-	movdqa    xmm6, XMMWORD [wk(2)]	; xmm6=BE
-	movdqa    xmm1, XMMWORD [wk(1)]	; xmm1=RO
-
-	movdqa    xmm4,xmm0
-	punpcklwd xmm0,xmm3
-	punpckhwd xmm4,xmm3
-	movdqa    xmm7,xmm0
-	movdqa    xmm5,xmm4
-	pmaddwd   xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-	pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-	pmaddwd   xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
-	pmaddwd   xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
-
-	movdqa    xmm3,[GOTOFF(eax,PD_ONEHALF)]	; xmm3=[PD_ONEHALF]
-
-	paddd     xmm0, XMMWORD [wk(4)]
-	paddd     xmm4, XMMWORD [wk(5)]
-	paddd     xmm0,xmm3
-	paddd     xmm4,xmm3
-	psrld     xmm0,SCALEBITS	; xmm0=YOL
-	psrld     xmm4,SCALEBITS	; xmm4=YOH
-	packssdw  xmm0,xmm4		; xmm0=YO
-
-	pxor      xmm3,xmm3
-	pxor      xmm4,xmm4
-	punpcklwd xmm3,xmm1		; xmm3=ROL
-	punpckhwd xmm4,xmm1		; xmm4=ROH
-	psrld     xmm3,1		; xmm3=ROL*FIX(0.500)
-	psrld     xmm4,1		; xmm4=ROH*FIX(0.500)
-
-	movdqa    xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
-
-	paddd     xmm7,xmm3
-	paddd     xmm5,xmm4
-	paddd     xmm7,xmm1
-	paddd     xmm5,xmm1
-	psrld     xmm7,SCALEBITS	; xmm7=CrOL
-	psrld     xmm5,SCALEBITS	; xmm5=CrOH
-	packssdw  xmm7,xmm5		; xmm7=CrO
-
-	movdqa    xmm3, XMMWORD [wk(0)]	; xmm3=RE
-
-	movdqa    xmm4,xmm6
-	punpcklwd xmm6,xmm2
-	punpckhwd xmm4,xmm2
-	movdqa    xmm1,xmm6
-	movdqa    xmm5,xmm4
-	pmaddwd   xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-	pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-	pmaddwd   xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
-	pmaddwd   xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
-
-	movdqa    xmm2,[GOTOFF(eax,PD_ONEHALF)]	; xmm2=[PD_ONEHALF]
-
-	paddd     xmm6, XMMWORD [wk(6)]
-	paddd     xmm4, XMMWORD [wk(7)]
-	paddd     xmm6,xmm2
-	paddd     xmm4,xmm2
-	psrld     xmm6,SCALEBITS	; xmm6=YEL
-	psrld     xmm4,SCALEBITS	; xmm4=YEH
-	packssdw  xmm6,xmm4		; xmm6=YE
-
-	psllw     xmm0,BYTE_BIT
-	por       xmm6,xmm0		; xmm6=Y
-	movdqa    XMMWORD [edi], xmm6	; Save Y
-
-	pxor      xmm2,xmm2
-	pxor      xmm4,xmm4
-	punpcklwd xmm2,xmm3		; xmm2=REL
-	punpckhwd xmm4,xmm3		; xmm4=REH
-	psrld     xmm2,1		; xmm2=REL*FIX(0.500)
-	psrld     xmm4,1		; xmm4=REH*FIX(0.500)
-
-	movdqa    xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ]
-
-	paddd     xmm1,xmm2
-	paddd     xmm5,xmm4
-	paddd     xmm1,xmm0
-	paddd     xmm5,xmm0
-	psrld     xmm1,SCALEBITS	; xmm1=CrEL
-	psrld     xmm5,SCALEBITS	; xmm5=CrEH
-	packssdw  xmm1,xmm5		; xmm1=CrE
-
-	psllw     xmm7,BYTE_BIT
-	por       xmm1,xmm7		; xmm1=Cr
-	movdqa    XMMWORD [edx], xmm1	; Save Cr
-
-	sub	ecx, byte SIZEOF_XMMWORD
-	add	esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; inptr
-	add	edi, byte SIZEOF_XMMWORD		; outptr0
-	add	ebx, byte SIZEOF_XMMWORD		; outptr1
-	add	edx, byte SIZEOF_XMMWORD		; outptr2
-	cmp	ecx, byte SIZEOF_XMMWORD
-	jae	near .columnloop
-	test	ecx,ecx
-	jnz	near .column_ld1
-
-	pop	ecx			; col
-	pop	esi
-	pop	edi
-	pop	ebx
-	pop	edx
-	poppic	eax
-
-	add	esi, byte SIZEOF_JSAMPROW	; input_buf
-	add	edi, byte SIZEOF_JSAMPROW
-	add	ebx, byte SIZEOF_JSAMPROW
-	add	edx, byte SIZEOF_JSAMPROW
-	dec	eax				; num_rows
-	jg	near .rowloop
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jccolmmx.asm b/simd/jccolmmx.asm
deleted file mode 100644
index 1867abe..0000000
--- a/simd/jccolmmx.asm
+++ /dev/null
@@ -1,123 +0,0 @@
-;
-; jccolmmx.asm - colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS	16
-
-F_0_081	equ	 5329			; FIX(0.08131)
-F_0_114	equ	 7471			; FIX(0.11400)
-F_0_168	equ	11059			; FIX(0.16874)
-F_0_250	equ	16384			; FIX(0.25000)
-F_0_299	equ	19595			; FIX(0.29900)
-F_0_331	equ	21709			; FIX(0.33126)
-F_0_418	equ	27439			; FIX(0.41869)
-F_0_587	equ	38470			; FIX(0.58700)
-F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_rgb_ycc_convert_mmx) PRIVATE
-
-EXTN(jconst_rgb_ycc_convert_mmx):
-
-PW_F0299_F0337	times 2 dw  F_0_299, F_0_337
-PW_F0114_F0250	times 2 dw  F_0_114, F_0_250
-PW_MF016_MF033	times 2 dw -F_0_168,-F_0_331
-PW_MF008_MF041	times 2 dw -F_0_081,-F_0_418
-PD_ONEHALFM1_CJ	times 2 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
-PD_ONEHALF	times 2 dd  (1 << (SCALEBITS-1))
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-
-%include "jcclrmmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_rgb_ycc_convert_mmx jsimd_extrgb_ycc_convert_mmx
-%include "jcclrmmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_rgb_ycc_convert_mmx jsimd_extrgbx_ycc_convert_mmx
-%include "jcclrmmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_rgb_ycc_convert_mmx jsimd_extbgr_ycc_convert_mmx
-%include "jcclrmmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_rgb_ycc_convert_mmx jsimd_extbgrx_ycc_convert_mmx
-%include "jcclrmmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_rgb_ycc_convert_mmx jsimd_extxbgr_ycc_convert_mmx
-%include "jcclrmmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_rgb_ycc_convert_mmx jsimd_extxrgb_ycc_convert_mmx
-%include "jcclrmmx.asm"
diff --git a/simd/jccolss2-64.asm b/simd/jccolss2-64.asm
deleted file mode 100644
index 6370293..0000000
--- a/simd/jccolss2-64.asm
+++ /dev/null
@@ -1,120 +0,0 @@
-;
-; jccolss2-64.asm - colorspace conversion (64-bit SSE2)
-;
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; Copyright (C) 2009, D. R. Commander.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS	16
-
-F_0_081	equ	 5329			; FIX(0.08131)
-F_0_114	equ	 7471			; FIX(0.11400)
-F_0_168	equ	11059			; FIX(0.16874)
-F_0_250	equ	16384			; FIX(0.25000)
-F_0_299	equ	19595			; FIX(0.29900)
-F_0_331	equ	21709			; FIX(0.33126)
-F_0_418	equ	27439			; FIX(0.41869)
-F_0_587	equ	38470			; FIX(0.58700)
-F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_rgb_ycc_convert_sse2) PRIVATE
-
-EXTN(jconst_rgb_ycc_convert_sse2):
-
-PW_F0299_F0337	times 4 dw  F_0_299, F_0_337
-PW_F0114_F0250	times 4 dw  F_0_114, F_0_250
-PW_MF016_MF033	times 4 dw -F_0_168,-F_0_331
-PW_MF008_MF041	times 4 dw -F_0_081,-F_0_418
-PD_ONEHALFM1_CJ	times 4 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
-PD_ONEHALF	times 4 dd  (1 << (SCALEBITS-1))
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
-
-%include "jcclrss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
-%include "jcclrss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
-%include "jcclrss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
-%include "jcclrss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
-%include "jcclrss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
-%include "jcclrss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
-%include "jcclrss2-64.asm"
diff --git a/simd/jccolss2.asm b/simd/jccolss2.asm
deleted file mode 100644
index abd6721..0000000
--- a/simd/jccolss2.asm
+++ /dev/null
@@ -1,120 +0,0 @@
-;
-; jccolss2.asm - colorspace conversion (SSE2)
-;
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; Copyright (C) 2009, D. R. Commander.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS	16
-
-F_0_081	equ	 5329			; FIX(0.08131)
-F_0_114	equ	 7471			; FIX(0.11400)
-F_0_168	equ	11059			; FIX(0.16874)
-F_0_250	equ	16384			; FIX(0.25000)
-F_0_299	equ	19595			; FIX(0.29900)
-F_0_331	equ	21709			; FIX(0.33126)
-F_0_418	equ	27439			; FIX(0.41869)
-F_0_587	equ	38470			; FIX(0.58700)
-F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_rgb_ycc_convert_sse2) PRIVATE
-
-EXTN(jconst_rgb_ycc_convert_sse2):
-
-PW_F0299_F0337	times 4 dw  F_0_299, F_0_337
-PW_F0114_F0250	times 4 dw  F_0_114, F_0_250
-PW_MF016_MF033	times 4 dw -F_0_168,-F_0_331
-PW_MF008_MF041	times 4 dw -F_0_081,-F_0_418
-PD_ONEHALFM1_CJ	times 4 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
-PD_ONEHALF	times 4 dd  (1 << (SCALEBITS-1))
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-
-%include "jcclrss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
-%include "jcclrss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
-%include "jcclrss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
-%include "jcclrss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
-%include "jcclrss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
-%include "jcclrss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
-%include "jcclrss2.asm"
diff --git a/simd/jcgrammx.asm b/simd/jcgrammx.asm
deleted file mode 100644
index 8553b23..0000000
--- a/simd/jcgrammx.asm
+++ /dev/null
@@ -1,116 +0,0 @@
-;
-; jcgrammx.asm - grayscale colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2011 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS	16
-
-F_0_114	equ	 7471			; FIX(0.11400)
-F_0_250	equ	16384			; FIX(0.25000)
-F_0_299	equ	19595			; FIX(0.29900)
-F_0_587	equ	38470			; FIX(0.58700)
-F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_rgb_gray_convert_mmx) PRIVATE
-
-EXTN(jconst_rgb_gray_convert_mmx):
-
-PW_F0299_F0337	times 2 dw  F_0_299, F_0_337
-PW_F0114_F0250	times 2 dw  F_0_114, F_0_250
-PD_ONEHALF	times 2 dd  (1 << (SCALEBITS-1))
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-
-%include "jcgrymmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_rgb_gray_convert_mmx jsimd_extrgb_gray_convert_mmx
-%include "jcgrymmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_rgb_gray_convert_mmx jsimd_extrgbx_gray_convert_mmx
-%include "jcgrymmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_rgb_gray_convert_mmx jsimd_extbgr_gray_convert_mmx
-%include "jcgrymmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_rgb_gray_convert_mmx jsimd_extbgrx_gray_convert_mmx
-%include "jcgrymmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_rgb_gray_convert_mmx jsimd_extxbgr_gray_convert_mmx
-%include "jcgrymmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_rgb_gray_convert_mmx jsimd_extxrgb_gray_convert_mmx
-%include "jcgrymmx.asm"
diff --git a/simd/jcgrass2-64.asm b/simd/jcgrass2-64.asm
deleted file mode 100644
index 7f025f9..0000000
--- a/simd/jcgrass2-64.asm
+++ /dev/null
@@ -1,113 +0,0 @@
-;
-; jcgrass2-64.asm - grayscale colorspace conversion (64-bit SSE2)
-;
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; Copyright (C) 2011, D. R. Commander.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS	16
-
-F_0_114	equ	 7471			; FIX(0.11400)
-F_0_250	equ	16384			; FIX(0.25000)
-F_0_299	equ	19595			; FIX(0.29900)
-F_0_587	equ	38470			; FIX(0.58700)
-F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_rgb_gray_convert_sse2) PRIVATE
-
-EXTN(jconst_rgb_gray_convert_sse2):
-
-PW_F0299_F0337	times 4 dw  F_0_299, F_0_337
-PW_F0114_F0250	times 4 dw  F_0_114, F_0_250
-PD_ONEHALF	times 4 dd  (1 << (SCALEBITS-1))
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
-
-%include "jcgryss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2
-%include "jcgryss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2
-%include "jcgryss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2
-%include "jcgryss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2
-%include "jcgryss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2
-%include "jcgryss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2
-%include "jcgryss2-64.asm"
diff --git a/simd/jcgrass2.asm b/simd/jcgrass2.asm
deleted file mode 100644
index 4a32e66..0000000
--- a/simd/jcgrass2.asm
+++ /dev/null
@@ -1,113 +0,0 @@
-;
-; jcgrass2.asm - grayscale colorspace conversion (SSE2)
-;
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; Copyright (C) 2011, D. R. Commander.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS	16
-
-F_0_114	equ	 7471			; FIX(0.11400)
-F_0_250	equ	16384			; FIX(0.25000)
-F_0_299	equ	19595			; FIX(0.29900)
-F_0_587	equ	38470			; FIX(0.58700)
-F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_rgb_gray_convert_sse2) PRIVATE
-
-EXTN(jconst_rgb_gray_convert_sse2):
-
-PW_F0299_F0337	times 4 dw  F_0_299, F_0_337
-PW_F0114_F0250	times 4 dw  F_0_114, F_0_250
-PD_ONEHALF	times 4 dd  (1 << (SCALEBITS-1))
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-
-%include "jcgryss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2
-%include "jcgryss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2
-%include "jcgryss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2
-%include "jcgryss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2
-%include "jcgryss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2
-%include "jcgryss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2
-%include "jcgryss2.asm"
diff --git a/simd/jcgrymmx.asm b/simd/jcgrymmx.asm
deleted file mode 100644
index c85a5cb..0000000
--- a/simd/jcgrymmx.asm
+++ /dev/null
@@ -1,357 +0,0 @@
-;
-; jcgrymmx.asm - grayscale colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2011 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_gray_convert_mmx (JDIMENSION img_width,
-;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                             JDIMENSION output_row, int num_rows);
-;
-
-%define img_width(b)	(b)+8			; JDIMENSION img_width
-%define input_buf(b)	(b)+12		; JSAMPARRAY input_buf
-%define output_buf(b)	(b)+16		; JSAMPIMAGE output_buf
-%define output_row(b)	(b)+20		; JDIMENSION output_row
-%define num_rows(b)	(b)+24		; int num_rows
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
-%define WK_NUM		2
-%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
-
-	align	16
-	global	EXTN(jsimd_rgb_gray_convert_mmx) PRIVATE
-
-EXTN(jsimd_rgb_gray_convert_mmx):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	eax		; make a room for GOT address
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx			; get GOT address
-	movpic	POINTER [gotptr], ebx	; save GOT address
-
-	mov	ecx, JDIMENSION [img_width(eax)]	; num_cols
-	test	ecx,ecx
-	jz	near .return
-
-	push	ecx
-
-	mov	esi, JSAMPIMAGE [output_buf(eax)]
-	mov	ecx, JDIMENSION [output_row(eax)]
-	mov	edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
-	lea	edi, [edi+ecx*SIZEOF_JSAMPROW]
-
-	pop	ecx
-
-	mov	esi, JSAMPARRAY [input_buf(eax)]
-	mov	eax, INT [num_rows(eax)]
-	test	eax,eax
-	jle	near .return
-	alignx	16,7
-.rowloop:
-	pushpic	eax
-	push	edi
-	push	esi
-	push	ecx			; col
-
-	mov	esi, JSAMPROW [esi]	; inptr
-	mov	edi, JSAMPROW [edi]	; outptr0
-	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
-
-	cmp	ecx, byte SIZEOF_MMWORD
-	jae	short .columnloop
-	alignx	16,7
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-	push	eax
-	push	edx
-	lea	ecx,[ecx+ecx*2]		; imul ecx,RGB_PIXELSIZE
-	test	cl, SIZEOF_BYTE
-	jz	short .column_ld2
-	sub	ecx, byte SIZEOF_BYTE
-	xor	eax,eax
-	mov	al, BYTE [esi+ecx]
-.column_ld2:
-	test	cl, SIZEOF_WORD
-	jz	short .column_ld4
-	sub	ecx, byte SIZEOF_WORD
-	xor	edx,edx
-	mov	dx, WORD [esi+ecx]
-	shl	eax, WORD_BIT
-	or	eax,edx
-.column_ld4:
-	movd	mmA,eax
-	pop	edx
-	pop	eax
-	test	cl, SIZEOF_DWORD
-	jz	short .column_ld8
-	sub	ecx, byte SIZEOF_DWORD
-	movd	mmG, DWORD [esi+ecx]
-	psllq	mmA, DWORD_BIT
-	por	mmA,mmG
-.column_ld8:
-	test	cl, SIZEOF_MMWORD
-	jz	short .column_ld16
-	movq	mmG,mmA
-	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-	mov	ecx, SIZEOF_MMWORD
-	jmp	short .rgb_gray_cnv
-.column_ld16:
-	test	cl, 2*SIZEOF_MMWORD
-	mov	ecx, SIZEOF_MMWORD
-	jz	short .rgb_gray_cnv
-	movq	mmF,mmA
-	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-	movq	mmG, MMWORD [esi+1*SIZEOF_MMWORD]
-	jmp	short .rgb_gray_cnv
-	alignx	16,7
-
-.columnloop:
-	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-	movq	mmG, MMWORD [esi+1*SIZEOF_MMWORD]
-	movq	mmF, MMWORD [esi+2*SIZEOF_MMWORD]
-
-.rgb_gray_cnv:
-	; mmA=(00 10 20 01 11 21 02 12)
-	; mmG=(22 03 13 23 04 14 24 05)
-	; mmF=(15 25 06 16 26 07 17 27)
-
-	movq      mmD,mmA
-	psllq     mmA,4*BYTE_BIT	; mmA=(-- -- -- -- 00 10 20 01)
-	psrlq     mmD,4*BYTE_BIT	; mmD=(11 21 02 12 -- -- -- --)
-
-	punpckhbw mmA,mmG		; mmA=(00 04 10 14 20 24 01 05)
-	psllq     mmG,4*BYTE_BIT	; mmG=(-- -- -- -- 22 03 13 23)
-
-	punpcklbw mmD,mmF		; mmD=(11 15 21 25 02 06 12 16)
-	punpckhbw mmG,mmF		; mmG=(22 26 03 07 13 17 23 27)
-
-	movq      mmE,mmA
-	psllq     mmA,4*BYTE_BIT	; mmA=(-- -- -- -- 00 04 10 14)
-	psrlq     mmE,4*BYTE_BIT	; mmE=(20 24 01 05 -- -- -- --)
-
-	punpckhbw mmA,mmD		; mmA=(00 02 04 06 10 12 14 16)
-	psllq     mmD,4*BYTE_BIT	; mmD=(-- -- -- -- 11 15 21 25)
-
-	punpcklbw mmE,mmG		; mmE=(20 22 24 26 01 03 05 07)
-	punpckhbw mmD,mmG		; mmD=(11 13 15 17 21 23 25 27)
-
-	pxor      mmH,mmH
-
-	movq      mmC,mmA
-	punpcklbw mmA,mmH		; mmA=(00 02 04 06)
-	punpckhbw mmC,mmH		; mmC=(10 12 14 16)
-
-	movq      mmB,mmE
-	punpcklbw mmE,mmH		; mmE=(20 22 24 26)
-	punpckhbw mmB,mmH		; mmB=(01 03 05 07)
-
-	movq      mmF,mmD
-	punpcklbw mmD,mmH		; mmD=(11 13 15 17)
-	punpckhbw mmF,mmH		; mmF=(21 23 25 27)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-	test	cl, SIZEOF_MMWORD/8
-	jz	short .column_ld2
-	sub	ecx, byte SIZEOF_MMWORD/8
-	movd	mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld2:
-	test	cl, SIZEOF_MMWORD/4
-	jz	short .column_ld4
-	sub	ecx, byte SIZEOF_MMWORD/4
-	movq	mmF,mmA
-	movq	mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld4:
-	test	cl, SIZEOF_MMWORD/2
-	mov	ecx, SIZEOF_MMWORD
-	jz	short .rgb_gray_cnv
-	movq	mmD,mmA
-	movq	mmC,mmF
-	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-	movq	mmF, MMWORD [esi+1*SIZEOF_MMWORD]
-	jmp	short .rgb_gray_cnv
-	alignx	16,7
-
-.columnloop:
-	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-	movq	mmF, MMWORD [esi+1*SIZEOF_MMWORD]
-	movq	mmD, MMWORD [esi+2*SIZEOF_MMWORD]
-	movq	mmC, MMWORD [esi+3*SIZEOF_MMWORD]
-
-.rgb_gray_cnv:
-	; mmA=(00 10 20 30 01 11 21 31)
-	; mmF=(02 12 22 32 03 13 23 33)
-	; mmD=(04 14 24 34 05 15 25 35)
-	; mmC=(06 16 26 36 07 17 27 37)
-
-	movq      mmB,mmA
-	punpcklbw mmA,mmF		; mmA=(00 02 10 12 20 22 30 32)
-	punpckhbw mmB,mmF		; mmB=(01 03 11 13 21 23 31 33)
-
-	movq      mmG,mmD
-	punpcklbw mmD,mmC		; mmD=(04 06 14 16 24 26 34 36)
-	punpckhbw mmG,mmC		; mmG=(05 07 15 17 25 27 35 37)
-
-	movq      mmE,mmA
-	punpcklwd mmA,mmD		; mmA=(00 02 04 06 10 12 14 16)
-	punpckhwd mmE,mmD		; mmE=(20 22 24 26 30 32 34 36)
-
-	movq      mmH,mmB
-	punpcklwd mmB,mmG		; mmB=(01 03 05 07 11 13 15 17)
-	punpckhwd mmH,mmG		; mmH=(21 23 25 27 31 33 35 37)
-
-	pxor      mmF,mmF
-
-	movq      mmC,mmA
-	punpcklbw mmA,mmF		; mmA=(00 02 04 06)
-	punpckhbw mmC,mmF		; mmC=(10 12 14 16)
-
-	movq      mmD,mmB
-	punpcklbw mmB,mmF		; mmB=(01 03 05 07)
-	punpckhbw mmD,mmF		; mmD=(11 13 15 17)
-
-	movq      mmG,mmE
-	punpcklbw mmE,mmF		; mmE=(20 22 24 26)
-	punpckhbw mmG,mmF		; mmG=(30 32 34 36)
-
-	punpcklbw mmF,mmH
-	punpckhbw mmH,mmH
-	psrlw     mmF,BYTE_BIT		; mmF=(21 23 25 27)
-	psrlw     mmH,BYTE_BIT		; mmH=(31 33 35 37)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-	; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
-	; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
-
-	; (Original)
-	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-	;
-	; (This implementation)
-	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-
-	movq      mm6,mm1
-	punpcklwd mm1,mm3
-	punpckhwd mm6,mm3
-	pmaddwd   mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-	pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-	movq      mm7, mm6	; mm7=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-	movq      mm6,mm0
-	punpcklwd mm0,mm2
-	punpckhwd mm6,mm2
-	pmaddwd   mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
-	pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
-
-	movq      MMWORD [wk(0)], mm0	; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
-	movq      MMWORD [wk(1)], mm6	; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-	movq      mm0, mm5	; mm0=BO
-	movq      mm6, mm4	; mm6=BE
-
-	movq      mm4,mm0
-	punpcklwd mm0,mm3
-	punpckhwd mm4,mm3
-	pmaddwd   mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-	pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-
-	movq      mm3,[GOTOFF(eax,PD_ONEHALF)]	; mm3=[PD_ONEHALF]
-
-	paddd     mm0, mm1
-	paddd     mm4, mm7
-	paddd     mm0,mm3
-	paddd     mm4,mm3
-	psrld     mm0,SCALEBITS		; mm0=YOL
-	psrld     mm4,SCALEBITS		; mm4=YOH
-	packssdw  mm0,mm4		; mm0=YO
-
-	movq      mm4,mm6
-	punpcklwd mm6,mm2
-	punpckhwd mm4,mm2
-	pmaddwd   mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-	pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-
-	movq      mm2,[GOTOFF(eax,PD_ONEHALF)]	; mm2=[PD_ONEHALF]
-
-	paddd     mm6, MMWORD [wk(0)]
-	paddd     mm4, MMWORD [wk(1)]
-	paddd     mm6,mm2
-	paddd     mm4,mm2
-	psrld     mm6,SCALEBITS		; mm6=YEL
-	psrld     mm4,SCALEBITS		; mm4=YEH
-	packssdw  mm6,mm4		; mm6=YE
-
-	psllw     mm0,BYTE_BIT
-	por       mm6,mm0		; mm6=Y
-	movq      MMWORD [edi], mm6	; Save Y
-
-	sub	ecx, byte SIZEOF_MMWORD
-	add	esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; inptr
-	add	edi, byte SIZEOF_MMWORD			; outptr0
-	cmp	ecx, byte SIZEOF_MMWORD
-	jae	near .columnloop
-	test	ecx,ecx
-	jnz	near .column_ld1
-
-	pop	ecx			; col
-	pop	esi
-	pop	edi
-	poppic	eax
-
-	add	esi, byte SIZEOF_JSAMPROW	; input_buf
-	add	edi, byte SIZEOF_JSAMPROW
-	dec	eax				; num_rows
-	jg	near .rowloop
-
-	emms		; empty MMX state
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jcgryss2-64.asm b/simd/jcgryss2-64.asm
deleted file mode 100644
index b4cee92..0000000
--- a/simd/jcgryss2-64.asm
+++ /dev/null
@@ -1,364 +0,0 @@
-;
-; jcgryss2-64.asm - grayscale colorspace conversion (64-bit SSE2)
-;
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; Copyright (C) 2011, D. R. Commander.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width,
-;                              JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                              JDIMENSION output_row, int num_rows);
-;
-
-; r10 = JDIMENSION img_width
-; r11 = JSAMPARRAY input_buf
-; r12 = JSAMPIMAGE output_buf
-; r13 = JDIMENSION output_row
-; r14 = int num_rows
-
-%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-
-	align	16
-
-	global	EXTN(jsimd_rgb_gray_convert_sse2) PRIVATE
-
-EXTN(jsimd_rgb_gray_convert_sse2):
-	push	rbp
-	mov	rax,rsp				; rax = original rbp
-	sub	rsp, byte 4
-	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[rsp],rax
-	mov	rbp,rsp				; rbp = aligned rbp
-	lea	rsp, [wk(0)]
-	collect_args
-	push	rbx
-
-	mov	rcx, r10
-	test	rcx,rcx
-	jz	near .return
-
-	push	rcx
-
-	mov rsi, r12
-	mov rcx, r13
-	mov	rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
-	lea	rdi, [rdi+rcx*SIZEOF_JSAMPROW]
-
-	pop	rcx
-
-	mov rsi, r11
-	mov	eax, r14d
-	test	rax,rax
-	jle	near .return
-.rowloop:
-	push	rdi
-	push	rsi
-	push	rcx			; col
-
-	mov	rsi, JSAMPROW [rsi]	; inptr
-	mov	rdi, JSAMPROW [rdi]	; outptr0
-
-	cmp	rcx, byte SIZEOF_XMMWORD
-	jae	near .columnloop
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-	push	rax
-	push	rdx
-	lea	rcx,[rcx+rcx*2]		; imul ecx,RGB_PIXELSIZE
-	test	cl, SIZEOF_BYTE
-	jz	short .column_ld2
-	sub	rcx, byte SIZEOF_BYTE
-	movzx	rax, BYTE [rsi+rcx]
-.column_ld2:
-	test	cl, SIZEOF_WORD
-	jz	short .column_ld4
-	sub	rcx, byte SIZEOF_WORD
-	movzx	rdx, WORD [rsi+rcx]
-	shl	rax, WORD_BIT
-	or	rax,rdx
-.column_ld4:
-	movd	xmmA,eax
-	pop	rdx
-	pop	rax
-	test	cl, SIZEOF_DWORD
-	jz	short .column_ld8
-	sub	rcx, byte SIZEOF_DWORD
-	movd	xmmF, XMM_DWORD [rsi+rcx]
-	pslldq	xmmA, SIZEOF_DWORD
-	por	xmmA,xmmF
-.column_ld8:
-	test	cl, SIZEOF_MMWORD
-	jz	short .column_ld16
-	sub	rcx, byte SIZEOF_MMWORD
-	movq	xmmB, XMM_MMWORD [rsi+rcx]
-	pslldq	xmmA, SIZEOF_MMWORD
-	por	xmmA,xmmB
-.column_ld16:
-	test	cl, SIZEOF_XMMWORD
-	jz	short .column_ld32
-	movdqa	xmmF,xmmA
-	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	mov	rcx, SIZEOF_XMMWORD
-	jmp	short .rgb_gray_cnv
-.column_ld32:
-	test	cl, 2*SIZEOF_XMMWORD
-	mov	rcx, SIZEOF_XMMWORD
-	jz	short .rgb_gray_cnv
-	movdqa	xmmB,xmmA
-	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	movdqu	xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-	jmp	short .rgb_gray_cnv
-
-.columnloop:
-	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	movdqu	xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-	movdqu	xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
-
-.rgb_gray_cnv:
-	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-	; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-	; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-	movdqa    xmmG,xmmA
-	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
-	psrldq    xmmG,8	; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
-
-	punpckhbw xmmA,xmmF	; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
-	pslldq    xmmF,8	; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
-
-	punpcklbw xmmG,xmmB	; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
-	punpckhbw xmmF,xmmB	; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
-
-	movdqa    xmmD,xmmA
-	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
-	psrldq    xmmD,8	; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
-
-	punpckhbw xmmA,xmmG	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
-	pslldq    xmmG,8	; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
-
-	punpcklbw xmmD,xmmF	; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
-	punpckhbw xmmG,xmmF	; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
-
-	movdqa    xmmE,xmmA
-	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
-	psrldq    xmmE,8	; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
-
-	punpckhbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-	pslldq    xmmD,8	; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
-
-	punpcklbw xmmE,xmmG	; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
-	punpckhbw xmmD,xmmG	; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
-
-	pxor      xmmH,xmmH
-
-	movdqa    xmmC,xmmA
-	punpcklbw xmmA,xmmH	; xmmA=(00 02 04 06 08 0A 0C 0E)
-	punpckhbw xmmC,xmmH	; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-	movdqa    xmmB,xmmE
-	punpcklbw xmmE,xmmH	; xmmE=(20 22 24 26 28 2A 2C 2E)
-	punpckhbw xmmB,xmmH	; xmmB=(01 03 05 07 09 0B 0D 0F)
-
-	movdqa    xmmF,xmmD
-	punpcklbw xmmD,xmmH	; xmmD=(11 13 15 17 19 1B 1D 1F)
-	punpckhbw xmmF,xmmH	; xmmF=(21 23 25 27 29 2B 2D 2F)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-	test	cl, SIZEOF_XMMWORD/16
-	jz	short .column_ld2
-	sub	rcx, byte SIZEOF_XMMWORD/16
-	movd	xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
-.column_ld2:
-	test	cl, SIZEOF_XMMWORD/8
-	jz	short .column_ld4
-	sub	rcx, byte SIZEOF_XMMWORD/8
-	movq	xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
-	pslldq	xmmA, SIZEOF_MMWORD
-	por	xmmA,xmmE
-.column_ld4:
-	test	cl, SIZEOF_XMMWORD/4
-	jz	short .column_ld8
-	sub	rcx, byte SIZEOF_XMMWORD/4
-	movdqa	xmmE,xmmA
-	movdqu	xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
-.column_ld8:
-	test	cl, SIZEOF_XMMWORD/2
-	mov	rcx, SIZEOF_XMMWORD
-	jz	short .rgb_gray_cnv
-	movdqa	xmmF,xmmA
-	movdqa	xmmH,xmmE
-	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	movdqu	xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-	jmp	short .rgb_gray_cnv
-
-.columnloop:
-	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	movdqu	xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-	movdqu	xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
-	movdqu	xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
-
-.rgb_gray_cnv:
-	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-	; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-	; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-	movdqa    xmmD,xmmA
-	punpcklbw xmmA,xmmE	; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
-	punpckhbw xmmD,xmmE	; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
-	movdqa    xmmC,xmmF
-	punpcklbw xmmF,xmmH	; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
-	punpckhbw xmmC,xmmH	; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
-	movdqa    xmmB,xmmA
-	punpcklwd xmmA,xmmF	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
-	punpckhwd xmmB,xmmF	; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
-	movdqa    xmmG,xmmD
-	punpcklwd xmmD,xmmC	; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
-	punpckhwd xmmG,xmmC	; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
-	movdqa    xmmE,xmmA
-	punpcklbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-	punpckhbw xmmE,xmmD	; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
-	movdqa    xmmH,xmmB
-	punpcklbw xmmB,xmmG	; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
-	punpckhbw xmmH,xmmG	; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
-	pxor      xmmF,xmmF
-
-	movdqa    xmmC,xmmA
-	punpcklbw xmmA,xmmF	; xmmA=(00 02 04 06 08 0A 0C 0E)
-	punpckhbw xmmC,xmmF	; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-	movdqa    xmmD,xmmB
-	punpcklbw xmmB,xmmF	; xmmB=(01 03 05 07 09 0B 0D 0F)
-	punpckhbw xmmD,xmmF	; xmmD=(11 13 15 17 19 1B 1D 1F)
-
-	movdqa    xmmG,xmmE
-	punpcklbw xmmE,xmmF	; xmmE=(20 22 24 26 28 2A 2C 2E)
-	punpckhbw xmmG,xmmF	; xmmG=(30 32 34 36 38 3A 3C 3E)
-
-	punpcklbw xmmF,xmmH
-	punpckhbw xmmH,xmmH
-	psrlw     xmmF,BYTE_BIT	; xmmF=(21 23 25 27 29 2B 2D 2F)
-	psrlw     xmmH,BYTE_BIT	; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-	; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
-	; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
-	; (Original)
-	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-	;
-	; (This implementation)
-	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-
-	movdqa    xmm6,xmm1
-	punpcklwd xmm1,xmm3
-	punpckhwd xmm6,xmm3
-	pmaddwd   xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-	pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-	movdqa    xmm7, xmm6	; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-	movdqa    xmm6,xmm0
-	punpcklwd xmm0,xmm2
-	punpckhwd xmm6,xmm2
-	pmaddwd   xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
-	pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
-
-	movdqa    XMMWORD [wk(0)], xmm0	; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
-	movdqa    XMMWORD [wk(1)], xmm6	; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-	movdqa    xmm0, xmm5	; xmm0=BO
-	movdqa    xmm6, xmm4	; xmm6=BE
-
-	movdqa    xmm4,xmm0
-	punpcklwd xmm0,xmm3
-	punpckhwd xmm4,xmm3
-	pmaddwd   xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-	pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-
-	movdqa    xmm3,[rel PD_ONEHALF]	; xmm3=[PD_ONEHALF]
-
-	paddd     xmm0, xmm1
-	paddd     xmm4, xmm7
-	paddd     xmm0,xmm3
-	paddd     xmm4,xmm3
-	psrld     xmm0,SCALEBITS	; xmm0=YOL
-	psrld     xmm4,SCALEBITS	; xmm4=YOH
-	packssdw  xmm0,xmm4		; xmm0=YO
-
-	movdqa    xmm4,xmm6
-	punpcklwd xmm6,xmm2
-	punpckhwd xmm4,xmm2
-	pmaddwd   xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-	pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-
-	movdqa    xmm2,[rel PD_ONEHALF]	; xmm2=[PD_ONEHALF]
-
-	paddd     xmm6, XMMWORD [wk(0)]
-	paddd     xmm4, XMMWORD [wk(1)]
-	paddd     xmm6,xmm2
-	paddd     xmm4,xmm2
-	psrld     xmm6,SCALEBITS	; xmm6=YEL
-	psrld     xmm4,SCALEBITS	; xmm4=YEH
-	packssdw  xmm6,xmm4		; xmm6=YE
-
-	psllw     xmm0,BYTE_BIT
-	por       xmm6,xmm0		; xmm6=Y
-	movdqa    XMMWORD [rdi], xmm6	; Save Y
-
-	sub	rcx, byte SIZEOF_XMMWORD
-	add	rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; inptr
-	add	rdi, byte SIZEOF_XMMWORD		; outptr0
-	cmp	rcx, byte SIZEOF_XMMWORD
-	jae	near .columnloop
-	test	rcx,rcx
-	jnz	near .column_ld1
-
-	pop	rcx			; col
-	pop	rsi
-	pop	rdi
-
-	add	rsi, byte SIZEOF_JSAMPROW	; input_buf
-	add	rdi, byte SIZEOF_JSAMPROW
-	dec	rax				; num_rows
-	jg	near .rowloop
-
-.return:
-	pop	rbx
-	uncollect_args
-	mov	rsp,rbp		; rsp <- aligned rbp
-	pop	rsp		; rsp <- original rbp
-	pop	rbp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jcgryss2.asm b/simd/jcgryss2.asm
deleted file mode 100644
index 53d5f94..0000000
--- a/simd/jcgryss2.asm
+++ /dev/null
@@ -1,383 +0,0 @@
-;
-; jcgryss2.asm - grayscale colorspace conversion (SSE2)
-;
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; Copyright (C) 2011, D. R. Commander.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width,
-;                              JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                              JDIMENSION output_row, int num_rows);
-;
-
-%define img_width(b)	(b)+8			; JDIMENSION img_width
-%define input_buf(b)	(b)+12		; JSAMPARRAY input_buf
-%define output_buf(b)	(b)+16		; JSAMPIMAGE output_buf
-%define output_row(b)	(b)+20		; JDIMENSION output_row
-%define num_rows(b)	(b)+24		; int num_rows
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
-
-	align	16
-
-	global	EXTN(jsimd_rgb_gray_convert_sse2) PRIVATE
-
-EXTN(jsimd_rgb_gray_convert_sse2):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	eax		; make a room for GOT address
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx			; get GOT address
-	movpic	POINTER [gotptr], ebx	; save GOT address
-
-	mov	ecx, JDIMENSION [img_width(eax)]
-	test	ecx,ecx
-	jz	near .return
-
-	push	ecx
-
-	mov	esi, JSAMPIMAGE [output_buf(eax)]
-	mov	ecx, JDIMENSION [output_row(eax)]
-	mov	edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
-	lea	edi, [edi+ecx*SIZEOF_JSAMPROW]
-
-	pop	ecx
-
-	mov	esi, JSAMPARRAY [input_buf(eax)]
-	mov	eax, INT [num_rows(eax)]
-	test	eax,eax
-	jle	near .return
-	alignx	16,7
-.rowloop:
-	pushpic	eax
-	push	edi
-	push	esi
-	push	ecx			; col
-
-	mov	esi, JSAMPROW [esi]	; inptr
-	mov	edi, JSAMPROW [edi]	; outptr0
-	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
-
-	cmp	ecx, byte SIZEOF_XMMWORD
-	jae	near .columnloop
-	alignx	16,7
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-	push	eax
-	push	edx
-	lea	ecx,[ecx+ecx*2]		; imul ecx,RGB_PIXELSIZE
-	test	cl, SIZEOF_BYTE
-	jz	short .column_ld2
-	sub	ecx, byte SIZEOF_BYTE
-	movzx	eax, BYTE [esi+ecx]
-.column_ld2:
-	test	cl, SIZEOF_WORD
-	jz	short .column_ld4
-	sub	ecx, byte SIZEOF_WORD
-	movzx	edx, WORD [esi+ecx]
-	shl	eax, WORD_BIT
-	or	eax,edx
-.column_ld4:
-	movd	xmmA,eax
-	pop	edx
-	pop	eax
-	test	cl, SIZEOF_DWORD
-	jz	short .column_ld8
-	sub	ecx, byte SIZEOF_DWORD
-	movd	xmmF, XMM_DWORD [esi+ecx]
-	pslldq	xmmA, SIZEOF_DWORD
-	por	xmmA,xmmF
-.column_ld8:
-	test	cl, SIZEOF_MMWORD
-	jz	short .column_ld16
-	sub	ecx, byte SIZEOF_MMWORD
-	movq	xmmB, XMM_MMWORD [esi+ecx]
-	pslldq	xmmA, SIZEOF_MMWORD
-	por	xmmA,xmmB
-.column_ld16:
-	test	cl, SIZEOF_XMMWORD
-	jz	short .column_ld32
-	movdqa	xmmF,xmmA
-	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	mov	ecx, SIZEOF_XMMWORD
-	jmp	short .rgb_gray_cnv
-.column_ld32:
-	test	cl, 2*SIZEOF_XMMWORD
-	mov	ecx, SIZEOF_XMMWORD
-	jz	short .rgb_gray_cnv
-	movdqa	xmmB,xmmA
-	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	movdqu	xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
-	jmp	short .rgb_gray_cnv
-	alignx	16,7
-
-.columnloop:
-	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	movdqu	xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
-	movdqu	xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
-
-.rgb_gray_cnv:
-	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-	; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-	; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-	movdqa    xmmG,xmmA
-	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
-	psrldq    xmmG,8	; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
-
-	punpckhbw xmmA,xmmF	; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
-	pslldq    xmmF,8	; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
-
-	punpcklbw xmmG,xmmB	; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
-	punpckhbw xmmF,xmmB	; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
-
-	movdqa    xmmD,xmmA
-	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
-	psrldq    xmmD,8	; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
-
-	punpckhbw xmmA,xmmG	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
-	pslldq    xmmG,8	; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
-
-	punpcklbw xmmD,xmmF	; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
-	punpckhbw xmmG,xmmF	; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
-
-	movdqa    xmmE,xmmA
-	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
-	psrldq    xmmE,8	; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
-
-	punpckhbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-	pslldq    xmmD,8	; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
-
-	punpcklbw xmmE,xmmG	; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
-	punpckhbw xmmD,xmmG	; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
-
-	pxor      xmmH,xmmH
-
-	movdqa    xmmC,xmmA
-	punpcklbw xmmA,xmmH	; xmmA=(00 02 04 06 08 0A 0C 0E)
-	punpckhbw xmmC,xmmH	; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-	movdqa    xmmB,xmmE
-	punpcklbw xmmE,xmmH	; xmmE=(20 22 24 26 28 2A 2C 2E)
-	punpckhbw xmmB,xmmH	; xmmB=(01 03 05 07 09 0B 0D 0F)
-
-	movdqa    xmmF,xmmD
-	punpcklbw xmmD,xmmH	; xmmD=(11 13 15 17 19 1B 1D 1F)
-	punpckhbw xmmF,xmmH	; xmmF=(21 23 25 27 29 2B 2D 2F)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-	test	cl, SIZEOF_XMMWORD/16
-	jz	short .column_ld2
-	sub	ecx, byte SIZEOF_XMMWORD/16
-	movd	xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld2:
-	test	cl, SIZEOF_XMMWORD/8
-	jz	short .column_ld4
-	sub	ecx, byte SIZEOF_XMMWORD/8
-	movq	xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
-	pslldq	xmmA, SIZEOF_MMWORD
-	por	xmmA,xmmE
-.column_ld4:
-	test	cl, SIZEOF_XMMWORD/4
-	jz	short .column_ld8
-	sub	ecx, byte SIZEOF_XMMWORD/4
-	movdqa	xmmE,xmmA
-	movdqu	xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld8:
-	test	cl, SIZEOF_XMMWORD/2
-	mov	ecx, SIZEOF_XMMWORD
-	jz	short .rgb_gray_cnv
-	movdqa	xmmF,xmmA
-	movdqa	xmmH,xmmE
-	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	movdqu	xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
-	jmp	short .rgb_gray_cnv
-	alignx	16,7
-
-.columnloop:
-	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	movdqu	xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
-	movdqu	xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
-	movdqu	xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
-
-.rgb_gray_cnv:
-	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-	; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-	; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-	movdqa    xmmD,xmmA
-	punpcklbw xmmA,xmmE	; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
-	punpckhbw xmmD,xmmE	; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
-	movdqa    xmmC,xmmF
-	punpcklbw xmmF,xmmH	; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
-	punpckhbw xmmC,xmmH	; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
-	movdqa    xmmB,xmmA
-	punpcklwd xmmA,xmmF	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
-	punpckhwd xmmB,xmmF	; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
-	movdqa    xmmG,xmmD
-	punpcklwd xmmD,xmmC	; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
-	punpckhwd xmmG,xmmC	; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
-	movdqa    xmmE,xmmA
-	punpcklbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-	punpckhbw xmmE,xmmD	; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
-	movdqa    xmmH,xmmB
-	punpcklbw xmmB,xmmG	; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
-	punpckhbw xmmH,xmmG	; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
-	pxor      xmmF,xmmF
-
-	movdqa    xmmC,xmmA
-	punpcklbw xmmA,xmmF	; xmmA=(00 02 04 06 08 0A 0C 0E)
-	punpckhbw xmmC,xmmF	; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-	movdqa    xmmD,xmmB
-	punpcklbw xmmB,xmmF	; xmmB=(01 03 05 07 09 0B 0D 0F)
-	punpckhbw xmmD,xmmF	; xmmD=(11 13 15 17 19 1B 1D 1F)
-
-	movdqa    xmmG,xmmE
-	punpcklbw xmmE,xmmF	; xmmE=(20 22 24 26 28 2A 2C 2E)
-	punpckhbw xmmG,xmmF	; xmmG=(30 32 34 36 38 3A 3C 3E)
-
-	punpcklbw xmmF,xmmH
-	punpckhbw xmmH,xmmH
-	psrlw     xmmF,BYTE_BIT	; xmmF=(21 23 25 27 29 2B 2D 2F)
-	psrlw     xmmH,BYTE_BIT	; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-	; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
-	; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
-	; (Original)
-	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-	;
-	; (This implementation)
-	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-
-	movdqa    xmm6,xmm1
-	punpcklwd xmm1,xmm3
-	punpckhwd xmm6,xmm3
-	pmaddwd   xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-	pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-	movdqa    xmm7, xmm6	; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-	movdqa    xmm6,xmm0
-	punpcklwd xmm0,xmm2
-	punpckhwd xmm6,xmm2
-	pmaddwd   xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
-	pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
-
-	movdqa    XMMWORD [wk(0)], xmm0	; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
-	movdqa    XMMWORD [wk(1)], xmm6	; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-	movdqa    xmm0, xmm5	; xmm0=BO
-	movdqa    xmm6, xmm4	; xmm6=BE
-
-	movdqa    xmm4,xmm0
-	punpcklwd xmm0,xmm3
-	punpckhwd xmm4,xmm3
-	pmaddwd   xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-	pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-
-	movdqa    xmm3,[GOTOFF(eax,PD_ONEHALF)]	; xmm3=[PD_ONEHALF]
-
-	paddd     xmm0, xmm1
-	paddd     xmm4, xmm7
-	paddd     xmm0,xmm3
-	paddd     xmm4,xmm3
-	psrld     xmm0,SCALEBITS	; xmm0=YOL
-	psrld     xmm4,SCALEBITS	; xmm4=YOH
-	packssdw  xmm0,xmm4		; xmm0=YO
-
-	movdqa    xmm4,xmm6
-	punpcklwd xmm6,xmm2
-	punpckhwd xmm4,xmm2
-	pmaddwd   xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-	pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-
-	movdqa    xmm2,[GOTOFF(eax,PD_ONEHALF)]	; xmm2=[PD_ONEHALF]
-
-	paddd     xmm6, XMMWORD [wk(0)]
-	paddd     xmm4, XMMWORD [wk(1)]
-	paddd     xmm6,xmm2
-	paddd     xmm4,xmm2
-	psrld     xmm6,SCALEBITS	; xmm6=YEL
-	psrld     xmm4,SCALEBITS	; xmm4=YEH
-	packssdw  xmm6,xmm4		; xmm6=YE
-
-	psllw     xmm0,BYTE_BIT
-	por       xmm6,xmm0		; xmm6=Y
-	movdqa    XMMWORD [edi], xmm6	; Save Y
-
-	sub	ecx, byte SIZEOF_XMMWORD
-	add	esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; inptr
-	add	edi, byte SIZEOF_XMMWORD		; outptr0
-	cmp	ecx, byte SIZEOF_XMMWORD
-	jae	near .columnloop
-	test	ecx,ecx
-	jnz	near .column_ld1
-
-	pop	ecx			; col
-	pop	esi
-	pop	edi
-	poppic	eax
-
-	add	esi, byte SIZEOF_JSAMPROW	; input_buf
-	add	edi, byte SIZEOF_JSAMPROW
-	dec	eax				; num_rows
-	jg	near .rowloop
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jcqnt3dn.asm b/simd/jcqnt3dn.asm
deleted file mode 100644
index 480777d..0000000
--- a/simd/jcqnt3dn.asm
+++ /dev/null
@@ -1,233 +0,0 @@
-;
-; jcqnt3dn.asm - sample data conversion and quantization (3DNow! & MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_float_3dnow (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                             FAST_FLOAT * workspace);
-;
-
-%define sample_data	ebp+8		; JSAMPARRAY sample_data
-%define start_col	ebp+12		; JDIMENSION start_col
-%define workspace	ebp+16		; FAST_FLOAT * workspace
-
-	align	16
-	global	EXTN(jsimd_convsamp_float_3dnow) PRIVATE
-
-EXTN(jsimd_convsamp_float_3dnow):
-	push	ebp
-	mov	ebp,esp
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	pcmpeqw  mm7,mm7
-	psllw    mm7,7
-	packsswb mm7,mm7		; mm7 = PB_CENTERJSAMPLE (0x808080..)
-
-	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [start_col]
-	mov	edi, POINTER [workspace]	; (DCTELEM *)
-	mov	ecx, DCTSIZE/2
-	alignx	16,7
-.convloop:
-	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-
-	movq	mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
-	movq	mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
-
-	psubb	mm0,mm7				; mm0=(01234567)
-	psubb	mm1,mm7				; mm1=(89ABCDEF)
-
-	punpcklbw mm2,mm0			; mm2=(*0*1*2*3)
-	punpckhbw mm0,mm0			; mm0=(*4*5*6*7)
-	punpcklbw mm3,mm1			; mm3=(*8*9*A*B)
-	punpckhbw mm1,mm1			; mm1=(*C*D*E*F)
-
-	punpcklwd mm4,mm2			; mm4=(***0***1)
-	punpckhwd mm2,mm2			; mm2=(***2***3)
-	punpcklwd mm5,mm0			; mm5=(***4***5)
-	punpckhwd mm0,mm0			; mm0=(***6***7)
-
-	psrad	mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(01)
-	psrad	mm2,(DWORD_BIT-BYTE_BIT)	; mm2=(23)
-	pi2fd	mm4,mm4
-	pi2fd	mm2,mm2
-	psrad	mm5,(DWORD_BIT-BYTE_BIT)	; mm5=(45)
-	psrad	mm0,(DWORD_BIT-BYTE_BIT)	; mm0=(67)
-	pi2fd	mm5,mm5
-	pi2fd	mm0,mm0
-
-	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4
-	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2
-	movq	MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
-	movq	MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
-
-	punpcklwd mm6,mm3			; mm6=(***8***9)
-	punpckhwd mm3,mm3			; mm3=(***A***B)
-	punpcklwd mm4,mm1			; mm4=(***C***D)
-	punpckhwd mm1,mm1			; mm1=(***E***F)
-
-	psrad	mm6,(DWORD_BIT-BYTE_BIT)	; mm6=(89)
-	psrad	mm3,(DWORD_BIT-BYTE_BIT)	; mm3=(AB)
-	pi2fd	mm6,mm6
-	pi2fd	mm3,mm3
-	psrad	mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(CD)
-	psrad	mm1,(DWORD_BIT-BYTE_BIT)	; mm1=(EF)
-	pi2fd	mm4,mm4
-	pi2fd	mm1,mm1
-
-	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6
-	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3
-	movq	MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4
-	movq	MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
-
-	add	esi, byte 2*SIZEOF_JSAMPROW
-	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
-	dec	ecx
-	jnz	near .convloop
-
-	femms		; empty MMX/3DNow! state
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	pop	ebp
-	ret
-
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; GLOBAL(void)
-; jsimd_quantize_float_3dnow (JCOEFPTR coef_block, FAST_FLOAT * divisors,
-;                             FAST_FLOAT * workspace);
-;
-
-%define coef_block	ebp+8		; JCOEFPTR coef_block
-%define divisors	ebp+12		; FAST_FLOAT * divisors
-%define workspace	ebp+16		; FAST_FLOAT * workspace
-
-	align	16
-	global	EXTN(jsimd_quantize_float_3dnow) PRIVATE
-
-EXTN(jsimd_quantize_float_3dnow):
-	push	ebp
-	mov	ebp,esp
-;	push	ebx		; unused
-;	push	ecx		; unused
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov       eax, 0x4B400000	; (float)0x00C00000 (rndint_magic)
-	movd      mm7,eax
-	punpckldq mm7,mm7		; mm7={12582912.0F 12582912.0F}
-
-	mov	esi, POINTER [workspace]
-	mov	edx, POINTER [divisors]
-	mov	edi, JCOEFPTR [coef_block]
-	mov	eax, DCTSIZE2/16
-	alignx	16,7
-.quantloop:
-	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-	movq	mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
-	pfmul	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-	pfmul	mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
-	movq	mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)]
-	movq	mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)]
-	pfmul	mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
-	pfmul	mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
-
-	pfadd	mm0,mm7			; mm0=(00 ** 01 **)
-	pfadd	mm1,mm7			; mm1=(02 ** 03 **)
-	pfadd	mm2,mm7			; mm0=(04 ** 05 **)
-	pfadd	mm3,mm7			; mm1=(06 ** 07 **)
-
-	movq      mm4,mm0
-	punpcklwd mm0,mm1		; mm0=(00 02 ** **)
-	punpckhwd mm4,mm1		; mm4=(01 03 ** **)
-	movq      mm5,mm2
-	punpcklwd mm2,mm3		; mm2=(04 06 ** **)
-	punpckhwd mm5,mm3		; mm5=(05 07 ** **)
-
-	punpcklwd mm0,mm4		; mm0=(00 01 02 03)
-	punpcklwd mm2,mm5		; mm2=(04 05 06 07)
-
-	movq	mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-	movq	mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
-	pfmul	mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-	pfmul	mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-	movq	mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)]
-	movq	mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)]
-	pfmul	mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
-	pfmul	mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
-
-	pfadd	mm6,mm7			; mm0=(10 ** 11 **)
-	pfadd	mm1,mm7			; mm4=(12 ** 13 **)
-	pfadd	mm3,mm7			; mm0=(14 ** 15 **)
-	pfadd	mm4,mm7			; mm4=(16 ** 17 **)
-
-	movq      mm5,mm6
-	punpcklwd mm6,mm1		; mm6=(10 12 ** **)
-	punpckhwd mm5,mm1		; mm5=(11 13 ** **)
-	movq      mm1,mm3
-	punpcklwd mm3,mm4		; mm3=(14 16 ** **)
-	punpckhwd mm1,mm4		; mm1=(15 17 ** **)
-
-	punpcklwd mm6,mm5		; mm6=(10 11 12 13)
-	punpcklwd mm3,mm1		; mm3=(14 15 16 17)
-
-	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
-	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2
-	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6
-	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
-
-	add	esi, byte 16*SIZEOF_FAST_FLOAT
-	add	edx, byte 16*SIZEOF_FAST_FLOAT
-	add	edi, byte 16*SIZEOF_JCOEF
-	dec	eax
-	jnz	near .quantloop
-
-	femms		; empty MMX/3DNow! state
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; unused
-;	pop	ebx		; unused
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jcqntmmx.asm b/simd/jcqntmmx.asm
deleted file mode 100644
index 62e00b6..0000000
--- a/simd/jcqntmmx.asm
+++ /dev/null
@@ -1,274 +0,0 @@
-;
-; jcqntmmx.asm - sample data conversion and quantization (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_mmx (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                     DCTELEM * workspace);
-;
-
-%define sample_data	ebp+8		; JSAMPARRAY sample_data
-%define start_col	ebp+12		; JDIMENSION start_col
-%define workspace	ebp+16		; DCTELEM * workspace
-
-	align	16
-	global	EXTN(jsimd_convsamp_mmx) PRIVATE
-
-EXTN(jsimd_convsamp_mmx):
-	push	ebp
-	mov	ebp,esp
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	pxor	mm6,mm6			; mm6=(all 0's)
-	pcmpeqw	mm7,mm7
-	psllw	mm7,7			; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
-
-	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [start_col]
-	mov	edi, POINTER [workspace]	; (DCTELEM *)
-	mov	ecx, DCTSIZE/4
-	alignx	16,7
-.convloop:
-	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-
-	movq	mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; mm0=(01234567)
-	movq	mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]	; mm1=(89ABCDEF)
-
-	mov	ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-	mov	edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-
-	movq	mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; mm2=(GHIJKLMN)
-	movq	mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE]	; mm3=(OPQRSTUV)
-
-	movq      mm4,mm0
-	punpcklbw mm0,mm6		; mm0=(0123)
-	punpckhbw mm4,mm6		; mm4=(4567)
-	movq      mm5,mm1
-	punpcklbw mm1,mm6		; mm1=(89AB)
-	punpckhbw mm5,mm6		; mm5=(CDEF)
-
-	paddw	mm0,mm7
-	paddw	mm4,mm7
-	paddw	mm1,mm7
-	paddw	mm5,mm7
-
-	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
-	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
-	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
-	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
-
-	movq      mm0,mm2
-	punpcklbw mm2,mm6		; mm2=(GHIJ)
-	punpckhbw mm0,mm6		; mm0=(KLMN)
-	movq      mm4,mm3
-	punpcklbw mm3,mm6		; mm3=(OPQR)
-	punpckhbw mm4,mm6		; mm4=(STUV)
-
-	paddw	mm2,mm7
-	paddw	mm0,mm7
-	paddw	mm3,mm7
-	paddw	mm4,mm7
-
-	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
-	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
-	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
-	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
-
-	add	esi, byte 4*SIZEOF_JSAMPROW
-	add	edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
-	dec	ecx
-	jnz	short .convloop
-
-	emms		; empty MMX state
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	pop	ebp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; This implementation is based on an algorithm described in
-;   "How to optimize for the Pentium family of microprocessors"
-;   (http://www.agner.org/assem/).
-;
-; GLOBAL(void)
-; jsimd_quantize_mmx (JCOEFPTR coef_block, DCTELEM * divisors,
-;                     DCTELEM * workspace);
-;
-
-%define RECIPROCAL(m,n,b) MMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
-%define CORRECTION(m,n,b) MMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
-%define SCALE(m,n,b)      MMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
-%define SHIFT(m,n,b)      MMBLOCK(DCTSIZE*3+(m),(n),(b),SIZEOF_DCTELEM)
-
-%define coef_block	ebp+8		; JCOEFPTR coef_block
-%define divisors	ebp+12		; DCTELEM * divisors
-%define workspace	ebp+16		; DCTELEM * workspace
-
-	align	16
-	global	EXTN(jsimd_quantize_mmx) PRIVATE
-
-EXTN(jsimd_quantize_mmx):
-	push	ebp
-	mov	ebp,esp
-;	push	ebx		; unused
-;	push	ecx		; unused
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov	esi, POINTER [workspace]
-	mov	edx, POINTER [divisors]
-	mov	edi, JCOEFPTR [coef_block]
-	mov	ah, 2
-	alignx	16,7
-.quantloop1:
-	mov	al, DCTSIZE2/8/2
-	alignx	16,7
-.quantloop2:
-	movq	mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
-	movq	mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
-
-	movq	mm0,mm2
-	movq	mm1,mm3
-
-	psraw	mm2,(WORD_BIT-1)  ; -1 if value < 0, 0 otherwise
-	psraw	mm3,(WORD_BIT-1)
-
-	pxor	mm0,mm2   ; val = -val
-	pxor	mm1,mm3
-	psubw	mm0,mm2
-	psubw	mm1,mm3
-
-	;
-	; MMX is an annoyingly crappy instruction set. It has two
-	; misfeatures that are causing problems here:
-	;
-	; - All multiplications are signed.
-	;
-	; - The second operand for the shifts is not treated as packed.
-	;
-	;
-	; We work around the first problem by implementing this algorithm:
-	;
-	; unsigned long unsigned_multiply(unsigned short x, unsigned short y)
-	; {
-	;   enum { SHORT_BIT = 16 };
-	;   signed short sx = (signed short) x;
-	;   signed short sy = (signed short) y;
-	;   signed long sz;
-	; 
-	;   sz = (long) sx * (long) sy;     /* signed multiply */
-	; 
-	;   if (sx < 0) sz += (long) sy << SHORT_BIT;
-	;   if (sy < 0) sz += (long) sx << SHORT_BIT;
-	; 
-	;   return (unsigned long) sz;
-	; }
-	;
-	; (note that a negative sx adds _sy_ and vice versa)
-	;
-	; For the second problem, we replace the shift by a multiplication.
-	; Unfortunately that means we have to deal with the signed issue again.
-	;
-
-	paddw	mm0, MMWORD [CORRECTION(0,0,edx)]   ; correction + roundfactor
-	paddw	mm1, MMWORD [CORRECTION(0,1,edx)]
-
-	movq	mm4,mm0   ; store current value for later
-	movq	mm5,mm1
-	pmulhw	mm0, MMWORD [RECIPROCAL(0,0,edx)]   ; reciprocal
-	pmulhw	mm1, MMWORD [RECIPROCAL(0,1,edx)]
-	paddw	mm0,mm4		; reciprocal is always negative (MSB=1),
-	paddw	mm1,mm5   ; so we always need to add the initial value
-	                ; (input value is never negative as we
-	                ; inverted it at the start of this routine)
-
-	; here it gets a bit tricky as both scale
-	; and mm0/mm1 can be negative
-	movq	mm6, MMWORD [SCALE(0,0,edx)]	; scale
-	movq	mm7, MMWORD [SCALE(0,1,edx)]
-	movq	mm4,mm0
-	movq	mm5,mm1
-	pmulhw	mm0,mm6
-	pmulhw	mm1,mm7
-
-	psraw	mm6,(WORD_BIT-1)    ; determine if scale is negative
-	psraw	mm7,(WORD_BIT-1)
-
-	pand	mm6,mm4             ; and add input if it is
-	pand	mm7,mm5
-	paddw	mm0,mm6
-	paddw	mm1,mm7
-
-	psraw	mm4,(WORD_BIT-1)    ; then check if negative input 
-	psraw	mm5,(WORD_BIT-1)
-
-	pand	mm4, MMWORD [SCALE(0,0,edx)]	; and add scale if it is
-	pand	mm5, MMWORD [SCALE(0,1,edx)]
-	paddw	mm0,mm4
-	paddw	mm1,mm5
-
-	pxor	mm0,mm2   ; val = -val
-	pxor	mm1,mm3
-	psubw	mm0,mm2
-	psubw	mm1,mm3
-
-	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
-	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
-
-	add	esi, byte 8*SIZEOF_DCTELEM
-	add	edx, byte 8*SIZEOF_DCTELEM
-	add	edi, byte 8*SIZEOF_JCOEF
-	dec	al
-	jnz	near .quantloop2
-	dec	ah
-	jnz	near .quantloop1	; to avoid branch misprediction
-
-	emms		; empty MMX state
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; unused
-;	pop	ebx		; unused
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jcqnts2f-64.asm b/simd/jcqnts2f-64.asm
deleted file mode 100644
index 5ee98b3..0000000
--- a/simd/jcqnts2f-64.asm
+++ /dev/null
@@ -1,158 +0,0 @@
-;
-; jcqnts2f-64.asm - sample data conversion and quantization (64-bit SSE & SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                            FAST_FLOAT * workspace);
-;
-
-; r10 = JSAMPARRAY sample_data
-; r11 = JDIMENSION start_col
-; r12 = FAST_FLOAT * workspace
-
-	align	16
-	global	EXTN(jsimd_convsamp_float_sse2) PRIVATE
-
-EXTN(jsimd_convsamp_float_sse2):
-	push	rbp
-	mov	rax,rsp
-	mov	rbp,rsp
-	collect_args
-	push	rbx
-
-	pcmpeqw  xmm7,xmm7
-	psllw    xmm7,7
-	packsswb xmm7,xmm7		; xmm7 = PB_CENTERJSAMPLE (0x808080..)
-
-	mov rsi, r10
-	mov	rax, r11
-	mov rdi, r12
-	mov	rcx, DCTSIZE/2
-.convloop:
-	mov	rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-	mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-
-	movq	xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
-	movq	xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
-
-	psubb	xmm0,xmm7			; xmm0=(01234567)
-	psubb	xmm1,xmm7			; xmm1=(89ABCDEF)
-
-	punpcklbw xmm0,xmm0			; xmm0=(*0*1*2*3*4*5*6*7)
-	punpcklbw xmm1,xmm1			; xmm1=(*8*9*A*B*C*D*E*F)
-
-	punpcklwd xmm2,xmm0			; xmm2=(***0***1***2***3)
-	punpckhwd xmm0,xmm0			; xmm0=(***4***5***6***7)
-	punpcklwd xmm3,xmm1			; xmm3=(***8***9***A***B)
-	punpckhwd xmm1,xmm1			; xmm1=(***C***D***E***F)
-
-	psrad     xmm2,(DWORD_BIT-BYTE_BIT)	; xmm2=(0123)
-	psrad     xmm0,(DWORD_BIT-BYTE_BIT)	; xmm0=(4567)
-	cvtdq2ps  xmm2,xmm2			; xmm2=(0123)
-	cvtdq2ps  xmm0,xmm0			; xmm0=(4567)
-	psrad     xmm3,(DWORD_BIT-BYTE_BIT)	; xmm3=(89AB)
-	psrad     xmm1,(DWORD_BIT-BYTE_BIT)	; xmm1=(CDEF)
-	cvtdq2ps  xmm3,xmm3			; xmm3=(89AB)
-	cvtdq2ps  xmm1,xmm1			; xmm1=(CDEF)
-
-	movaps	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
-	movaps	XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
-	movaps	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
-	movaps	XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
-
-	add	rsi, byte 2*SIZEOF_JSAMPROW
-	add	rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
-	dec	rcx
-	jnz	short .convloop
-
-	pop	rbx
-	uncollect_args
-	pop	rbp
-	ret
-
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; GLOBAL(void)
-; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors,
-;                         FAST_FLOAT * workspace);
-;
-
-; r10 = JCOEFPTR coef_block
-; r11 = FAST_FLOAT * divisors
-; r12 = FAST_FLOAT * workspace
-
-	align	16
-	global	EXTN(jsimd_quantize_float_sse2) PRIVATE
-
-EXTN(jsimd_quantize_float_sse2):
-	push	rbp
-	mov	rax,rsp
-	mov	rbp,rsp
-	collect_args
-
-	mov rsi, r12
-	mov rdx, r11
-	mov rdi, r10
-	mov	rax, DCTSIZE2/16
-.quantloop:
-	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
-	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
-	mulps	xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
-	mulps	xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
-	mulps	xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
-
-	cvtps2dq xmm0,xmm0
-	cvtps2dq xmm1,xmm1
-	cvtps2dq xmm2,xmm2
-	cvtps2dq xmm3,xmm3
-
-	packssdw xmm0,xmm1
-	packssdw xmm2,xmm3
-
-	movdqa	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
-	movdqa	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
-
-	add	rsi, byte 16*SIZEOF_FAST_FLOAT
-	add	rdx, byte 16*SIZEOF_FAST_FLOAT
-	add	rdi, byte 16*SIZEOF_JCOEF
-	dec	rax
-	jnz	short .quantloop
-
-	uncollect_args
-	pop	rbp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jcqnts2f.asm b/simd/jcqnts2f.asm
deleted file mode 100644
index e5f5793..0000000
--- a/simd/jcqnts2f.asm
+++ /dev/null
@@ -1,171 +0,0 @@
-;
-; jcqnts2f.asm - sample data conversion and quantization (SSE & SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                            FAST_FLOAT * workspace);
-;
-
-%define sample_data	ebp+8		; JSAMPARRAY sample_data
-%define start_col	ebp+12		; JDIMENSION start_col
-%define workspace	ebp+16		; FAST_FLOAT * workspace
-
-	align	16
-	global	EXTN(jsimd_convsamp_float_sse2) PRIVATE
-
-EXTN(jsimd_convsamp_float_sse2):
-	push	ebp
-	mov	ebp,esp
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	pcmpeqw  xmm7,xmm7
-	psllw    xmm7,7
-	packsswb xmm7,xmm7		; xmm7 = PB_CENTERJSAMPLE (0x808080..)
-
-	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [start_col]
-	mov	edi, POINTER [workspace]	; (DCTELEM *)
-	mov	ecx, DCTSIZE/2
-	alignx	16,7
-.convloop:
-	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-
-	movq	xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
-	movq	xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
-
-	psubb	xmm0,xmm7			; xmm0=(01234567)
-	psubb	xmm1,xmm7			; xmm1=(89ABCDEF)
-
-	punpcklbw xmm0,xmm0			; xmm0=(*0*1*2*3*4*5*6*7)
-	punpcklbw xmm1,xmm1			; xmm1=(*8*9*A*B*C*D*E*F)
-
-	punpcklwd xmm2,xmm0			; xmm2=(***0***1***2***3)
-	punpckhwd xmm0,xmm0			; xmm0=(***4***5***6***7)
-	punpcklwd xmm3,xmm1			; xmm3=(***8***9***A***B)
-	punpckhwd xmm1,xmm1			; xmm1=(***C***D***E***F)
-
-	psrad     xmm2,(DWORD_BIT-BYTE_BIT)	; xmm2=(0123)
-	psrad     xmm0,(DWORD_BIT-BYTE_BIT)	; xmm0=(4567)
-	cvtdq2ps  xmm2,xmm2			; xmm2=(0123)
-	cvtdq2ps  xmm0,xmm0			; xmm0=(4567)
-	psrad     xmm3,(DWORD_BIT-BYTE_BIT)	; xmm3=(89AB)
-	psrad     xmm1,(DWORD_BIT-BYTE_BIT)	; xmm1=(CDEF)
-	cvtdq2ps  xmm3,xmm3			; xmm3=(89AB)
-	cvtdq2ps  xmm1,xmm1			; xmm1=(CDEF)
-
-	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
-	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
-	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
-	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
-
-	add	esi, byte 2*SIZEOF_JSAMPROW
-	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
-	dec	ecx
-	jnz	short .convloop
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	pop	ebp
-	ret
-
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; GLOBAL(void)
-; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors,
-;                         FAST_FLOAT * workspace);
-;
-
-%define coef_block	ebp+8		; JCOEFPTR coef_block
-%define divisors	ebp+12		; FAST_FLOAT * divisors
-%define workspace	ebp+16		; FAST_FLOAT * workspace
-
-	align	16
-	global	EXTN(jsimd_quantize_float_sse2) PRIVATE
-
-EXTN(jsimd_quantize_float_sse2):
-	push	ebp
-	mov	ebp,esp
-;	push	ebx		; unused
-;	push	ecx		; unused
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov	esi, POINTER [workspace]
-	mov	edx, POINTER [divisors]
-	mov	edi, JCOEFPTR [coef_block]
-	mov	eax, DCTSIZE2/16
-	alignx	16,7
-.quantloop:
-	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
-	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-	mulps	xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
-	mulps	xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-	mulps	xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-
-	cvtps2dq xmm0,xmm0
-	cvtps2dq xmm1,xmm1
-	cvtps2dq xmm2,xmm2
-	cvtps2dq xmm3,xmm3
-
-	packssdw xmm0,xmm1
-	packssdw xmm2,xmm3
-
-	movdqa	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
-	movdqa	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
-
-	add	esi, byte 16*SIZEOF_FAST_FLOAT
-	add	edx, byte 16*SIZEOF_FAST_FLOAT
-	add	edi, byte 16*SIZEOF_JCOEF
-	dec	eax
-	jnz	short .quantloop
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; unused
-;	pop	ebx		; unused
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jcqnts2i-64.asm b/simd/jcqnts2i-64.asm
deleted file mode 100644
index c3e3bff..0000000
--- a/simd/jcqnts2i-64.asm
+++ /dev/null
@@ -1,187 +0,0 @@
-;
-; jcqnts2i-64.asm - sample data conversion and quantization (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                      DCTELEM * workspace);
-;
-
-; r10 = JSAMPARRAY sample_data
-; r11 = JDIMENSION start_col
-; r12 = DCTELEM * workspace
-
-	align	16
-	global	EXTN(jsimd_convsamp_sse2) PRIVATE
-
-EXTN(jsimd_convsamp_sse2):
-	push	rbp
-	mov	rax,rsp
-	mov	rbp,rsp
-	collect_args
-	push	rbx
-
-	pxor	xmm6,xmm6		; xmm6=(all 0's)
-	pcmpeqw	xmm7,xmm7
-	psllw	xmm7,7			; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
-	mov rsi, r10
-	mov rax, r11
-	mov rdi, r12
-	mov	rcx, DCTSIZE/4
-.convloop:
-	mov	rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-	mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-
-	movq	xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]	; xmm0=(01234567)
-	movq	xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]	; xmm1=(89ABCDEF)
-
-	mov	rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-	mov	rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-
-	movq	xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]	; xmm2=(GHIJKLMN)
-	movq	xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]	; xmm3=(OPQRSTUV)
-
-	punpcklbw xmm0,xmm6		; xmm0=(01234567)
-	punpcklbw xmm1,xmm6		; xmm1=(89ABCDEF)
-	paddw     xmm0,xmm7
-	paddw     xmm1,xmm7
-	punpcklbw xmm2,xmm6		; xmm2=(GHIJKLMN)
-	punpcklbw xmm3,xmm6		; xmm3=(OPQRSTUV)
-	paddw     xmm2,xmm7
-	paddw     xmm3,xmm7
-
-	movdqa	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
-	movdqa	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
-	movdqa	XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
-	movdqa	XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
-
-	add	rsi, byte 4*SIZEOF_JSAMPROW
-	add	rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
-	dec	rcx
-	jnz	short .convloop
-
-	pop	rbx
-	uncollect_args
-	pop	rbp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; This implementation is based on an algorithm described in
-;   "How to optimize for the Pentium family of microprocessors"
-;   (http://www.agner.org/assem/).
-;
-; GLOBAL(void)
-; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM * divisors,
-;                      DCTELEM * workspace);
-;
-
-%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
-%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
-%define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
-
-; r10 = JCOEFPTR coef_block
-; r11 = DCTELEM * divisors
-; r12 = DCTELEM * workspace
-
-	align	16
-	global	EXTN(jsimd_quantize_sse2) PRIVATE
-
-EXTN(jsimd_quantize_sse2):
-	push	rbp
-	mov	rax,rsp
-	mov	rbp,rsp
-	collect_args
-
-	mov rsi, r12
-	mov rdx, r11
-	mov rdi, r10
-	mov	rax, DCTSIZE2/32
-.quantloop:
-	movdqa	xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
-	movdqa	xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
-	movdqa	xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
-	movdqa	xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
-	movdqa	xmm0,xmm4
-	movdqa	xmm1,xmm5
-	movdqa	xmm2,xmm6
-	movdqa	xmm3,xmm7
-	psraw	xmm4,(WORD_BIT-1)
-	psraw	xmm5,(WORD_BIT-1)
-	psraw	xmm6,(WORD_BIT-1)
-	psraw	xmm7,(WORD_BIT-1)
-	pxor	xmm0,xmm4
-	pxor	xmm1,xmm5
-	pxor	xmm2,xmm6
-	pxor	xmm3,xmm7
-	psubw	xmm0,xmm4		; if (xmm0 < 0) xmm0 = -xmm0;
-	psubw	xmm1,xmm5		; if (xmm1 < 0) xmm1 = -xmm1;
-	psubw	xmm2,xmm6		; if (xmm2 < 0) xmm2 = -xmm2;
-	psubw	xmm3,xmm7		; if (xmm3 < 0) xmm3 = -xmm3;
-
-	paddw	xmm0, XMMWORD [CORRECTION(0,0,rdx)]  ; correction + roundfactor
-	paddw	xmm1, XMMWORD [CORRECTION(1,0,rdx)]
-	paddw	xmm2, XMMWORD [CORRECTION(2,0,rdx)]
-	paddw	xmm3, XMMWORD [CORRECTION(3,0,rdx)]
-	pmulhuw	xmm0, XMMWORD [RECIPROCAL(0,0,rdx)]  ; reciprocal
-	pmulhuw	xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
-	pmulhuw	xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
-	pmulhuw	xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
-	pmulhuw	xmm0, XMMWORD [SCALE(0,0,rdx)]	; scale
-	pmulhuw	xmm1, XMMWORD [SCALE(1,0,rdx)]
-	pmulhuw	xmm2, XMMWORD [SCALE(2,0,rdx)]
-	pmulhuw	xmm3, XMMWORD [SCALE(3,0,rdx)]
-
-	pxor	xmm0,xmm4
-	pxor	xmm1,xmm5
-	pxor	xmm2,xmm6
-	pxor	xmm3,xmm7
-	psubw	xmm0,xmm4
-	psubw	xmm1,xmm5
-	psubw	xmm2,xmm6
-	psubw	xmm3,xmm7
-	movdqa	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
-	movdqa	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
-	movdqa	XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
-	movdqa	XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
-
-	add	rsi, byte 32*SIZEOF_DCTELEM
-	add	rdx, byte 32*SIZEOF_DCTELEM
-	add	rdi, byte 32*SIZEOF_JCOEF
-	dec	rax
-	jnz	near .quantloop
-
-	uncollect_args
-	pop	rbp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jcqnts2i.asm b/simd/jcqnts2i.asm
deleted file mode 100644
index 412032b..0000000
--- a/simd/jcqnts2i.asm
+++ /dev/null
@@ -1,200 +0,0 @@
-;
-; jcqnts2i.asm - sample data conversion and quantization (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                      DCTELEM * workspace);
-;
-
-%define sample_data	ebp+8		; JSAMPARRAY sample_data
-%define start_col	ebp+12		; JDIMENSION start_col
-%define workspace	ebp+16		; DCTELEM * workspace
-
-	align	16
-	global	EXTN(jsimd_convsamp_sse2) PRIVATE
-
-EXTN(jsimd_convsamp_sse2):
-	push	ebp
-	mov	ebp,esp
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	pxor	xmm6,xmm6		; xmm6=(all 0's)
-	pcmpeqw	xmm7,xmm7
-	psllw	xmm7,7			; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
-	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [start_col]
-	mov	edi, POINTER [workspace]	; (DCTELEM *)
-	mov	ecx, DCTSIZE/4
-	alignx	16,7
-.convloop:
-	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-
-	movq	xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; xmm0=(01234567)
-	movq	xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]	; xmm1=(89ABCDEF)
-
-	mov	ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-	mov	edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-
-	movq	xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; xmm2=(GHIJKLMN)
-	movq	xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]	; xmm3=(OPQRSTUV)
-
-	punpcklbw xmm0,xmm6		; xmm0=(01234567)
-	punpcklbw xmm1,xmm6		; xmm1=(89ABCDEF)
-	paddw     xmm0,xmm7
-	paddw     xmm1,xmm7
-	punpcklbw xmm2,xmm6		; xmm2=(GHIJKLMN)
-	punpcklbw xmm3,xmm6		; xmm3=(OPQRSTUV)
-	paddw     xmm2,xmm7
-	paddw     xmm3,xmm7
-
-	movdqa	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
-	movdqa	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
-	movdqa	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
-	movdqa	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
-
-	add	esi, byte 4*SIZEOF_JSAMPROW
-	add	edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
-	dec	ecx
-	jnz	short .convloop
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	pop	ebp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; This implementation is based on an algorithm described in
-;   "How to optimize for the Pentium family of microprocessors"
-;   (http://www.agner.org/assem/).
-;
-; GLOBAL(void)
-; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM * divisors,
-;                      DCTELEM * workspace);
-;
-
-%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
-%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
-%define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
-
-%define coef_block	ebp+8		; JCOEFPTR coef_block
-%define divisors	ebp+12		; DCTELEM * divisors
-%define workspace	ebp+16		; DCTELEM * workspace
-
-	align	16
-	global	EXTN(jsimd_quantize_sse2) PRIVATE
-
-EXTN(jsimd_quantize_sse2):
-	push	ebp
-	mov	ebp,esp
-;	push	ebx		; unused
-;	push	ecx		; unused
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov	esi, POINTER [workspace]
-	mov	edx, POINTER [divisors]
-	mov	edi, JCOEFPTR [coef_block]
-	mov	eax, DCTSIZE2/32
-	alignx	16,7
-.quantloop:
-	movdqa	xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
-	movdqa	xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
-	movdqa	xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
-	movdqa	xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
-	movdqa	xmm0,xmm4
-	movdqa	xmm1,xmm5
-	movdqa	xmm2,xmm6
-	movdqa	xmm3,xmm7
-	psraw	xmm4,(WORD_BIT-1)
-	psraw	xmm5,(WORD_BIT-1)
-	psraw	xmm6,(WORD_BIT-1)
-	psraw	xmm7,(WORD_BIT-1)
-	pxor	xmm0,xmm4
-	pxor	xmm1,xmm5
-	pxor	xmm2,xmm6
-	pxor	xmm3,xmm7
-	psubw	xmm0,xmm4		; if (xmm0 < 0) xmm0 = -xmm0;
-	psubw	xmm1,xmm5		; if (xmm1 < 0) xmm1 = -xmm1;
-	psubw	xmm2,xmm6		; if (xmm2 < 0) xmm2 = -xmm2;
-	psubw	xmm3,xmm7		; if (xmm3 < 0) xmm3 = -xmm3;
-
-	paddw	xmm0, XMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
-	paddw	xmm1, XMMWORD [CORRECTION(1,0,edx)]
-	paddw	xmm2, XMMWORD [CORRECTION(2,0,edx)]
-	paddw	xmm3, XMMWORD [CORRECTION(3,0,edx)]
-	pmulhuw	xmm0, XMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
-	pmulhuw	xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
-	pmulhuw	xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
-	pmulhuw	xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
-	pmulhuw	xmm0, XMMWORD [SCALE(0,0,edx)]	; scale
-	pmulhuw	xmm1, XMMWORD [SCALE(1,0,edx)]
-	pmulhuw	xmm2, XMMWORD [SCALE(2,0,edx)]
-	pmulhuw	xmm3, XMMWORD [SCALE(3,0,edx)]
-
-	pxor	xmm0,xmm4
-	pxor	xmm1,xmm5
-	pxor	xmm2,xmm6
-	pxor	xmm3,xmm7
-	psubw	xmm0,xmm4
-	psubw	xmm1,xmm5
-	psubw	xmm2,xmm6
-	psubw	xmm3,xmm7
-	movdqa	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
-	movdqa	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
-	movdqa	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
-	movdqa	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
-
-	add	esi, byte 32*SIZEOF_DCTELEM
-	add	edx, byte 32*SIZEOF_DCTELEM
-	add	edi, byte 32*SIZEOF_JCOEF
-	dec	eax
-	jnz	near .quantloop
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; unused
-;	pop	ebx		; unused
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jcqntsse.asm b/simd/jcqntsse.asm
deleted file mode 100644
index df7243e..0000000
--- a/simd/jcqntsse.asm
+++ /dev/null
@@ -1,211 +0,0 @@
-;
-; jcqntsse.asm - sample data conversion and quantization (SSE & MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                           FAST_FLOAT * workspace);
-;
-
-%define sample_data	ebp+8		; JSAMPARRAY sample_data
-%define start_col	ebp+12		; JDIMENSION start_col
-%define workspace	ebp+16		; FAST_FLOAT * workspace
-
-	align	16
-	global	EXTN(jsimd_convsamp_float_sse) PRIVATE
-
-EXTN(jsimd_convsamp_float_sse):
-	push	ebp
-	mov	ebp,esp
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	pcmpeqw  mm7,mm7
-	psllw    mm7,7
-	packsswb mm7,mm7		; mm7 = PB_CENTERJSAMPLE (0x808080..)
-
-	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [start_col]
-	mov	edi, POINTER [workspace]	; (DCTELEM *)
-	mov	ecx, DCTSIZE/2
-	alignx	16,7
-.convloop:
-	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-
-	movq	mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
-	movq	mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
-
-	psubb	mm0,mm7				; mm0=(01234567)
-	psubb	mm1,mm7				; mm1=(89ABCDEF)
-
-	punpcklbw mm2,mm0			; mm2=(*0*1*2*3)
-	punpckhbw mm0,mm0			; mm0=(*4*5*6*7)
-	punpcklbw mm3,mm1			; mm3=(*8*9*A*B)
-	punpckhbw mm1,mm1			; mm1=(*C*D*E*F)
-
-	punpcklwd mm4,mm2			; mm4=(***0***1)
-	punpckhwd mm2,mm2			; mm2=(***2***3)
-	punpcklwd mm5,mm0			; mm5=(***4***5)
-	punpckhwd mm0,mm0			; mm0=(***6***7)
-
-	psrad     mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(01)
-	psrad     mm2,(DWORD_BIT-BYTE_BIT)	; mm2=(23)
-	cvtpi2ps  xmm0,mm4			; xmm0=(01**)
-	cvtpi2ps  xmm1,mm2			; xmm1=(23**)
-	psrad     mm5,(DWORD_BIT-BYTE_BIT)	; mm5=(45)
-	psrad     mm0,(DWORD_BIT-BYTE_BIT)	; mm0=(67)
-	cvtpi2ps  xmm2,mm5			; xmm2=(45**)
-	cvtpi2ps  xmm3,mm0			; xmm3=(67**)
-
-	punpcklwd mm6,mm3			; mm6=(***8***9)
-	punpckhwd mm3,mm3			; mm3=(***A***B)
-	punpcklwd mm4,mm1			; mm4=(***C***D)
-	punpckhwd mm1,mm1			; mm1=(***E***F)
-
-	psrad     mm6,(DWORD_BIT-BYTE_BIT)	; mm6=(89)
-	psrad     mm3,(DWORD_BIT-BYTE_BIT)	; mm3=(AB)
-	cvtpi2ps  xmm4,mm6			; xmm4=(89**)
-	cvtpi2ps  xmm5,mm3			; xmm5=(AB**)
-	psrad     mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(CD)
-	psrad     mm1,(DWORD_BIT-BYTE_BIT)	; mm1=(EF)
-	cvtpi2ps  xmm6,mm4			; xmm6=(CD**)
-	cvtpi2ps  xmm7,mm1			; xmm7=(EF**)
-
-	movlhps   xmm0,xmm1			; xmm0=(0123)
-	movlhps   xmm2,xmm3			; xmm2=(4567)
-	movlhps   xmm4,xmm5			; xmm4=(89AB)
-	movlhps   xmm6,xmm7			; xmm6=(CDEF)
-
-	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
-	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
-	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
-	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
-
-	add	esi, byte 2*SIZEOF_JSAMPROW
-	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
-	dec	ecx
-	jnz	near .convloop
-
-	emms		; empty MMX state
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	pop	ebp
-	ret
-
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; GLOBAL(void)
-; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT * divisors,
-;                           FAST_FLOAT * workspace);
-;
-
-%define coef_block	ebp+8		; JCOEFPTR coef_block
-%define divisors	ebp+12		; FAST_FLOAT * divisors
-%define workspace	ebp+16		; FAST_FLOAT * workspace
-
-	align	16
-	global	EXTN(jsimd_quantize_float_sse) PRIVATE
-
-EXTN(jsimd_quantize_float_sse):
-	push	ebp
-	mov	ebp,esp
-;	push	ebx		; unused
-;	push	ecx		; unused
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov	esi, POINTER [workspace]
-	mov	edx, POINTER [divisors]
-	mov	edi, JCOEFPTR [coef_block]
-	mov	eax, DCTSIZE2/16
-	alignx	16,7
-.quantloop:
-	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
-	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-	mulps	xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
-	mulps	xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-	mulps	xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-
-	movhlps  xmm4,xmm0
-	movhlps  xmm5,xmm1
-
-	cvtps2pi mm0,xmm0
-	cvtps2pi mm1,xmm1
-	cvtps2pi mm4,xmm4
-	cvtps2pi mm5,xmm5
-
-	movhlps  xmm6,xmm2
-	movhlps  xmm7,xmm3
-
-	cvtps2pi mm2,xmm2
-	cvtps2pi mm3,xmm3
-	cvtps2pi mm6,xmm6
-	cvtps2pi mm7,xmm7
-
-	packssdw mm0,mm4
-	packssdw mm1,mm5
-	packssdw mm2,mm6
-	packssdw mm3,mm7
-
-	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
-	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
-	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
-	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
-
-	add	esi, byte 16*SIZEOF_FAST_FLOAT
-	add	edx, byte 16*SIZEOF_FAST_FLOAT
-	add	edi, byte 16*SIZEOF_JCOEF
-	dec	eax
-	jnz	short .quantloop
-
-	emms		; empty MMX state
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; unused
-;	pop	ebx		; unused
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jcsammmx.asm b/simd/jcsammmx.asm
deleted file mode 100644
index e5e2d23..0000000
--- a/simd/jcsammmx.asm
+++ /dev/null
@@ -1,324 +0,0 @@
-;
-; jcsammmx.asm - downsampling (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Downsample pixel values of a single component.
-; This version handles the common case of 2:1 horizontal and 1:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
-;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-%define img_width(b)	(b)+8			; JDIMENSION image_width
-%define max_v_samp(b)	(b)+12		; int max_v_samp_factor
-%define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
-%define width_blks(b)	(b)+20		; JDIMENSION width_blocks
-%define input_data(b)	(b)+24		; JSAMPARRAY input_data
-%define output_data(b)	(b)+28	; JSAMPARRAY output_data
-
-	align	16
-	global	EXTN(jsimd_h2v1_downsample_mmx) PRIVATE
-
-EXTN(jsimd_h2v1_downsample_mmx):
-	push	ebp
-	mov	ebp,esp
-;	push	ebx		; unused
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov	ecx, JDIMENSION [width_blks(ebp)]
-	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
-	jz	near .return
-
-	mov	edx, JDIMENSION [img_width(ebp)]
-
-	; -- expand_right_edge
-
-	push	ecx
-	shl	ecx,1				; output_cols * 2
-	sub	ecx,edx
-	jle	short .expand_end
-
-	mov	eax, INT [max_v_samp(ebp)]
-	test	eax,eax
-	jle	short .expand_end
-
-	cld
-	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
-	alignx	16,7
-.expandloop:
-	push	eax
-	push	ecx
-
-	mov	edi, JSAMPROW [esi]
-	add	edi,edx
-	mov	al, JSAMPLE [edi-1]
-
-	rep stosb
-
-	pop	ecx
-	pop	eax
-
-	add	esi, byte SIZEOF_JSAMPROW
-	dec	eax
-	jg	short .expandloop
-
-.expand_end:
-	pop	ecx				; output_cols
-
-	; -- h2v1_downsample
-
-	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
-	test	eax,eax
-	jle	near .return
-
-	mov       edx, 0x00010000	; bias pattern
-	movd      mm7,edx
-	pcmpeqw   mm6,mm6
-	punpckldq mm7,mm7		; mm7={0, 1, 0, 1}
-	psrlw     mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
-
-	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
-	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
-	alignx	16,7
-.rowloop:
-	push	ecx
-	push	edi
-	push	esi
-
-	mov	esi, JSAMPROW [esi]		; inptr
-	mov	edi, JSAMPROW [edi]		; outptr
-	alignx	16,7
-.columnloop:
-
-	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
-	movq	mm1, MMWORD [esi+1*SIZEOF_MMWORD]
-	movq	mm2,mm0
-	movq	mm3,mm1
-
-	pand	mm0,mm6
-	psrlw	mm2,BYTE_BIT
-	pand	mm1,mm6
-	psrlw	mm3,BYTE_BIT
-
-	paddw	mm0,mm2
-	paddw	mm1,mm3
-	paddw	mm0,mm7
-	paddw	mm1,mm7
-	psrlw	mm0,1
-	psrlw	mm1,1
-
-	packuswb mm0,mm1
-
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
-
-	add	esi, byte 2*SIZEOF_MMWORD	; inptr
-	add	edi, byte 1*SIZEOF_MMWORD	; outptr
-	sub	ecx, byte SIZEOF_MMWORD		; outcol
-	jnz	short .columnloop
-
-	pop	esi
-	pop	edi
-	pop	ecx
-
-	add	esi, byte SIZEOF_JSAMPROW	; input_data
-	add	edi, byte SIZEOF_JSAMPROW	; output_data
-	dec	eax				; rowctr
-	jg	short .rowloop
-
-	emms		; empty MMX state
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-;	pop	ebx		; unused
-	pop	ebp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Downsample pixel values of a single component.
-; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
-;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-%define img_width(b)	(b)+8			; JDIMENSION image_width
-%define max_v_samp(b)	(b)+12		; int max_v_samp_factor
-%define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
-%define width_blks(b)	(b)+20		; JDIMENSION width_blocks
-%define input_data(b)	(b)+24		; JSAMPARRAY input_data
-%define output_data(b)	(b)+28	; JSAMPARRAY output_data
-
-	align	16
-	global	EXTN(jsimd_h2v2_downsample_mmx) PRIVATE
-
-EXTN(jsimd_h2v2_downsample_mmx):
-	push	ebp
-	mov	ebp,esp
-;	push	ebx		; unused
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov	ecx, JDIMENSION [width_blks(ebp)]
-	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
-	jz	near .return
-
-	mov	edx, JDIMENSION [img_width(ebp)]
-
-	; -- expand_right_edge
-
-	push	ecx
-	shl	ecx,1				; output_cols * 2
-	sub	ecx,edx
-	jle	short .expand_end
-
-	mov	eax, INT [max_v_samp(ebp)]
-	test	eax,eax
-	jle	short .expand_end
-
-	cld
-	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
-	alignx	16,7
-.expandloop:
-	push	eax
-	push	ecx
-
-	mov	edi, JSAMPROW [esi]
-	add	edi,edx
-	mov	al, JSAMPLE [edi-1]
-
-	rep stosb
-
-	pop	ecx
-	pop	eax
-
-	add	esi, byte SIZEOF_JSAMPROW
-	dec	eax
-	jg	short .expandloop
-
-.expand_end:
-	pop	ecx				; output_cols
-
-	; -- h2v2_downsample
-
-	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
-	test	eax,eax
-	jle	near .return
-
-	mov       edx, 0x00020001	; bias pattern
-	movd      mm7,edx
-	pcmpeqw   mm6,mm6
-	punpckldq mm7,mm7		; mm7={1, 2, 1, 2}
-	psrlw     mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
-
-	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
-	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
-	alignx	16,7
-.rowloop:
-	push	ecx
-	push	edi
-	push	esi
-
-	mov	edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
-	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1
-	mov	edi, JSAMPROW [edi]			; outptr
-	alignx	16,7
-.columnloop:
-
-	movq	mm0, MMWORD [edx+0*SIZEOF_MMWORD]
-	movq	mm1, MMWORD [esi+0*SIZEOF_MMWORD]
-	movq	mm2, MMWORD [edx+1*SIZEOF_MMWORD]
-	movq	mm3, MMWORD [esi+1*SIZEOF_MMWORD]
-
-	movq	mm4,mm0
-	movq	mm5,mm1
-	pand	mm0,mm6
-	psrlw	mm4,BYTE_BIT
-	pand	mm1,mm6
-	psrlw	mm5,BYTE_BIT
-	paddw	mm0,mm4
-	paddw	mm1,mm5
-
-	movq	mm4,mm2
-	movq	mm5,mm3
-	pand	mm2,mm6
-	psrlw	mm4,BYTE_BIT
-	pand	mm3,mm6
-	psrlw	mm5,BYTE_BIT
-	paddw	mm2,mm4
-	paddw	mm3,mm5
-
-	paddw	mm0,mm1
-	paddw	mm2,mm3
-	paddw	mm0,mm7
-	paddw	mm2,mm7
-	psrlw	mm0,2
-	psrlw	mm2,2
-
-	packuswb mm0,mm2
-
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
-
-	add	edx, byte 2*SIZEOF_MMWORD	; inptr0
-	add	esi, byte 2*SIZEOF_MMWORD	; inptr1
-	add	edi, byte 1*SIZEOF_MMWORD	; outptr
-	sub	ecx, byte SIZEOF_MMWORD		; outcol
-	jnz	near .columnloop
-
-	pop	esi
-	pop	edi
-	pop	ecx
-
-	add	esi, byte 2*SIZEOF_JSAMPROW	; input_data
-	add	edi, byte 1*SIZEOF_JSAMPROW	; output_data
-	dec	eax				; rowctr
-	jg	near .rowloop
-
-	emms		; empty MMX state
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-;	pop	ebx		; unused
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jcsamss2-64.asm b/simd/jcsamss2-64.asm
deleted file mode 100644
index e20084e..0000000
--- a/simd/jcsamss2-64.asm
+++ /dev/null
@@ -1,330 +0,0 @@
-;
-; jcsamss2-64.asm - downsampling (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
-;
-; Downsample pixel values of a single component.
-; This version handles the common case of 2:1 horizontal and 1:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
-;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-; r10 = JDIMENSION image_width
-; r11 = int max_v_samp_factor
-; r12 = JDIMENSION v_samp_factor
-; r13 = JDIMENSION width_blocks
-; r14 = JSAMPARRAY input_data
-; r15 = JSAMPARRAY output_data
-
-	align	16
-	global	EXTN(jsimd_h2v1_downsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v1_downsample_sse2):
-	push	rbp
-	mov	rax,rsp
-	mov	rbp,rsp
-	collect_args
-
-	mov rcx, r13
-	shl	rcx,3			; imul rcx,DCTSIZE (rcx = output_cols)
-	jz	near .return
-
-	mov rdx, r10
-
-	; -- expand_right_edge
-
-	push	rcx
-	shl	rcx,1				; output_cols * 2
-	sub	rcx,rdx
-	jle	short .expand_end
-
-	mov	rax, r11
-	test	rax,rax
-	jle	short .expand_end
-
-	cld
-	mov	rsi, r14	; input_data
-.expandloop:
-	push	rax
-	push	rcx
-
-	mov	rdi, JSAMPROW [rsi]
-	add	rdi,rdx
-	mov	al, JSAMPLE [rdi-1]
-
-	rep stosb
-
-	pop	rcx
-	pop	rax
-
-	add	rsi, byte SIZEOF_JSAMPROW
-	dec	rax
-	jg	short .expandloop
-
-.expand_end:
-	pop	rcx				; output_cols
-
-	; -- h2v1_downsample
-
-	mov	rax, r12	; rowctr
-	test	eax,eax
-	jle	near .return
-
-	mov	rdx, 0x00010000		; bias pattern
-	movd	xmm7,edx
-	pcmpeqw	xmm6,xmm6
-	pshufd	xmm7,xmm7,0x00		; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
-	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
-
-	mov	rsi, r14	; input_data
-	mov	rdi, r15	; output_data
-.rowloop:
-	push	rcx
-	push	rdi
-	push	rsi
-
-	mov	rsi, JSAMPROW [rsi]		; inptr
-	mov rdi, JSAMPROW [rdi]		; outptr
-
-	cmp	rcx, byte SIZEOF_XMMWORD
-	jae	short .columnloop
-
-.columnloop_r8:
-	movdqa	xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	pxor	xmm1,xmm1
-	mov	rcx, SIZEOF_XMMWORD
-	jmp	short .downsample
-
-.columnloop:
-	movdqa	xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	movdqa	xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-
-.downsample:
-	movdqa	xmm2,xmm0
-	movdqa	xmm3,xmm1
-
-	pand	xmm0,xmm6
-	psrlw	xmm2,BYTE_BIT
-	pand	xmm1,xmm6
-	psrlw	xmm3,BYTE_BIT
-
-	paddw	xmm0,xmm2
-	paddw	xmm1,xmm3
-	paddw	xmm0,xmm7
-	paddw	xmm1,xmm7
-	psrlw	xmm0,1
-	psrlw	xmm1,1
-
-	packuswb xmm0,xmm1
-
-	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
-
-	sub	rcx, byte SIZEOF_XMMWORD	; outcol
-	add	rsi, byte 2*SIZEOF_XMMWORD	; inptr
-	add	rdi, byte 1*SIZEOF_XMMWORD	; outptr
-	cmp	rcx, byte SIZEOF_XMMWORD
-	jae	short .columnloop
-	test	rcx,rcx
-	jnz	short .columnloop_r8
-
-	pop	rsi
-	pop	rdi
-	pop	rcx
-
-	add	rsi, byte SIZEOF_JSAMPROW	; input_data
-	add	rdi, byte SIZEOF_JSAMPROW	; output_data
-	dec	rax				; rowctr
-	jg	near .rowloop
-
-.return:
-	uncollect_args
-	pop	rbp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Downsample pixel values of a single component.
-; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
-;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-; r10 = JDIMENSION image_width
-; r11 = int max_v_samp_factor
-; r12 = JDIMENSION v_samp_factor
-; r13 = JDIMENSION width_blocks
-; r14 = JSAMPARRAY input_data
-; r15 = JSAMPARRAY output_data
-
-	align	16
-	global	EXTN(jsimd_h2v2_downsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v2_downsample_sse2):
-	push	rbp
-	mov	rax,rsp
-	mov	rbp,rsp
-	collect_args
-
-	mov	rcx, r13
-	shl	rcx,3			; imul rcx,DCTSIZE (rcx = output_cols)
-	jz	near .return
-
-	mov	rdx, r10
-
-	; -- expand_right_edge
-
-	push	rcx
-	shl	rcx,1				; output_cols * 2
-	sub	rcx,rdx
-	jle	short .expand_end
-
-	mov	rax, r11
-	test	rax,rax
-	jle	short .expand_end
-
-	cld
-	mov	rsi, r14	; input_data
-.expandloop:
-	push	rax
-	push	rcx
-
-	mov	rdi, JSAMPROW [rsi]
-	add	rdi,rdx
-	mov	al, JSAMPLE [rdi-1]
-
-	rep stosb
-
-	pop	rcx
-	pop	rax
-
-	add	rsi, byte SIZEOF_JSAMPROW
-	dec	rax
-	jg	short .expandloop
-
-.expand_end:
-	pop	rcx				; output_cols
-
-	; -- h2v2_downsample
-
-	mov	rax, r12	; rowctr
-	test	rax,rax
-	jle	near .return
-
-	mov	rdx, 0x00020001		; bias pattern
-	movd	xmm7,edx
-	pcmpeqw	xmm6,xmm6
-	pshufd	xmm7,xmm7,0x00		; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
-	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
-
-	mov	rsi, r14	; input_data
-	mov	rdi, r15	; output_data
-.rowloop:
-	push	rcx
-	push	rdi
-	push	rsi
-
-	mov	rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]	; inptr0
-	mov	rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]	; inptr1
-	mov	rdi, JSAMPROW [rdi]			; outptr
-
-	cmp	rcx, byte SIZEOF_XMMWORD
-	jae	short .columnloop
-
-.columnloop_r8:
-	movdqa	xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
-	movdqa	xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	pxor	xmm2,xmm2
-	pxor	xmm3,xmm3
-	mov	rcx, SIZEOF_XMMWORD
-	jmp	short .downsample
-
-.columnloop:
-	movdqa	xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
-	movdqa	xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	movdqa	xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
-	movdqa	xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-
-.downsample:
-	movdqa	xmm4,xmm0
-	movdqa	xmm5,xmm1
-	pand	xmm0,xmm6
-	psrlw	xmm4,BYTE_BIT
-	pand	xmm1,xmm6
-	psrlw	xmm5,BYTE_BIT
-	paddw	xmm0,xmm4
-	paddw	xmm1,xmm5
-
-	movdqa	xmm4,xmm2
-	movdqa	xmm5,xmm3
-	pand	xmm2,xmm6
-	psrlw	xmm4,BYTE_BIT
-	pand	xmm3,xmm6
-	psrlw	xmm5,BYTE_BIT
-	paddw	xmm2,xmm4
-	paddw	xmm3,xmm5
-
-	paddw	xmm0,xmm1
-	paddw	xmm2,xmm3
-	paddw	xmm0,xmm7
-	paddw	xmm2,xmm7
-	psrlw	xmm0,2
-	psrlw	xmm2,2
-
-	packuswb xmm0,xmm2
-
-	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
-
-	sub	rcx, byte SIZEOF_XMMWORD	; outcol
-	add	rdx, byte 2*SIZEOF_XMMWORD	; inptr0
-	add	rsi, byte 2*SIZEOF_XMMWORD	; inptr1
-	add	rdi, byte 1*SIZEOF_XMMWORD	; outptr
-	cmp	rcx, byte SIZEOF_XMMWORD
-	jae	near .columnloop
-	test	rcx,rcx
-	jnz	near .columnloop_r8
-
-	pop	rsi
-	pop	rdi
-	pop	rcx
-
-	add	rsi, byte 2*SIZEOF_JSAMPROW	; input_data
-	add	rdi, byte 1*SIZEOF_JSAMPROW	; output_data
-	dec	rax				; rowctr
-	jg	near .rowloop
-
-.return:
-	uncollect_args
-	pop	rbp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jcsamss2.asm b/simd/jcsamss2.asm
deleted file mode 100644
index feb979d..0000000
--- a/simd/jcsamss2.asm
+++ /dev/null
@@ -1,351 +0,0 @@
-;
-; jcsamss2.asm - downsampling (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Downsample pixel values of a single component.
-; This version handles the common case of 2:1 horizontal and 1:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
-;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-%define img_width(b)	(b)+8			; JDIMENSION image_width
-%define max_v_samp(b)	(b)+12		; int max_v_samp_factor
-%define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
-%define width_blks(b)	(b)+20		; JDIMENSION width_blocks
-%define input_data(b)	(b)+24		; JSAMPARRAY input_data
-%define output_data(b)	(b)+28		; JSAMPARRAY output_data
-
-	align	16
-	global	EXTN(jsimd_h2v1_downsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v1_downsample_sse2):
-	push	ebp
-	mov	ebp,esp
-;	push	ebx		; unused
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov	ecx, JDIMENSION [width_blks(ebp)]
-	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
-	jz	near .return
-
-	mov	edx, JDIMENSION [img_width(ebp)]
-
-	; -- expand_right_edge
-
-	push	ecx
-	shl	ecx,1				; output_cols * 2
-	sub	ecx,edx
-	jle	short .expand_end
-
-	mov	eax, INT [max_v_samp(ebp)]
-	test	eax,eax
-	jle	short .expand_end
-
-	cld
-	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
-	alignx	16,7
-.expandloop:
-	push	eax
-	push	ecx
-
-	mov	edi, JSAMPROW [esi]
-	add	edi,edx
-	mov	al, JSAMPLE [edi-1]
-
-	rep stosb
-
-	pop	ecx
-	pop	eax
-
-	add	esi, byte SIZEOF_JSAMPROW
-	dec	eax
-	jg	short .expandloop
-
-.expand_end:
-	pop	ecx				; output_cols
-
-	; -- h2v1_downsample
-
-	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
-	test	eax,eax
-	jle	near .return
-
-	mov	edx, 0x00010000		; bias pattern
-	movd	xmm7,edx
-	pcmpeqw	xmm6,xmm6
-	pshufd	xmm7,xmm7,0x00		; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
-	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
-
-	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
-	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
-	alignx	16,7
-.rowloop:
-	push	ecx
-	push	edi
-	push	esi
-
-	mov	esi, JSAMPROW [esi]		; inptr
-	mov	edi, JSAMPROW [edi]		; outptr
-
-	cmp	ecx, byte SIZEOF_XMMWORD
-	jae	short .columnloop
-	alignx	16,7
-
-.columnloop_r8:
-	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	pxor	xmm1,xmm1
-	mov	ecx, SIZEOF_XMMWORD
-	jmp	short .downsample
-	alignx	16,7
-
-.columnloop:
-	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	movdqa	xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
-
-.downsample:
-	movdqa	xmm2,xmm0
-	movdqa	xmm3,xmm1
-
-	pand	xmm0,xmm6
-	psrlw	xmm2,BYTE_BIT
-	pand	xmm1,xmm6
-	psrlw	xmm3,BYTE_BIT
-
-	paddw	xmm0,xmm2
-	paddw	xmm1,xmm3
-	paddw	xmm0,xmm7
-	paddw	xmm1,xmm7
-	psrlw	xmm0,1
-	psrlw	xmm1,1
-
-	packuswb xmm0,xmm1
-
-	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
-
-	sub	ecx, byte SIZEOF_XMMWORD	; outcol
-	add	esi, byte 2*SIZEOF_XMMWORD	; inptr
-	add	edi, byte 1*SIZEOF_XMMWORD	; outptr
-	cmp	ecx, byte SIZEOF_XMMWORD
-	jae	short .columnloop
-	test	ecx,ecx
-	jnz	short .columnloop_r8
-
-	pop	esi
-	pop	edi
-	pop	ecx
-
-	add	esi, byte SIZEOF_JSAMPROW	; input_data
-	add	edi, byte SIZEOF_JSAMPROW	; output_data
-	dec	eax				; rowctr
-	jg	near .rowloop
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-;	pop	ebx		; unused
-	pop	ebp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Downsample pixel values of a single component.
-; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
-;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-%define img_width(b)	(b)+8			; JDIMENSION image_width
-%define max_v_samp(b)	(b)+12		; int max_v_samp_factor
-%define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
-%define width_blks(b)	(b)+20		; JDIMENSION width_blocks
-%define input_data(b)	(b)+24		; JSAMPARRAY input_data
-%define output_data(b)	(b)+28	; JSAMPARRAY output_data
-
-	align	16
-	global	EXTN(jsimd_h2v2_downsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v2_downsample_sse2):
-	push	ebp
-	mov	ebp,esp
-;	push	ebx		; unused
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov	ecx, JDIMENSION [width_blks(ebp)]
-	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
-	jz	near .return
-
-	mov	edx, JDIMENSION [img_width(ebp)]
-
-	; -- expand_right_edge
-
-	push	ecx
-	shl	ecx,1				; output_cols * 2
-	sub	ecx,edx
-	jle	short .expand_end
-
-	mov	eax, INT [max_v_samp(ebp)]
-	test	eax,eax
-	jle	short .expand_end
-
-	cld
-	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
-	alignx	16,7
-.expandloop:
-	push	eax
-	push	ecx
-
-	mov	edi, JSAMPROW [esi]
-	add	edi,edx
-	mov	al, JSAMPLE [edi-1]
-
-	rep stosb
-
-	pop	ecx
-	pop	eax
-
-	add	esi, byte SIZEOF_JSAMPROW
-	dec	eax
-	jg	short .expandloop
-
-.expand_end:
-	pop	ecx				; output_cols
-
-	; -- h2v2_downsample
-
-	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
-	test	eax,eax
-	jle	near .return
-
-	mov	edx, 0x00020001		; bias pattern
-	movd	xmm7,edx
-	pcmpeqw	xmm6,xmm6
-	pshufd	xmm7,xmm7,0x00		; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
-	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
-
-	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
-	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
-	alignx	16,7
-.rowloop:
-	push	ecx
-	push	edi
-	push	esi
-
-	mov	edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
-	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1
-	mov	edi, JSAMPROW [edi]			; outptr
-
-	cmp	ecx, byte SIZEOF_XMMWORD
-	jae	short .columnloop
-	alignx	16,7
-
-.columnloop_r8:
-	movdqa	xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
-	movdqa	xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	pxor	xmm2,xmm2
-	pxor	xmm3,xmm3
-	mov	ecx, SIZEOF_XMMWORD
-	jmp	short .downsample
-	alignx	16,7
-
-.columnloop:
-	movdqa	xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
-	movdqa	xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	movdqa	xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
-	movdqa	xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
-
-.downsample:
-	movdqa	xmm4,xmm0
-	movdqa	xmm5,xmm1
-	pand	xmm0,xmm6
-	psrlw	xmm4,BYTE_BIT
-	pand	xmm1,xmm6
-	psrlw	xmm5,BYTE_BIT
-	paddw	xmm0,xmm4
-	paddw	xmm1,xmm5
-
-	movdqa	xmm4,xmm2
-	movdqa	xmm5,xmm3
-	pand	xmm2,xmm6
-	psrlw	xmm4,BYTE_BIT
-	pand	xmm3,xmm6
-	psrlw	xmm5,BYTE_BIT
-	paddw	xmm2,xmm4
-	paddw	xmm3,xmm5
-
-	paddw	xmm0,xmm1
-	paddw	xmm2,xmm3
-	paddw	xmm0,xmm7
-	paddw	xmm2,xmm7
-	psrlw	xmm0,2
-	psrlw	xmm2,2
-
-	packuswb xmm0,xmm2
-
-	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
-
-	sub	ecx, byte SIZEOF_XMMWORD	; outcol
-	add	edx, byte 2*SIZEOF_XMMWORD	; inptr0
-	add	esi, byte 2*SIZEOF_XMMWORD	; inptr1
-	add	edi, byte 1*SIZEOF_XMMWORD	; outptr
-	cmp	ecx, byte SIZEOF_XMMWORD
-	jae	near .columnloop
-	test	ecx,ecx
-	jnz	near .columnloop_r8
-
-	pop	esi
-	pop	edi
-	pop	ecx
-
-	add	esi, byte 2*SIZEOF_JSAMPROW	; input_data
-	add	edi, byte 1*SIZEOF_JSAMPROW	; output_data
-	dec	eax				; rowctr
-	jg	near .rowloop
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-;	pop	ebx		; unused
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jdclrmmx.asm b/simd/jdclrmmx.asm
deleted file mode 100644
index d2aa165..0000000
--- a/simd/jdclrmmx.asm
+++ /dev/null
@@ -1,405 +0,0 @@
-;
-; jdclrmmx.asm - colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_ycc_rgb_convert_mmx (JDIMENSION out_width,
-;                            JSAMPIMAGE input_buf, JDIMENSION input_row,
-;                            JSAMPARRAY output_buf, int num_rows)
-;
-
-%define out_width(b)	(b)+8			; JDIMENSION out_width
-%define input_buf(b)	(b)+12		; JSAMPIMAGE input_buf
-%define input_row(b)	(b)+16		; JDIMENSION input_row
-%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
-%define num_rows(b)	(b)+24		; int num_rows
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
-%define WK_NUM		2
-%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
-
-	align	16
-	global	EXTN(jsimd_ycc_rgb_convert_mmx) PRIVATE
-
-EXTN(jsimd_ycc_rgb_convert_mmx):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	eax		; make a room for GOT address
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx			; get GOT address
-	movpic	POINTER [gotptr], ebx	; save GOT address
-
-	mov	ecx, JDIMENSION [out_width(eax)]	; num_cols
-	test	ecx,ecx
-	jz	near .return
-
-	push	ecx
-
-	mov	edi, JSAMPIMAGE [input_buf(eax)]
-	mov	ecx, JDIMENSION [input_row(eax)]
-	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
-	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
-	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
-
-	pop	ecx
-
-	mov	edi, JSAMPARRAY [output_buf(eax)]
-	mov	eax, INT [num_rows(eax)]
-	test	eax,eax
-	jle	near .return
-	alignx	16,7
-.rowloop:
-	push	eax
-	push	edi
-	push	edx
-	push	ebx
-	push	esi
-	push	ecx			; col
-
-	mov	esi, JSAMPROW [esi]	; inptr0
-	mov	ebx, JSAMPROW [ebx]	; inptr1
-	mov	edx, JSAMPROW [edx]	; inptr2
-	mov	edi, JSAMPROW [edi]	; outptr
-	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
-	alignx	16,7
-.columnloop:
-
-	movq	mm5, MMWORD [ebx]	; mm5=Cb(01234567)
-	movq	mm1, MMWORD [edx]	; mm1=Cr(01234567)
-
-	pcmpeqw	mm4,mm4
-	pcmpeqw	mm7,mm7
-	psrlw	mm4,BYTE_BIT
-	psllw	mm7,7			; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
-	movq	mm0,mm4			; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
-
-	pand	mm4,mm5			; mm4=Cb(0246)=CbE
-	psrlw	mm5,BYTE_BIT		; mm5=Cb(1357)=CbO
-	pand	mm0,mm1			; mm0=Cr(0246)=CrE
-	psrlw	mm1,BYTE_BIT		; mm1=Cr(1357)=CrO
-
-	paddw	mm4,mm7
-	paddw	mm5,mm7
-	paddw	mm0,mm7
-	paddw	mm1,mm7
-
-	; (Original)
-	; R = Y                + 1.40200 * Cr
-	; G = Y - 0.34414 * Cb - 0.71414 * Cr
-	; B = Y + 1.77200 * Cb
-	;
-	; (This implementation)
-	; R = Y                + 0.40200 * Cr + Cr
-	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-	; B = Y - 0.22800 * Cb + Cb + Cb
-
-	movq	mm2,mm4			; mm2=CbE
-	movq	mm3,mm5			; mm3=CbO
-	paddw	mm4,mm4			; mm4=2*CbE
-	paddw	mm5,mm5			; mm5=2*CbO
-	movq	mm6,mm0			; mm6=CrE
-	movq	mm7,mm1			; mm7=CrO
-	paddw	mm0,mm0			; mm0=2*CrE
-	paddw	mm1,mm1			; mm1=2*CrO
-
-	pmulhw	mm4,[GOTOFF(eax,PW_MF0228)]	; mm4=(2*CbE * -FIX(0.22800))
-	pmulhw	mm5,[GOTOFF(eax,PW_MF0228)]	; mm5=(2*CbO * -FIX(0.22800))
-	pmulhw	mm0,[GOTOFF(eax,PW_F0402)]	; mm0=(2*CrE * FIX(0.40200))
-	pmulhw	mm1,[GOTOFF(eax,PW_F0402)]	; mm1=(2*CrO * FIX(0.40200))
-
-	paddw	mm4,[GOTOFF(eax,PW_ONE)]
-	paddw	mm5,[GOTOFF(eax,PW_ONE)]
-	psraw	mm4,1			; mm4=(CbE * -FIX(0.22800))
-	psraw	mm5,1			; mm5=(CbO * -FIX(0.22800))
-	paddw	mm0,[GOTOFF(eax,PW_ONE)]
-	paddw	mm1,[GOTOFF(eax,PW_ONE)]
-	psraw	mm0,1			; mm0=(CrE * FIX(0.40200))
-	psraw	mm1,1			; mm1=(CrO * FIX(0.40200))
-
-	paddw	mm4,mm2
-	paddw	mm5,mm3
-	paddw	mm4,mm2			; mm4=(CbE * FIX(1.77200))=(B-Y)E
-	paddw	mm5,mm3			; mm5=(CbO * FIX(1.77200))=(B-Y)O
-	paddw	mm0,mm6			; mm0=(CrE * FIX(1.40200))=(R-Y)E
-	paddw	mm1,mm7			; mm1=(CrO * FIX(1.40200))=(R-Y)O
-
-	movq	MMWORD [wk(0)], mm4	; wk(0)=(B-Y)E
-	movq	MMWORD [wk(1)], mm5	; wk(1)=(B-Y)O
-
-	movq      mm4,mm2
-	movq      mm5,mm3
-	punpcklwd mm2,mm6
-	punpckhwd mm4,mm6
-	pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
-	pmaddwd   mm4,[GOTOFF(eax,PW_MF0344_F0285)]
-	punpcklwd mm3,mm7
-	punpckhwd mm5,mm7
-	pmaddwd   mm3,[GOTOFF(eax,PW_MF0344_F0285)]
-	pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
-
-	paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
-	paddd     mm4,[GOTOFF(eax,PD_ONEHALF)]
-	psrad     mm2,SCALEBITS
-	psrad     mm4,SCALEBITS
-	paddd     mm3,[GOTOFF(eax,PD_ONEHALF)]
-	paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
-	psrad     mm3,SCALEBITS
-	psrad     mm5,SCALEBITS
-
-	packssdw  mm2,mm4	; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
-	packssdw  mm3,mm5	; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
-	psubw     mm2,mm6	; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
-	psubw     mm3,mm7	; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
-
-	movq      mm5, MMWORD [esi]	; mm5=Y(01234567)
-
-	pcmpeqw   mm4,mm4
-	psrlw     mm4,BYTE_BIT		; mm4={0xFF 0x00 0xFF 0x00 ..}
-	pand      mm4,mm5		; mm4=Y(0246)=YE
-	psrlw     mm5,BYTE_BIT		; mm5=Y(1357)=YO
-
-	paddw     mm0,mm4		; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
-	paddw     mm1,mm5		; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
-	packuswb  mm0,mm0		; mm0=(R0 R2 R4 R6 ** ** ** **)
-	packuswb  mm1,mm1		; mm1=(R1 R3 R5 R7 ** ** ** **)
-
-	paddw     mm2,mm4		; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
-	paddw     mm3,mm5		; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
-	packuswb  mm2,mm2		; mm2=(G0 G2 G4 G6 ** ** ** **)
-	packuswb  mm3,mm3		; mm3=(G1 G3 G5 G7 ** ** ** **)
-
-	paddw     mm4, MMWORD [wk(0)]	; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
-	paddw     mm5, MMWORD [wk(1)]	; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
-	packuswb  mm4,mm4		; mm4=(B0 B2 B4 B6 ** ** ** **)
-	packuswb  mm5,mm5		; mm5=(B1 B3 B5 B7 ** ** ** **)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
-	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
-	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
-	; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
-
-	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
-	punpcklbw mmE,mmB		; mmE=(20 01 22 03 24 05 26 07)
-	punpcklbw mmD,mmF		; mmD=(11 21 13 23 15 25 17 27)
-
-	movq      mmG,mmA
-	movq      mmH,mmA
-	punpcklwd mmA,mmE		; mmA=(00 10 20 01 02 12 22 03)
-	punpckhwd mmG,mmE		; mmG=(04 14 24 05 06 16 26 07)
-
-	psrlq     mmH,2*BYTE_BIT	; mmH=(02 12 04 14 06 16 -- --)
-	psrlq     mmE,2*BYTE_BIT	; mmE=(22 03 24 05 26 07 -- --)
-
-	movq      mmC,mmD
-	movq      mmB,mmD
-	punpcklwd mmD,mmH		; mmD=(11 21 02 12 13 23 04 14)
-	punpckhwd mmC,mmH		; mmC=(15 25 06 16 17 27 -- --)
-
-	psrlq     mmB,2*BYTE_BIT	; mmB=(13 23 15 25 17 27 -- --)
-
-	movq      mmF,mmE
-	punpcklwd mmE,mmB		; mmE=(22 03 13 23 24 05 15 25)
-	punpckhwd mmF,mmB		; mmF=(26 07 17 27 -- -- -- --)
-
-	punpckldq mmA,mmD		; mmA=(00 10 20 01 11 21 02 12)
-	punpckldq mmE,mmG		; mmE=(22 03 13 23 04 14 24 05)
-	punpckldq mmC,mmF		; mmC=(15 25 06 16 26 07 17 27)
-
-	cmp	ecx, byte SIZEOF_MMWORD
-	jb	short .column_st16
-
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
-	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
-	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
-
-	sub	ecx, byte SIZEOF_MMWORD
-	jz	short .nextrow
-
-	add	esi, byte SIZEOF_MMWORD			; inptr0
-	add	ebx, byte SIZEOF_MMWORD			; inptr1
-	add	edx, byte SIZEOF_MMWORD			; inptr2
-	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
-	jmp	near .columnloop
-	alignx	16,7
-
-.column_st16:
-	lea	ecx, [ecx+ecx*2]	; imul ecx, RGB_PIXELSIZE
-	cmp	ecx, byte 2*SIZEOF_MMWORD
-	jb	short .column_st8
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
-	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
-	movq	mmA,mmC
-	sub	ecx, byte 2*SIZEOF_MMWORD
-	add	edi, byte 2*SIZEOF_MMWORD
-	jmp	short .column_st4
-.column_st8:
-	cmp	ecx, byte SIZEOF_MMWORD
-	jb	short .column_st4
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
-	movq	mmA,mmE
-	sub	ecx, byte SIZEOF_MMWORD
-	add	edi, byte SIZEOF_MMWORD
-.column_st4:
-	movd	eax,mmA
-	cmp	ecx, byte SIZEOF_DWORD
-	jb	short .column_st2
-	mov	DWORD [edi+0*SIZEOF_DWORD], eax
-	psrlq	mmA,DWORD_BIT
-	movd	eax,mmA
-	sub	ecx, byte SIZEOF_DWORD
-	add	edi, byte SIZEOF_DWORD
-.column_st2:
-	cmp	ecx, byte SIZEOF_WORD
-	jb	short .column_st1
-	mov	WORD [edi+0*SIZEOF_WORD], ax
-	shr	eax,WORD_BIT
-	sub	ecx, byte SIZEOF_WORD
-	add	edi, byte SIZEOF_WORD
-.column_st1:
-	cmp	ecx, byte SIZEOF_BYTE
-	jb	short .nextrow
-	mov	BYTE [edi+0*SIZEOF_BYTE], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-	pcmpeqb   mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
-	pcmpeqb   mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
-%else
-	pxor      mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
-	pxor      mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
-%endif
-	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
-	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
-	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
-	; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
-
-	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
-	punpcklbw mmE,mmG		; mmE=(20 30 22 32 24 34 26 36)
-	punpcklbw mmB,mmD		; mmB=(01 11 03 13 05 15 07 17)
-	punpcklbw mmF,mmH		; mmF=(21 31 23 33 25 35 27 37)
-
-	movq      mmC,mmA
-	punpcklwd mmA,mmE		; mmA=(00 10 20 30 02 12 22 32)
-	punpckhwd mmC,mmE		; mmC=(04 14 24 34 06 16 26 36)
-	movq      mmG,mmB
-	punpcklwd mmB,mmF		; mmB=(01 11 21 31 03 13 23 33)
-	punpckhwd mmG,mmF		; mmG=(05 15 25 35 07 17 27 37)
-
-	movq      mmD,mmA
-	punpckldq mmA,mmB		; mmA=(00 10 20 30 01 11 21 31)
-	punpckhdq mmD,mmB		; mmD=(02 12 22 32 03 13 23 33)
-	movq      mmH,mmC
-	punpckldq mmC,mmG		; mmC=(04 14 24 34 05 15 25 35)
-	punpckhdq mmH,mmG		; mmH=(06 16 26 36 07 17 27 37)
-
-	cmp	ecx, byte SIZEOF_MMWORD
-	jb	short .column_st16
-
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
-	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
-	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
-	movq	MMWORD [edi+3*SIZEOF_MMWORD], mmH
-
-	sub	ecx, byte SIZEOF_MMWORD
-	jz	short .nextrow
-
-	add	esi, byte SIZEOF_MMWORD			; inptr0
-	add	ebx, byte SIZEOF_MMWORD			; inptr1
-	add	edx, byte SIZEOF_MMWORD			; inptr2
-	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
-	jmp	near .columnloop
-	alignx	16,7
-
-.column_st16:
-	cmp	ecx, byte SIZEOF_MMWORD/2
-	jb	short .column_st8
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
-	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
-	movq	mmA,mmC
-	movq	mmD,mmH
-	sub	ecx, byte SIZEOF_MMWORD/2
-	add	edi, byte 2*SIZEOF_MMWORD
-.column_st8:
-	cmp	ecx, byte SIZEOF_MMWORD/4
-	jb	short .column_st4
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
-	movq	mmA,mmD
-	sub	ecx, byte SIZEOF_MMWORD/4
-	add	edi, byte 1*SIZEOF_MMWORD
-.column_st4:
-	cmp	ecx, byte SIZEOF_MMWORD/8
-	jb	short .nextrow
-	movd	DWORD [edi+0*SIZEOF_DWORD], mmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-	alignx	16,7
-
-.nextrow:
-	pop	ecx
-	pop	esi
-	pop	ebx
-	pop	edx
-	pop	edi
-	pop	eax
-
-	add	esi, byte SIZEOF_JSAMPROW
-	add	ebx, byte SIZEOF_JSAMPROW
-	add	edx, byte SIZEOF_JSAMPROW
-	add	edi, byte SIZEOF_JSAMPROW	; output_buf
-	dec	eax				; num_rows
-	jg	near .rowloop
-
-	emms		; empty MMX state
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jdclrss2-64.asm b/simd/jdclrss2-64.asm
deleted file mode 100644
index a5ae01b..0000000
--- a/simd/jdclrss2-64.asm
+++ /dev/null
@@ -1,441 +0,0 @@
-;
-; jdclrss2-64.asm - colorspace conversion (64-bit SSE2)
-;
-; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009, 2012 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-				
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
-;                             JSAMPIMAGE input_buf, JDIMENSION input_row,
-;                             JSAMPARRAY output_buf, int num_rows)
-;
-
-; r10 = JDIMENSION out_width
-; r11 = JSAMPIMAGE input_buf
-; r12 = JDIMENSION input_row
-; r13 = JSAMPARRAY output_buf
-; r14 = int num_rows
-
-%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-
-	align	16
-	global	EXTN(jsimd_ycc_rgb_convert_sse2) PRIVATE
-
-EXTN(jsimd_ycc_rgb_convert_sse2):
-	push	rbp
-	mov	rax,rsp				; rax = original rbp
-	sub	rsp, byte 4
-	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[rsp],rax
-	mov	rbp,rsp				; rbp = aligned rbp
-	lea	rsp, [wk(0)]
-	collect_args
-	push	rbx
-
-	mov	rcx, r10	; num_cols
-	test	rcx,rcx
-	jz	near .return
-
-	push	rcx
-
-	mov	rdi, r11
-	mov	rcx, r12
-	mov	rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
-	mov	rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
-	mov	rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
-	lea	rsi, [rsi+rcx*SIZEOF_JSAMPROW]
-	lea	rbx, [rbx+rcx*SIZEOF_JSAMPROW]
-	lea	rdx, [rdx+rcx*SIZEOF_JSAMPROW]
-
-	pop	rcx
-
-	mov	rdi, r13
-	mov	eax, r14d
-	test	rax,rax
-	jle	near .return
-.rowloop:
-	push	rax
-	push	rdi
-	push	rdx
-	push	rbx
-	push	rsi
-	push	rcx			; col
-
-	mov	rsi, JSAMPROW [rsi]	; inptr0
-	mov	rbx, JSAMPROW [rbx]	; inptr1
-	mov	rdx, JSAMPROW [rdx]	; inptr2
-	mov	rdi, JSAMPROW [rdi]	; outptr
-.columnloop:
-
-	movdqa	xmm5, XMMWORD [rbx]	; xmm5=Cb(0123456789ABCDEF)
-	movdqa	xmm1, XMMWORD [rdx]	; xmm1=Cr(0123456789ABCDEF)
-
-	pcmpeqw	xmm4,xmm4
-	pcmpeqw	xmm7,xmm7
-	psrlw	xmm4,BYTE_BIT
-	psllw	xmm7,7			; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-	movdqa	xmm0,xmm4		; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
-
-	pand	xmm4,xmm5		; xmm4=Cb(02468ACE)=CbE
-	psrlw	xmm5,BYTE_BIT		; xmm5=Cb(13579BDF)=CbO
-	pand	xmm0,xmm1		; xmm0=Cr(02468ACE)=CrE
-	psrlw	xmm1,BYTE_BIT		; xmm1=Cr(13579BDF)=CrO
-
-	paddw	xmm4,xmm7
-	paddw	xmm5,xmm7
-	paddw	xmm0,xmm7
-	paddw	xmm1,xmm7
-
-	; (Original)
-	; R = Y                + 1.40200 * Cr
-	; G = Y - 0.34414 * Cb - 0.71414 * Cr
-	; B = Y + 1.77200 * Cb
-	;
-	; (This implementation)
-	; R = Y                + 0.40200 * Cr + Cr
-	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-	; B = Y - 0.22800 * Cb + Cb + Cb
-
-	movdqa	xmm2,xmm4		; xmm2=CbE
-	movdqa	xmm3,xmm5		; xmm3=CbO
-	paddw	xmm4,xmm4		; xmm4=2*CbE
-	paddw	xmm5,xmm5		; xmm5=2*CbO
-	movdqa	xmm6,xmm0		; xmm6=CrE
-	movdqa	xmm7,xmm1		; xmm7=CrO
-	paddw	xmm0,xmm0		; xmm0=2*CrE
-	paddw	xmm1,xmm1		; xmm1=2*CrO
-
-	pmulhw	xmm4,[rel PW_MF0228]	; xmm4=(2*CbE * -FIX(0.22800))
-	pmulhw	xmm5,[rel PW_MF0228]	; xmm5=(2*CbO * -FIX(0.22800))
-	pmulhw	xmm0,[rel PW_F0402]	; xmm0=(2*CrE * FIX(0.40200))
-	pmulhw	xmm1,[rel PW_F0402]	; xmm1=(2*CrO * FIX(0.40200))
-
-	paddw	xmm4,[rel PW_ONE]
-	paddw	xmm5,[rel PW_ONE]
-	psraw	xmm4,1			; xmm4=(CbE * -FIX(0.22800))
-	psraw	xmm5,1			; xmm5=(CbO * -FIX(0.22800))
-	paddw	xmm0,[rel PW_ONE]
-	paddw	xmm1,[rel PW_ONE]
-	psraw	xmm0,1			; xmm0=(CrE * FIX(0.40200))
-	psraw	xmm1,1			; xmm1=(CrO * FIX(0.40200))
-
-	paddw	xmm4,xmm2
-	paddw	xmm5,xmm3
-	paddw	xmm4,xmm2		; xmm4=(CbE * FIX(1.77200))=(B-Y)E
-	paddw	xmm5,xmm3		; xmm5=(CbO * FIX(1.77200))=(B-Y)O
-	paddw	xmm0,xmm6		; xmm0=(CrE * FIX(1.40200))=(R-Y)E
-	paddw	xmm1,xmm7		; xmm1=(CrO * FIX(1.40200))=(R-Y)O
-
-	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=(B-Y)E
-	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(B-Y)O
-
-	movdqa    xmm4,xmm2
-	movdqa    xmm5,xmm3
-	punpcklwd xmm2,xmm6
-	punpckhwd xmm4,xmm6
-	pmaddwd   xmm2,[rel PW_MF0344_F0285]
-	pmaddwd   xmm4,[rel PW_MF0344_F0285]
-	punpcklwd xmm3,xmm7
-	punpckhwd xmm5,xmm7
-	pmaddwd   xmm3,[rel PW_MF0344_F0285]
-	pmaddwd   xmm5,[rel PW_MF0344_F0285]
-
-	paddd     xmm2,[rel PD_ONEHALF]
-	paddd     xmm4,[rel PD_ONEHALF]
-	psrad     xmm2,SCALEBITS
-	psrad     xmm4,SCALEBITS
-	paddd     xmm3,[rel PD_ONEHALF]
-	paddd     xmm5,[rel PD_ONEHALF]
-	psrad     xmm3,SCALEBITS
-	psrad     xmm5,SCALEBITS
-
-	packssdw  xmm2,xmm4	; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
-	packssdw  xmm3,xmm5	; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
-	psubw     xmm2,xmm6	; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
-	psubw     xmm3,xmm7	; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
-
-	movdqa    xmm5, XMMWORD [rsi]	; xmm5=Y(0123456789ABCDEF)
-
-	pcmpeqw   xmm4,xmm4
-	psrlw     xmm4,BYTE_BIT		; xmm4={0xFF 0x00 0xFF 0x00 ..}
-	pand      xmm4,xmm5		; xmm4=Y(02468ACE)=YE
-	psrlw     xmm5,BYTE_BIT		; xmm5=Y(13579BDF)=YO
-
-	paddw     xmm0,xmm4		; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
-	paddw     xmm1,xmm5		; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
-	packuswb  xmm0,xmm0		; xmm0=R(02468ACE********)
-	packuswb  xmm1,xmm1		; xmm1=R(13579BDF********)
-
-	paddw     xmm2,xmm4		; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
-	paddw     xmm3,xmm5		; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
-	packuswb  xmm2,xmm2		; xmm2=G(02468ACE********)
-	packuswb  xmm3,xmm3		; xmm3=G(13579BDF********)
-
-	paddw     xmm4, XMMWORD [wk(0)]	; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
-	paddw     xmm5, XMMWORD [wk(1)]	; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
-	packuswb  xmm4,xmm4		; xmm4=B(02468ACE********)
-	packuswb  xmm5,xmm5		; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-	; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
-	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-	punpcklbw xmmE,xmmB	; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
-	punpcklbw xmmD,xmmF	; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
-	movdqa    xmmG,xmmA
-	movdqa    xmmH,xmmA
-	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
-	punpckhwd xmmG,xmmE	; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
-	psrldq    xmmH,2	; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
-	psrldq    xmmE,2	; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
-	movdqa    xmmC,xmmD
-	movdqa    xmmB,xmmD
-	punpcklwd xmmD,xmmH	; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
-	punpckhwd xmmC,xmmH	; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
-	psrldq    xmmB,2	; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
-	movdqa    xmmF,xmmE
-	punpcklwd xmmE,xmmB	; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
-	punpckhwd xmmF,xmmB	; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
-	pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
-	movdqa    xmmB,xmmE
-	punpckldq xmmA,xmmD	; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
-	punpckldq xmmE,xmmH	; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
-	punpckhdq xmmD,xmmB	; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
-	pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
-	movdqa    xmmB,xmmF
-	punpckldq xmmG,xmmC	; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
-	punpckldq xmmF,xmmH	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
-	punpckhdq xmmC,xmmB	; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
-	punpcklqdq xmmA,xmmE	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-	punpcklqdq xmmD,xmmG	; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-	punpcklqdq xmmF,xmmC	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-	cmp	rcx, byte SIZEOF_XMMWORD
-	jb	short .column_st32
-
-	test	rdi, SIZEOF_XMMWORD-1
-	jnz	short .out1
-	; --(aligned)-------------------
-	movntdq	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-	jmp	short .out0
-.out1:	; --(unaligned)-----------------
-	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-	movdqu	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-.out0:
-	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
-	sub	rcx, byte SIZEOF_XMMWORD
-	jz	near .nextrow
-
-	add	rsi, byte SIZEOF_XMMWORD	; inptr0
-	add	rbx, byte SIZEOF_XMMWORD	; inptr1
-	add	rdx, byte SIZEOF_XMMWORD	; inptr2
-	jmp	near .columnloop
-
-.column_st32:
-	lea	rcx, [rcx+rcx*2]		; imul ecx, RGB_PIXELSIZE
-	cmp	rcx, byte 2*SIZEOF_XMMWORD
-	jb	short .column_st16
-	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmF
-	sub	rcx, byte 2*SIZEOF_XMMWORD
-	jmp	short .column_st15
-.column_st16:
-	cmp	rcx, byte SIZEOF_XMMWORD
-	jb	short .column_st15
-	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmD
-	sub	rcx, byte SIZEOF_XMMWORD
-.column_st15:
-	; Store the lower 8 bytes of xmmA to the output when it has enough
-	; space.
-	cmp	rcx, byte SIZEOF_MMWORD
-	jb	short .column_st7
-	movq	XMM_MMWORD [rdi], xmmA
-	add	rdi, byte SIZEOF_MMWORD
-	sub	rcx, byte SIZEOF_MMWORD
-	psrldq	xmmA, SIZEOF_MMWORD
-.column_st7:
-	; Store the lower 4 bytes of xmmA to the output when it has enough
-	; space.
-	cmp	rcx, byte SIZEOF_DWORD
-	jb	short .column_st3
-	movd	XMM_DWORD [rdi], xmmA
-	add	rdi, byte SIZEOF_DWORD
-	sub	rcx, byte SIZEOF_DWORD
-	psrldq	xmmA, SIZEOF_DWORD
-.column_st3:
-	; Store the lower 2 bytes of rax to the output when it has enough
-	; space.
-	movd	eax, xmmA
-	cmp	rcx, byte SIZEOF_WORD
-	jb	short .column_st1
-	mov	WORD [rdi], ax
-	add	rdi, byte SIZEOF_WORD
-	sub	rcx, byte SIZEOF_WORD
-	shr	rax, 16
-.column_st1:
-	; Store the lower 1 byte of rax to the output when it has enough
-	; space.
-	test	rcx, rcx
-	jz	short .nextrow
-	mov	BYTE [rdi], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-	pcmpeqb   xmm6,xmm6		; xmm6=XE=X(02468ACE********)
-	pcmpeqb   xmm7,xmm7		; xmm7=XO=X(13579BDF********)
-%else
-	pxor      xmm6,xmm6		; xmm6=XE=X(02468ACE********)
-	pxor      xmm7,xmm7		; xmm7=XO=X(13579BDF********)
-%endif
-	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-	; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
-	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-	punpcklbw xmmE,xmmG	; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
-	punpcklbw xmmB,xmmD	; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
-	punpcklbw xmmF,xmmH	; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
-	movdqa    xmmC,xmmA
-	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
-	punpckhwd xmmC,xmmE	; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
-	movdqa    xmmG,xmmB
-	punpcklwd xmmB,xmmF	; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
-	punpckhwd xmmG,xmmF	; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
-	movdqa    xmmD,xmmA
-	punpckldq xmmA,xmmB	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-	punpckhdq xmmD,xmmB	; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-	movdqa    xmmH,xmmC
-	punpckldq xmmC,xmmG	; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-	punpckhdq xmmH,xmmG	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-	cmp	rcx, byte SIZEOF_XMMWORD
-	jb	short .column_st32
-
-	test	rdi, SIZEOF_XMMWORD-1
-	jnz	short .out1
-	; --(aligned)-------------------
-	movntdq	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
-	movntdq	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-	jmp	short .out0
-.out1:	; --(unaligned)-----------------
-	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-	movdqu	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
-	movdqu	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-.out0:
-	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
-	sub	rcx, byte SIZEOF_XMMWORD
-	jz	near .nextrow
-
-	add	rsi, byte SIZEOF_XMMWORD	; inptr0
-	add	rbx, byte SIZEOF_XMMWORD	; inptr1
-	add	rdx, byte SIZEOF_XMMWORD	; inptr2
-	jmp	near .columnloop
-
-.column_st32:
-	cmp	rcx, byte SIZEOF_XMMWORD/2
-	jb	short .column_st16
-	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmC
-	movdqa	xmmD,xmmH
-	sub	rcx, byte SIZEOF_XMMWORD/2
-.column_st16:
-	cmp	rcx, byte SIZEOF_XMMWORD/4
-	jb	short .column_st15
-	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmD
-	sub	rcx, byte SIZEOF_XMMWORD/4
-.column_st15:
-	; Store two pixels (8 bytes) of xmmA to the output when it has enough
-	; space.
-	cmp	rcx, byte SIZEOF_XMMWORD/8
-	jb	short .column_st7
-	movq	MMWORD [rdi], xmmA
-	add	rdi, byte SIZEOF_XMMWORD/8*4
-	sub	rcx, byte SIZEOF_XMMWORD/8
-	psrldq	xmmA, SIZEOF_XMMWORD/8*4
-.column_st7:
-	; Store one pixel (4 bytes) of xmmA to the output when it has enough
-	; space.
-	test	rcx, rcx
-	jz	short .nextrow
-	movd	XMM_DWORD [rdi], xmmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-.nextrow:
-	pop	rcx
-	pop	rsi
-	pop	rbx
-	pop	rdx
-	pop	rdi
-	pop	rax
-
-	add	rsi, byte SIZEOF_JSAMPROW
-	add	rbx, byte SIZEOF_JSAMPROW
-	add	rdx, byte SIZEOF_JSAMPROW
-	add	rdi, byte SIZEOF_JSAMPROW	; output_buf
-	dec	rax				; num_rows
-	jg	near .rowloop
-
-	sfence		; flush the write buffer
-
-.return:
-	pop	rbx
-	uncollect_args
-	mov	rsp,rbp		; rsp <- aligned rbp
-	pop	rsp		; rsp <- original rbp
-	pop	rbp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jdclrss2.asm b/simd/jdclrss2.asm
deleted file mode 100644
index 98402c6..0000000
--- a/simd/jdclrss2.asm
+++ /dev/null
@@ -1,460 +0,0 @@
-;
-; jdclrss2.asm - colorspace conversion (SSE2)
-;
-; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2012 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-				
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
-;                             JSAMPIMAGE input_buf, JDIMENSION input_row,
-;                             JSAMPARRAY output_buf, int num_rows)
-;
-
-%define out_width(b)	(b)+8			; JDIMENSION out_width
-%define input_buf(b)	(b)+12		; JSAMPIMAGE input_buf
-%define input_row(b)	(b)+16		; JDIMENSION input_row
-%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
-%define num_rows(b)	(b)+24		; int num_rows
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
-
-	align	16
-	global	EXTN(jsimd_ycc_rgb_convert_sse2) PRIVATE
-
-EXTN(jsimd_ycc_rgb_convert_sse2):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	eax		; make a room for GOT address
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx			; get GOT address
-	movpic	POINTER [gotptr], ebx	; save GOT address
-
-	mov	ecx, JDIMENSION [out_width(eax)]	; num_cols
-	test	ecx,ecx
-	jz	near .return
-
-	push	ecx
-
-	mov	edi, JSAMPIMAGE [input_buf(eax)]
-	mov	ecx, JDIMENSION [input_row(eax)]
-	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
-	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
-	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
-
-	pop	ecx
-
-	mov	edi, JSAMPARRAY [output_buf(eax)]
-	mov	eax, INT [num_rows(eax)]
-	test	eax,eax
-	jle	near .return
-	alignx	16,7
-.rowloop:
-	push	eax
-	push	edi
-	push	edx
-	push	ebx
-	push	esi
-	push	ecx			; col
-
-	mov	esi, JSAMPROW [esi]	; inptr0
-	mov	ebx, JSAMPROW [ebx]	; inptr1
-	mov	edx, JSAMPROW [edx]	; inptr2
-	mov	edi, JSAMPROW [edi]	; outptr
-	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
-	alignx	16,7
-.columnloop:
-
-	movdqa	xmm5, XMMWORD [ebx]	; xmm5=Cb(0123456789ABCDEF)
-	movdqa	xmm1, XMMWORD [edx]	; xmm1=Cr(0123456789ABCDEF)
-
-	pcmpeqw	xmm4,xmm4
-	pcmpeqw	xmm7,xmm7
-	psrlw	xmm4,BYTE_BIT
-	psllw	xmm7,7			; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-	movdqa	xmm0,xmm4		; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
-
-	pand	xmm4,xmm5		; xmm4=Cb(02468ACE)=CbE
-	psrlw	xmm5,BYTE_BIT		; xmm5=Cb(13579BDF)=CbO
-	pand	xmm0,xmm1		; xmm0=Cr(02468ACE)=CrE
-	psrlw	xmm1,BYTE_BIT		; xmm1=Cr(13579BDF)=CrO
-
-	paddw	xmm4,xmm7
-	paddw	xmm5,xmm7
-	paddw	xmm0,xmm7
-	paddw	xmm1,xmm7
-
-	; (Original)
-	; R = Y                + 1.40200 * Cr
-	; G = Y - 0.34414 * Cb - 0.71414 * Cr
-	; B = Y + 1.77200 * Cb
-	;
-	; (This implementation)
-	; R = Y                + 0.40200 * Cr + Cr
-	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-	; B = Y - 0.22800 * Cb + Cb + Cb
-
-	movdqa	xmm2,xmm4		; xmm2=CbE
-	movdqa	xmm3,xmm5		; xmm3=CbO
-	paddw	xmm4,xmm4		; xmm4=2*CbE
-	paddw	xmm5,xmm5		; xmm5=2*CbO
-	movdqa	xmm6,xmm0		; xmm6=CrE
-	movdqa	xmm7,xmm1		; xmm7=CrO
-	paddw	xmm0,xmm0		; xmm0=2*CrE
-	paddw	xmm1,xmm1		; xmm1=2*CrO
-
-	pmulhw	xmm4,[GOTOFF(eax,PW_MF0228)]	; xmm4=(2*CbE * -FIX(0.22800))
-	pmulhw	xmm5,[GOTOFF(eax,PW_MF0228)]	; xmm5=(2*CbO * -FIX(0.22800))
-	pmulhw	xmm0,[GOTOFF(eax,PW_F0402)]	; xmm0=(2*CrE * FIX(0.40200))
-	pmulhw	xmm1,[GOTOFF(eax,PW_F0402)]	; xmm1=(2*CrO * FIX(0.40200))
-
-	paddw	xmm4,[GOTOFF(eax,PW_ONE)]
-	paddw	xmm5,[GOTOFF(eax,PW_ONE)]
-	psraw	xmm4,1			; xmm4=(CbE * -FIX(0.22800))
-	psraw	xmm5,1			; xmm5=(CbO * -FIX(0.22800))
-	paddw	xmm0,[GOTOFF(eax,PW_ONE)]
-	paddw	xmm1,[GOTOFF(eax,PW_ONE)]
-	psraw	xmm0,1			; xmm0=(CrE * FIX(0.40200))
-	psraw	xmm1,1			; xmm1=(CrO * FIX(0.40200))
-
-	paddw	xmm4,xmm2
-	paddw	xmm5,xmm3
-	paddw	xmm4,xmm2		; xmm4=(CbE * FIX(1.77200))=(B-Y)E
-	paddw	xmm5,xmm3		; xmm5=(CbO * FIX(1.77200))=(B-Y)O
-	paddw	xmm0,xmm6		; xmm0=(CrE * FIX(1.40200))=(R-Y)E
-	paddw	xmm1,xmm7		; xmm1=(CrO * FIX(1.40200))=(R-Y)O
-
-	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=(B-Y)E
-	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(B-Y)O
-
-	movdqa    xmm4,xmm2
-	movdqa    xmm5,xmm3
-	punpcklwd xmm2,xmm6
-	punpckhwd xmm4,xmm6
-	pmaddwd   xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
-	pmaddwd   xmm4,[GOTOFF(eax,PW_MF0344_F0285)]
-	punpcklwd xmm3,xmm7
-	punpckhwd xmm5,xmm7
-	pmaddwd   xmm3,[GOTOFF(eax,PW_MF0344_F0285)]
-	pmaddwd   xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
-
-	paddd     xmm2,[GOTOFF(eax,PD_ONEHALF)]
-	paddd     xmm4,[GOTOFF(eax,PD_ONEHALF)]
-	psrad     xmm2,SCALEBITS
-	psrad     xmm4,SCALEBITS
-	paddd     xmm3,[GOTOFF(eax,PD_ONEHALF)]
-	paddd     xmm5,[GOTOFF(eax,PD_ONEHALF)]
-	psrad     xmm3,SCALEBITS
-	psrad     xmm5,SCALEBITS
-
-	packssdw  xmm2,xmm4	; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
-	packssdw  xmm3,xmm5	; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
-	psubw     xmm2,xmm6	; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
-	psubw     xmm3,xmm7	; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
-
-	movdqa    xmm5, XMMWORD [esi]	; xmm5=Y(0123456789ABCDEF)
-
-	pcmpeqw   xmm4,xmm4
-	psrlw     xmm4,BYTE_BIT		; xmm4={0xFF 0x00 0xFF 0x00 ..}
-	pand      xmm4,xmm5		; xmm4=Y(02468ACE)=YE
-	psrlw     xmm5,BYTE_BIT		; xmm5=Y(13579BDF)=YO
-
-	paddw     xmm0,xmm4		; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
-	paddw     xmm1,xmm5		; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
-	packuswb  xmm0,xmm0		; xmm0=R(02468ACE********)
-	packuswb  xmm1,xmm1		; xmm1=R(13579BDF********)
-
-	paddw     xmm2,xmm4		; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
-	paddw     xmm3,xmm5		; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
-	packuswb  xmm2,xmm2		; xmm2=G(02468ACE********)
-	packuswb  xmm3,xmm3		; xmm3=G(13579BDF********)
-
-	paddw     xmm4, XMMWORD [wk(0)]	; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
-	paddw     xmm5, XMMWORD [wk(1)]	; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
-	packuswb  xmm4,xmm4		; xmm4=B(02468ACE********)
-	packuswb  xmm5,xmm5		; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-	; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
-	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-	punpcklbw xmmE,xmmB	; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
-	punpcklbw xmmD,xmmF	; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
-	movdqa    xmmG,xmmA
-	movdqa    xmmH,xmmA
-	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
-	punpckhwd xmmG,xmmE	; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
-	psrldq    xmmH,2	; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
-	psrldq    xmmE,2	; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
-	movdqa    xmmC,xmmD
-	movdqa    xmmB,xmmD
-	punpcklwd xmmD,xmmH	; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
-	punpckhwd xmmC,xmmH	; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
-	psrldq    xmmB,2	; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
-	movdqa    xmmF,xmmE
-	punpcklwd xmmE,xmmB	; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
-	punpckhwd xmmF,xmmB	; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
-	pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
-	movdqa    xmmB,xmmE
-	punpckldq xmmA,xmmD	; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
-	punpckldq xmmE,xmmH	; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
-	punpckhdq xmmD,xmmB	; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
-	pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
-	movdqa    xmmB,xmmF
-	punpckldq xmmG,xmmC	; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
-	punpckldq xmmF,xmmH	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
-	punpckhdq xmmC,xmmB	; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
-	punpcklqdq xmmA,xmmE	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-	punpcklqdq xmmD,xmmG	; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-	punpcklqdq xmmF,xmmC	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-	cmp	ecx, byte SIZEOF_XMMWORD
-	jb	short .column_st32
-
-	test	edi, SIZEOF_XMMWORD-1
-	jnz	short .out1
-	; --(aligned)-------------------
-	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-	jmp	short .out0
-.out1:	; --(unaligned)-----------------
-	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-	movdqu	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-.out0:
-	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
-	sub	ecx, byte SIZEOF_XMMWORD
-	jz	near .nextrow
-
-	add	esi, byte SIZEOF_XMMWORD	; inptr0
-	add	ebx, byte SIZEOF_XMMWORD	; inptr1
-	add	edx, byte SIZEOF_XMMWORD	; inptr2
-	jmp	near .columnloop
-	alignx	16,7
-
-.column_st32:
-	lea	ecx, [ecx+ecx*2]		; imul ecx, RGB_PIXELSIZE
-	cmp	ecx, byte 2*SIZEOF_XMMWORD
-	jb	short .column_st16
-	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-	add	edi, byte 2*SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmF
-	sub	ecx, byte 2*SIZEOF_XMMWORD
-	jmp	short .column_st15
-.column_st16:
-	cmp	ecx, byte SIZEOF_XMMWORD
-	jb	short .column_st15
-	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmD
-	sub	ecx, byte SIZEOF_XMMWORD
-.column_st15:
-	; Store the lower 8 bytes of xmmA to the output when it has enough
-	; space.
-	cmp	ecx, byte SIZEOF_MMWORD
-	jb	short .column_st7
-	movq	XMM_MMWORD [edi], xmmA
-	add	edi, byte SIZEOF_MMWORD
-	sub	ecx, byte SIZEOF_MMWORD
-	psrldq	xmmA, SIZEOF_MMWORD
-.column_st7:
-	; Store the lower 4 bytes of xmmA to the output when it has enough
-	; space.
-	cmp	ecx, byte SIZEOF_DWORD
-	jb	short .column_st3
-	movd	XMM_DWORD [edi], xmmA
-	add	edi, byte SIZEOF_DWORD
-	sub	ecx, byte SIZEOF_DWORD
-	psrldq	xmmA, SIZEOF_DWORD
-.column_st3:
-	; Store the lower 2 bytes of eax to the output when it has enough
-	; space.
-	movd	eax, xmmA
-	cmp	ecx, byte SIZEOF_WORD
-	jb	short .column_st1
-	mov	WORD [edi], ax
-	add	edi, byte SIZEOF_WORD
-	sub	ecx, byte SIZEOF_WORD
-	shr	eax, 16
-.column_st1:
-	; Store the lower 1 byte of eax to the output when it has enough
-	; space.
-	test	ecx, ecx
-	jz	short .nextrow
-	mov	BYTE [edi], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-	pcmpeqb   xmm6,xmm6		; xmm6=XE=X(02468ACE********)
-	pcmpeqb   xmm7,xmm7		; xmm7=XO=X(13579BDF********)
-%else
-	pxor      xmm6,xmm6		; xmm6=XE=X(02468ACE********)
-	pxor      xmm7,xmm7		; xmm7=XO=X(13579BDF********)
-%endif
-	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-	; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
-	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-	punpcklbw xmmE,xmmG	; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
-	punpcklbw xmmB,xmmD	; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
-	punpcklbw xmmF,xmmH	; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
-	movdqa    xmmC,xmmA
-	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
-	punpckhwd xmmC,xmmE	; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
-	movdqa    xmmG,xmmB
-	punpcklwd xmmB,xmmF	; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
-	punpckhwd xmmG,xmmF	; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
-	movdqa    xmmD,xmmA
-	punpckldq xmmA,xmmB	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-	punpckhdq xmmD,xmmB	; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-	movdqa    xmmH,xmmC
-	punpckldq xmmC,xmmG	; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-	punpckhdq xmmH,xmmG	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-	cmp	ecx, byte SIZEOF_XMMWORD
-	jb	short .column_st32
-
-	test	edi, SIZEOF_XMMWORD-1
-	jnz	short .out1
-	; --(aligned)-------------------
-	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
-	movntdq	XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-	jmp	short .out0
-.out1:	; --(unaligned)-----------------
-	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-	movdqu	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
-	movdqu	XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-.out0:
-	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
-	sub	ecx, byte SIZEOF_XMMWORD
-	jz	near .nextrow
-
-	add	esi, byte SIZEOF_XMMWORD	; inptr0
-	add	ebx, byte SIZEOF_XMMWORD	; inptr1
-	add	edx, byte SIZEOF_XMMWORD	; inptr2
-	jmp	near .columnloop
-	alignx	16,7
-
-.column_st32:
-	cmp	ecx, byte SIZEOF_XMMWORD/2
-	jb	short .column_st16
-	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-	add	edi, byte 2*SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmC
-	movdqa	xmmD,xmmH
-	sub	ecx, byte SIZEOF_XMMWORD/2
-.column_st16:
-	cmp	ecx, byte SIZEOF_XMMWORD/4
-	jb	short .column_st15
-	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmD
-	sub	ecx, byte SIZEOF_XMMWORD/4
-.column_st15:
-	; Store two pixels (8 bytes) of xmmA to the output when it has enough
-	; space.
-	cmp	ecx, byte SIZEOF_XMMWORD/8
-	jb	short .column_st7
-	movq	XMM_MMWORD [edi], xmmA
-	add	edi, byte SIZEOF_XMMWORD/8*4
-	sub	ecx, byte SIZEOF_XMMWORD/8
-	psrldq	xmmA, SIZEOF_XMMWORD/8*4
-.column_st7:
-	; Store one pixel (4 bytes) of xmmA to the output when it has enough
-	; space.
-	test	ecx, ecx
-	jz	short .nextrow
-	movd	XMM_DWORD [edi], xmmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-	alignx	16,7
-
-.nextrow:
-	pop	ecx
-	pop	esi
-	pop	ebx
-	pop	edx
-	pop	edi
-	pop	eax
-
-	add	esi, byte SIZEOF_JSAMPROW
-	add	ebx, byte SIZEOF_JSAMPROW
-	add	edx, byte SIZEOF_JSAMPROW
-	add	edi, byte SIZEOF_JSAMPROW	; output_buf
-	dec	eax				; num_rows
-	jg	near .rowloop
-
-	sfence		; flush the write buffer
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jdcolmmx.asm b/simd/jdcolmmx.asm
deleted file mode 100644
index 21ca32a..0000000
--- a/simd/jdcolmmx.asm
+++ /dev/null
@@ -1,120 +0,0 @@
-;
-; jdcolmmx.asm - colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS	16
-
-F_0_344	equ	 22554			; FIX(0.34414)
-F_0_714	equ	 46802			; FIX(0.71414)
-F_1_402	equ	 91881			; FIX(1.40200)
-F_1_772	equ	116130			; FIX(1.77200)
-F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
-F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
-F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_ycc_rgb_convert_mmx) PRIVATE
-
-EXTN(jconst_ycc_rgb_convert_mmx):
-
-PW_F0402	times 4 dw  F_0_402
-PW_MF0228	times 4 dw -F_0_228
-PW_MF0344_F0285	times 2 dw -F_0_344, F_0_285
-PW_ONE		times 4 dw  1
-PD_ONEHALF	times 2 dd  1 << (SCALEBITS-1)
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-
-%include "jdclrmmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgb_convert_mmx
-%include "jdclrmmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgbx_convert_mmx
-%include "jdclrmmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgr_convert_mmx
-%include "jdclrmmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgrx_convert_mmx
-%include "jdclrmmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxbgr_convert_mmx
-%include "jdclrmmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxrgb_convert_mmx
-%include "jdclrmmx.asm"
diff --git a/simd/jdcolss2-64.asm b/simd/jdcolss2-64.asm
deleted file mode 100644
index 443734f..0000000
--- a/simd/jdcolss2-64.asm
+++ /dev/null
@@ -1,120 +0,0 @@
-;
-; jdcolss2-64.asm - colorspace conversion (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS	16
-
-F_0_344	equ	 22554			; FIX(0.34414)
-F_0_714	equ	 46802			; FIX(0.71414)
-F_1_402	equ	 91881			; FIX(1.40200)
-F_1_772	equ	116130			; FIX(1.77200)
-F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
-F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
-F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_ycc_rgb_convert_sse2) PRIVATE
-
-EXTN(jconst_ycc_rgb_convert_sse2):
-
-PW_F0402	times 8 dw  F_0_402
-PW_MF0228	times 8 dw -F_0_228
-PW_MF0344_F0285	times 4 dw -F_0_344, F_0_285
-PW_ONE		times 8 dw  1
-PD_ONEHALF	times 4 dd  1 << (SCALEBITS-1)
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
-
-%include "jdclrss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
-%include "jdclrss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
-%include "jdclrss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
-%include "jdclrss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
-%include "jdclrss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
-%include "jdclrss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
-%include "jdclrss2-64.asm"
diff --git a/simd/jdcolss2.asm b/simd/jdcolss2.asm
deleted file mode 100644
index f968cf8..0000000
--- a/simd/jdcolss2.asm
+++ /dev/null
@@ -1,120 +0,0 @@
-;
-; jdcolss2.asm - colorspace conversion (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS	16
-
-F_0_344	equ	 22554			; FIX(0.34414)
-F_0_714	equ	 46802			; FIX(0.71414)
-F_1_402	equ	 91881			; FIX(1.40200)
-F_1_772	equ	116130			; FIX(1.77200)
-F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
-F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
-F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_ycc_rgb_convert_sse2) PRIVATE
-
-EXTN(jconst_ycc_rgb_convert_sse2):
-
-PW_F0402	times 8 dw  F_0_402
-PW_MF0228	times 8 dw -F_0_228
-PW_MF0344_F0285	times 4 dw -F_0_344, F_0_285
-PW_ONE		times 8 dw  1
-PD_ONEHALF	times 4 dd  1 << (SCALEBITS-1)
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-
-%include "jdclrss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
-%include "jdclrss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
-%include "jdclrss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
-%include "jdclrss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
-%include "jdclrss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
-%include "jdclrss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
-%include "jdclrss2.asm"
diff --git a/simd/jdmermmx.asm b/simd/jdmermmx.asm
deleted file mode 100644
index 76f2f5b..0000000
--- a/simd/jdmermmx.asm
+++ /dev/null
@@ -1,126 +0,0 @@
-;
-; jdmermmx.asm - merged upsampling/color conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS	16
-
-F_0_344	equ	 22554			; FIX(0.34414)
-F_0_714	equ	 46802			; FIX(0.71414)
-F_1_402	equ	 91881			; FIX(1.40200)
-F_1_772	equ	116130			; FIX(1.77200)
-F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
-F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
-F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_merged_upsample_mmx) PRIVATE
-
-EXTN(jconst_merged_upsample_mmx):
-
-PW_F0402	times 4 dw  F_0_402
-PW_MF0228	times 4 dw -F_0_228
-PW_MF0344_F0285	times 2 dw -F_0_344, F_0_285
-PW_ONE		times 4 dw  1
-PD_ONEHALF	times 2 dd  1 << (SCALEBITS-1)
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-
-%include "jdmrgmmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgb_merged_upsample_mmx
-%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgb_merged_upsample_mmx
-%include "jdmrgmmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgbx_merged_upsample_mmx
-%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgbx_merged_upsample_mmx
-%include "jdmrgmmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgr_merged_upsample_mmx
-%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgr_merged_upsample_mmx
-%include "jdmrgmmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgrx_merged_upsample_mmx
-%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgrx_merged_upsample_mmx
-%include "jdmrgmmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxbgr_merged_upsample_mmx
-%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxbgr_merged_upsample_mmx
-%include "jdmrgmmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxrgb_merged_upsample_mmx
-%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxrgb_merged_upsample_mmx
-%include "jdmrgmmx.asm"
diff --git a/simd/jdmerss2-64.asm b/simd/jdmerss2-64.asm
deleted file mode 100644
index 02dd6da..0000000
--- a/simd/jdmerss2-64.asm
+++ /dev/null
@@ -1,126 +0,0 @@
-;
-; jdmerss2-64.asm - merged upsampling/color conversion (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS	16
-
-F_0_344	equ	 22554			; FIX(0.34414)
-F_0_714	equ	 46802			; FIX(0.71414)
-F_1_402	equ	 91881			; FIX(1.40200)
-F_1_772	equ	116130			; FIX(1.77200)
-F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
-F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
-F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_merged_upsample_sse2) PRIVATE
-
-EXTN(jconst_merged_upsample_sse2):
-
-PW_F0402	times 8 dw  F_0_402
-PW_MF0228	times 8 dw -F_0_228
-PW_MF0344_F0285	times 4 dw -F_0_344, F_0_285
-PW_ONE		times 8 dw  1
-PD_ONEHALF	times 4 dd  1 << (SCALEBITS-1)
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
-
-%include "jdmrgss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgb_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgb_merged_upsample_sse2
-%include "jdmrgss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgbx_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgbx_merged_upsample_sse2
-%include "jdmrgss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgr_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgr_merged_upsample_sse2
-%include "jdmrgss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgrx_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgrx_merged_upsample_sse2
-%include "jdmrgss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxbgr_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxbgr_merged_upsample_sse2
-%include "jdmrgss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxrgb_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxrgb_merged_upsample_sse2
-%include "jdmrgss2-64.asm"
diff --git a/simd/jdmerss2.asm b/simd/jdmerss2.asm
deleted file mode 100644
index 4fa6f7f..0000000
--- a/simd/jdmerss2.asm
+++ /dev/null
@@ -1,126 +0,0 @@
-;
-; jdmerss2.asm - merged upsampling/color conversion (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS	16
-
-F_0_344	equ	 22554			; FIX(0.34414)
-F_0_714	equ	 46802			; FIX(0.71414)
-F_1_402	equ	 91881			; FIX(1.40200)
-F_1_772	equ	116130			; FIX(1.77200)
-F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
-F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
-F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_merged_upsample_sse2) PRIVATE
-
-EXTN(jconst_merged_upsample_sse2):
-
-PW_F0402	times 8 dw  F_0_402
-PW_MF0228	times 8 dw -F_0_228
-PW_MF0344_F0285	times 4 dw -F_0_344, F_0_285
-PW_ONE		times 8 dw  1
-PD_ONEHALF	times 4 dd  1 << (SCALEBITS-1)
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-
-%include "jdmrgss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgb_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgb_merged_upsample_sse2
-%include "jdmrgss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgbx_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgbx_merged_upsample_sse2
-%include "jdmrgss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgr_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgr_merged_upsample_sse2
-%include "jdmrgss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgrx_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgrx_merged_upsample_sse2
-%include "jdmrgss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxbgr_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxbgr_merged_upsample_sse2
-%include "jdmrgss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxrgb_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxrgb_merged_upsample_sse2
-%include "jdmrgss2.asm"
diff --git a/simd/jdmrgmmx.asm b/simd/jdmrgmmx.asm
deleted file mode 100644
index bfa4c86..0000000
--- a/simd/jdmrgmmx.asm
+++ /dev/null
@@ -1,464 +0,0 @@
-;
-; jdmrgmmx.asm - merged upsampling/color conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v1_merged_upsample_mmx (JDIMENSION output_width,
-;                                 JSAMPIMAGE input_buf,
-;                                 JDIMENSION in_row_group_ctr,
-;                                 JSAMPARRAY output_buf);
-;
-
-%define output_width(b)	(b)+8			; JDIMENSION output_width
-%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
-%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
-%define WK_NUM		3
-%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
-
-	align	16
-	global	EXTN(jsimd_h2v1_merged_upsample_mmx) PRIVATE
-
-EXTN(jsimd_h2v1_merged_upsample_mmx):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	eax		; make a room for GOT address
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx			; get GOT address
-	movpic	POINTER [gotptr], ebx	; save GOT address
-
-	mov	ecx, JDIMENSION [output_width(eax)]	; col
-	test	ecx,ecx
-	jz	near .return
-
-	push	ecx
-
-	mov	edi, JSAMPIMAGE [input_buf(eax)]
-	mov	ecx, JDIMENSION [in_row_group_ctr(eax)]
-	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-	mov	edi, JSAMPARRAY [output_buf(eax)]
-	mov	esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]		; inptr0
-	mov	ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]		; inptr1
-	mov	edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]		; inptr2
-	mov	edi, JSAMPROW [edi]				; outptr
-
-	pop	ecx			; col
-
-	alignx	16,7
-.columnloop:
-	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
-
-	movq      mm6, MMWORD [ebx]	; mm6=Cb(01234567)
-	movq      mm7, MMWORD [edx]	; mm7=Cr(01234567)
-
-	pxor      mm1,mm1		; mm1=(all 0's)
-	pcmpeqw   mm3,mm3
-	psllw     mm3,7			; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
-
-	movq      mm4,mm6
-	punpckhbw mm6,mm1		; mm6=Cb(4567)=CbH
-	punpcklbw mm4,mm1		; mm4=Cb(0123)=CbL
-	movq      mm0,mm7
-	punpckhbw mm7,mm1		; mm7=Cr(4567)=CrH
-	punpcklbw mm0,mm1		; mm0=Cr(0123)=CrL
-
-	paddw     mm6,mm3
-	paddw     mm4,mm3
-	paddw     mm7,mm3
-	paddw     mm0,mm3
-
-	; (Original)
-	; R = Y                + 1.40200 * Cr
-	; G = Y - 0.34414 * Cb - 0.71414 * Cr
-	; B = Y + 1.77200 * Cb
-	;
-	; (This implementation)
-	; R = Y                + 0.40200 * Cr + Cr
-	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-	; B = Y - 0.22800 * Cb + Cb + Cb
-
-	movq	mm5,mm6			; mm5=CbH
-	movq	mm2,mm4			; mm2=CbL
-	paddw	mm6,mm6			; mm6=2*CbH
-	paddw	mm4,mm4			; mm4=2*CbL
-	movq	mm1,mm7			; mm1=CrH
-	movq	mm3,mm0			; mm3=CrL
-	paddw	mm7,mm7			; mm7=2*CrH
-	paddw	mm0,mm0			; mm0=2*CrL
-
-	pmulhw	mm6,[GOTOFF(eax,PW_MF0228)]	; mm6=(2*CbH * -FIX(0.22800))
-	pmulhw	mm4,[GOTOFF(eax,PW_MF0228)]	; mm4=(2*CbL * -FIX(0.22800))
-	pmulhw	mm7,[GOTOFF(eax,PW_F0402)]	; mm7=(2*CrH * FIX(0.40200))
-	pmulhw	mm0,[GOTOFF(eax,PW_F0402)]	; mm0=(2*CrL * FIX(0.40200))
-
-	paddw	mm6,[GOTOFF(eax,PW_ONE)]
-	paddw	mm4,[GOTOFF(eax,PW_ONE)]
-	psraw	mm6,1			; mm6=(CbH * -FIX(0.22800))
-	psraw	mm4,1			; mm4=(CbL * -FIX(0.22800))
-	paddw	mm7,[GOTOFF(eax,PW_ONE)]
-	paddw	mm0,[GOTOFF(eax,PW_ONE)]
-	psraw	mm7,1			; mm7=(CrH * FIX(0.40200))
-	psraw	mm0,1			; mm0=(CrL * FIX(0.40200))
-
-	paddw	mm6,mm5
-	paddw	mm4,mm2
-	paddw	mm6,mm5			; mm6=(CbH * FIX(1.77200))=(B-Y)H
-	paddw	mm4,mm2			; mm4=(CbL * FIX(1.77200))=(B-Y)L
-	paddw	mm7,mm1			; mm7=(CrH * FIX(1.40200))=(R-Y)H
-	paddw	mm0,mm3			; mm0=(CrL * FIX(1.40200))=(R-Y)L
-
-	movq	MMWORD [wk(0)], mm6	; wk(0)=(B-Y)H
-	movq	MMWORD [wk(1)], mm7	; wk(1)=(R-Y)H
-
-	movq      mm6,mm5
-	movq      mm7,mm2
-	punpcklwd mm5,mm1
-	punpckhwd mm6,mm1
-	pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
-	pmaddwd   mm6,[GOTOFF(eax,PW_MF0344_F0285)]
-	punpcklwd mm2,mm3
-	punpckhwd mm7,mm3
-	pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
-	pmaddwd   mm7,[GOTOFF(eax,PW_MF0344_F0285)]
-
-	paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
-	paddd     mm6,[GOTOFF(eax,PD_ONEHALF)]
-	psrad     mm5,SCALEBITS
-	psrad     mm6,SCALEBITS
-	paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
-	paddd     mm7,[GOTOFF(eax,PD_ONEHALF)]
-	psrad     mm2,SCALEBITS
-	psrad     mm7,SCALEBITS
-
-	packssdw  mm5,mm6	; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
-	packssdw  mm2,mm7	; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
-	psubw     mm5,mm1	; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
-	psubw     mm2,mm3	; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
-
-	movq	MMWORD [wk(2)], mm5	; wk(2)=(G-Y)H
-
-	mov	al,2			; Yctr
-	jmp	short .Yloop_1st
-	alignx	16,7
-
-.Yloop_2nd:
-	movq	mm0, MMWORD [wk(1)]	; mm0=(R-Y)H
-	movq	mm2, MMWORD [wk(2)]	; mm2=(G-Y)H
-	movq	mm4, MMWORD [wk(0)]	; mm4=(B-Y)H
-	alignx	16,7
-
-.Yloop_1st:
-	movq	mm7, MMWORD [esi]	; mm7=Y(01234567)
-
-	pcmpeqw	mm6,mm6
-	psrlw	mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
-	pand	mm6,mm7			; mm6=Y(0246)=YE
-	psrlw	mm7,BYTE_BIT		; mm7=Y(1357)=YO
-
-	movq	mm1,mm0			; mm1=mm0=(R-Y)(L/H)
-	movq	mm3,mm2			; mm3=mm2=(G-Y)(L/H)
-	movq	mm5,mm4			; mm5=mm4=(B-Y)(L/H)
-
-	paddw     mm0,mm6		; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
-	paddw     mm1,mm7		; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
-	packuswb  mm0,mm0		; mm0=(R0 R2 R4 R6 ** ** ** **)
-	packuswb  mm1,mm1		; mm1=(R1 R3 R5 R7 ** ** ** **)
-
-	paddw     mm2,mm6		; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
-	paddw     mm3,mm7		; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
-	packuswb  mm2,mm2		; mm2=(G0 G2 G4 G6 ** ** ** **)
-	packuswb  mm3,mm3		; mm3=(G1 G3 G5 G7 ** ** ** **)
-
-	paddw     mm4,mm6		; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
-	paddw     mm5,mm7		; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
-	packuswb  mm4,mm4		; mm4=(B0 B2 B4 B6 ** ** ** **)
-	packuswb  mm5,mm5		; mm5=(B1 B3 B5 B7 ** ** ** **)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
-	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
-	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
-	; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
-
-	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
-	punpcklbw mmE,mmB		; mmE=(20 01 22 03 24 05 26 07)
-	punpcklbw mmD,mmF		; mmD=(11 21 13 23 15 25 17 27)
-
-	movq      mmG,mmA
-	movq      mmH,mmA
-	punpcklwd mmA,mmE		; mmA=(00 10 20 01 02 12 22 03)
-	punpckhwd mmG,mmE		; mmG=(04 14 24 05 06 16 26 07)
-
-	psrlq     mmH,2*BYTE_BIT	; mmH=(02 12 04 14 06 16 -- --)
-	psrlq     mmE,2*BYTE_BIT	; mmE=(22 03 24 05 26 07 -- --)
-
-	movq      mmC,mmD
-	movq      mmB,mmD
-	punpcklwd mmD,mmH		; mmD=(11 21 02 12 13 23 04 14)
-	punpckhwd mmC,mmH		; mmC=(15 25 06 16 17 27 -- --)
-
-	psrlq     mmB,2*BYTE_BIT	; mmB=(13 23 15 25 17 27 -- --)
-
-	movq      mmF,mmE
-	punpcklwd mmE,mmB		; mmE=(22 03 13 23 24 05 15 25)
-	punpckhwd mmF,mmB		; mmF=(26 07 17 27 -- -- -- --)
-
-	punpckldq mmA,mmD		; mmA=(00 10 20 01 11 21 02 12)
-	punpckldq mmE,mmG		; mmE=(22 03 13 23 04 14 24 05)
-	punpckldq mmC,mmF		; mmC=(15 25 06 16 26 07 17 27)
-
-	cmp	ecx, byte SIZEOF_MMWORD
-	jb	short .column_st16
-
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
-	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
-	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
-
-	sub	ecx, byte SIZEOF_MMWORD
-	jz	near .endcolumn
-
-	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
-	add	esi, byte SIZEOF_MMWORD			; inptr0
-	dec	al			; Yctr
-	jnz	near .Yloop_2nd
-
-	add	ebx, byte SIZEOF_MMWORD			; inptr1
-	add	edx, byte SIZEOF_MMWORD			; inptr2
-	jmp	near .columnloop
-	alignx	16,7
-
-.column_st16:
-	lea	ecx, [ecx+ecx*2]	; imul ecx, RGB_PIXELSIZE
-	cmp	ecx, byte 2*SIZEOF_MMWORD
-	jb	short .column_st8
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
-	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
-	movq	mmA,mmC
-	sub	ecx, byte 2*SIZEOF_MMWORD
-	add	edi, byte 2*SIZEOF_MMWORD
-	jmp	short .column_st4
-.column_st8:
-	cmp	ecx, byte SIZEOF_MMWORD
-	jb	short .column_st4
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
-	movq	mmA,mmE
-	sub	ecx, byte SIZEOF_MMWORD
-	add	edi, byte SIZEOF_MMWORD
-.column_st4:
-	movd	eax,mmA
-	cmp	ecx, byte SIZEOF_DWORD
-	jb	short .column_st2
-	mov	DWORD [edi+0*SIZEOF_DWORD], eax
-	psrlq	mmA,DWORD_BIT
-	movd	eax,mmA
-	sub	ecx, byte SIZEOF_DWORD
-	add	edi, byte SIZEOF_DWORD
-.column_st2:
-	cmp	ecx, byte SIZEOF_WORD
-	jb	short .column_st1
-	mov	WORD [edi+0*SIZEOF_WORD], ax
-	shr	eax,WORD_BIT
-	sub	ecx, byte SIZEOF_WORD
-	add	edi, byte SIZEOF_WORD
-.column_st1:
-	cmp	ecx, byte SIZEOF_BYTE
-	jb	short .endcolumn
-	mov	BYTE [edi+0*SIZEOF_BYTE], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-	pcmpeqb   mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
-	pcmpeqb   mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
-%else
-	pxor      mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
-	pxor      mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
-%endif
-	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
-	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
-	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
-	; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
-
-	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
-	punpcklbw mmE,mmG		; mmE=(20 30 22 32 24 34 26 36)
-	punpcklbw mmB,mmD		; mmB=(01 11 03 13 05 15 07 17)
-	punpcklbw mmF,mmH		; mmF=(21 31 23 33 25 35 27 37)
-
-	movq      mmC,mmA
-	punpcklwd mmA,mmE		; mmA=(00 10 20 30 02 12 22 32)
-	punpckhwd mmC,mmE		; mmC=(04 14 24 34 06 16 26 36)
-	movq      mmG,mmB
-	punpcklwd mmB,mmF		; mmB=(01 11 21 31 03 13 23 33)
-	punpckhwd mmG,mmF		; mmG=(05 15 25 35 07 17 27 37)
-
-	movq      mmD,mmA
-	punpckldq mmA,mmB		; mmA=(00 10 20 30 01 11 21 31)
-	punpckhdq mmD,mmB		; mmD=(02 12 22 32 03 13 23 33)
-	movq      mmH,mmC
-	punpckldq mmC,mmG		; mmC=(04 14 24 34 05 15 25 35)
-	punpckhdq mmH,mmG		; mmH=(06 16 26 36 07 17 27 37)
-
-	cmp	ecx, byte SIZEOF_MMWORD
-	jb	short .column_st16
-
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
-	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
-	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
-	movq	MMWORD [edi+3*SIZEOF_MMWORD], mmH
-
-	sub	ecx, byte SIZEOF_MMWORD
-	jz	short .endcolumn
-
-	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
-	add	esi, byte SIZEOF_MMWORD			; inptr0
-	dec	al			; Yctr
-	jnz	near .Yloop_2nd
-
-	add	ebx, byte SIZEOF_MMWORD			; inptr1
-	add	edx, byte SIZEOF_MMWORD			; inptr2
-	jmp	near .columnloop
-	alignx	16,7
-
-.column_st16:
-	cmp	ecx, byte SIZEOF_MMWORD/2
-	jb	short .column_st8
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
-	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
-	movq	mmA,mmC
-	movq	mmD,mmH
-	sub	ecx, byte SIZEOF_MMWORD/2
-	add	edi, byte 2*SIZEOF_MMWORD
-.column_st8:
-	cmp	ecx, byte SIZEOF_MMWORD/4
-	jb	short .column_st4
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
-	movq	mmA,mmD
-	sub	ecx, byte SIZEOF_MMWORD/4
-	add	edi, byte 1*SIZEOF_MMWORD
-.column_st4:
-	cmp	ecx, byte SIZEOF_MMWORD/8
-	jb	short .endcolumn
-	movd	DWORD [edi+0*SIZEOF_DWORD], mmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-.endcolumn:
-	emms		; empty MMX state
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v2_merged_upsample_mmx (JDIMENSION output_width,
-;                                 JSAMPIMAGE input_buf,
-;                                 JDIMENSION in_row_group_ctr,
-;                                 JSAMPARRAY output_buf);
-;
-
-%define output_width(b)	(b)+8			; JDIMENSION output_width
-%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
-%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
-
-	align	16
-	global	EXTN(jsimd_h2v2_merged_upsample_mmx) PRIVATE
-
-EXTN(jsimd_h2v2_merged_upsample_mmx):
-	push	ebp
-	mov	ebp,esp
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov	eax, JDIMENSION [output_width(ebp)]
-
-	mov	edi, JSAMPIMAGE [input_buf(ebp)]
-	mov	ecx, JDIMENSION [in_row_group_ctr(ebp)]
-	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-	mov	edi, JSAMPARRAY [output_buf(ebp)]
-	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
-
-	push	edx			; inptr2
-	push	ebx			; inptr1
-	push	esi			; inptr00
-	mov	ebx,esp
-
-	push	edi			; output_buf (outptr0)
-	push	ecx			; in_row_group_ctr
-	push	ebx			; input_buf
-	push	eax			; output_width
-
-	call	near EXTN(jsimd_h2v1_merged_upsample_mmx)
-
-	add	esi, byte SIZEOF_JSAMPROW	; inptr01
-	add	edi, byte SIZEOF_JSAMPROW	; outptr1
-	mov	POINTER [ebx+0*SIZEOF_POINTER], esi
-	mov	POINTER [ebx-1*SIZEOF_POINTER], edi
-
-	call	near EXTN(jsimd_h2v1_merged_upsample_mmx)
-
-	add	esp, byte 7*SIZEOF_DWORD
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jdmrgss2-64.asm b/simd/jdmrgss2-64.asm
deleted file mode 100644
index 8c98a62..0000000
--- a/simd/jdmrgss2-64.asm
+++ /dev/null
@@ -1,538 +0,0 @@
-;
-; jdmrgss2-64.asm - merged upsampling/color conversion (64-bit SSE2)
-;
-; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009, 2012 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-				
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width,
-;                                  JSAMPIMAGE input_buf,
-;                                  JDIMENSION in_row_group_ctr,
-;                                  JSAMPARRAY output_buf);
-;
-
-; r10 = JDIMENSION output_width
-; r11 = JSAMPIMAGE input_buf
-; r12 = JDIMENSION in_row_group_ctr
-; r13 = JSAMPARRAY output_buf
-
-%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		3
-
-	align	16
-	global	EXTN(jsimd_h2v1_merged_upsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v1_merged_upsample_sse2):
-	push	rbp
-	mov	rax,rsp				; rax = original rbp
-	sub	rsp, byte 4
-	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[rsp],rax
-	mov	rbp,rsp				; rbp = aligned rbp
-	lea	rsp, [wk(0)]
-	collect_args
-	push	rbx
-
-	mov	rcx, r10	; col
-	test	rcx,rcx
-	jz	near .return
-
-	push	rcx
-
-	mov	rdi, r11
-	mov	rcx, r12
-	mov	rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
-	mov	rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
-	mov	rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
-	mov	rdi, r13
-	mov	rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW]		; inptr0
-	mov	rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW]		; inptr1
-	mov	rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW]		; inptr2
-	mov	rdi, JSAMPROW [rdi]				; outptr
-
-	pop	rcx			; col
-
-.columnloop:
-
-	movdqa    xmm6, XMMWORD [rbx]	; xmm6=Cb(0123456789ABCDEF)
-	movdqa    xmm7, XMMWORD [rdx]	; xmm7=Cr(0123456789ABCDEF)
-
-	pxor      xmm1,xmm1		; xmm1=(all 0's)
-	pcmpeqw   xmm3,xmm3
-	psllw     xmm3,7		; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
-	movdqa    xmm4,xmm6
-	punpckhbw xmm6,xmm1		; xmm6=Cb(89ABCDEF)=CbH
-	punpcklbw xmm4,xmm1		; xmm4=Cb(01234567)=CbL
-	movdqa    xmm0,xmm7
-	punpckhbw xmm7,xmm1		; xmm7=Cr(89ABCDEF)=CrH
-	punpcklbw xmm0,xmm1		; xmm0=Cr(01234567)=CrL
-
-	paddw     xmm6,xmm3
-	paddw     xmm4,xmm3
-	paddw     xmm7,xmm3
-	paddw     xmm0,xmm3
-
-	; (Original)
-	; R = Y                + 1.40200 * Cr
-	; G = Y - 0.34414 * Cb - 0.71414 * Cr
-	; B = Y + 1.77200 * Cb
-	;
-	; (This implementation)
-	; R = Y                + 0.40200 * Cr + Cr
-	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-	; B = Y - 0.22800 * Cb + Cb + Cb
-
-	movdqa	xmm5,xmm6		; xmm5=CbH
-	movdqa	xmm2,xmm4		; xmm2=CbL
-	paddw	xmm6,xmm6		; xmm6=2*CbH
-	paddw	xmm4,xmm4		; xmm4=2*CbL
-	movdqa	xmm1,xmm7		; xmm1=CrH
-	movdqa	xmm3,xmm0		; xmm3=CrL
-	paddw	xmm7,xmm7		; xmm7=2*CrH
-	paddw	xmm0,xmm0		; xmm0=2*CrL
-
-	pmulhw	xmm6,[rel PW_MF0228]	; xmm6=(2*CbH * -FIX(0.22800))
-	pmulhw	xmm4,[rel PW_MF0228]	; xmm4=(2*CbL * -FIX(0.22800))
-	pmulhw	xmm7,[rel PW_F0402]	; xmm7=(2*CrH * FIX(0.40200))
-	pmulhw	xmm0,[rel PW_F0402]	; xmm0=(2*CrL * FIX(0.40200))
-
-	paddw	xmm6,[rel PW_ONE]
-	paddw	xmm4,[rel PW_ONE]
-	psraw	xmm6,1			; xmm6=(CbH * -FIX(0.22800))
-	psraw	xmm4,1			; xmm4=(CbL * -FIX(0.22800))
-	paddw	xmm7,[rel PW_ONE]
-	paddw	xmm0,[rel PW_ONE]
-	psraw	xmm7,1			; xmm7=(CrH * FIX(0.40200))
-	psraw	xmm0,1			; xmm0=(CrL * FIX(0.40200))
-
-	paddw	xmm6,xmm5
-	paddw	xmm4,xmm2
-	paddw	xmm6,xmm5		; xmm6=(CbH * FIX(1.77200))=(B-Y)H
-	paddw	xmm4,xmm2		; xmm4=(CbL * FIX(1.77200))=(B-Y)L
-	paddw	xmm7,xmm1		; xmm7=(CrH * FIX(1.40200))=(R-Y)H
-	paddw	xmm0,xmm3		; xmm0=(CrL * FIX(1.40200))=(R-Y)L
-
-	movdqa	XMMWORD [wk(0)], xmm6	; wk(0)=(B-Y)H
-	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=(R-Y)H
-
-	movdqa    xmm6,xmm5
-	movdqa    xmm7,xmm2
-	punpcklwd xmm5,xmm1
-	punpckhwd xmm6,xmm1
-	pmaddwd   xmm5,[rel PW_MF0344_F0285]
-	pmaddwd   xmm6,[rel PW_MF0344_F0285]
-	punpcklwd xmm2,xmm3
-	punpckhwd xmm7,xmm3
-	pmaddwd   xmm2,[rel PW_MF0344_F0285]
-	pmaddwd   xmm7,[rel PW_MF0344_F0285]
-
-	paddd     xmm5,[rel PD_ONEHALF]
-	paddd     xmm6,[rel PD_ONEHALF]
-	psrad     xmm5,SCALEBITS
-	psrad     xmm6,SCALEBITS
-	paddd     xmm2,[rel PD_ONEHALF]
-	paddd     xmm7,[rel PD_ONEHALF]
-	psrad     xmm2,SCALEBITS
-	psrad     xmm7,SCALEBITS
-
-	packssdw  xmm5,xmm6	; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
-	packssdw  xmm2,xmm7	; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
-	psubw     xmm5,xmm1	; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
-	psubw     xmm2,xmm3	; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
-
-	movdqa	XMMWORD [wk(2)], xmm5	; wk(2)=(G-Y)H
-
-	mov	al,2			; Yctr
-	jmp	short .Yloop_1st
-
-.Yloop_2nd:
-	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(R-Y)H
-	movdqa	xmm2, XMMWORD [wk(2)]	; xmm2=(G-Y)H
-	movdqa	xmm4, XMMWORD [wk(0)]	; xmm4=(B-Y)H
-
-.Yloop_1st:
-	movdqa	xmm7, XMMWORD [rsi]	; xmm7=Y(0123456789ABCDEF)
-
-	pcmpeqw	xmm6,xmm6
-	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
-	pand	xmm6,xmm7		; xmm6=Y(02468ACE)=YE
-	psrlw	xmm7,BYTE_BIT		; xmm7=Y(13579BDF)=YO
-
-	movdqa	xmm1,xmm0		; xmm1=xmm0=(R-Y)(L/H)
-	movdqa	xmm3,xmm2		; xmm3=xmm2=(G-Y)(L/H)
-	movdqa	xmm5,xmm4		; xmm5=xmm4=(B-Y)(L/H)
-
-	paddw     xmm0,xmm6		; xmm0=((R-Y)+YE)=RE=R(02468ACE)
-	paddw     xmm1,xmm7		; xmm1=((R-Y)+YO)=RO=R(13579BDF)
-	packuswb  xmm0,xmm0		; xmm0=R(02468ACE********)
-	packuswb  xmm1,xmm1		; xmm1=R(13579BDF********)
-
-	paddw     xmm2,xmm6		; xmm2=((G-Y)+YE)=GE=G(02468ACE)
-	paddw     xmm3,xmm7		; xmm3=((G-Y)+YO)=GO=G(13579BDF)
-	packuswb  xmm2,xmm2		; xmm2=G(02468ACE********)
-	packuswb  xmm3,xmm3		; xmm3=G(13579BDF********)
-
-	paddw     xmm4,xmm6		; xmm4=((B-Y)+YE)=BE=B(02468ACE)
-	paddw     xmm5,xmm7		; xmm5=((B-Y)+YO)=BO=B(13579BDF)
-	packuswb  xmm4,xmm4		; xmm4=B(02468ACE********)
-	packuswb  xmm5,xmm5		; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-	; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
-	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-	punpcklbw xmmE,xmmB	; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
-	punpcklbw xmmD,xmmF	; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
-	movdqa    xmmG,xmmA
-	movdqa    xmmH,xmmA
-	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
-	punpckhwd xmmG,xmmE	; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
-	psrldq    xmmH,2	; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
-	psrldq    xmmE,2	; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
-	movdqa    xmmC,xmmD
-	movdqa    xmmB,xmmD
-	punpcklwd xmmD,xmmH	; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
-	punpckhwd xmmC,xmmH	; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
-	psrldq    xmmB,2	; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
-	movdqa    xmmF,xmmE
-	punpcklwd xmmE,xmmB	; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
-	punpckhwd xmmF,xmmB	; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
-	pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
-	movdqa    xmmB,xmmE
-	punpckldq xmmA,xmmD	; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
-	punpckldq xmmE,xmmH	; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
-	punpckhdq xmmD,xmmB	; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
-	pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
-	movdqa    xmmB,xmmF
-	punpckldq xmmG,xmmC	; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
-	punpckldq xmmF,xmmH	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
-	punpckhdq xmmC,xmmB	; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
-	punpcklqdq xmmA,xmmE	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-	punpcklqdq xmmD,xmmG	; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-	punpcklqdq xmmF,xmmC	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-	cmp	rcx, byte SIZEOF_XMMWORD
-	jb	short .column_st32
-
-	test	rdi, SIZEOF_XMMWORD-1
-	jnz	short .out1
-	; --(aligned)-------------------
-	movntdq	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-	jmp	short .out0
-.out1:	; --(unaligned)-----------------
-	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-	movdqu	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-.out0:
-	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
-	sub	rcx, byte SIZEOF_XMMWORD
-	jz	near .endcolumn
-
-	add	rsi, byte SIZEOF_XMMWORD	; inptr0
-	dec	al			; Yctr
-	jnz	near .Yloop_2nd
-
-	add	rbx, byte SIZEOF_XMMWORD	; inptr1
-	add	rdx, byte SIZEOF_XMMWORD	; inptr2
-	jmp	near .columnloop
-
-.column_st32:
-	lea	rcx, [rcx+rcx*2]		; imul ecx, RGB_PIXELSIZE
-	cmp	rcx, byte 2*SIZEOF_XMMWORD
-	jb	short .column_st16
-	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmF
-	sub	rcx, byte 2*SIZEOF_XMMWORD
-	jmp	short .column_st15
-.column_st16:
-	cmp	rcx, byte SIZEOF_XMMWORD
-	jb	short .column_st15
-	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmD
-	sub	rcx, byte SIZEOF_XMMWORD
-.column_st15:
-	; Store the lower 8 bytes of xmmA to the output when it has enough
-	; space.
-	cmp	rcx, byte SIZEOF_MMWORD
-	jb	short .column_st7
-	movq	XMM_MMWORD [rdi], xmmA
-	add	rdi, byte SIZEOF_MMWORD
-	sub	rcx, byte SIZEOF_MMWORD
-	psrldq	xmmA, SIZEOF_MMWORD
-.column_st7:
-	; Store the lower 4 bytes of xmmA to the output when it has enough
-	; space.
-	cmp	rcx, byte SIZEOF_DWORD
-	jb	short .column_st3
-	movd	XMM_DWORD [rdi], xmmA
-	add	rdi, byte SIZEOF_DWORD
-	sub	rcx, byte SIZEOF_DWORD
-	psrldq	xmmA, SIZEOF_DWORD
-.column_st3:
-	; Store the lower 2 bytes of rax to the output when it has enough
-	; space.
-	movd	eax, xmmA
-	cmp	rcx, byte SIZEOF_WORD
-	jb	short .column_st1
-	mov	WORD [rdi], ax
-	add	rdi, byte SIZEOF_WORD
-	sub	rcx, byte SIZEOF_WORD
-	shr	rax, 16
-.column_st1:
-	; Store the lower 1 byte of rax to the output when it has enough
-	; space.
-	test	rcx, rcx
-	jz	short .endcolumn
-	mov	BYTE [rdi], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-	pcmpeqb   xmm6,xmm6		; xmm6=XE=X(02468ACE********)
-	pcmpeqb   xmm7,xmm7		; xmm7=XO=X(13579BDF********)
-%else
-	pxor      xmm6,xmm6		; xmm6=XE=X(02468ACE********)
-	pxor      xmm7,xmm7		; xmm7=XO=X(13579BDF********)
-%endif
-	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-	; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
-	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-	punpcklbw xmmE,xmmG	; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
-	punpcklbw xmmB,xmmD	; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
-	punpcklbw xmmF,xmmH	; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
-	movdqa    xmmC,xmmA
-	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
-	punpckhwd xmmC,xmmE	; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
-	movdqa    xmmG,xmmB
-	punpcklwd xmmB,xmmF	; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
-	punpckhwd xmmG,xmmF	; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
-	movdqa    xmmD,xmmA
-	punpckldq xmmA,xmmB	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-	punpckhdq xmmD,xmmB	; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-	movdqa    xmmH,xmmC
-	punpckldq xmmC,xmmG	; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-	punpckhdq xmmH,xmmG	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-	cmp	rcx, byte SIZEOF_XMMWORD
-	jb	short .column_st32
-
-	test	rdi, SIZEOF_XMMWORD-1
-	jnz	short .out1
-	; --(aligned)-------------------
-	movntdq	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
-	movntdq	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-	jmp	short .out0
-.out1:	; --(unaligned)-----------------
-	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-	movdqu	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
-	movdqu	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-.out0:
-	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
-	sub	rcx, byte SIZEOF_XMMWORD
-	jz	near .endcolumn
-
-	add	rsi, byte SIZEOF_XMMWORD	; inptr0
-	dec	al			; Yctr
-	jnz	near .Yloop_2nd
-
-	add	rbx, byte SIZEOF_XMMWORD	; inptr1
-	add	rdx, byte SIZEOF_XMMWORD	; inptr2
-	jmp	near .columnloop
-
-.column_st32:
-	cmp	rcx, byte SIZEOF_XMMWORD/2
-	jb	short .column_st16
-	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmC
-	movdqa	xmmD,xmmH
-	sub	rcx, byte SIZEOF_XMMWORD/2
-.column_st16:
-	cmp	rcx, byte SIZEOF_XMMWORD/4
-	jb	short .column_st15
-	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmD
-	sub	rcx, byte SIZEOF_XMMWORD/4
-.column_st15:
-	; Store two pixels (8 bytes) of xmmA to the output when it has enough
-	; space.
-	cmp	rcx, byte SIZEOF_XMMWORD/8
-	jb	short .column_st7
-	movq	XMM_MMWORD [rdi], xmmA
-	add	rdi, byte SIZEOF_XMMWORD/8*4
-	sub	rcx, byte SIZEOF_XMMWORD/8
-	psrldq	xmmA, SIZEOF_XMMWORD/8*4
-.column_st7:
-	; Store one pixel (4 bytes) of xmmA to the output when it has enough
-	; space.
-	test	rcx, rcx
-	jz	short .endcolumn
-	movd	XMM_DWORD [rdi], xmmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-.endcolumn:
-	sfence		; flush the write buffer
-
-.return:
-	pop	rbx
-	uncollect_args
-	mov	rsp,rbp		; rsp <- aligned rbp
-	pop	rsp		; rsp <- original rbp
-	pop	rbp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width,
-;                                  JSAMPIMAGE input_buf,
-;                                  JDIMENSION in_row_group_ctr,
-;                                  JSAMPARRAY output_buf);
-;
-
-; r10 = JDIMENSION output_width
-; r11 = JSAMPIMAGE input_buf
-; r12 = JDIMENSION in_row_group_ctr
-; r13 = JSAMPARRAY output_buf
-
-	align	16
-	global	EXTN(jsimd_h2v2_merged_upsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v2_merged_upsample_sse2):
-	push	rbp
-	mov	rax,rsp
-	mov	rbp,rsp
-	collect_args
-	push	rbx
-
-	mov	rax, r10
-
-	mov	rdi, r11
-	mov	rcx, r12
-	mov	rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
-	mov	rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
-	mov	rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
-	mov	rdi, r13
-	lea	rsi, [rsi+rcx*SIZEOF_JSAMPROW]
-
-	push	rdx			; inptr2
-	push	rbx			; inptr1
-	push	rsi			; inptr00
-	mov	rbx,rsp
-
-	push	rdi
-	push	rcx
-	push	rax
-
-	%ifdef WIN64
-	mov r8, rcx
-	mov r9, rdi
-	mov rcx, rax
-	mov rdx, rbx
-	%else
-	mov rdx, rcx
-	mov rcx, rdi
-	mov	rdi, rax
-	mov rsi, rbx
-	%endif
-
-	call	EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-	pop rax
-	pop rcx
-	pop rdi
-	pop rsi
-	pop rbx
-	pop rdx
-
-	add	rdi, byte SIZEOF_JSAMPROW	; outptr1
-	add	rsi, byte SIZEOF_JSAMPROW	; inptr01
-
-	push	rdx			; inptr2
-	push	rbx			; inptr1
-	push	rsi			; inptr00
-	mov	rbx,rsp
-
-	push	rdi
-	push	rcx
-	push	rax
-
-	%ifdef WIN64
-	mov r8, rcx
-	mov r9, rdi
-	mov rcx, rax
-	mov rdx, rbx
-	%else
-	mov rdx, rcx
-	mov rcx, rdi
-	mov	rdi, rax
-	mov rsi, rbx
-	%endif
-
-	call	EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-	pop rax
-	pop rcx
-	pop rdi
-	pop rsi
-	pop rbx
-	pop rdx
-
-	pop	rbx
-	uncollect_args
-	pop	rbp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jdmrgss2.asm b/simd/jdmrgss2.asm
deleted file mode 100644
index 1fd15ba..0000000
--- a/simd/jdmrgss2.asm
+++ /dev/null
@@ -1,519 +0,0 @@
-;
-; jdmrgss2.asm - merged upsampling/color conversion (SSE2)
-;
-; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2012 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-				
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width,
-;                                  JSAMPIMAGE input_buf,
-;                                  JDIMENSION in_row_group_ctr,
-;                                  JSAMPARRAY output_buf);
-;
-
-%define output_width(b)	(b)+8			; JDIMENSION output_width
-%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
-%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		3
-%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
-
-	align	16
-	global	EXTN(jsimd_h2v1_merged_upsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v1_merged_upsample_sse2):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	eax		; make a room for GOT address
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx			; get GOT address
-	movpic	POINTER [gotptr], ebx	; save GOT address
-
-	mov	ecx, JDIMENSION [output_width(eax)]	; col
-	test	ecx,ecx
-	jz	near .return
-
-	push	ecx
-
-	mov	edi, JSAMPIMAGE [input_buf(eax)]
-	mov	ecx, JDIMENSION [in_row_group_ctr(eax)]
-	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-	mov	edi, JSAMPARRAY [output_buf(eax)]
-	mov	esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]		; inptr0
-	mov	ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]		; inptr1
-	mov	edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]		; inptr2
-	mov	edi, JSAMPROW [edi]				; outptr
-
-	pop	ecx			; col
-
-	alignx	16,7
-.columnloop:
-	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
-
-	movdqa    xmm6, XMMWORD [ebx]	; xmm6=Cb(0123456789ABCDEF)
-	movdqa    xmm7, XMMWORD [edx]	; xmm7=Cr(0123456789ABCDEF)
-
-	pxor      xmm1,xmm1		; xmm1=(all 0's)
-	pcmpeqw   xmm3,xmm3
-	psllw     xmm3,7		; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
-	movdqa    xmm4,xmm6
-	punpckhbw xmm6,xmm1		; xmm6=Cb(89ABCDEF)=CbH
-	punpcklbw xmm4,xmm1		; xmm4=Cb(01234567)=CbL
-	movdqa    xmm0,xmm7
-	punpckhbw xmm7,xmm1		; xmm7=Cr(89ABCDEF)=CrH
-	punpcklbw xmm0,xmm1		; xmm0=Cr(01234567)=CrL
-
-	paddw     xmm6,xmm3
-	paddw     xmm4,xmm3
-	paddw     xmm7,xmm3
-	paddw     xmm0,xmm3
-
-	; (Original)
-	; R = Y                + 1.40200 * Cr
-	; G = Y - 0.34414 * Cb - 0.71414 * Cr
-	; B = Y + 1.77200 * Cb
-	;
-	; (This implementation)
-	; R = Y                + 0.40200 * Cr + Cr
-	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-	; B = Y - 0.22800 * Cb + Cb + Cb
-
-	movdqa	xmm5,xmm6		; xmm5=CbH
-	movdqa	xmm2,xmm4		; xmm2=CbL
-	paddw	xmm6,xmm6		; xmm6=2*CbH
-	paddw	xmm4,xmm4		; xmm4=2*CbL
-	movdqa	xmm1,xmm7		; xmm1=CrH
-	movdqa	xmm3,xmm0		; xmm3=CrL
-	paddw	xmm7,xmm7		; xmm7=2*CrH
-	paddw	xmm0,xmm0		; xmm0=2*CrL
-
-	pmulhw	xmm6,[GOTOFF(eax,PW_MF0228)]	; xmm6=(2*CbH * -FIX(0.22800))
-	pmulhw	xmm4,[GOTOFF(eax,PW_MF0228)]	; xmm4=(2*CbL * -FIX(0.22800))
-	pmulhw	xmm7,[GOTOFF(eax,PW_F0402)]	; xmm7=(2*CrH * FIX(0.40200))
-	pmulhw	xmm0,[GOTOFF(eax,PW_F0402)]	; xmm0=(2*CrL * FIX(0.40200))
-
-	paddw	xmm6,[GOTOFF(eax,PW_ONE)]
-	paddw	xmm4,[GOTOFF(eax,PW_ONE)]
-	psraw	xmm6,1			; xmm6=(CbH * -FIX(0.22800))
-	psraw	xmm4,1			; xmm4=(CbL * -FIX(0.22800))
-	paddw	xmm7,[GOTOFF(eax,PW_ONE)]
-	paddw	xmm0,[GOTOFF(eax,PW_ONE)]
-	psraw	xmm7,1			; xmm7=(CrH * FIX(0.40200))
-	psraw	xmm0,1			; xmm0=(CrL * FIX(0.40200))
-
-	paddw	xmm6,xmm5
-	paddw	xmm4,xmm2
-	paddw	xmm6,xmm5		; xmm6=(CbH * FIX(1.77200))=(B-Y)H
-	paddw	xmm4,xmm2		; xmm4=(CbL * FIX(1.77200))=(B-Y)L
-	paddw	xmm7,xmm1		; xmm7=(CrH * FIX(1.40200))=(R-Y)H
-	paddw	xmm0,xmm3		; xmm0=(CrL * FIX(1.40200))=(R-Y)L
-
-	movdqa	XMMWORD [wk(0)], xmm6	; wk(0)=(B-Y)H
-	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=(R-Y)H
-
-	movdqa    xmm6,xmm5
-	movdqa    xmm7,xmm2
-	punpcklwd xmm5,xmm1
-	punpckhwd xmm6,xmm1
-	pmaddwd   xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
-	pmaddwd   xmm6,[GOTOFF(eax,PW_MF0344_F0285)]
-	punpcklwd xmm2,xmm3
-	punpckhwd xmm7,xmm3
-	pmaddwd   xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
-	pmaddwd   xmm7,[GOTOFF(eax,PW_MF0344_F0285)]
-
-	paddd     xmm5,[GOTOFF(eax,PD_ONEHALF)]
-	paddd     xmm6,[GOTOFF(eax,PD_ONEHALF)]
-	psrad     xmm5,SCALEBITS
-	psrad     xmm6,SCALEBITS
-	paddd     xmm2,[GOTOFF(eax,PD_ONEHALF)]
-	paddd     xmm7,[GOTOFF(eax,PD_ONEHALF)]
-	psrad     xmm2,SCALEBITS
-	psrad     xmm7,SCALEBITS
-
-	packssdw  xmm5,xmm6	; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
-	packssdw  xmm2,xmm7	; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
-	psubw     xmm5,xmm1	; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
-	psubw     xmm2,xmm3	; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
-
-	movdqa	XMMWORD [wk(2)], xmm5	; wk(2)=(G-Y)H
-
-	mov	al,2			; Yctr
-	jmp	short .Yloop_1st
-	alignx	16,7
-
-.Yloop_2nd:
-	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(R-Y)H
-	movdqa	xmm2, XMMWORD [wk(2)]	; xmm2=(G-Y)H
-	movdqa	xmm4, XMMWORD [wk(0)]	; xmm4=(B-Y)H
-	alignx	16,7
-
-.Yloop_1st:
-	movdqa	xmm7, XMMWORD [esi]	; xmm7=Y(0123456789ABCDEF)
-
-	pcmpeqw	xmm6,xmm6
-	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
-	pand	xmm6,xmm7		; xmm6=Y(02468ACE)=YE
-	psrlw	xmm7,BYTE_BIT		; xmm7=Y(13579BDF)=YO
-
-	movdqa	xmm1,xmm0		; xmm1=xmm0=(R-Y)(L/H)
-	movdqa	xmm3,xmm2		; xmm3=xmm2=(G-Y)(L/H)
-	movdqa	xmm5,xmm4		; xmm5=xmm4=(B-Y)(L/H)
-
-	paddw     xmm0,xmm6		; xmm0=((R-Y)+YE)=RE=R(02468ACE)
-	paddw     xmm1,xmm7		; xmm1=((R-Y)+YO)=RO=R(13579BDF)
-	packuswb  xmm0,xmm0		; xmm0=R(02468ACE********)
-	packuswb  xmm1,xmm1		; xmm1=R(13579BDF********)
-
-	paddw     xmm2,xmm6		; xmm2=((G-Y)+YE)=GE=G(02468ACE)
-	paddw     xmm3,xmm7		; xmm3=((G-Y)+YO)=GO=G(13579BDF)
-	packuswb  xmm2,xmm2		; xmm2=G(02468ACE********)
-	packuswb  xmm3,xmm3		; xmm3=G(13579BDF********)
-
-	paddw     xmm4,xmm6		; xmm4=((B-Y)+YE)=BE=B(02468ACE)
-	paddw     xmm5,xmm7		; xmm5=((B-Y)+YO)=BO=B(13579BDF)
-	packuswb  xmm4,xmm4		; xmm4=B(02468ACE********)
-	packuswb  xmm5,xmm5		; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-	; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
-	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-	punpcklbw xmmE,xmmB	; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
-	punpcklbw xmmD,xmmF	; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
-	movdqa    xmmG,xmmA
-	movdqa    xmmH,xmmA
-	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
-	punpckhwd xmmG,xmmE	; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
-	psrldq    xmmH,2	; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
-	psrldq    xmmE,2	; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
-	movdqa    xmmC,xmmD
-	movdqa    xmmB,xmmD
-	punpcklwd xmmD,xmmH	; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
-	punpckhwd xmmC,xmmH	; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
-	psrldq    xmmB,2	; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
-	movdqa    xmmF,xmmE
-	punpcklwd xmmE,xmmB	; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
-	punpckhwd xmmF,xmmB	; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
-	pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
-	movdqa    xmmB,xmmE
-	punpckldq xmmA,xmmD	; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
-	punpckldq xmmE,xmmH	; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
-	punpckhdq xmmD,xmmB	; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
-	pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
-	movdqa    xmmB,xmmF
-	punpckldq xmmG,xmmC	; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
-	punpckldq xmmF,xmmH	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
-	punpckhdq xmmC,xmmB	; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
-	punpcklqdq xmmA,xmmE	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-	punpcklqdq xmmD,xmmG	; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-	punpcklqdq xmmF,xmmC	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-	cmp	ecx, byte SIZEOF_XMMWORD
-	jb	short .column_st32
-
-	test	edi, SIZEOF_XMMWORD-1
-	jnz	short .out1
-	; --(aligned)-------------------
-	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-	jmp	short .out0
-.out1:	; --(unaligned)-----------------
-	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-	movdqu	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-.out0:
-	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
-	sub	ecx, byte SIZEOF_XMMWORD
-	jz	near .endcolumn
-
-	add	esi, byte SIZEOF_XMMWORD	; inptr0
-	dec	al			; Yctr
-	jnz	near .Yloop_2nd
-
-	add	ebx, byte SIZEOF_XMMWORD	; inptr1
-	add	edx, byte SIZEOF_XMMWORD	; inptr2
-	jmp	near .columnloop
-	alignx	16,7
-
-.column_st32:
-	lea	ecx, [ecx+ecx*2]		; imul ecx, RGB_PIXELSIZE
-	cmp	ecx, byte 2*SIZEOF_XMMWORD
-	jb	short .column_st16
-	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-	add	edi, byte 2*SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmF
-	sub	ecx, byte 2*SIZEOF_XMMWORD
-	jmp	short .column_st15
-.column_st16:
-	cmp	ecx, byte SIZEOF_XMMWORD
-	jb	short .column_st15
-	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmD
-	sub	ecx, byte SIZEOF_XMMWORD
-.column_st15:
-	; Store the lower 8 bytes of xmmA to the output when it has enough
-	; space.
-	cmp	ecx, byte SIZEOF_MMWORD
-	jb	short .column_st7
-	movq	XMM_MMWORD [edi], xmmA
-	add	edi, byte SIZEOF_MMWORD
-	sub	ecx, byte SIZEOF_MMWORD
-	psrldq	xmmA, SIZEOF_MMWORD
-.column_st7:
-	; Store the lower 4 bytes of xmmA to the output when it has enough
-	; space.
-	cmp	ecx, byte SIZEOF_DWORD
-	jb	short .column_st3
-	movd	XMM_DWORD [edi], xmmA
-	add	edi, byte SIZEOF_DWORD
-	sub	ecx, byte SIZEOF_DWORD
-	psrldq	xmmA, SIZEOF_DWORD
-.column_st3:
-	; Store the lower 2 bytes of eax to the output when it has enough
-	; space.
-	movd	eax, xmmA
-	cmp	ecx, byte SIZEOF_WORD
-	jb	short .column_st1
-	mov	WORD [edi], ax
-	add	edi, byte SIZEOF_WORD
-	sub	ecx, byte SIZEOF_WORD
-	shr	eax, 16
-.column_st1:
-	; Store the lower 1 byte of eax to the output when it has enough
-	; space.
-	test	ecx, ecx
-	jz	short .endcolumn
-	mov	BYTE [edi], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-	pcmpeqb   xmm6,xmm6		; xmm6=XE=X(02468ACE********)
-	pcmpeqb   xmm7,xmm7		; xmm7=XO=X(13579BDF********)
-%else
-	pxor      xmm6,xmm6		; xmm6=XE=X(02468ACE********)
-	pxor      xmm7,xmm7		; xmm7=XO=X(13579BDF********)
-%endif
-	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-	; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
-	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-	punpcklbw xmmE,xmmG	; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
-	punpcklbw xmmB,xmmD	; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
-	punpcklbw xmmF,xmmH	; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
-	movdqa    xmmC,xmmA
-	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
-	punpckhwd xmmC,xmmE	; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
-	movdqa    xmmG,xmmB
-	punpcklwd xmmB,xmmF	; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
-	punpckhwd xmmG,xmmF	; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
-	movdqa    xmmD,xmmA
-	punpckldq xmmA,xmmB	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-	punpckhdq xmmD,xmmB	; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-	movdqa    xmmH,xmmC
-	punpckldq xmmC,xmmG	; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-	punpckhdq xmmH,xmmG	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-	cmp	ecx, byte SIZEOF_XMMWORD
-	jb	short .column_st32
-
-	test	edi, SIZEOF_XMMWORD-1
-	jnz	short .out1
-	; --(aligned)-------------------
-	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
-	movntdq	XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-	jmp	short .out0
-.out1:	; --(unaligned)-----------------
-	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-	movdqu	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
-	movdqu	XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-.out0:
-	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
-	sub	ecx, byte SIZEOF_XMMWORD
-	jz	near .endcolumn
-
-	add	esi, byte SIZEOF_XMMWORD	; inptr0
-	dec	al			; Yctr
-	jnz	near .Yloop_2nd
-
-	add	ebx, byte SIZEOF_XMMWORD	; inptr1
-	add	edx, byte SIZEOF_XMMWORD	; inptr2
-	jmp	near .columnloop
-	alignx	16,7
-
-.column_st32:
-	cmp	ecx, byte SIZEOF_XMMWORD/2
-	jb	short .column_st16
-	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-	add	edi, byte 2*SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmC
-	movdqa	xmmD,xmmH
-	sub	ecx, byte SIZEOF_XMMWORD/2
-.column_st16:
-	cmp	ecx, byte SIZEOF_XMMWORD/4
-	jb	short .column_st15
-	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmD
-	sub	ecx, byte SIZEOF_XMMWORD/4
-.column_st15:
-	; Store two pixels (8 bytes) of xmmA to the output when it has enough
-	; space.
-	cmp	ecx, byte SIZEOF_XMMWORD/8
-	jb	short .column_st7
-	movq	XMM_MMWORD [edi], xmmA
-	add	edi, byte SIZEOF_XMMWORD/8*4
-	sub	ecx, byte SIZEOF_XMMWORD/8
-	psrldq	xmmA, SIZEOF_XMMWORD/8*4
-.column_st7:
-	; Store one pixel (4 bytes) of xmmA to the output when it has enough
-	; space.
-	test	ecx, ecx
-	jz	short .endcolumn
-	movd	XMM_DWORD [edi], xmmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-.endcolumn:
-	sfence		; flush the write buffer
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width,
-;                                  JSAMPIMAGE input_buf,
-;                                  JDIMENSION in_row_group_ctr,
-;                                  JSAMPARRAY output_buf);
-;
-
-%define output_width(b)	(b)+8			; JDIMENSION output_width
-%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
-%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
-
-	align	16
-	global	EXTN(jsimd_h2v2_merged_upsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v2_merged_upsample_sse2):
-	push	ebp
-	mov	ebp,esp
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov	eax, POINTER [output_width(ebp)]
-
-	mov	edi, JSAMPIMAGE [input_buf(ebp)]
-	mov	ecx, JDIMENSION [in_row_group_ctr(ebp)]
-	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-	mov	edi, JSAMPARRAY [output_buf(ebp)]
-	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
-
-	push	edx			; inptr2
-	push	ebx			; inptr1
-	push	esi			; inptr00
-	mov	ebx,esp
-
-	push	edi			; output_buf (outptr0)
-	push	ecx			; in_row_group_ctr
-	push	ebx			; input_buf
-	push	eax			; output_width
-
-	call	near EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-	add	esi, byte SIZEOF_JSAMPROW	; inptr01
-	add	edi, byte SIZEOF_JSAMPROW	; outptr1
-	mov	POINTER [ebx+0*SIZEOF_POINTER], esi
-	mov	POINTER [ebx-1*SIZEOF_POINTER], edi
-
-	call	near EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-	add	esp, byte 7*SIZEOF_DWORD
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jdsammmx.asm b/simd/jdsammmx.asm
deleted file mode 100644
index d92a8c9..0000000
--- a/simd/jdsammmx.asm
+++ /dev/null
@@ -1,737 +0,0 @@
-;
-; jdsammmx.asm - upsampling (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_fancy_upsample_mmx) PRIVATE
-
-EXTN(jconst_fancy_upsample_mmx):
-
-PW_ONE		times 4 dw  1
-PW_TWO		times 4 dw  2
-PW_THREE	times 4 dw  3
-PW_SEVEN	times 4 dw  7
-PW_EIGHT	times 4 dw  8
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
-;
-; The upsampling algorithm is linear interpolation between pixel centers,
-; also known as a "triangle filter".  This is a good compromise between
-; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
-; of the way between input pixel centers.
-;
-; GLOBAL(void)
-; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor,
-;                                JDIMENSION downsampled_width,
-;                                JSAMPARRAY input_data,
-;                                JSAMPARRAY * output_data_ptr);
-;
-
-%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
-%define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width
-%define input_data(b)		(b)+16		; JSAMPARRAY input_data
-%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
-
-	align	16
-	global	EXTN(jsimd_h2v1_fancy_upsample_mmx) PRIVATE
-
-EXTN(jsimd_h2v1_fancy_upsample_mmx):
-	push	ebp
-	mov	ebp,esp
-	pushpic	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx		; get GOT address
-
-	mov	eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
-	test	eax,eax
-	jz	near .return
-
-	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
-	test	ecx,ecx
-	jz	near .return
-
-	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
-	mov	edi, POINTER [output_data_ptr(ebp)]
-	mov	edi, JSAMPARRAY [edi]			; output_data
-	alignx	16,7
-.rowloop:
-	push	eax			; colctr
-	push	edi
-	push	esi
-
-	mov	esi, JSAMPROW [esi]	; inptr
-	mov	edi, JSAMPROW [edi]	; outptr
-
-	test	eax, SIZEOF_MMWORD-1
-	jz	short .skip
-	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
-	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
-.skip:
-	pxor	mm0,mm0			; mm0=(all 0's)
-	pcmpeqb	mm7,mm7
-	psrlq	mm7,(SIZEOF_MMWORD-1)*BYTE_BIT
-	pand	mm7, MMWORD [esi+0*SIZEOF_MMWORD]
-
-	add	eax, byte SIZEOF_MMWORD-1
-	and	eax, byte -SIZEOF_MMWORD
-	cmp	eax, byte SIZEOF_MMWORD
-	ja	short .columnloop
-	alignx	16,7
-
-.columnloop_last:
-	pcmpeqb	mm6,mm6
-	psllq	mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
-	pand	mm6, MMWORD [esi+0*SIZEOF_MMWORD]
-	jmp	short .upsample
-	alignx	16,7
-
-.columnloop:
-	movq	mm6, MMWORD [esi+1*SIZEOF_MMWORD]
-	psllq	mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
-
-.upsample:
-	movq	mm1, MMWORD [esi+0*SIZEOF_MMWORD]
-	movq	mm2,mm1
-	movq	mm3,mm1			; mm1=( 0 1 2 3 4 5 6 7)
-	psllq	mm2,BYTE_BIT		; mm2=( - 0 1 2 3 4 5 6)
-	psrlq	mm3,BYTE_BIT		; mm3=( 1 2 3 4 5 6 7 -)
-
-	por	mm2,mm7			; mm2=(-1 0 1 2 3 4 5 6)
-	por	mm3,mm6			; mm3=( 1 2 3 4 5 6 7 8)
-
-	movq	mm7,mm1
-	psrlq	mm7,(SIZEOF_MMWORD-1)*BYTE_BIT	; mm7=( 7 - - - - - - -)
-
-	movq      mm4,mm1
-	punpcklbw mm1,mm0		; mm1=( 0 1 2 3)
-	punpckhbw mm4,mm0		; mm4=( 4 5 6 7)
-	movq      mm5,mm2
-	punpcklbw mm2,mm0		; mm2=(-1 0 1 2)
-	punpckhbw mm5,mm0		; mm5=( 3 4 5 6)
-	movq      mm6,mm3
-	punpcklbw mm3,mm0		; mm3=( 1 2 3 4)
-	punpckhbw mm6,mm0		; mm6=( 5 6 7 8)
-
-	pmullw	mm1,[GOTOFF(ebx,PW_THREE)]
-	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
-	paddw	mm2,[GOTOFF(ebx,PW_ONE)]
-	paddw	mm5,[GOTOFF(ebx,PW_ONE)]
-	paddw	mm3,[GOTOFF(ebx,PW_TWO)]
-	paddw	mm6,[GOTOFF(ebx,PW_TWO)]
-
-	paddw	mm2,mm1
-	paddw	mm5,mm4
-	psrlw	mm2,2			; mm2=OutLE=( 0  2  4  6)
-	psrlw	mm5,2			; mm5=OutHE=( 8 10 12 14)
-	paddw	mm3,mm1
-	paddw	mm6,mm4
-	psrlw	mm3,2			; mm3=OutLO=( 1  3  5  7)
-	psrlw	mm6,2			; mm6=OutHO=( 9 11 13 15)
-
-	psllw	mm3,BYTE_BIT
-	psllw	mm6,BYTE_BIT
-	por	mm2,mm3			; mm2=OutL=( 0  1  2  3  4  5  6  7)
-	por	mm5,mm6			; mm5=OutH=( 8  9 10 11 12 13 14 15)
-
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm2
-	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm5
-
-	sub	eax, byte SIZEOF_MMWORD
-	add	esi, byte 1*SIZEOF_MMWORD	; inptr
-	add	edi, byte 2*SIZEOF_MMWORD	; outptr
-	cmp	eax, byte SIZEOF_MMWORD
-	ja	near .columnloop
-	test	eax,eax
-	jnz	near .columnloop_last
-
-	pop	esi
-	pop	edi
-	pop	eax
-
-	add	esi, byte SIZEOF_JSAMPROW	; input_data
-	add	edi, byte SIZEOF_JSAMPROW	; output_data
-	dec	ecx				; rowctr
-	jg	near .rowloop
-
-	emms		; empty MMX state
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	poppic	ebx
-	pop	ebp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
-; Again a triangle filter; see comments for h2v1 case, above.
-;
-; GLOBAL(void)
-; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor,
-;                                JDIMENSION downsampled_width,
-;                                JSAMPARRAY input_data,
-;                                JSAMPARRAY * output_data_ptr);
-;
-
-%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
-%define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width
-%define input_data(b)		(b)+16		; JSAMPARRAY input_data
-%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
-%define WK_NUM		4
-%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
-
-	align	16
-	global	EXTN(jsimd_h2v2_fancy_upsample_mmx) PRIVATE
-
-EXTN(jsimd_h2v2_fancy_upsample_mmx):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	eax		; make a room for GOT address
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx			; get GOT address
-	movpic	POINTER [gotptr], ebx	; save GOT address
-
-	mov	edx,eax				; edx = original ebp
-	mov	eax, JDIMENSION [downsamp_width(edx)]  ; colctr
-	test	eax,eax
-	jz	near .return
-
-	mov	ecx, INT [max_v_samp(edx)]	; rowctr
-	test	ecx,ecx
-	jz	near .return
-
-	mov	esi, JSAMPARRAY [input_data(edx)]	; input_data
-	mov	edi, POINTER [output_data_ptr(edx)]
-	mov	edi, JSAMPARRAY [edi]			; output_data
-	alignx	16,7
-.rowloop:
-	push	eax					; colctr
-	push	ecx
-	push	edi
-	push	esi
-
-	mov	ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]	; inptr1(above)
-	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
-	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1(below)
-	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
-	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
-
-	test	eax, SIZEOF_MMWORD-1
-	jz	short .skip
-	push	edx
-	mov	dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
-	mov	JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
-	mov	dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
-	mov	JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
-	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
-	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
-	pop	edx
-.skip:
-	; -- process the first column block
-
-	movq	mm0, MMWORD [ebx+0*SIZEOF_MMWORD]	; mm0=row[ 0][0]
-	movq	mm1, MMWORD [ecx+0*SIZEOF_MMWORD]	; mm1=row[-1][0]
-	movq	mm2, MMWORD [esi+0*SIZEOF_MMWORD]	; mm2=row[+1][0]
-
-	pushpic	ebx
-	movpic	ebx, POINTER [gotptr]	; load GOT address
-
-	pxor      mm3,mm3		; mm3=(all 0's)
-	movq      mm4,mm0
-	punpcklbw mm0,mm3		; mm0=row[ 0][0]( 0 1 2 3)
-	punpckhbw mm4,mm3		; mm4=row[ 0][0]( 4 5 6 7)
-	movq      mm5,mm1
-	punpcklbw mm1,mm3		; mm1=row[-1][0]( 0 1 2 3)
-	punpckhbw mm5,mm3		; mm5=row[-1][0]( 4 5 6 7)
-	movq      mm6,mm2
-	punpcklbw mm2,mm3		; mm2=row[+1][0]( 0 1 2 3)
-	punpckhbw mm6,mm3		; mm6=row[+1][0]( 4 5 6 7)
-
-	pmullw	mm0,[GOTOFF(ebx,PW_THREE)]
-	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
-
-	pcmpeqb	mm7,mm7
-	psrlq	mm7,(SIZEOF_MMWORD-2)*BYTE_BIT
-
-	paddw	mm1,mm0			; mm1=Int0L=( 0 1 2 3)
-	paddw	mm5,mm4			; mm5=Int0H=( 4 5 6 7)
-	paddw	mm2,mm0			; mm2=Int1L=( 0 1 2 3)
-	paddw	mm6,mm4			; mm6=Int1H=( 4 5 6 7)
-
-	movq	MMWORD [edx+0*SIZEOF_MMWORD], mm1	; temporarily save
-	movq	MMWORD [edx+1*SIZEOF_MMWORD], mm5	; the intermediate data
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm2
-	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm6
-
-	pand	mm1,mm7			; mm1=( 0 - - -)
-	pand	mm2,mm7			; mm2=( 0 - - -)
-
-	movq	MMWORD [wk(0)], mm1
-	movq	MMWORD [wk(1)], mm2
-
-	poppic	ebx
-
-	add	eax, byte SIZEOF_MMWORD-1
-	and	eax, byte -SIZEOF_MMWORD
-	cmp	eax, byte SIZEOF_MMWORD
-	ja	short .columnloop
-	alignx	16,7
-
-.columnloop_last:
-	; -- process the last column block
-
-	pushpic	ebx
-	movpic	ebx, POINTER [gotptr]	; load GOT address
-
-	pcmpeqb	mm1,mm1
-	psllq	mm1,(SIZEOF_MMWORD-2)*BYTE_BIT
-	movq	mm2,mm1
-
-	pand	mm1, MMWORD [edx+1*SIZEOF_MMWORD]	; mm1=( - - - 7)
-	pand	mm2, MMWORD [edi+1*SIZEOF_MMWORD]	; mm2=( - - - 7)
-
-	movq	MMWORD [wk(2)], mm1
-	movq	MMWORD [wk(3)], mm2
-
-	jmp	short .upsample
-	alignx	16,7
-
-.columnloop:
-	; -- process the next column block
-
-	movq	mm0, MMWORD [ebx+1*SIZEOF_MMWORD]	; mm0=row[ 0][1]
-	movq	mm1, MMWORD [ecx+1*SIZEOF_MMWORD]	; mm1=row[-1][1]
-	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]	; mm2=row[+1][1]
-
-	pushpic	ebx
-	movpic	ebx, POINTER [gotptr]	; load GOT address
-
-	pxor      mm3,mm3		; mm3=(all 0's)
-	movq      mm4,mm0
-	punpcklbw mm0,mm3		; mm0=row[ 0][1]( 0 1 2 3)
-	punpckhbw mm4,mm3		; mm4=row[ 0][1]( 4 5 6 7)
-	movq      mm5,mm1
-	punpcklbw mm1,mm3		; mm1=row[-1][1]( 0 1 2 3)
-	punpckhbw mm5,mm3		; mm5=row[-1][1]( 4 5 6 7)
-	movq      mm6,mm2
-	punpcklbw mm2,mm3		; mm2=row[+1][1]( 0 1 2 3)
-	punpckhbw mm6,mm3		; mm6=row[+1][1]( 4 5 6 7)
-
-	pmullw	mm0,[GOTOFF(ebx,PW_THREE)]
-	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
-
-	paddw	mm1,mm0			; mm1=Int0L=( 0 1 2 3)
-	paddw	mm5,mm4			; mm5=Int0H=( 4 5 6 7)
-	paddw	mm2,mm0			; mm2=Int1L=( 0 1 2 3)
-	paddw	mm6,mm4			; mm6=Int1H=( 4 5 6 7)
-
-	movq	MMWORD [edx+2*SIZEOF_MMWORD], mm1	; temporarily save
-	movq	MMWORD [edx+3*SIZEOF_MMWORD], mm5	; the intermediate data
-	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
-	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm6
-
-	psllq	mm1,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm1=( - - - 0)
-	psllq	mm2,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm2=( - - - 0)
-
-	movq	MMWORD [wk(2)], mm1
-	movq	MMWORD [wk(3)], mm2
-
-.upsample:
-	; -- process the upper row
-
-	movq	mm7, MMWORD [edx+0*SIZEOF_MMWORD]	; mm7=Int0L=( 0 1 2 3)
-	movq	mm3, MMWORD [edx+1*SIZEOF_MMWORD]	; mm3=Int0H=( 4 5 6 7)
-
-	movq	mm0,mm7
-	movq	mm4,mm3
-	psrlq	mm0,2*BYTE_BIT			; mm0=( 1 2 3 -)
-	psllq	mm4,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm4=( - - - 4)
-	movq	mm5,mm7
-	movq	mm6,mm3
-	psrlq	mm5,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm5=( 3 - - -)
-	psllq	mm6,2*BYTE_BIT			; mm6=( - 4 5 6)
-
-	por	mm0,mm4				; mm0=( 1 2 3 4)
-	por	mm5,mm6				; mm5=( 3 4 5 6)
-
-	movq	mm1,mm7
-	movq	mm2,mm3
-	psllq	mm1,2*BYTE_BIT			; mm1=( - 0 1 2)
-	psrlq	mm2,2*BYTE_BIT			; mm2=( 5 6 7 -)
-	movq	mm4,mm3
-	psrlq	mm4,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm4=( 7 - - -)
-
-	por	mm1, MMWORD [wk(0)]		; mm1=(-1 0 1 2)
-	por	mm2, MMWORD [wk(2)]		; mm2=( 5 6 7 8)
-
-	movq	MMWORD [wk(0)], mm4
-
-	pmullw	mm7,[GOTOFF(ebx,PW_THREE)]
-	pmullw	mm3,[GOTOFF(ebx,PW_THREE)]
-	paddw	mm1,[GOTOFF(ebx,PW_EIGHT)]
-	paddw	mm5,[GOTOFF(ebx,PW_EIGHT)]
-	paddw	mm0,[GOTOFF(ebx,PW_SEVEN)]
-	paddw	mm2,[GOTOFF(ebx,PW_SEVEN)]
-
-	paddw	mm1,mm7
-	paddw	mm5,mm3
-	psrlw	mm1,4			; mm1=Out0LE=( 0  2  4  6)
-	psrlw	mm5,4			; mm5=Out0HE=( 8 10 12 14)
-	paddw	mm0,mm7
-	paddw	mm2,mm3
-	psrlw	mm0,4			; mm0=Out0LO=( 1  3  5  7)
-	psrlw	mm2,4			; mm2=Out0HO=( 9 11 13 15)
-
-	psllw	mm0,BYTE_BIT
-	psllw	mm2,BYTE_BIT
-	por	mm1,mm0			; mm1=Out0L=( 0  1  2  3  4  5  6  7)
-	por	mm5,mm2			; mm5=Out0H=( 8  9 10 11 12 13 14 15)
-
-	movq	MMWORD [edx+0*SIZEOF_MMWORD], mm1
-	movq	MMWORD [edx+1*SIZEOF_MMWORD], mm5
-
-	; -- process the lower row
-
-	movq	mm6, MMWORD [edi+0*SIZEOF_MMWORD]	; mm6=Int1L=( 0 1 2 3)
-	movq	mm4, MMWORD [edi+1*SIZEOF_MMWORD]	; mm4=Int1H=( 4 5 6 7)
-
-	movq	mm7,mm6
-	movq	mm3,mm4
-	psrlq	mm7,2*BYTE_BIT			; mm7=( 1 2 3 -)
-	psllq	mm3,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm3=( - - - 4)
-	movq	mm0,mm6
-	movq	mm2,mm4
-	psrlq	mm0,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm0=( 3 - - -)
-	psllq	mm2,2*BYTE_BIT			; mm2=( - 4 5 6)
-
-	por	mm7,mm3				; mm7=( 1 2 3 4)
-	por	mm0,mm2				; mm0=( 3 4 5 6)
-
-	movq	mm1,mm6
-	movq	mm5,mm4
-	psllq	mm1,2*BYTE_BIT			; mm1=( - 0 1 2)
-	psrlq	mm5,2*BYTE_BIT			; mm5=( 5 6 7 -)
-	movq	mm3,mm4
-	psrlq	mm3,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm3=( 7 - - -)
-
-	por	mm1, MMWORD [wk(1)]		; mm1=(-1 0 1 2)
-	por	mm5, MMWORD [wk(3)]		; mm5=( 5 6 7 8)
-
-	movq	MMWORD [wk(1)], mm3
-
-	pmullw	mm6,[GOTOFF(ebx,PW_THREE)]
-	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
-	paddw	mm1,[GOTOFF(ebx,PW_EIGHT)]
-	paddw	mm0,[GOTOFF(ebx,PW_EIGHT)]
-	paddw	mm7,[GOTOFF(ebx,PW_SEVEN)]
-	paddw	mm5,[GOTOFF(ebx,PW_SEVEN)]
-
-	paddw	mm1,mm6
-	paddw	mm0,mm4
-	psrlw	mm1,4			; mm1=Out1LE=( 0  2  4  6)
-	psrlw	mm0,4			; mm0=Out1HE=( 8 10 12 14)
-	paddw	mm7,mm6
-	paddw	mm5,mm4
-	psrlw	mm7,4			; mm7=Out1LO=( 1  3  5  7)
-	psrlw	mm5,4			; mm5=Out1HO=( 9 11 13 15)
-
-	psllw	mm7,BYTE_BIT
-	psllw	mm5,BYTE_BIT
-	por	mm1,mm7			; mm1=Out1L=( 0  1  2  3  4  5  6  7)
-	por	mm0,mm5			; mm0=Out1H=( 8  9 10 11 12 13 14 15)
-
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm1
-	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm0
-
-	poppic	ebx
-
-	sub	eax, byte SIZEOF_MMWORD
-	add	ecx, byte 1*SIZEOF_MMWORD	; inptr1(above)
-	add	ebx, byte 1*SIZEOF_MMWORD	; inptr0
-	add	esi, byte 1*SIZEOF_MMWORD	; inptr1(below)
-	add	edx, byte 2*SIZEOF_MMWORD	; outptr0
-	add	edi, byte 2*SIZEOF_MMWORD	; outptr1
-	cmp	eax, byte SIZEOF_MMWORD
-	ja	near .columnloop
-	test	eax,eax
-	jnz	near .columnloop_last
-
-	pop	esi
-	pop	edi
-	pop	ecx
-	pop	eax
-
-	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
-	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
-	sub	ecx, byte 2			; rowctr
-	jg	near .rowloop
-
-	emms		; empty MMX state
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v1_upsample_mmx (int max_v_samp_factor,
-;                          JDIMENSION output_width,
-;                          JSAMPARRAY input_data,
-;                          JSAMPARRAY * output_data_ptr);
-;
-
-%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
-%define output_width(b)	(b)+12		; JDIMENSION output_width
-%define input_data(b)		(b)+16		; JSAMPARRAY input_data
-%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
-
-	align	16
-	global	EXTN(jsimd_h2v1_upsample_mmx) PRIVATE
-
-EXTN(jsimd_h2v1_upsample_mmx):
-	push	ebp
-	mov	ebp,esp
-;	push	ebx		; unused
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov	edx, JDIMENSION [output_width(ebp)]
-	add	edx, byte (2*SIZEOF_MMWORD)-1
-	and	edx, byte -(2*SIZEOF_MMWORD)
-	jz	short .return
-
-	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
-	test	ecx,ecx
-	jz	short .return
-
-	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
-	mov	edi, POINTER [output_data_ptr(ebp)]
-	mov	edi, JSAMPARRAY [edi]			; output_data
-	alignx	16,7
-.rowloop:
-	push	edi
-	push	esi
-
-	mov	esi, JSAMPROW [esi]		; inptr
-	mov	edi, JSAMPROW [edi]		; outptr
-	mov	eax,edx				; colctr
-	alignx	16,7
-.columnloop:
-
-	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
-
-	movq      mm1,mm0
-	punpcklbw mm0,mm0
-	punpckhbw mm1,mm1
-
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
-	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm1
-
-	sub	eax, byte 2*SIZEOF_MMWORD
-	jz	short .nextrow
-
-	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]
-
-	movq      mm3,mm2
-	punpcklbw mm2,mm2
-	punpckhbw mm3,mm3
-
-	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
-	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm3
-
-	sub	eax, byte 2*SIZEOF_MMWORD
-	jz	short .nextrow
-
-	add	esi, byte 2*SIZEOF_MMWORD	; inptr
-	add	edi, byte 4*SIZEOF_MMWORD	; outptr
-	jmp	short .columnloop
-	alignx	16,7
-
-.nextrow:
-	pop	esi
-	pop	edi
-
-	add	esi, byte SIZEOF_JSAMPROW	; input_data
-	add	edi, byte SIZEOF_JSAMPROW	; output_data
-	dec	ecx				; rowctr
-	jg	short .rowloop
-
-	emms		; empty MMX state
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-;	pop	ebx		; unused
-	pop	ebp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v2_upsample_mmx (int max_v_samp_factor,
-;                          JDIMENSION output_width,
-;                          JSAMPARRAY input_data,
-;                          JSAMPARRAY * output_data_ptr);
-;
-
-%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
-%define output_width(b)	(b)+12		; JDIMENSION output_width
-%define input_data(b)		(b)+16		; JSAMPARRAY input_data
-%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
-
-	align	16
-	global	EXTN(jsimd_h2v2_upsample_mmx) PRIVATE
-
-EXTN(jsimd_h2v2_upsample_mmx):
-	push	ebp
-	mov	ebp,esp
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov	edx, JDIMENSION [output_width(ebp)]
-	add	edx, byte (2*SIZEOF_MMWORD)-1
-	and	edx, byte -(2*SIZEOF_MMWORD)
-	jz	near .return
-
-	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
-	test	ecx,ecx
-	jz	short .return
-
-	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
-	mov	edi, POINTER [output_data_ptr(ebp)]
-	mov	edi, JSAMPARRAY [edi]			; output_data
-	alignx	16,7
-.rowloop:
-	push	edi
-	push	esi
-
-	mov	esi, JSAMPROW [esi]			; inptr
-	mov	ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
-	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
-	mov	eax,edx					; colctr
-	alignx	16,7
-.columnloop:
-
-	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
-
-	movq      mm1,mm0
-	punpcklbw mm0,mm0
-	punpckhbw mm1,mm1
-
-	movq	MMWORD [ebx+0*SIZEOF_MMWORD], mm0
-	movq	MMWORD [ebx+1*SIZEOF_MMWORD], mm1
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
-	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm1
-
-	sub	eax, byte 2*SIZEOF_MMWORD
-	jz	short .nextrow
-
-	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]
-
-	movq      mm3,mm2
-	punpcklbw mm2,mm2
-	punpckhbw mm3,mm3
-
-	movq	MMWORD [ebx+2*SIZEOF_MMWORD], mm2
-	movq	MMWORD [ebx+3*SIZEOF_MMWORD], mm3
-	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
-	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm3
-
-	sub	eax, byte 2*SIZEOF_MMWORD
-	jz	short .nextrow
-
-	add	esi, byte 2*SIZEOF_MMWORD	; inptr
-	add	ebx, byte 4*SIZEOF_MMWORD	; outptr0
-	add	edi, byte 4*SIZEOF_MMWORD	; outptr1
-	jmp	short .columnloop
-	alignx	16,7
-
-.nextrow:
-	pop	esi
-	pop	edi
-
-	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
-	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
-	sub	ecx, byte 2			; rowctr
-	jg	short .rowloop
-
-	emms		; empty MMX state
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jdsamss2-64.asm b/simd/jdsamss2-64.asm
deleted file mode 100644
index 73577fd..0000000
--- a/simd/jdsamss2-64.asm
+++ /dev/null
@@ -1,671 +0,0 @@
-;
-; jdsamss2-64.asm - upsampling (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_fancy_upsample_sse2) PRIVATE
-
-EXTN(jconst_fancy_upsample_sse2):
-
-PW_ONE		times 8 dw  1
-PW_TWO		times 8 dw  2
-PW_THREE	times 8 dw  3
-PW_SEVEN	times 8 dw  7
-PW_EIGHT	times 8 dw  8
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
-;
-; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
-;
-; The upsampling algorithm is linear interpolation between pixel centers,
-; also known as a "triangle filter".  This is a good compromise between
-; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
-; of the way between input pixel centers.
-;
-; GLOBAL(void)
-; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
-;                                 JDIMENSION downsampled_width,
-;                                 JSAMPARRAY input_data,
-;                                 JSAMPARRAY * output_data_ptr);
-;
-
-; r10 = int max_v_samp_factor
-; r11 = JDIMENSION downsampled_width
-; r12 = JSAMPARRAY input_data
-; r13 = JSAMPARRAY * output_data_ptr
-
-	align	16
-	global	EXTN(jsimd_h2v1_fancy_upsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v1_fancy_upsample_sse2):
-	push	rbp
-	mov	rax,rsp
-	mov	rbp,rsp
-	collect_args
-
-	mov	rax, r11  ; colctr
-	test	rax,rax
-	jz	near .return
-
-	mov	rcx, r10	; rowctr
-	test	rcx,rcx
-	jz	near .return
-
-	mov	rsi, r12	; input_data
-	mov	rdi, r13
-	mov	rdi, JSAMPARRAY [rdi]			; output_data
-.rowloop:
-	push	rax			; colctr
-	push	rdi
-	push	rsi
-
-	mov	rsi, JSAMPROW [rsi]	; inptr
-	mov	rdi, JSAMPROW [rdi]	; outptr
-
-	test	rax, SIZEOF_XMMWORD-1
-	jz	short .skip
-	mov	dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
-	mov	JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
-.skip:
-	pxor	xmm0,xmm0		; xmm0=(all 0's)
-	pcmpeqb	xmm7,xmm7
-	psrldq	xmm7,(SIZEOF_XMMWORD-1)
-	pand	xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-
-	add	rax, byte SIZEOF_XMMWORD-1
-	and	rax, byte -SIZEOF_XMMWORD
-	cmp	rax, byte SIZEOF_XMMWORD
-	ja	short .columnloop
-
-.columnloop_last:
-	pcmpeqb	xmm6,xmm6
-	pslldq	xmm6,(SIZEOF_XMMWORD-1)
-	pand	xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	jmp	short .upsample
-
-.columnloop:
-	movdqa	xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-	pslldq	xmm6,(SIZEOF_XMMWORD-1)
-
-.upsample:
-	movdqa	xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	movdqa	xmm2,xmm1
-	movdqa	xmm3,xmm1		; xmm1=( 0  1  2 ... 13 14 15)
-	pslldq	xmm2,1			; xmm2=(--  0  1 ... 12 13 14)
-	psrldq	xmm3,1			; xmm3=( 1  2  3 ... 14 15 --)
-
-	por	xmm2,xmm7		; xmm2=(-1  0  1 ... 12 13 14)
-	por	xmm3,xmm6		; xmm3=( 1  2  3 ... 14 15 16)
-
-	movdqa	xmm7,xmm1
-	psrldq	xmm7,(SIZEOF_XMMWORD-1)	; xmm7=(15 -- -- ... -- -- --)
-
-	movdqa    xmm4,xmm1
-	punpcklbw xmm1,xmm0		; xmm1=( 0  1  2  3  4  5  6  7)
-	punpckhbw xmm4,xmm0		; xmm4=( 8  9 10 11 12 13 14 15)
-	movdqa    xmm5,xmm2
-	punpcklbw xmm2,xmm0		; xmm2=(-1  0  1  2  3  4  5  6)
-	punpckhbw xmm5,xmm0		; xmm5=( 7  8  9 10 11 12 13 14)
-	movdqa    xmm6,xmm3
-	punpcklbw xmm3,xmm0		; xmm3=( 1  2  3  4  5  6  7  8)
-	punpckhbw xmm6,xmm0		; xmm6=( 9 10 11 12 13 14 15 16)
-
-	pmullw	xmm1,[rel PW_THREE]
-	pmullw	xmm4,[rel PW_THREE]
-	paddw	xmm2,[rel PW_ONE]
-	paddw	xmm5,[rel PW_ONE]
-	paddw	xmm3,[rel PW_TWO]
-	paddw	xmm6,[rel PW_TWO]
-
-	paddw	xmm2,xmm1
-	paddw	xmm5,xmm4
-	psrlw	xmm2,2			; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
-	psrlw	xmm5,2			; xmm5=OutHE=(16 18 20 22 24 26 28 30)
-	paddw	xmm3,xmm1
-	paddw	xmm6,xmm4
-	psrlw	xmm3,2			; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
-	psrlw	xmm6,2			; xmm6=OutHO=(17 19 21 23 25 27 29 31)
-
-	psllw	xmm3,BYTE_BIT
-	psllw	xmm6,BYTE_BIT
-	por	xmm2,xmm3		; xmm2=OutL=( 0  1  2 ... 13 14 15)
-	por	xmm5,xmm6		; xmm5=OutH=(16 17 18 ... 29 30 31)
-
-	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
-	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
-
-	sub	rax, byte SIZEOF_XMMWORD
-	add	rsi, byte 1*SIZEOF_XMMWORD	; inptr
-	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr
-	cmp	rax, byte SIZEOF_XMMWORD
-	ja	near .columnloop
-	test	eax,eax
-	jnz	near .columnloop_last
-
-	pop	rsi
-	pop	rdi
-	pop	rax
-
-	add	rsi, byte SIZEOF_JSAMPROW	; input_data
-	add	rdi, byte SIZEOF_JSAMPROW	; output_data
-	dec	rcx				; rowctr
-	jg	near .rowloop
-
-.return:
-	uncollect_args
-	pop	rbp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
-; Again a triangle filter; see comments for h2v1 case, above.
-;
-; GLOBAL(void)
-; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
-;                                 JDIMENSION downsampled_width,
-;                                 JSAMPARRAY input_data,
-;                                 JSAMPARRAY * output_data_ptr);
-;
-
-; r10 = int max_v_samp_factor
-; r11 = JDIMENSION downsampled_width
-; r12 = JSAMPARRAY input_data
-; r13 = JSAMPARRAY * output_data_ptr
-
-%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		4
-
-	align	16
-	global	EXTN(jsimd_h2v2_fancy_upsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v2_fancy_upsample_sse2):
-	push	rbp
-	mov	rax,rsp				; rax = original rbp
-	sub	rsp, byte 4
-	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[rsp],rax
-	mov	rbp,rsp				; rbp = aligned rbp
-	lea	rsp, [wk(0)]
-	collect_args
-	push	rbx
-
-	mov	rax, r11  ; colctr
-	test	rax,rax
-	jz	near .return
-
-	mov	rcx, r10	; rowctr
-	test	rcx,rcx
-	jz	near .return
-
-	mov	rsi, r12	; input_data
-	mov	rdi, r13
-	mov	rdi, JSAMPARRAY [rdi]			; output_data
-.rowloop:
-	push	rax					; colctr
-	push	rcx
-	push	rdi
-	push	rsi
-
-	mov	rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]	; inptr1(above)
-	mov	rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]	; inptr0
-	mov	rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]	; inptr1(below)
-	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]	; outptr0
-	mov	rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]	; outptr1
-
-	test	rax, SIZEOF_XMMWORD-1
-	jz	short .skip
-	push	rdx
-	mov	dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
-	mov	JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
-	mov	dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
-	mov	JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
-	mov	dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
-	mov	JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
-	pop	rdx
-.skip:
-	; -- process the first column block
-
-	movdqa	xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD]	; xmm0=row[ 0][0]
-	movdqa	xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD]	; xmm1=row[-1][0]
-	movdqa	xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD]	; xmm2=row[+1][0]
-
-	pxor      xmm3,xmm3		; xmm3=(all 0's)
-	movdqa    xmm4,xmm0
-	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
-	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
-	movdqa    xmm5,xmm1
-	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
-	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
-	movdqa    xmm6,xmm2
-	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
-	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
-
-	pmullw	xmm0,[rel PW_THREE]
-	pmullw	xmm4,[rel PW_THREE]
-
-	pcmpeqb	xmm7,xmm7
-	psrldq	xmm7,(SIZEOF_XMMWORD-2)
-
-	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
-	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
-	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
-	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
-
-	movdqa	XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1	; temporarily save
-	movdqa	XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5	; the intermediate data
-	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
-	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
-
-	pand	xmm1,xmm7		; xmm1=( 0 -- -- -- -- -- -- --)
-	pand	xmm2,xmm7		; xmm2=( 0 -- -- -- -- -- -- --)
-
-	movdqa	XMMWORD [wk(0)], xmm1
-	movdqa	XMMWORD [wk(1)], xmm2
-
-	add	rax, byte SIZEOF_XMMWORD-1
-	and	rax, byte -SIZEOF_XMMWORD
-	cmp	rax, byte SIZEOF_XMMWORD
-	ja	short .columnloop
-
-.columnloop_last:
-	; -- process the last column block
-
-	pcmpeqb	xmm1,xmm1
-	pslldq	xmm1,(SIZEOF_XMMWORD-2)
-	movdqa	xmm2,xmm1
-
-	pand	xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
-	pand	xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
-
-	movdqa	XMMWORD [wk(2)], xmm1	; xmm1=(-- -- -- -- -- -- -- 15)
-	movdqa	XMMWORD [wk(3)], xmm2	; xmm2=(-- -- -- -- -- -- -- 15)
-
-	jmp	near .upsample
-
-.columnloop:
-	; -- process the next column block
-
-	movdqa	xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD]	; xmm0=row[ 0][1]
-	movdqa	xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD]	; xmm1=row[-1][1]
-	movdqa	xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]	; xmm2=row[+1][1]
-
-	pxor      xmm3,xmm3		; xmm3=(all 0's)
-	movdqa    xmm4,xmm0
-	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
-	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
-	movdqa    xmm5,xmm1
-	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
-	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
-	movdqa    xmm6,xmm2
-	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
-	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
-
-	pmullw	xmm0,[rel PW_THREE]
-	pmullw	xmm4,[rel PW_THREE]
-
-	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
-	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
-	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
-	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
-
-	movdqa	XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1	; temporarily save
-	movdqa	XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5	; the intermediate data
-	movdqa	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
-	movdqa	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
-
-	pslldq	xmm1,(SIZEOF_XMMWORD-2)	; xmm1=(-- -- -- -- -- -- --  0)
-	pslldq	xmm2,(SIZEOF_XMMWORD-2)	; xmm2=(-- -- -- -- -- -- --  0)
-
-	movdqa	XMMWORD [wk(2)], xmm1
-	movdqa	XMMWORD [wk(3)], xmm2
-
-.upsample:
-	; -- process the upper row
-
-	movdqa	xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
-	movdqa	xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
-
-	movdqa	xmm0,xmm7		; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
-	movdqa	xmm4,xmm3		; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
-	psrldq	xmm0,2			; xmm0=( 1  2  3  4  5  6  7 --)
-	pslldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(-- -- -- -- -- -- --  8)
-	movdqa	xmm5,xmm7
-	movdqa	xmm6,xmm3
-	psrldq	xmm5,(SIZEOF_XMMWORD-2)	; xmm5=( 7 -- -- -- -- -- -- --)
-	pslldq	xmm6,2			; xmm6=(--  8  9 10 11 12 13 14)
-
-	por	xmm0,xmm4		; xmm0=( 1  2  3  4  5  6  7  8)
-	por	xmm5,xmm6		; xmm5=( 7  8  9 10 11 12 13 14)
-
-	movdqa	xmm1,xmm7
-	movdqa	xmm2,xmm3
-	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)
-	psrldq	xmm2,2			; xmm2=( 9 10 11 12 13 14 15 --)
-	movdqa	xmm4,xmm3
-	psrldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(15 -- -- -- -- -- -- --)
-
-	por	xmm1, XMMWORD [wk(0)]	; xmm1=(-1  0  1  2  3  4  5  6)
-	por	xmm2, XMMWORD [wk(2)]	; xmm2=( 9 10 11 12 13 14 15 16)
-
-	movdqa	XMMWORD [wk(0)], xmm4
-
-	pmullw	xmm7,[rel PW_THREE]
-	pmullw	xmm3,[rel PW_THREE]
-	paddw	xmm1,[rel PW_EIGHT]
-	paddw	xmm5,[rel PW_EIGHT]
-	paddw	xmm0,[rel PW_SEVEN]
-	paddw	xmm2,[rel PW_SEVEN]
-
-	paddw	xmm1,xmm7
-	paddw	xmm5,xmm3
-	psrlw	xmm1,4			; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
-	psrlw	xmm5,4			; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
-	paddw	xmm0,xmm7
-	paddw	xmm2,xmm3
-	psrlw	xmm0,4			; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
-	psrlw	xmm2,4			; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
-
-	psllw	xmm0,BYTE_BIT
-	psllw	xmm2,BYTE_BIT
-	por	xmm1,xmm0		; xmm1=Out0L=( 0  1  2 ... 13 14 15)
-	por	xmm5,xmm2		; xmm5=Out0H=(16 17 18 ... 29 30 31)
-
-	movdqa	XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
-	movdqa	XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
-
-	; -- process the lower row
-
-	movdqa	xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
-	movdqa	xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
-
-	movdqa	xmm7,xmm6		; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
-	movdqa	xmm3,xmm4		; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
-	psrldq	xmm7,2			; xmm7=( 1  2  3  4  5  6  7 --)
-	pslldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(-- -- -- -- -- -- --  8)
-	movdqa	xmm0,xmm6
-	movdqa	xmm2,xmm4
-	psrldq	xmm0,(SIZEOF_XMMWORD-2)	; xmm0=( 7 -- -- -- -- -- -- --)
-	pslldq	xmm2,2			; xmm2=(--  8  9 10 11 12 13 14)
-
-	por	xmm7,xmm3		; xmm7=( 1  2  3  4  5  6  7  8)
-	por	xmm0,xmm2		; xmm0=( 7  8  9 10 11 12 13 14)
-
-	movdqa	xmm1,xmm6
-	movdqa	xmm5,xmm4
-	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)
-	psrldq	xmm5,2			; xmm5=( 9 10 11 12 13 14 15 --)
-	movdqa	xmm3,xmm4
-	psrldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(15 -- -- -- -- -- -- --)
-
-	por	xmm1, XMMWORD [wk(1)]	; xmm1=(-1  0  1  2  3  4  5  6)
-	por	xmm5, XMMWORD [wk(3)]	; xmm5=( 9 10 11 12 13 14 15 16)
-
-	movdqa	XMMWORD [wk(1)], xmm3
-
-	pmullw	xmm6,[rel PW_THREE]
-	pmullw	xmm4,[rel PW_THREE]
-	paddw	xmm1,[rel PW_EIGHT]
-	paddw	xmm0,[rel PW_EIGHT]
-	paddw	xmm7,[rel PW_SEVEN]
-	paddw	xmm5,[rel PW_SEVEN]
-
-	paddw	xmm1,xmm6
-	paddw	xmm0,xmm4
-	psrlw	xmm1,4			; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
-	psrlw	xmm0,4			; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
-	paddw	xmm7,xmm6
-	paddw	xmm5,xmm4
-	psrlw	xmm7,4			; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
-	psrlw	xmm5,4			; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
-
-	psllw	xmm7,BYTE_BIT
-	psllw	xmm5,BYTE_BIT
-	por	xmm1,xmm7		; xmm1=Out1L=( 0  1  2 ... 13 14 15)
-	por	xmm0,xmm5		; xmm0=Out1H=(16 17 18 ... 29 30 31)
-
-	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
-	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
-
-	sub	rax, byte SIZEOF_XMMWORD
-	add	rcx, byte 1*SIZEOF_XMMWORD	; inptr1(above)
-	add	rbx, byte 1*SIZEOF_XMMWORD	; inptr0
-	add	rsi, byte 1*SIZEOF_XMMWORD	; inptr1(below)
-	add	rdx, byte 2*SIZEOF_XMMWORD	; outptr0
-	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr1
-	cmp	rax, byte SIZEOF_XMMWORD
-	ja	near .columnloop
-	test	rax,rax
-	jnz	near .columnloop_last
-
-	pop	rsi
-	pop	rdi
-	pop	rcx
-	pop	rax
-
-	add	rsi, byte 1*SIZEOF_JSAMPROW	; input_data
-	add	rdi, byte 2*SIZEOF_JSAMPROW	; output_data
-	sub	rcx, byte 2			; rowctr
-	jg	near .rowloop
-
-.return:
-	pop	rbx
-	uncollect_args
-	mov	rsp,rbp		; rsp <- aligned rbp
-	pop	rsp		; rsp <- original rbp
-	pop	rbp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
-;                           JDIMENSION output_width,
-;                           JSAMPARRAY input_data,
-;                           JSAMPARRAY * output_data_ptr);
-;
-
-; r10 = int max_v_samp_factor
-; r11 = JDIMENSION output_width
-; r12 = JSAMPARRAY input_data
-; r13 = JSAMPARRAY * output_data_ptr
-
-	align	16
-	global	EXTN(jsimd_h2v1_upsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v1_upsample_sse2):
-	push	rbp
-	mov	rax,rsp
-	mov	rbp,rsp
-	collect_args
-
-	mov	rdx, r11
-	add	rdx, byte (2*SIZEOF_XMMWORD)-1
-	and	rdx, byte -(2*SIZEOF_XMMWORD)
-	jz	near .return
-
-	mov	rcx, r10	; rowctr
-	test	rcx,rcx
-	jz	short .return
-
-	mov	rsi, r12 ; input_data
-	mov	rdi, r13
-	mov	rdi, JSAMPARRAY [rdi]			; output_data
-.rowloop:
-	push	rdi
-	push	rsi
-
-	mov	rsi, JSAMPROW [rsi]		; inptr
-	mov	rdi, JSAMPROW [rdi]		; outptr
-	mov	rax,rdx				; colctr
-.columnloop:
-
-	movdqa	xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-
-	movdqa    xmm1,xmm0
-	punpcklbw xmm0,xmm0
-	punpckhbw xmm1,xmm1
-
-	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
-	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
-
-	sub	rax, byte 2*SIZEOF_XMMWORD
-	jz	short .nextrow
-
-	movdqa	xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-
-	movdqa    xmm3,xmm2
-	punpcklbw xmm2,xmm2
-	punpckhbw xmm3,xmm3
-
-	movdqa	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
-	movdqa	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
-
-	sub	rax, byte 2*SIZEOF_XMMWORD
-	jz	short .nextrow
-
-	add	rsi, byte 2*SIZEOF_XMMWORD	; inptr
-	add	rdi, byte 4*SIZEOF_XMMWORD	; outptr
-	jmp	short .columnloop
-
-.nextrow:
-	pop	rsi
-	pop	rdi
-
-	add	rsi, byte SIZEOF_JSAMPROW	; input_data
-	add	rdi, byte SIZEOF_JSAMPROW	; output_data
-	dec	rcx				; rowctr
-	jg	short .rowloop
-
-.return:
-	uncollect_args
-	pop	rbp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
-;                           JDIMENSION output_width,
-;                           JSAMPARRAY input_data,
-;                           JSAMPARRAY * output_data_ptr);
-;
-
-; r10 = int max_v_samp_factor
-; r11 = JDIMENSION output_width
-; r12 = JSAMPARRAY input_data
-; r13 = JSAMPARRAY * output_data_ptr
-
-	align	16
-	global	EXTN(jsimd_h2v2_upsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v2_upsample_sse2):
-	push	rbp
-	mov	rax,rsp
-	mov	rbp,rsp
-	collect_args
-	push	rbx
-
-	mov	rdx, r11
-	add	rdx, byte (2*SIZEOF_XMMWORD)-1
-	and	rdx, byte -(2*SIZEOF_XMMWORD)
-	jz	near .return
-
-	mov	rcx, r10	; rowctr
-	test	rcx,rcx
-	jz	near .return
-
-	mov	rsi, r12	; input_data
-	mov	rdi, r13
-	mov	rdi, JSAMPARRAY [rdi]			; output_data
-.rowloop:
-	push	rdi
-	push	rsi
-
-	mov	rsi, JSAMPROW [rsi]			; inptr
-	mov	rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]	; outptr0
-	mov	rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]	; outptr1
-	mov	rax,rdx					; colctr
-.columnloop:
-
-	movdqa	xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-
-	movdqa    xmm1,xmm0
-	punpcklbw xmm0,xmm0
-	punpckhbw xmm1,xmm1
-
-	movdqa	XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
-	movdqa	XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
-	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
-	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
-
-	sub	rax, byte 2*SIZEOF_XMMWORD
-	jz	short .nextrow
-
-	movdqa	xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-
-	movdqa    xmm3,xmm2
-	punpcklbw xmm2,xmm2
-	punpckhbw xmm3,xmm3
-
-	movdqa	XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
-	movdqa	XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
-	movdqa	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
-	movdqa	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
-
-	sub	rax, byte 2*SIZEOF_XMMWORD
-	jz	short .nextrow
-
-	add	rsi, byte 2*SIZEOF_XMMWORD	; inptr
-	add	rbx, byte 4*SIZEOF_XMMWORD	; outptr0
-	add	rdi, byte 4*SIZEOF_XMMWORD	; outptr1
-	jmp	short .columnloop
-
-.nextrow:
-	pop	rsi
-	pop	rdi
-
-	add	rsi, byte 1*SIZEOF_JSAMPROW	; input_data
-	add	rdi, byte 2*SIZEOF_JSAMPROW	; output_data
-	sub	rcx, byte 2			; rowctr
-	jg	near .rowloop
-
-.return:
-	pop	rbx
-	uncollect_args
-	pop	rbp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jdsamss2.asm b/simd/jdsamss2.asm
deleted file mode 100644
index c91a863..0000000
--- a/simd/jdsamss2.asm
+++ /dev/null
@@ -1,729 +0,0 @@
-;
-; jdsamss2.asm - upsampling (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_fancy_upsample_sse2) PRIVATE
-
-EXTN(jconst_fancy_upsample_sse2):
-
-PW_ONE		times 8 dw  1
-PW_TWO		times 8 dw  2
-PW_THREE	times 8 dw  3
-PW_SEVEN	times 8 dw  7
-PW_EIGHT	times 8 dw  8
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
-;
-; The upsampling algorithm is linear interpolation between pixel centers,
-; also known as a "triangle filter".  This is a good compromise between
-; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
-; of the way between input pixel centers.
-;
-; GLOBAL(void)
-; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
-;                                 JDIMENSION downsampled_width,
-;                                 JSAMPARRAY input_data,
-;                                 JSAMPARRAY * output_data_ptr);
-;
-
-%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
-%define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width
-%define input_data(b)		(b)+16		; JSAMPARRAY input_data
-%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
-
-	align	16
-	global	EXTN(jsimd_h2v1_fancy_upsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v1_fancy_upsample_sse2):
-	push	ebp
-	mov	ebp,esp
-	pushpic	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx		; get GOT address
-
-	mov	eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
-	test	eax,eax
-	jz	near .return
-
-	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
-	test	ecx,ecx
-	jz	near .return
-
-	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
-	mov	edi, POINTER [output_data_ptr(ebp)]
-	mov	edi, JSAMPARRAY [edi]			; output_data
-	alignx	16,7
-.rowloop:
-	push	eax			; colctr
-	push	edi
-	push	esi
-
-	mov	esi, JSAMPROW [esi]	; inptr
-	mov	edi, JSAMPROW [edi]	; outptr
-
-	test	eax, SIZEOF_XMMWORD-1
-	jz	short .skip
-	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
-	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
-.skip:
-	pxor	xmm0,xmm0		; xmm0=(all 0's)
-	pcmpeqb	xmm7,xmm7
-	psrldq	xmm7,(SIZEOF_XMMWORD-1)
-	pand	xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
-
-	add	eax, byte SIZEOF_XMMWORD-1
-	and	eax, byte -SIZEOF_XMMWORD
-	cmp	eax, byte SIZEOF_XMMWORD
-	ja	short .columnloop
-	alignx	16,7
-
-.columnloop_last:
-	pcmpeqb	xmm6,xmm6
-	pslldq	xmm6,(SIZEOF_XMMWORD-1)
-	pand	xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	jmp	short .upsample
-	alignx	16,7
-
-.columnloop:
-	movdqa	xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
-	pslldq	xmm6,(SIZEOF_XMMWORD-1)
-
-.upsample:
-	movdqa	xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	movdqa	xmm2,xmm1
-	movdqa	xmm3,xmm1		; xmm1=( 0  1  2 ... 13 14 15)
-	pslldq	xmm2,1			; xmm2=(--  0  1 ... 12 13 14)
-	psrldq	xmm3,1			; xmm3=( 1  2  3 ... 14 15 --)
-
-	por	xmm2,xmm7		; xmm2=(-1  0  1 ... 12 13 14)
-	por	xmm3,xmm6		; xmm3=( 1  2  3 ... 14 15 16)
-
-	movdqa	xmm7,xmm1
-	psrldq	xmm7,(SIZEOF_XMMWORD-1)	; xmm7=(15 -- -- ... -- -- --)
-
-	movdqa    xmm4,xmm1
-	punpcklbw xmm1,xmm0		; xmm1=( 0  1  2  3  4  5  6  7)
-	punpckhbw xmm4,xmm0		; xmm4=( 8  9 10 11 12 13 14 15)
-	movdqa    xmm5,xmm2
-	punpcklbw xmm2,xmm0		; xmm2=(-1  0  1  2  3  4  5  6)
-	punpckhbw xmm5,xmm0		; xmm5=( 7  8  9 10 11 12 13 14)
-	movdqa    xmm6,xmm3
-	punpcklbw xmm3,xmm0		; xmm3=( 1  2  3  4  5  6  7  8)
-	punpckhbw xmm6,xmm0		; xmm6=( 9 10 11 12 13 14 15 16)
-
-	pmullw	xmm1,[GOTOFF(ebx,PW_THREE)]
-	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
-	paddw	xmm2,[GOTOFF(ebx,PW_ONE)]
-	paddw	xmm5,[GOTOFF(ebx,PW_ONE)]
-	paddw	xmm3,[GOTOFF(ebx,PW_TWO)]
-	paddw	xmm6,[GOTOFF(ebx,PW_TWO)]
-
-	paddw	xmm2,xmm1
-	paddw	xmm5,xmm4
-	psrlw	xmm2,2			; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
-	psrlw	xmm5,2			; xmm5=OutHE=(16 18 20 22 24 26 28 30)
-	paddw	xmm3,xmm1
-	paddw	xmm6,xmm4
-	psrlw	xmm3,2			; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
-	psrlw	xmm6,2			; xmm6=OutHO=(17 19 21 23 25 27 29 31)
-
-	psllw	xmm3,BYTE_BIT
-	psllw	xmm6,BYTE_BIT
-	por	xmm2,xmm3		; xmm2=OutL=( 0  1  2 ... 13 14 15)
-	por	xmm5,xmm6		; xmm5=OutH=(16 17 18 ... 29 30 31)
-
-	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
-	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
-
-	sub	eax, byte SIZEOF_XMMWORD
-	add	esi, byte 1*SIZEOF_XMMWORD	; inptr
-	add	edi, byte 2*SIZEOF_XMMWORD	; outptr
-	cmp	eax, byte SIZEOF_XMMWORD
-	ja	near .columnloop
-	test	eax,eax
-	jnz	near .columnloop_last
-
-	pop	esi
-	pop	edi
-	pop	eax
-
-	add	esi, byte SIZEOF_JSAMPROW	; input_data
-	add	edi, byte SIZEOF_JSAMPROW	; output_data
-	dec	ecx				; rowctr
-	jg	near .rowloop
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	poppic	ebx
-	pop	ebp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
-; Again a triangle filter; see comments for h2v1 case, above.
-;
-; GLOBAL(void)
-; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
-;                                 JDIMENSION downsampled_width,
-;                                 JSAMPARRAY input_data,
-;                                 JSAMPARRAY * output_data_ptr);
-;
-
-%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
-%define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width
-%define input_data(b)		(b)+16		; JSAMPARRAY input_data
-%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		4
-%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
-
-	align	16
-	global	EXTN(jsimd_h2v2_fancy_upsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v2_fancy_upsample_sse2):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	eax		; make a room for GOT address
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx			; get GOT address
-	movpic	POINTER [gotptr], ebx	; save GOT address
-
-	mov	edx,eax				; edx = original ebp
-	mov	eax, JDIMENSION [downsamp_width(edx)]  ; colctr
-	test	eax,eax
-	jz	near .return
-
-	mov	ecx, INT [max_v_samp(edx)]	; rowctr
-	test	ecx,ecx
-	jz	near .return
-
-	mov	esi, JSAMPARRAY [input_data(edx)]	; input_data
-	mov	edi, POINTER [output_data_ptr(edx)]
-	mov	edi, JSAMPARRAY [edi]			; output_data
-	alignx	16,7
-.rowloop:
-	push	eax					; colctr
-	push	ecx
-	push	edi
-	push	esi
-
-	mov	ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]	; inptr1(above)
-	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
-	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1(below)
-	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
-	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
-
-	test	eax, SIZEOF_XMMWORD-1
-	jz	short .skip
-	push	edx
-	mov	dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
-	mov	JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
-	mov	dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
-	mov	JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
-	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
-	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
-	pop	edx
-.skip:
-	; -- process the first column block
-
-	movdqa	xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD]	; xmm0=row[ 0][0]
-	movdqa	xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD]	; xmm1=row[-1][0]
-	movdqa	xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD]	; xmm2=row[+1][0]
-
-	pushpic	ebx
-	movpic	ebx, POINTER [gotptr]	; load GOT address
-
-	pxor      xmm3,xmm3		; xmm3=(all 0's)
-	movdqa    xmm4,xmm0
-	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
-	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
-	movdqa    xmm5,xmm1
-	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
-	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
-	movdqa    xmm6,xmm2
-	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
-	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
-
-	pmullw	xmm0,[GOTOFF(ebx,PW_THREE)]
-	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
-
-	pcmpeqb	xmm7,xmm7
-	psrldq	xmm7,(SIZEOF_XMMWORD-2)
-
-	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
-	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
-	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
-	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
-
-	movdqa	XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1	; temporarily save
-	movdqa	XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5	; the intermediate data
-	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
-	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
-
-	pand	xmm1,xmm7		; xmm1=( 0 -- -- -- -- -- -- --)
-	pand	xmm2,xmm7		; xmm2=( 0 -- -- -- -- -- -- --)
-
-	movdqa	XMMWORD [wk(0)], xmm1
-	movdqa	XMMWORD [wk(1)], xmm2
-
-	poppic	ebx
-
-	add	eax, byte SIZEOF_XMMWORD-1
-	and	eax, byte -SIZEOF_XMMWORD
-	cmp	eax, byte SIZEOF_XMMWORD
-	ja	short .columnloop
-	alignx	16,7
-
-.columnloop_last:
-	; -- process the last column block
-
-	pushpic	ebx
-	movpic	ebx, POINTER [gotptr]	; load GOT address
-
-	pcmpeqb	xmm1,xmm1
-	pslldq	xmm1,(SIZEOF_XMMWORD-2)
-	movdqa	xmm2,xmm1
-
-	pand	xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
-	pand	xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
-
-	movdqa	XMMWORD [wk(2)], xmm1	; xmm1=(-- -- -- -- -- -- -- 15)
-	movdqa	XMMWORD [wk(3)], xmm2	; xmm2=(-- -- -- -- -- -- -- 15)
-
-	jmp	near .upsample
-	alignx	16,7
-
-.columnloop:
-	; -- process the next column block
-
-	movdqa	xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD]	; xmm0=row[ 0][1]
-	movdqa	xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD]	; xmm1=row[-1][1]
-	movdqa	xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]	; xmm2=row[+1][1]
-
-	pushpic	ebx
-	movpic	ebx, POINTER [gotptr]	; load GOT address
-
-	pxor      xmm3,xmm3		; xmm3=(all 0's)
-	movdqa    xmm4,xmm0
-	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
-	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
-	movdqa    xmm5,xmm1
-	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
-	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
-	movdqa    xmm6,xmm2
-	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
-	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
-
-	pmullw	xmm0,[GOTOFF(ebx,PW_THREE)]
-	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
-
-	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
-	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
-	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
-	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
-
-	movdqa	XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1	; temporarily save
-	movdqa	XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5	; the intermediate data
-	movdqa	XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
-	movdqa	XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
-
-	pslldq	xmm1,(SIZEOF_XMMWORD-2)	; xmm1=(-- -- -- -- -- -- --  0)
-	pslldq	xmm2,(SIZEOF_XMMWORD-2)	; xmm2=(-- -- -- -- -- -- --  0)
-
-	movdqa	XMMWORD [wk(2)], xmm1
-	movdqa	XMMWORD [wk(3)], xmm2
-
-.upsample:
-	; -- process the upper row
-
-	movdqa	xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
-	movdqa	xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
-
-	movdqa	xmm0,xmm7		; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
-	movdqa	xmm4,xmm3		; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
-	psrldq	xmm0,2			; xmm0=( 1  2  3  4  5  6  7 --)
-	pslldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(-- -- -- -- -- -- --  8)
-	movdqa	xmm5,xmm7
-	movdqa	xmm6,xmm3
-	psrldq	xmm5,(SIZEOF_XMMWORD-2)	; xmm5=( 7 -- -- -- -- -- -- --)
-	pslldq	xmm6,2			; xmm6=(--  8  9 10 11 12 13 14)
-
-	por	xmm0,xmm4		; xmm0=( 1  2  3  4  5  6  7  8)
-	por	xmm5,xmm6		; xmm5=( 7  8  9 10 11 12 13 14)
-
-	movdqa	xmm1,xmm7
-	movdqa	xmm2,xmm3
-	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)
-	psrldq	xmm2,2			; xmm2=( 9 10 11 12 13 14 15 --)
-	movdqa	xmm4,xmm3
-	psrldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(15 -- -- -- -- -- -- --)
-
-	por	xmm1, XMMWORD [wk(0)]	; xmm1=(-1  0  1  2  3  4  5  6)
-	por	xmm2, XMMWORD [wk(2)]	; xmm2=( 9 10 11 12 13 14 15 16)
-
-	movdqa	XMMWORD [wk(0)], xmm4
-
-	pmullw	xmm7,[GOTOFF(ebx,PW_THREE)]
-	pmullw	xmm3,[GOTOFF(ebx,PW_THREE)]
-	paddw	xmm1,[GOTOFF(ebx,PW_EIGHT)]
-	paddw	xmm5,[GOTOFF(ebx,PW_EIGHT)]
-	paddw	xmm0,[GOTOFF(ebx,PW_SEVEN)]
-	paddw	xmm2,[GOTOFF(ebx,PW_SEVEN)]
-
-	paddw	xmm1,xmm7
-	paddw	xmm5,xmm3
-	psrlw	xmm1,4			; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
-	psrlw	xmm5,4			; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
-	paddw	xmm0,xmm7
-	paddw	xmm2,xmm3
-	psrlw	xmm0,4			; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
-	psrlw	xmm2,4			; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
-
-	psllw	xmm0,BYTE_BIT
-	psllw	xmm2,BYTE_BIT
-	por	xmm1,xmm0		; xmm1=Out0L=( 0  1  2 ... 13 14 15)
-	por	xmm5,xmm2		; xmm5=Out0H=(16 17 18 ... 29 30 31)
-
-	movdqa	XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
-	movdqa	XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
-
-	; -- process the lower row
-
-	movdqa	xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
-	movdqa	xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
-
-	movdqa	xmm7,xmm6		; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
-	movdqa	xmm3,xmm4		; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
-	psrldq	xmm7,2			; xmm7=( 1  2  3  4  5  6  7 --)
-	pslldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(-- -- -- -- -- -- --  8)
-	movdqa	xmm0,xmm6
-	movdqa	xmm2,xmm4
-	psrldq	xmm0,(SIZEOF_XMMWORD-2)	; xmm0=( 7 -- -- -- -- -- -- --)
-	pslldq	xmm2,2			; xmm2=(--  8  9 10 11 12 13 14)
-
-	por	xmm7,xmm3		; xmm7=( 1  2  3  4  5  6  7  8)
-	por	xmm0,xmm2		; xmm0=( 7  8  9 10 11 12 13 14)
-
-	movdqa	xmm1,xmm6
-	movdqa	xmm5,xmm4
-	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)
-	psrldq	xmm5,2			; xmm5=( 9 10 11 12 13 14 15 --)
-	movdqa	xmm3,xmm4
-	psrldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(15 -- -- -- -- -- -- --)
-
-	por	xmm1, XMMWORD [wk(1)]	; xmm1=(-1  0  1  2  3  4  5  6)
-	por	xmm5, XMMWORD [wk(3)]	; xmm5=( 9 10 11 12 13 14 15 16)
-
-	movdqa	XMMWORD [wk(1)], xmm3
-
-	pmullw	xmm6,[GOTOFF(ebx,PW_THREE)]
-	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
-	paddw	xmm1,[GOTOFF(ebx,PW_EIGHT)]
-	paddw	xmm0,[GOTOFF(ebx,PW_EIGHT)]
-	paddw	xmm7,[GOTOFF(ebx,PW_SEVEN)]
-	paddw	xmm5,[GOTOFF(ebx,PW_SEVEN)]
-
-	paddw	xmm1,xmm6
-	paddw	xmm0,xmm4
-	psrlw	xmm1,4			; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
-	psrlw	xmm0,4			; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
-	paddw	xmm7,xmm6
-	paddw	xmm5,xmm4
-	psrlw	xmm7,4			; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
-	psrlw	xmm5,4			; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
-
-	psllw	xmm7,BYTE_BIT
-	psllw	xmm5,BYTE_BIT
-	por	xmm1,xmm7		; xmm1=Out1L=( 0  1  2 ... 13 14 15)
-	por	xmm0,xmm5		; xmm0=Out1H=(16 17 18 ... 29 30 31)
-
-	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
-	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
-
-	poppic	ebx
-
-	sub	eax, byte SIZEOF_XMMWORD
-	add	ecx, byte 1*SIZEOF_XMMWORD	; inptr1(above)
-	add	ebx, byte 1*SIZEOF_XMMWORD	; inptr0
-	add	esi, byte 1*SIZEOF_XMMWORD	; inptr1(below)
-	add	edx, byte 2*SIZEOF_XMMWORD	; outptr0
-	add	edi, byte 2*SIZEOF_XMMWORD	; outptr1
-	cmp	eax, byte SIZEOF_XMMWORD
-	ja	near .columnloop
-	test	eax,eax
-	jnz	near .columnloop_last
-
-	pop	esi
-	pop	edi
-	pop	ecx
-	pop	eax
-
-	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
-	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
-	sub	ecx, byte 2			; rowctr
-	jg	near .rowloop
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
-;                           JDIMENSION output_width,
-;                           JSAMPARRAY input_data,
-;                           JSAMPARRAY * output_data_ptr);
-;
-
-%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
-%define output_width(b)	(b)+12		; JDIMENSION output_width
-%define input_data(b)		(b)+16		; JSAMPARRAY input_data
-%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
-
-	align	16
-	global	EXTN(jsimd_h2v1_upsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v1_upsample_sse2):
-	push	ebp
-	mov	ebp,esp
-;	push	ebx		; unused
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov	edx, JDIMENSION [output_width(ebp)]
-	add	edx, byte (2*SIZEOF_XMMWORD)-1
-	and	edx, byte -(2*SIZEOF_XMMWORD)
-	jz	short .return
-
-	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
-	test	ecx,ecx
-	jz	short .return
-
-	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
-	mov	edi, POINTER [output_data_ptr(ebp)]
-	mov	edi, JSAMPARRAY [edi]			; output_data
-	alignx	16,7
-.rowloop:
-	push	edi
-	push	esi
-
-	mov	esi, JSAMPROW [esi]		; inptr
-	mov	edi, JSAMPROW [edi]		; outptr
-	mov	eax,edx				; colctr
-	alignx	16,7
-.columnloop:
-
-	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
-
-	movdqa    xmm1,xmm0
-	punpcklbw xmm0,xmm0
-	punpckhbw xmm1,xmm1
-
-	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
-	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
-
-	sub	eax, byte 2*SIZEOF_XMMWORD
-	jz	short .nextrow
-
-	movdqa	xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
-
-	movdqa    xmm3,xmm2
-	punpcklbw xmm2,xmm2
-	punpckhbw xmm3,xmm3
-
-	movdqa	XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
-	movdqa	XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
-
-	sub	eax, byte 2*SIZEOF_XMMWORD
-	jz	short .nextrow
-
-	add	esi, byte 2*SIZEOF_XMMWORD	; inptr
-	add	edi, byte 4*SIZEOF_XMMWORD	; outptr
-	jmp	short .columnloop
-	alignx	16,7
-
-.nextrow:
-	pop	esi
-	pop	edi
-
-	add	esi, byte SIZEOF_JSAMPROW	; input_data
-	add	edi, byte SIZEOF_JSAMPROW	; output_data
-	dec	ecx				; rowctr
-	jg	short .rowloop
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-;	pop	ebx		; unused
-	pop	ebp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
-;                           JDIMENSION output_width,
-;                           JSAMPARRAY input_data,
-;                           JSAMPARRAY * output_data_ptr);
-;
-
-%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
-%define output_width(b)	(b)+12		; JDIMENSION output_width
-%define input_data(b)		(b)+16		; JSAMPARRAY input_data
-%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
-
-	align	16
-	global	EXTN(jsimd_h2v2_upsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v2_upsample_sse2):
-	push	ebp
-	mov	ebp,esp
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov	edx, JDIMENSION [output_width(ebp)]
-	add	edx, byte (2*SIZEOF_XMMWORD)-1
-	and	edx, byte -(2*SIZEOF_XMMWORD)
-	jz	near .return
-
-	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
-	test	ecx,ecx
-	jz	near .return
-
-	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
-	mov	edi, POINTER [output_data_ptr(ebp)]
-	mov	edi, JSAMPARRAY [edi]			; output_data
-	alignx	16,7
-.rowloop:
-	push	edi
-	push	esi
-
-	mov	esi, JSAMPROW [esi]			; inptr
-	mov	ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
-	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
-	mov	eax,edx					; colctr
-	alignx	16,7
-.columnloop:
-
-	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
-
-	movdqa    xmm1,xmm0
-	punpcklbw xmm0,xmm0
-	punpckhbw xmm1,xmm1
-
-	movdqa	XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
-	movdqa	XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
-	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
-	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
-
-	sub	eax, byte 2*SIZEOF_XMMWORD
-	jz	short .nextrow
-
-	movdqa	xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
-
-	movdqa    xmm3,xmm2
-	punpcklbw xmm2,xmm2
-	punpckhbw xmm3,xmm3
-
-	movdqa	XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
-	movdqa	XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
-	movdqa	XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
-	movdqa	XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
-
-	sub	eax, byte 2*SIZEOF_XMMWORD
-	jz	short .nextrow
-
-	add	esi, byte 2*SIZEOF_XMMWORD	; inptr
-	add	ebx, byte 4*SIZEOF_XMMWORD	; outptr0
-	add	edi, byte 4*SIZEOF_XMMWORD	; outptr1
-	jmp	short .columnloop
-	alignx	16,7
-
-.nextrow:
-	pop	esi
-	pop	edi
-
-	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
-	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
-	sub	ecx, byte 2			; rowctr
-	jg	short .rowloop
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jf3dnflt.asm b/simd/jf3dnflt.asm
deleted file mode 100644
index 432e304..0000000
--- a/simd/jf3dnflt.asm
+++ /dev/null
@@ -1,320 +0,0 @@
-;
-; jf3dnflt.asm - floating-point FDCT (3DNow!)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the forward DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_fdct_float_3dnow) PRIVATE
-
-EXTN(jconst_fdct_float_3dnow):
-
-PD_0_382	times 2 dd  0.382683432365089771728460
-PD_0_707	times 2 dd  0.707106781186547524400844
-PD_0_541	times 2 dd  0.541196100146196984399723
-PD_1_306	times 2 dd  1.306562964876376527856643
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_float_3dnow (FAST_FLOAT * data)
-;
-
-%define data(b)		(b)+8		; FAST_FLOAT * data
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
-%define WK_NUM		2
-
-	align	16
-	global	EXTN(jsimd_fdct_float_3dnow) PRIVATE
-
-EXTN(jsimd_fdct_float_3dnow):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-;	push	esi		; unused
-;	push	edi		; unused
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process rows.
-
-	mov	edx, POINTER [data(eax)]	; (FAST_FLOAT *)
-	mov	ecx, DCTSIZE/2
-	alignx	16,7
-.rowloop:
-
-	movq	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-	movq	mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-	movq	mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
-	movq	mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
-
-	; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17)
-
-	movq      mm4,mm0		; transpose coefficients
-	punpckldq mm0,mm1		; mm0=(00 10)=data0
-	punpckhdq mm4,mm1		; mm4=(01 11)=data1
-	movq      mm5,mm2		; transpose coefficients
-	punpckldq mm2,mm3		; mm2=(06 16)=data6
-	punpckhdq mm5,mm3		; mm5=(07 17)=data7
-
-	movq	mm6,mm4
-	movq	mm7,mm0
-	pfsub	mm4,mm2			; mm4=data1-data6=tmp6
-	pfsub	mm0,mm5			; mm0=data0-data7=tmp7
-	pfadd	mm6,mm2			; mm6=data1+data6=tmp1
-	pfadd	mm7,mm5			; mm7=data0+data7=tmp0
-
-	movq	mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
-	movq	mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-	movq	mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
-	movq	mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
-
-	; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15)
-
-	movq	MMWORD [wk(0)], mm4	; wk(0)=tmp6
-	movq	MMWORD [wk(1)], mm0	; wk(1)=tmp7
-
-	movq      mm4,mm1		; transpose coefficients
-	punpckldq mm1,mm3		; mm1=(02 12)=data2
-	punpckhdq mm4,mm3		; mm4=(03 13)=data3
-	movq      mm0,mm2		; transpose coefficients
-	punpckldq mm2,mm5		; mm2=(04 14)=data4
-	punpckhdq mm0,mm5		; mm0=(05 15)=data5
-
-	movq	mm3,mm4
-	movq	mm5,mm1
-	pfadd	mm4,mm2			; mm4=data3+data4=tmp3
-	pfadd	mm1,mm0			; mm1=data2+data5=tmp2
-	pfsub	mm3,mm2			; mm3=data3-data4=tmp4
-	pfsub	mm5,mm0			; mm5=data2-data5=tmp5
-
-	; -- Even part
-
-	movq	mm2,mm7
-	movq	mm0,mm6
-	pfsub	mm7,mm4			; mm7=tmp13
-	pfsub	mm6,mm1			; mm6=tmp12
-	pfadd	mm2,mm4			; mm2=tmp10
-	pfadd	mm0,mm1			; mm0=tmp11
-
-	pfadd	mm6,mm7
-	pfmul	mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
-
-	movq	mm4,mm2
-	movq	mm1,mm7
-	pfsub	mm2,mm0			; mm2=data4
-	pfsub	mm7,mm6			; mm7=data6
-	pfadd	mm4,mm0			; mm4=data0
-	pfadd	mm1,mm6			; mm1=data2
-
-	movq	MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2
-	movq	MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7
-	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
-	movq	MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1
-
-	; -- Odd part
-
-	movq	mm0, MMWORD [wk(0)]	; mm0=tmp6
-	movq	mm6, MMWORD [wk(1)]	; mm6=tmp7
-
-	pfadd	mm3,mm5			; mm3=tmp10
-	pfadd	mm5,mm0			; mm5=tmp11
-	pfadd	mm0,mm6			; mm0=tmp12, mm6=tmp7
-
-	pfmul	mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
-
-	movq	mm2,mm3			; mm2=tmp10
-	pfsub	mm3,mm0
-	pfmul	mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
-	pfmul	mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
-	pfmul	mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
-	pfadd	mm2,mm3			; mm2=z2
-	pfadd	mm0,mm3			; mm0=z4
-
-	movq	mm7,mm6
-	pfsub	mm6,mm5			; mm6=z13
-	pfadd	mm7,mm5			; mm7=z11
-
-	movq	mm4,mm6
-	movq	mm1,mm7
-	pfsub	mm6,mm2			; mm6=data3
-	pfsub	mm7,mm0			; mm7=data7
-	pfadd	mm4,mm2			; mm4=data5
-	pfadd	mm1,mm0			; mm1=data1
-
-	movq	MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6
-	movq	MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7
-	movq	MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4
-	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
-
-	add	edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
-	dec	ecx
-	jnz	near .rowloop
-
-	; ---- Pass 2: process columns.
-
-	mov	edx, POINTER [data(eax)]	; (FAST_FLOAT *)
-	mov	ecx, DCTSIZE/2
-	alignx	16,7
-.columnloop:
-
-	movq	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-	movq	mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-	movq	mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
-	movq	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
-
-	; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71)
-
-	movq      mm4,mm0		; transpose coefficients
-	punpckldq mm0,mm1		; mm0=(00 01)=data0
-	punpckhdq mm4,mm1		; mm4=(10 11)=data1
-	movq      mm5,mm2		; transpose coefficients
-	punpckldq mm2,mm3		; mm2=(60 61)=data6
-	punpckhdq mm5,mm3		; mm5=(70 71)=data7
-
-	movq	mm6,mm4
-	movq	mm7,mm0
-	pfsub	mm4,mm2			; mm4=data1-data6=tmp6
-	pfsub	mm0,mm5			; mm0=data0-data7=tmp7
-	pfadd	mm6,mm2			; mm6=data1+data6=tmp1
-	pfadd	mm7,mm5			; mm7=data0+data7=tmp0
-
-	movq	mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
-	movq	mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
-	movq	mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
-	movq	mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
-
-	; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51)
-
-	movq	MMWORD [wk(0)], mm4	; wk(0)=tmp6
-	movq	MMWORD [wk(1)], mm0	; wk(1)=tmp7
-
-	movq      mm4,mm1		; transpose coefficients
-	punpckldq mm1,mm3		; mm1=(20 21)=data2
-	punpckhdq mm4,mm3		; mm4=(30 31)=data3
-	movq      mm0,mm2		; transpose coefficients
-	punpckldq mm2,mm5		; mm2=(40 41)=data4
-	punpckhdq mm0,mm5		; mm0=(50 51)=data5
-
-	movq	mm3,mm4
-	movq	mm5,mm1
-	pfadd	mm4,mm2			; mm4=data3+data4=tmp3
-	pfadd	mm1,mm0			; mm1=data2+data5=tmp2
-	pfsub	mm3,mm2			; mm3=data3-data4=tmp4
-	pfsub	mm5,mm0			; mm5=data2-data5=tmp5
-
-	; -- Even part
-
-	movq	mm2,mm7
-	movq	mm0,mm6
-	pfsub	mm7,mm4			; mm7=tmp13
-	pfsub	mm6,mm1			; mm6=tmp12
-	pfadd	mm2,mm4			; mm2=tmp10
-	pfadd	mm0,mm1			; mm0=tmp11
-
-	pfadd	mm6,mm7
-	pfmul	mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
-
-	movq	mm4,mm2
-	movq	mm1,mm7
-	pfsub	mm2,mm0			; mm2=data4
-	pfsub	mm7,mm6			; mm7=data6
-	pfadd	mm4,mm0			; mm4=data0
-	pfadd	mm1,mm6			; mm1=data2
-
-	movq	MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2
-	movq	MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7
-	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
-	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1
-
-	; -- Odd part
-
-	movq	mm0, MMWORD [wk(0)]	; mm0=tmp6
-	movq	mm6, MMWORD [wk(1)]	; mm6=tmp7
-
-	pfadd	mm3,mm5			; mm3=tmp10
-	pfadd	mm5,mm0			; mm5=tmp11
-	pfadd	mm0,mm6			; mm0=tmp12, mm6=tmp7
-
-	pfmul	mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
-
-	movq	mm2,mm3			; mm2=tmp10
-	pfsub	mm3,mm0
-	pfmul	mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
-	pfmul	mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
-	pfmul	mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
-	pfadd	mm2,mm3			; mm2=z2
-	pfadd	mm0,mm3			; mm0=z4
-
-	movq	mm7,mm6
-	pfsub	mm6,mm5			; mm6=z13
-	pfadd	mm7,mm5			; mm7=z11
-
-	movq	mm4,mm6
-	movq	mm1,mm7
-	pfsub	mm6,mm2			; mm6=data3
-	pfsub	mm7,mm0			; mm7=data7
-	pfadd	mm4,mm2			; mm4=data5
-	pfadd	mm1,mm0			; mm1=data1
-
-	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6
-	movq	MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7
-	movq	MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4
-	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
-
-	add	edx, byte 2*SIZEOF_FAST_FLOAT
-	dec	ecx
-	jnz	near .columnloop
-
-	femms		; empty MMX/3DNow! state
-
-;	pop	edi		; unused
-;	pop	esi		; unused
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	poppic	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jfmmxfst.asm b/simd/jfmmxfst.asm
deleted file mode 100644
index 146e8c3..0000000
--- a/simd/jfmmxfst.asm
+++ /dev/null
@@ -1,397 +0,0 @@
-;
-; jfmmxfst.asm - fast integer FDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the forward DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	8	; 14 is also OK.
-
-%if CONST_BITS == 8
-F_0_382	equ	 98		; FIX(0.382683433)
-F_0_541	equ	139		; FIX(0.541196100)
-F_0_707	equ	181		; FIX(0.707106781)
-F_1_306	equ	334		; FIX(1.306562965)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_382	equ	DESCALE( 410903207,30-CONST_BITS)	; FIX(0.382683433)
-F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
-F_0_707	equ	DESCALE( 759250124,30-CONST_BITS)	; FIX(0.707106781)
-F_1_306	equ	DESCALE(1402911301,30-CONST_BITS)	; FIX(1.306562965)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-	alignz	16
-	global	EXTN(jconst_fdct_ifast_mmx) PRIVATE
-
-EXTN(jconst_fdct_ifast_mmx):
-
-PW_F0707	times 4 dw  F_0_707 << CONST_SHIFT
-PW_F0382	times 4 dw  F_0_382 << CONST_SHIFT
-PW_F0541	times 4 dw  F_0_541 << CONST_SHIFT
-PW_F1306	times 4 dw  F_1_306 << CONST_SHIFT
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_ifast_mmx (DCTELEM * data)
-;
-
-%define data(b)		(b)+8		; DCTELEM * data
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
-%define WK_NUM		2
-
-	align	16
-	global	EXTN(jsimd_fdct_ifast_mmx) PRIVATE
-
-EXTN(jsimd_fdct_ifast_mmx):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-;	push	esi		; unused
-;	push	edi		; unused
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process rows.
-
-	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
-	mov	ecx, DCTSIZE/4
-	alignx	16,7
-.rowloop:
-
-	movq	mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-	movq	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-	movq	mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
-	movq	mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
-
-	; mm0=(20 21 22 23), mm2=(24 25 26 27)
-	; mm1=(30 31 32 33), mm3=(34 35 36 37)
-
-	movq      mm4,mm0		; transpose coefficients(phase 1)
-	punpcklwd mm0,mm1		; mm0=(20 30 21 31)
-	punpckhwd mm4,mm1		; mm4=(22 32 23 33)
-	movq      mm5,mm2		; transpose coefficients(phase 1)
-	punpcklwd mm2,mm3		; mm2=(24 34 25 35)
-	punpckhwd mm5,mm3		; mm5=(26 36 27 37)
-
-	movq	mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-	movq	mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-	movq	mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
-	movq	mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
-
-	; mm6=(00 01 02 03), mm1=(04 05 06 07)
-	; mm7=(10 11 12 13), mm3=(14 15 16 17)
-
-	movq	MMWORD [wk(0)], mm4	; wk(0)=(22 32 23 33)
-	movq	MMWORD [wk(1)], mm2	; wk(1)=(24 34 25 35)
-
-	movq      mm4,mm6		; transpose coefficients(phase 1)
-	punpcklwd mm6,mm7		; mm6=(00 10 01 11)
-	punpckhwd mm4,mm7		; mm4=(02 12 03 13)
-	movq      mm2,mm1		; transpose coefficients(phase 1)
-	punpcklwd mm1,mm3		; mm1=(04 14 05 15)
-	punpckhwd mm2,mm3		; mm2=(06 16 07 17)
-
-	movq      mm7,mm6		; transpose coefficients(phase 2)
-	punpckldq mm6,mm0		; mm6=(00 10 20 30)=data0
-	punpckhdq mm7,mm0		; mm7=(01 11 21 31)=data1
-	movq      mm3,mm2		; transpose coefficients(phase 2)
-	punpckldq mm2,mm5		; mm2=(06 16 26 36)=data6
-	punpckhdq mm3,mm5		; mm3=(07 17 27 37)=data7
-
-	movq	mm0,mm7
-	movq	mm5,mm6
-	psubw	mm7,mm2			; mm7=data1-data6=tmp6
-	psubw	mm6,mm3			; mm6=data0-data7=tmp7
-	paddw	mm0,mm2			; mm0=data1+data6=tmp1
-	paddw	mm5,mm3			; mm5=data0+data7=tmp0
-
-	movq	mm2, MMWORD [wk(0)]	; mm2=(22 32 23 33)
-	movq	mm3, MMWORD [wk(1)]	; mm3=(24 34 25 35)
-	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp6
-	movq	MMWORD [wk(1)], mm6	; wk(1)=tmp7
-
-	movq      mm7,mm4		; transpose coefficients(phase 2)
-	punpckldq mm4,mm2		; mm4=(02 12 22 32)=data2
-	punpckhdq mm7,mm2		; mm7=(03 13 23 33)=data3
-	movq      mm6,mm1		; transpose coefficients(phase 2)
-	punpckldq mm1,mm3		; mm1=(04 14 24 34)=data4
-	punpckhdq mm6,mm3		; mm6=(05 15 25 35)=data5
-
-	movq	mm2,mm7
-	movq	mm3,mm4
-	paddw	mm7,mm1			; mm7=data3+data4=tmp3
-	paddw	mm4,mm6			; mm4=data2+data5=tmp2
-	psubw	mm2,mm1			; mm2=data3-data4=tmp4
-	psubw	mm3,mm6			; mm3=data2-data5=tmp5
-
-	; -- Even part
-
-	movq	mm1,mm5
-	movq	mm6,mm0
-	psubw	mm5,mm7			; mm5=tmp13
-	psubw	mm0,mm4			; mm0=tmp12
-	paddw	mm1,mm7			; mm1=tmp10
-	paddw	mm6,mm4			; mm6=tmp11
-
-	paddw	mm0,mm5
-	psllw	mm0,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1
-
-	movq	mm7,mm1
-	movq	mm4,mm5
-	psubw	mm1,mm6			; mm1=data4
-	psubw	mm5,mm0			; mm5=data6
-	paddw	mm7,mm6			; mm7=data0
-	paddw	mm4,mm0			; mm4=data2
-
-	movq	MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1
-	movq	MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5
-	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
-	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
-
-	; -- Odd part
-
-	movq	mm6, MMWORD [wk(0)]	; mm6=tmp6
-	movq	mm0, MMWORD [wk(1)]	; mm0=tmp7
-
-	paddw	mm2,mm3			; mm2=tmp10
-	paddw	mm3,mm6			; mm3=tmp11
-	paddw	mm6,mm0			; mm6=tmp12, mm0=tmp7
-
-	psllw	mm2,PRE_MULTIPLY_SCALE_BITS
-	psllw	mm6,PRE_MULTIPLY_SCALE_BITS
-
-	psllw	mm3,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3
-
-	movq	mm1,mm2			; mm1=tmp10
-	psubw	mm2,mm6
-	pmulhw	mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5
-	pmulhw	mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
-	pmulhw	mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
-	paddw	mm1,mm2			; mm1=z2
-	paddw	mm6,mm2			; mm6=z4
-
-	movq	mm5,mm0
-	psubw	mm0,mm3			; mm0=z13
-	paddw	mm5,mm3			; mm5=z11
-
-	movq	mm7,mm0
-	movq	mm4,mm5
-	psubw	mm0,mm1			; mm0=data3
-	psubw	mm5,mm6			; mm5=data7
-	paddw	mm7,mm1			; mm7=data5
-	paddw	mm4,mm6			; mm4=data1
-
-	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
-	movq	MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5
-	movq	MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7
-	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
-
-	add	edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
-	dec	ecx
-	jnz	near .rowloop
-
-	; ---- Pass 2: process columns.
-
-	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
-	mov	ecx, DCTSIZE/4
-	alignx	16,7
-.columnloop:
-
-	movq	mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-	movq	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-	movq	mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
-	movq	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
-
-	; mm0=(02 12 22 32), mm2=(42 52 62 72)
-	; mm1=(03 13 23 33), mm3=(43 53 63 73)
-
-	movq      mm4,mm0		; transpose coefficients(phase 1)
-	punpcklwd mm0,mm1		; mm0=(02 03 12 13)
-	punpckhwd mm4,mm1		; mm4=(22 23 32 33)
-	movq      mm5,mm2		; transpose coefficients(phase 1)
-	punpcklwd mm2,mm3		; mm2=(42 43 52 53)
-	punpckhwd mm5,mm3		; mm5=(62 63 72 73)
-
-	movq	mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-	movq	mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-	movq	mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
-	movq	mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
-
-	; mm6=(00 10 20 30), mm1=(40 50 60 70)
-	; mm7=(01 11 21 31), mm3=(41 51 61 71)
-
-	movq	MMWORD [wk(0)], mm4	; wk(0)=(22 23 32 33)
-	movq	MMWORD [wk(1)], mm2	; wk(1)=(42 43 52 53)
-
-	movq      mm4,mm6		; transpose coefficients(phase 1)
-	punpcklwd mm6,mm7		; mm6=(00 01 10 11)
-	punpckhwd mm4,mm7		; mm4=(20 21 30 31)
-	movq      mm2,mm1		; transpose coefficients(phase 1)
-	punpcklwd mm1,mm3		; mm1=(40 41 50 51)
-	punpckhwd mm2,mm3		; mm2=(60 61 70 71)
-
-	movq      mm7,mm6		; transpose coefficients(phase 2)
-	punpckldq mm6,mm0		; mm6=(00 01 02 03)=data0
-	punpckhdq mm7,mm0		; mm7=(10 11 12 13)=data1
-	movq      mm3,mm2		; transpose coefficients(phase 2)
-	punpckldq mm2,mm5		; mm2=(60 61 62 63)=data6
-	punpckhdq mm3,mm5		; mm3=(70 71 72 73)=data7
-
-	movq	mm0,mm7
-	movq	mm5,mm6
-	psubw	mm7,mm2			; mm7=data1-data6=tmp6
-	psubw	mm6,mm3			; mm6=data0-data7=tmp7
-	paddw	mm0,mm2			; mm0=data1+data6=tmp1
-	paddw	mm5,mm3			; mm5=data0+data7=tmp0
-
-	movq	mm2, MMWORD [wk(0)]	; mm2=(22 23 32 33)
-	movq	mm3, MMWORD [wk(1)]	; mm3=(42 43 52 53)
-	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp6
-	movq	MMWORD [wk(1)], mm6	; wk(1)=tmp7
-
-	movq      mm7,mm4		; transpose coefficients(phase 2)
-	punpckldq mm4,mm2		; mm4=(20 21 22 23)=data2
-	punpckhdq mm7,mm2		; mm7=(30 31 32 33)=data3
-	movq      mm6,mm1		; transpose coefficients(phase 2)
-	punpckldq mm1,mm3		; mm1=(40 41 42 43)=data4
-	punpckhdq mm6,mm3		; mm6=(50 51 52 53)=data5
-
-	movq	mm2,mm7
-	movq	mm3,mm4
-	paddw	mm7,mm1			; mm7=data3+data4=tmp3
-	paddw	mm4,mm6			; mm4=data2+data5=tmp2
-	psubw	mm2,mm1			; mm2=data3-data4=tmp4
-	psubw	mm3,mm6			; mm3=data2-data5=tmp5
-
-	; -- Even part
-
-	movq	mm1,mm5
-	movq	mm6,mm0
-	psubw	mm5,mm7			; mm5=tmp13
-	psubw	mm0,mm4			; mm0=tmp12
-	paddw	mm1,mm7			; mm1=tmp10
-	paddw	mm6,mm4			; mm6=tmp11
-
-	paddw	mm0,mm5
-	psllw	mm0,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1
-
-	movq	mm7,mm1
-	movq	mm4,mm5
-	psubw	mm1,mm6			; mm1=data4
-	psubw	mm5,mm0			; mm5=data6
-	paddw	mm7,mm6			; mm7=data0
-	paddw	mm4,mm0			; mm4=data2
-
-	movq	MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1
-	movq	MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5
-	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
-	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
-
-	; -- Odd part
-
-	movq	mm6, MMWORD [wk(0)]	; mm6=tmp6
-	movq	mm0, MMWORD [wk(1)]	; mm0=tmp7
-
-	paddw	mm2,mm3			; mm2=tmp10
-	paddw	mm3,mm6			; mm3=tmp11
-	paddw	mm6,mm0			; mm6=tmp12, mm0=tmp7
-
-	psllw	mm2,PRE_MULTIPLY_SCALE_BITS
-	psllw	mm6,PRE_MULTIPLY_SCALE_BITS
-
-	psllw	mm3,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3
-
-	movq	mm1,mm2			; mm1=tmp10
-	psubw	mm2,mm6
-	pmulhw	mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5
-	pmulhw	mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
-	pmulhw	mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
-	paddw	mm1,mm2			; mm1=z2
-	paddw	mm6,mm2			; mm6=z4
-
-	movq	mm5,mm0
-	psubw	mm0,mm3			; mm0=z13
-	paddw	mm5,mm3			; mm5=z11
-
-	movq	mm7,mm0
-	movq	mm4,mm5
-	psubw	mm0,mm1			; mm0=data3
-	psubw	mm5,mm6			; mm5=data7
-	paddw	mm7,mm1			; mm7=data5
-	paddw	mm4,mm6			; mm4=data1
-
-	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
-	movq	MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5
-	movq	MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7
-	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
-
-	add	edx, byte 4*SIZEOF_DCTELEM
-	dec	ecx
-	jnz	near .columnloop
-
-	emms		; empty MMX state
-
-;	pop	edi		; unused
-;	pop	esi		; unused
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	poppic	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jfmmxint.asm b/simd/jfmmxint.asm
deleted file mode 100644
index e5593f8..0000000
--- a/simd/jfmmxint.asm
+++ /dev/null
@@ -1,622 +0,0 @@
-;
-; jfmmxint.asm - accurate integer FDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; forward DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jfdctint.c; see the jfdctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	13
-%define PASS1_BITS	2
-
-%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
-%define DESCALE_P2	(CONST_BITS+PASS1_BITS)
-
-%if CONST_BITS == 13
-F_0_298	equ	 2446		; FIX(0.298631336)
-F_0_390	equ	 3196		; FIX(0.390180644)
-F_0_541	equ	 4433		; FIX(0.541196100)
-F_0_765	equ	 6270		; FIX(0.765366865)
-F_0_899	equ	 7373		; FIX(0.899976223)
-F_1_175	equ	 9633		; FIX(1.175875602)
-F_1_501	equ	12299		; FIX(1.501321110)
-F_1_847	equ	15137		; FIX(1.847759065)
-F_1_961	equ	16069		; FIX(1.961570560)
-F_2_053	equ	16819		; FIX(2.053119869)
-F_2_562	equ	20995		; FIX(2.562915447)
-F_3_072	equ	25172		; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
-F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
-F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
-F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
-F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
-F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
-F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
-F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
-F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
-F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
-F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
-F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_fdct_islow_mmx) PRIVATE
-
-EXTN(jconst_fdct_islow_mmx):
-
-PW_F130_F054	times 2 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130	times 2 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117	times 2 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078	times 2 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089	times 2 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060	times 2 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256	times 2 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050	times 2 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1	times 2 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2	times 2 dd  1 << (DESCALE_P2-1)
-PW_DESCALE_P2X	times 4 dw  1 << (PASS1_BITS-1)
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_islow_mmx (DCTELEM * data)
-;
-
-%define data(b)		(b)+8		; DCTELEM * data
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
-%define WK_NUM		2
-
-	align	16
-	global	EXTN(jsimd_fdct_islow_mmx) PRIVATE
-
-EXTN(jsimd_fdct_islow_mmx):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-;	push	esi		; unused
-;	push	edi		; unused
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process rows.
-
-	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
-	mov	ecx, DCTSIZE/4
-	alignx	16,7
-.rowloop:
-
-	movq	mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-	movq	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-	movq	mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
-	movq	mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
-
-	; mm0=(20 21 22 23), mm2=(24 25 26 27)
-	; mm1=(30 31 32 33), mm3=(34 35 36 37)
-
-	movq      mm4,mm0		; transpose coefficients(phase 1)
-	punpcklwd mm0,mm1		; mm0=(20 30 21 31)
-	punpckhwd mm4,mm1		; mm4=(22 32 23 33)
-	movq      mm5,mm2		; transpose coefficients(phase 1)
-	punpcklwd mm2,mm3		; mm2=(24 34 25 35)
-	punpckhwd mm5,mm3		; mm5=(26 36 27 37)
-
-	movq	mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-	movq	mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-	movq	mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
-	movq	mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
-
-	; mm6=(00 01 02 03), mm1=(04 05 06 07)
-	; mm7=(10 11 12 13), mm3=(14 15 16 17)
-
-	movq	MMWORD [wk(0)], mm4	; wk(0)=(22 32 23 33)
-	movq	MMWORD [wk(1)], mm2	; wk(1)=(24 34 25 35)
-
-	movq      mm4,mm6		; transpose coefficients(phase 1)
-	punpcklwd mm6,mm7		; mm6=(00 10 01 11)
-	punpckhwd mm4,mm7		; mm4=(02 12 03 13)
-	movq      mm2,mm1		; transpose coefficients(phase 1)
-	punpcklwd mm1,mm3		; mm1=(04 14 05 15)
-	punpckhwd mm2,mm3		; mm2=(06 16 07 17)
-
-	movq      mm7,mm6		; transpose coefficients(phase 2)
-	punpckldq mm6,mm0		; mm6=(00 10 20 30)=data0
-	punpckhdq mm7,mm0		; mm7=(01 11 21 31)=data1
-	movq      mm3,mm2		; transpose coefficients(phase 2)
-	punpckldq mm2,mm5		; mm2=(06 16 26 36)=data6
-	punpckhdq mm3,mm5		; mm3=(07 17 27 37)=data7
-
-	movq	mm0,mm7
-	movq	mm5,mm6
-	psubw	mm7,mm2			; mm7=data1-data6=tmp6
-	psubw	mm6,mm3			; mm6=data0-data7=tmp7
-	paddw	mm0,mm2			; mm0=data1+data6=tmp1
-	paddw	mm5,mm3			; mm5=data0+data7=tmp0
-
-	movq	mm2, MMWORD [wk(0)]	; mm2=(22 32 23 33)
-	movq	mm3, MMWORD [wk(1)]	; mm3=(24 34 25 35)
-	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp6
-	movq	MMWORD [wk(1)], mm6	; wk(1)=tmp7
-
-	movq      mm7,mm4		; transpose coefficients(phase 2)
-	punpckldq mm4,mm2		; mm4=(02 12 22 32)=data2
-	punpckhdq mm7,mm2		; mm7=(03 13 23 33)=data3
-	movq      mm6,mm1		; transpose coefficients(phase 2)
-	punpckldq mm1,mm3		; mm1=(04 14 24 34)=data4
-	punpckhdq mm6,mm3		; mm6=(05 15 25 35)=data5
-
-	movq	mm2,mm7
-	movq	mm3,mm4
-	paddw	mm7,mm1			; mm7=data3+data4=tmp3
-	paddw	mm4,mm6			; mm4=data2+data5=tmp2
-	psubw	mm2,mm1			; mm2=data3-data4=tmp4
-	psubw	mm3,mm6			; mm3=data2-data5=tmp5
-
-	; -- Even part
-
-	movq	mm1,mm5
-	movq	mm6,mm0
-	paddw	mm5,mm7			; mm5=tmp10
-	paddw	mm0,mm4			; mm0=tmp11
-	psubw	mm1,mm7			; mm1=tmp13
-	psubw	mm6,mm4			; mm6=tmp12
-
-	movq	mm7,mm5
-	paddw	mm5,mm0			; mm5=tmp10+tmp11
-	psubw	mm7,mm0			; mm7=tmp10-tmp11
-
-	psllw	mm5,PASS1_BITS		; mm5=data0
-	psllw	mm7,PASS1_BITS		; mm7=data4
-
-	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
-	movq	MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7
-
-	; (Original)
-	; z1 = (tmp12 + tmp13) * 0.541196100;
-	; data2 = z1 + tmp13 * 0.765366865;
-	; data6 = z1 + tmp12 * -1.847759065;
-	;
-	; (This implementation)
-	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-	movq      mm4,mm1		; mm1=tmp13
-	movq      mm0,mm1
-	punpcklwd mm4,mm6		; mm6=tmp12
-	punpckhwd mm0,mm6
-	movq      mm1,mm4
-	movq      mm6,mm0
-	pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]	; mm4=data2L
-	pmaddwd   mm0,[GOTOFF(ebx,PW_F130_F054)]	; mm0=data2H
-	pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]	; mm1=data6L
-	pmaddwd   mm6,[GOTOFF(ebx,PW_F054_MF130)]	; mm6=data6H
-
-	paddd	mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
-	paddd	mm0,[GOTOFF(ebx,PD_DESCALE_P1)]
-	psrad	mm4,DESCALE_P1
-	psrad	mm0,DESCALE_P1
-	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
-	paddd	mm6,[GOTOFF(ebx,PD_DESCALE_P1)]
-	psrad	mm1,DESCALE_P1
-	psrad	mm6,DESCALE_P1
-
-	packssdw  mm4,mm0		; mm4=data2
-	packssdw  mm1,mm6		; mm1=data6
-
-	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
-	movq	MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1
-
-	; -- Odd part
-
-	movq	mm5, MMWORD [wk(0)]	; mm5=tmp6
-	movq	mm7, MMWORD [wk(1)]	; mm7=tmp7
-
-	movq	mm0,mm2			; mm2=tmp4
-	movq	mm6,mm3			; mm3=tmp5
-	paddw	mm0,mm5			; mm0=z3
-	paddw	mm6,mm7			; mm6=z4
-
-	; (Original)
-	; z5 = (z3 + z4) * 1.175875602;
-	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-	; z3 += z5;  z4 += z5;
-	;
-	; (This implementation)
-	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-	movq      mm4,mm0
-	movq      mm1,mm0
-	punpcklwd mm4,mm6
-	punpckhwd mm1,mm6
-	movq      mm0,mm4
-	movq      mm6,mm1
-	pmaddwd   mm4,[GOTOFF(ebx,PW_MF078_F117)]	; mm4=z3L
-	pmaddwd   mm1,[GOTOFF(ebx,PW_MF078_F117)]	; mm1=z3H
-	pmaddwd   mm0,[GOTOFF(ebx,PW_F117_F078)]	; mm0=z4L
-	pmaddwd   mm6,[GOTOFF(ebx,PW_F117_F078)]	; mm6=z4H
-
-	movq	MMWORD [wk(0)], mm4	; wk(0)=z3L
-	movq	MMWORD [wk(1)], mm1	; wk(1)=z3H
-
-	; (Original)
-	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-	;
-	; (This implementation)
-	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-	movq      mm4,mm2
-	movq      mm1,mm2
-	punpcklwd mm4,mm7
-	punpckhwd mm1,mm7
-	movq      mm2,mm4
-	movq      mm7,mm1
-	pmaddwd   mm4,[GOTOFF(ebx,PW_MF060_MF089)]	; mm4=tmp4L
-	pmaddwd   mm1,[GOTOFF(ebx,PW_MF060_MF089)]	; mm1=tmp4H
-	pmaddwd   mm2,[GOTOFF(ebx,PW_MF089_F060)]	; mm2=tmp7L
-	pmaddwd   mm7,[GOTOFF(ebx,PW_MF089_F060)]	; mm7=tmp7H
-
-	paddd	mm4, MMWORD [wk(0)]	; mm4=data7L
-	paddd	mm1, MMWORD [wk(1)]	; mm1=data7H
-	paddd	mm2,mm0			; mm2=data1L
-	paddd	mm7,mm6			; mm7=data1H
-
-	paddd	mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
-	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
-	psrad	mm4,DESCALE_P1
-	psrad	mm1,DESCALE_P1
-	paddd	mm2,[GOTOFF(ebx,PD_DESCALE_P1)]
-	paddd	mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
-	psrad	mm2,DESCALE_P1
-	psrad	mm7,DESCALE_P1
-
-	packssdw  mm4,mm1		; mm4=data7
-	packssdw  mm2,mm7		; mm2=data1
-
-	movq	MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4
-	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
-
-	movq      mm1,mm3
-	movq      mm7,mm3
-	punpcklwd mm1,mm5
-	punpckhwd mm7,mm5
-	movq      mm3,mm1
-	movq      mm5,mm7
-	pmaddwd   mm1,[GOTOFF(ebx,PW_MF050_MF256)]	; mm1=tmp5L
-	pmaddwd   mm7,[GOTOFF(ebx,PW_MF050_MF256)]	; mm7=tmp5H
-	pmaddwd   mm3,[GOTOFF(ebx,PW_MF256_F050)]	; mm3=tmp6L
-	pmaddwd   mm5,[GOTOFF(ebx,PW_MF256_F050)]	; mm5=tmp6H
-
-	paddd	mm1,mm0			; mm1=data5L
-	paddd	mm7,mm6			; mm7=data5H
-	paddd	mm3, MMWORD [wk(0)]	; mm3=data3L
-	paddd	mm5, MMWORD [wk(1)]	; mm5=data3H
-
-	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
-	paddd	mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
-	psrad	mm1,DESCALE_P1
-	psrad	mm7,DESCALE_P1
-	paddd	mm3,[GOTOFF(ebx,PD_DESCALE_P1)]
-	paddd	mm5,[GOTOFF(ebx,PD_DESCALE_P1)]
-	psrad	mm3,DESCALE_P1
-	psrad	mm5,DESCALE_P1
-
-	packssdw  mm1,mm7		; mm1=data5
-	packssdw  mm3,mm5		; mm3=data3
-
-	movq	MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1
-	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
-
-	add	edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
-	dec	ecx
-	jnz	near .rowloop
-
-	; ---- Pass 2: process columns.
-
-	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
-	mov	ecx, DCTSIZE/4
-	alignx	16,7
-.columnloop:
-
-	movq	mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-	movq	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-	movq	mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
-	movq	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
-
-	; mm0=(02 12 22 32), mm2=(42 52 62 72)
-	; mm1=(03 13 23 33), mm3=(43 53 63 73)
-
-	movq      mm4,mm0		; transpose coefficients(phase 1)
-	punpcklwd mm0,mm1		; mm0=(02 03 12 13)
-	punpckhwd mm4,mm1		; mm4=(22 23 32 33)
-	movq      mm5,mm2		; transpose coefficients(phase 1)
-	punpcklwd mm2,mm3		; mm2=(42 43 52 53)
-	punpckhwd mm5,mm3		; mm5=(62 63 72 73)
-
-	movq	mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-	movq	mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-	movq	mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
-	movq	mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
-
-	; mm6=(00 10 20 30), mm1=(40 50 60 70)
-	; mm7=(01 11 21 31), mm3=(41 51 61 71)
-
-	movq	MMWORD [wk(0)], mm4	; wk(0)=(22 23 32 33)
-	movq	MMWORD [wk(1)], mm2	; wk(1)=(42 43 52 53)
-
-	movq      mm4,mm6		; transpose coefficients(phase 1)
-	punpcklwd mm6,mm7		; mm6=(00 01 10 11)
-	punpckhwd mm4,mm7		; mm4=(20 21 30 31)
-	movq      mm2,mm1		; transpose coefficients(phase 1)
-	punpcklwd mm1,mm3		; mm1=(40 41 50 51)
-	punpckhwd mm2,mm3		; mm2=(60 61 70 71)
-
-	movq      mm7,mm6		; transpose coefficients(phase 2)
-	punpckldq mm6,mm0		; mm6=(00 01 02 03)=data0
-	punpckhdq mm7,mm0		; mm7=(10 11 12 13)=data1
-	movq      mm3,mm2		; transpose coefficients(phase 2)
-	punpckldq mm2,mm5		; mm2=(60 61 62 63)=data6
-	punpckhdq mm3,mm5		; mm3=(70 71 72 73)=data7
-
-	movq	mm0,mm7
-	movq	mm5,mm6
-	psubw	mm7,mm2			; mm7=data1-data6=tmp6
-	psubw	mm6,mm3			; mm6=data0-data7=tmp7
-	paddw	mm0,mm2			; mm0=data1+data6=tmp1
-	paddw	mm5,mm3			; mm5=data0+data7=tmp0
-
-	movq	mm2, MMWORD [wk(0)]	; mm2=(22 23 32 33)
-	movq	mm3, MMWORD [wk(1)]	; mm3=(42 43 52 53)
-	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp6
-	movq	MMWORD [wk(1)], mm6	; wk(1)=tmp7
-
-	movq      mm7,mm4		; transpose coefficients(phase 2)
-	punpckldq mm4,mm2		; mm4=(20 21 22 23)=data2
-	punpckhdq mm7,mm2		; mm7=(30 31 32 33)=data3
-	movq      mm6,mm1		; transpose coefficients(phase 2)
-	punpckldq mm1,mm3		; mm1=(40 41 42 43)=data4
-	punpckhdq mm6,mm3		; mm6=(50 51 52 53)=data5
-
-	movq	mm2,mm7
-	movq	mm3,mm4
-	paddw	mm7,mm1			; mm7=data3+data4=tmp3
-	paddw	mm4,mm6			; mm4=data2+data5=tmp2
-	psubw	mm2,mm1			; mm2=data3-data4=tmp4
-	psubw	mm3,mm6			; mm3=data2-data5=tmp5
-
-	; -- Even part
-
-	movq	mm1,mm5
-	movq	mm6,mm0
-	paddw	mm5,mm7			; mm5=tmp10
-	paddw	mm0,mm4			; mm0=tmp11
-	psubw	mm1,mm7			; mm1=tmp13
-	psubw	mm6,mm4			; mm6=tmp12
-
-	movq	mm7,mm5
-	paddw	mm5,mm0			; mm5=tmp10+tmp11
-	psubw	mm7,mm0			; mm7=tmp10-tmp11
-
-	paddw	mm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
-	paddw	mm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
-	psraw	mm5,PASS1_BITS		; mm5=data0
-	psraw	mm7,PASS1_BITS		; mm7=data4
-
-	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
-	movq	MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7
-
-	; (Original)
-	; z1 = (tmp12 + tmp13) * 0.541196100;
-	; data2 = z1 + tmp13 * 0.765366865;
-	; data6 = z1 + tmp12 * -1.847759065;
-	;
-	; (This implementation)
-	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-	movq      mm4,mm1		; mm1=tmp13
-	movq      mm0,mm1
-	punpcklwd mm4,mm6		; mm6=tmp12
-	punpckhwd mm0,mm6
-	movq      mm1,mm4
-	movq      mm6,mm0
-	pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]	; mm4=data2L
-	pmaddwd   mm0,[GOTOFF(ebx,PW_F130_F054)]	; mm0=data2H
-	pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]	; mm1=data6L
-	pmaddwd   mm6,[GOTOFF(ebx,PW_F054_MF130)]	; mm6=data6H
-
-	paddd	mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
-	paddd	mm0,[GOTOFF(ebx,PD_DESCALE_P2)]
-	psrad	mm4,DESCALE_P2
-	psrad	mm0,DESCALE_P2
-	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-	paddd	mm6,[GOTOFF(ebx,PD_DESCALE_P2)]
-	psrad	mm1,DESCALE_P2
-	psrad	mm6,DESCALE_P2
-
-	packssdw  mm4,mm0		; mm4=data2
-	packssdw  mm1,mm6		; mm1=data6
-
-	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
-	movq	MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1
-
-	; -- Odd part
-
-	movq	mm5, MMWORD [wk(0)]	; mm5=tmp6
-	movq	mm7, MMWORD [wk(1)]	; mm7=tmp7
-
-	movq	mm0,mm2			; mm2=tmp4
-	movq	mm6,mm3			; mm3=tmp5
-	paddw	mm0,mm5			; mm0=z3
-	paddw	mm6,mm7			; mm6=z4
-
-	; (Original)
-	; z5 = (z3 + z4) * 1.175875602;
-	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-	; z3 += z5;  z4 += z5;
-	;
-	; (This implementation)
-	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-	movq      mm4,mm0
-	movq      mm1,mm0
-	punpcklwd mm4,mm6
-	punpckhwd mm1,mm6
-	movq      mm0,mm4
-	movq      mm6,mm1
-	pmaddwd   mm4,[GOTOFF(ebx,PW_MF078_F117)]	; mm4=z3L
-	pmaddwd   mm1,[GOTOFF(ebx,PW_MF078_F117)]	; mm1=z3H
-	pmaddwd   mm0,[GOTOFF(ebx,PW_F117_F078)]	; mm0=z4L
-	pmaddwd   mm6,[GOTOFF(ebx,PW_F117_F078)]	; mm6=z4H
-
-	movq	MMWORD [wk(0)], mm4	; wk(0)=z3L
-	movq	MMWORD [wk(1)], mm1	; wk(1)=z3H
-
-	; (Original)
-	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-	;
-	; (This implementation)
-	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-	movq      mm4,mm2
-	movq      mm1,mm2
-	punpcklwd mm4,mm7
-	punpckhwd mm1,mm7
-	movq      mm2,mm4
-	movq      mm7,mm1
-	pmaddwd   mm4,[GOTOFF(ebx,PW_MF060_MF089)]	; mm4=tmp4L
-	pmaddwd   mm1,[GOTOFF(ebx,PW_MF060_MF089)]	; mm1=tmp4H
-	pmaddwd   mm2,[GOTOFF(ebx,PW_MF089_F060)]	; mm2=tmp7L
-	pmaddwd   mm7,[GOTOFF(ebx,PW_MF089_F060)]	; mm7=tmp7H
-
-	paddd	mm4, MMWORD [wk(0)]	; mm4=data7L
-	paddd	mm1, MMWORD [wk(1)]	; mm1=data7H
-	paddd	mm2,mm0			; mm2=data1L
-	paddd	mm7,mm6			; mm7=data1H
-
-	paddd	mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
-	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-	psrad	mm4,DESCALE_P2
-	psrad	mm1,DESCALE_P2
-	paddd	mm2,[GOTOFF(ebx,PD_DESCALE_P2)]
-	paddd	mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
-	psrad	mm2,DESCALE_P2
-	psrad	mm7,DESCALE_P2
-
-	packssdw  mm4,mm1		; mm4=data7
-	packssdw  mm2,mm7		; mm2=data1
-
-	movq	MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4
-	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
-
-	movq      mm1,mm3
-	movq      mm7,mm3
-	punpcklwd mm1,mm5
-	punpckhwd mm7,mm5
-	movq      mm3,mm1
-	movq      mm5,mm7
-	pmaddwd   mm1,[GOTOFF(ebx,PW_MF050_MF256)]	; mm1=tmp5L
-	pmaddwd   mm7,[GOTOFF(ebx,PW_MF050_MF256)]	; mm7=tmp5H
-	pmaddwd   mm3,[GOTOFF(ebx,PW_MF256_F050)]	; mm3=tmp6L
-	pmaddwd   mm5,[GOTOFF(ebx,PW_MF256_F050)]	; mm5=tmp6H
-
-	paddd	mm1,mm0			; mm1=data5L
-	paddd	mm7,mm6			; mm7=data5H
-	paddd	mm3, MMWORD [wk(0)]	; mm3=data3L
-	paddd	mm5, MMWORD [wk(1)]	; mm5=data3H
-
-	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-	paddd	mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
-	psrad	mm1,DESCALE_P2
-	psrad	mm7,DESCALE_P2
-	paddd	mm3,[GOTOFF(ebx,PD_DESCALE_P2)]
-	paddd	mm5,[GOTOFF(ebx,PD_DESCALE_P2)]
-	psrad	mm3,DESCALE_P2
-	psrad	mm5,DESCALE_P2
-
-	packssdw  mm1,mm7		; mm1=data5
-	packssdw  mm3,mm5		; mm3=data3
-
-	movq	MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1
-	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
-
-	add	edx, byte 4*SIZEOF_DCTELEM
-	dec	ecx
-	jnz	near .columnloop
-
-	emms		; empty MMX state
-
-;	pop	edi		; unused
-;	pop	esi		; unused
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	poppic	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jfss2fst-64.asm b/simd/jfss2fst-64.asm
deleted file mode 100644
index 16a62f2..0000000
--- a/simd/jfss2fst-64.asm
+++ /dev/null
@@ -1,392 +0,0 @@
-;
-; jfss2fst-64.asm - fast integer FDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the forward DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	8	; 14 is also OK.
-
-%if CONST_BITS == 8
-F_0_382	equ	 98		; FIX(0.382683433)
-F_0_541	equ	139		; FIX(0.541196100)
-F_0_707	equ	181		; FIX(0.707106781)
-F_1_306	equ	334		; FIX(1.306562965)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_382	equ	DESCALE( 410903207,30-CONST_BITS)	; FIX(0.382683433)
-F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
-F_0_707	equ	DESCALE( 759250124,30-CONST_BITS)	; FIX(0.707106781)
-F_1_306	equ	DESCALE(1402911301,30-CONST_BITS)	; FIX(1.306562965)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-	alignz	16
-	global	EXTN(jconst_fdct_ifast_sse2) PRIVATE
-
-EXTN(jconst_fdct_ifast_sse2):
-
-PW_F0707	times 8 dw  F_0_707 << CONST_SHIFT
-PW_F0382	times 8 dw  F_0_382 << CONST_SHIFT
-PW_F0541	times 8 dw  F_0_541 << CONST_SHIFT
-PW_F1306	times 8 dw  F_1_306 << CONST_SHIFT
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_ifast_sse2 (DCTELEM * data)
-;
-
-; r10 = DCTELEM * data
-
-%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-
-	align	16
-	global	EXTN(jsimd_fdct_ifast_sse2) PRIVATE
-
-EXTN(jsimd_fdct_ifast_sse2):
-	push	rbp
-	mov	rax,rsp				; rax = original rbp
-	sub	rsp, byte 4
-	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[rsp],rax
-	mov	rbp,rsp				; rbp = aligned rbp
-	lea	rsp, [wk(0)]
-	collect_args
-
-	; ---- Pass 1: process rows.
-
-	mov	rdx, r10	; (DCTELEM *)
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
-	movdqa	xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
-
-	; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
-	; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
-
-	movdqa    xmm4,xmm0		; transpose coefficients(phase 1)
-	punpcklwd xmm0,xmm1		; xmm0=(00 10 01 11 02 12 03 13)
-	punpckhwd xmm4,xmm1		; xmm4=(04 14 05 15 06 16 07 17)
-	movdqa    xmm5,xmm2		; transpose coefficients(phase 1)
-	punpcklwd xmm2,xmm3		; xmm2=(20 30 21 31 22 32 23 33)
-	punpckhwd xmm5,xmm3		; xmm5=(24 34 25 35 26 36 27 37)
-
-	movdqa	xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
-	movdqa	xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
-
-	; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
-	; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
-
-	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(20 30 21 31 22 32 23 33)
-	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(24 34 25 35 26 36 27 37)
-
-	movdqa    xmm2,xmm6		; transpose coefficients(phase 1)
-	punpcklwd xmm6,xmm7		; xmm6=(40 50 41 51 42 52 43 53)
-	punpckhwd xmm2,xmm7		; xmm2=(44 54 45 55 46 56 47 57)
-	movdqa    xmm5,xmm1		; transpose coefficients(phase 1)
-	punpcklwd xmm1,xmm3		; xmm1=(60 70 61 71 62 72 63 73)
-	punpckhwd xmm5,xmm3		; xmm5=(64 74 65 75 66 76 67 77)
-
-	movdqa    xmm7,xmm6		; transpose coefficients(phase 2)
-	punpckldq xmm6,xmm1		; xmm6=(40 50 60 70 41 51 61 71)
-	punpckhdq xmm7,xmm1		; xmm7=(42 52 62 72 43 53 63 73)
-	movdqa    xmm3,xmm2		; transpose coefficients(phase 2)
-	punpckldq xmm2,xmm5		; xmm2=(44 54 64 74 45 55 65 75)
-	punpckhdq xmm3,xmm5		; xmm3=(46 56 66 76 47 57 67 77)
-
-	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(20 30 21 31 22 32 23 33)
-	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(24 34 25 35 26 36 27 37)
-	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=(42 52 62 72 43 53 63 73)
-	movdqa	XMMWORD [wk(1)], xmm2	; wk(1)=(44 54 64 74 45 55 65 75)
-
-	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
-	punpckldq xmm0,xmm1		; xmm0=(00 10 20 30 01 11 21 31)
-	punpckhdq xmm7,xmm1		; xmm7=(02 12 22 32 03 13 23 33)
-	movdqa    xmm2,xmm4		; transpose coefficients(phase 2)
-	punpckldq xmm4,xmm5		; xmm4=(04 14 24 34 05 15 25 35)
-	punpckhdq xmm2,xmm5		; xmm2=(06 16 26 36 07 17 27 37)
-
-	movdqa     xmm1,xmm0		; transpose coefficients(phase 3)
-	punpcklqdq xmm0,xmm6		; xmm0=(00 10 20 30 40 50 60 70)=data0
-	punpckhqdq xmm1,xmm6		; xmm1=(01 11 21 31 41 51 61 71)=data1
-	movdqa     xmm5,xmm2		; transpose coefficients(phase 3)
-	punpcklqdq xmm2,xmm3		; xmm2=(06 16 26 36 46 56 66 76)=data6
-	punpckhqdq xmm5,xmm3		; xmm5=(07 17 27 37 47 57 67 77)=data7
-
-	movdqa	xmm6,xmm1
-	movdqa	xmm3,xmm0
-	psubw	xmm1,xmm2		; xmm1=data1-data6=tmp6
-	psubw	xmm0,xmm5		; xmm0=data0-data7=tmp7
-	paddw	xmm6,xmm2		; xmm6=data1+data6=tmp1
-	paddw	xmm3,xmm5		; xmm3=data0+data7=tmp0
-
-	movdqa	xmm2, XMMWORD [wk(0)]	; xmm2=(42 52 62 72 43 53 63 73)
-	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(44 54 64 74 45 55 65 75)
-	movdqa	XMMWORD [wk(0)], xmm1	; wk(0)=tmp6
-	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp7
-
-	movdqa     xmm1,xmm7		; transpose coefficients(phase 3)
-	punpcklqdq xmm7,xmm2		; xmm7=(02 12 22 32 42 52 62 72)=data2
-	punpckhqdq xmm1,xmm2		; xmm1=(03 13 23 33 43 53 63 73)=data3
-	movdqa     xmm0,xmm4		; transpose coefficients(phase 3)
-	punpcklqdq xmm4,xmm5		; xmm4=(04 14 24 34 44 54 64 74)=data4
-	punpckhqdq xmm0,xmm5		; xmm0=(05 15 25 35 45 55 65 75)=data5
-
-	movdqa	xmm2,xmm1
-	movdqa	xmm5,xmm7
-	paddw	xmm1,xmm4		; xmm1=data3+data4=tmp3
-	paddw	xmm7,xmm0		; xmm7=data2+data5=tmp2
-	psubw	xmm2,xmm4		; xmm2=data3-data4=tmp4
-	psubw	xmm5,xmm0		; xmm5=data2-data5=tmp5
-
-	; -- Even part
-
-	movdqa	xmm4,xmm3
-	movdqa	xmm0,xmm6
-	psubw	xmm3,xmm1		; xmm3=tmp13
-	psubw	xmm6,xmm7		; xmm6=tmp12
-	paddw	xmm4,xmm1		; xmm4=tmp10
-	paddw	xmm0,xmm7		; xmm0=tmp11
-
-	paddw	xmm6,xmm3
-	psllw	xmm6,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm6,[rel PW_F0707] ; xmm6=z1
-
-	movdqa	xmm1,xmm4
-	movdqa	xmm7,xmm3
-	psubw	xmm4,xmm0		; xmm4=data4
-	psubw	xmm3,xmm6		; xmm3=data6
-	paddw	xmm1,xmm0		; xmm1=data0
-	paddw	xmm7,xmm6		; xmm7=data2
-
-	movdqa	xmm0, XMMWORD [wk(0)]	; xmm0=tmp6
-	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=tmp7
-	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=data4
-	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=data6
-
-	; -- Odd part
-
-	paddw	xmm2,xmm5		; xmm2=tmp10
-	paddw	xmm5,xmm0		; xmm5=tmp11
-	paddw	xmm0,xmm6		; xmm0=tmp12, xmm6=tmp7
-
-	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
-	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
-
-	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm5,[rel PW_F0707] ; xmm5=z3
-
-	movdqa	xmm4,xmm2		; xmm4=tmp10
-	psubw	xmm2,xmm0
-	pmulhw	xmm2,[rel PW_F0382] ; xmm2=z5
-	pmulhw	xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
-	pmulhw	xmm0,[rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
-	paddw	xmm4,xmm2		; xmm4=z2
-	paddw	xmm0,xmm2		; xmm0=z4
-
-	movdqa	xmm3,xmm6
-	psubw	xmm6,xmm5		; xmm6=z13
-	paddw	xmm3,xmm5		; xmm3=z11
-
-	movdqa	xmm2,xmm6
-	movdqa	xmm5,xmm3
-	psubw	xmm6,xmm4		; xmm6=data3
-	psubw	xmm3,xmm0		; xmm3=data7
-	paddw	xmm2,xmm4		; xmm2=data5
-	paddw	xmm5,xmm0		; xmm5=data1
-
-	; ---- Pass 2: process columns.
-
-	; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
-	; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
-
-	movdqa    xmm4,xmm1		; transpose coefficients(phase 1)
-	punpcklwd xmm1,xmm5		; xmm1=(00 01 10 11 20 21 30 31)
-	punpckhwd xmm4,xmm5		; xmm4=(40 41 50 51 60 61 70 71)
-	movdqa    xmm0,xmm7		; transpose coefficients(phase 1)
-	punpcklwd xmm7,xmm6		; xmm7=(02 03 12 13 22 23 32 33)
-	punpckhwd xmm0,xmm6		; xmm0=(42 43 52 53 62 63 72 73)
-
-	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=col4
-	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=col6
-
-	; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
-	; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
-
-	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=(02 03 12 13 22 23 32 33)
-	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(42 43 52 53 62 63 72 73)
-
-	movdqa    xmm7,xmm5		; transpose coefficients(phase 1)
-	punpcklwd xmm5,xmm2		; xmm5=(04 05 14 15 24 25 34 35)
-	punpckhwd xmm7,xmm2		; xmm7=(44 45 54 55 64 65 74 75)
-	movdqa    xmm0,xmm6		; transpose coefficients(phase 1)
-	punpcklwd xmm6,xmm3		; xmm6=(06 07 16 17 26 27 36 37)
-	punpckhwd xmm0,xmm3		; xmm0=(46 47 56 57 66 67 76 77)
-
-	movdqa    xmm2,xmm5		; transpose coefficients(phase 2)
-	punpckldq xmm5,xmm6		; xmm5=(04 05 06 07 14 15 16 17)
-	punpckhdq xmm2,xmm6		; xmm2=(24 25 26 27 34 35 36 37)
-	movdqa    xmm3,xmm7		; transpose coefficients(phase 2)
-	punpckldq xmm7,xmm0		; xmm7=(44 45 46 47 54 55 56 57)
-	punpckhdq xmm3,xmm0		; xmm3=(64 65 66 67 74 75 76 77)
-
-	movdqa	xmm6, XMMWORD [wk(0)]	; xmm6=(02 03 12 13 22 23 32 33)
-	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(42 43 52 53 62 63 72 73)
-	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(24 25 26 27 34 35 36 37)
-	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=(44 45 46 47 54 55 56 57)
-
-	movdqa    xmm2,xmm1		; transpose coefficients(phase 2)
-	punpckldq xmm1,xmm6		; xmm1=(00 01 02 03 10 11 12 13)
-	punpckhdq xmm2,xmm6		; xmm2=(20 21 22 23 30 31 32 33)
-	movdqa    xmm7,xmm4		; transpose coefficients(phase 2)
-	punpckldq xmm4,xmm0		; xmm4=(40 41 42 43 50 51 52 53)
-	punpckhdq xmm7,xmm0		; xmm7=(60 61 62 63 70 71 72 73)
-
-	movdqa     xmm6,xmm1		; transpose coefficients(phase 3)
-	punpcklqdq xmm1,xmm5		; xmm1=(00 01 02 03 04 05 06 07)=data0
-	punpckhqdq xmm6,xmm5		; xmm6=(10 11 12 13 14 15 16 17)=data1
-	movdqa     xmm0,xmm7		; transpose coefficients(phase 3)
-	punpcklqdq xmm7,xmm3		; xmm7=(60 61 62 63 64 65 66 67)=data6
-	punpckhqdq xmm0,xmm3		; xmm0=(70 71 72 73 74 75 76 77)=data7
-
-	movdqa	xmm5,xmm6
-	movdqa	xmm3,xmm1
-	psubw	xmm6,xmm7		; xmm6=data1-data6=tmp6
-	psubw	xmm1,xmm0		; xmm1=data0-data7=tmp7
-	paddw	xmm5,xmm7		; xmm5=data1+data6=tmp1
-	paddw	xmm3,xmm0		; xmm3=data0+data7=tmp0
-
-	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=(24 25 26 27 34 35 36 37)
-	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(44 45 46 47 54 55 56 57)
-	movdqa	XMMWORD [wk(0)], xmm6	; wk(0)=tmp6
-	movdqa	XMMWORD [wk(1)], xmm1	; wk(1)=tmp7
-
-	movdqa     xmm6,xmm2		; transpose coefficients(phase 3)
-	punpcklqdq xmm2,xmm7		; xmm2=(20 21 22 23 24 25 26 27)=data2
-	punpckhqdq xmm6,xmm7		; xmm6=(30 31 32 33 34 35 36 37)=data3
-	movdqa     xmm1,xmm4		; transpose coefficients(phase 3)
-	punpcklqdq xmm4,xmm0		; xmm4=(40 41 42 43 44 45 46 47)=data4
-	punpckhqdq xmm1,xmm0		; xmm1=(50 51 52 53 54 55 56 57)=data5
-
-	movdqa	xmm7,xmm6
-	movdqa	xmm0,xmm2
-	paddw	xmm6,xmm4		; xmm6=data3+data4=tmp3
-	paddw	xmm2,xmm1		; xmm2=data2+data5=tmp2
-	psubw	xmm7,xmm4		; xmm7=data3-data4=tmp4
-	psubw	xmm0,xmm1		; xmm0=data2-data5=tmp5
-
-	; -- Even part
-
-	movdqa	xmm4,xmm3
-	movdqa	xmm1,xmm5
-	psubw	xmm3,xmm6		; xmm3=tmp13
-	psubw	xmm5,xmm2		; xmm5=tmp12
-	paddw	xmm4,xmm6		; xmm4=tmp10
-	paddw	xmm1,xmm2		; xmm1=tmp11
-
-	paddw	xmm5,xmm3
-	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm5,[rel PW_F0707] ; xmm5=z1
-
-	movdqa	xmm6,xmm4
-	movdqa	xmm2,xmm3
-	psubw	xmm4,xmm1		; xmm4=data4
-	psubw	xmm3,xmm5		; xmm3=data6
-	paddw	xmm6,xmm1		; xmm6=data0
-	paddw	xmm2,xmm5		; xmm2=data2
-
-	movdqa	XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
-	movdqa	XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
-	movdqa	XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6
-	movdqa	XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2
-
-	; -- Odd part
-
-	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=tmp6
-	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp7
-
-	paddw	xmm7,xmm0		; xmm7=tmp10
-	paddw	xmm0,xmm1		; xmm0=tmp11
-	paddw	xmm1,xmm5		; xmm1=tmp12, xmm5=tmp7
-
-	psllw	xmm7,PRE_MULTIPLY_SCALE_BITS
-	psllw	xmm1,PRE_MULTIPLY_SCALE_BITS
-
-	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm0,[rel PW_F0707] ; xmm0=z3
-
-	movdqa	xmm4,xmm7		; xmm4=tmp10
-	psubw	xmm7,xmm1
-	pmulhw	xmm7,[rel PW_F0382] ; xmm7=z5
-	pmulhw	xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
-	pmulhw	xmm1,[rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
-	paddw	xmm4,xmm7		; xmm4=z2
-	paddw	xmm1,xmm7		; xmm1=z4
-
-	movdqa	xmm3,xmm5
-	psubw	xmm5,xmm0		; xmm5=z13
-	paddw	xmm3,xmm0		; xmm3=z11
-
-	movdqa	xmm6,xmm5
-	movdqa	xmm2,xmm3
-	psubw	xmm5,xmm4		; xmm5=data3
-	psubw	xmm3,xmm1		; xmm3=data7
-	paddw	xmm6,xmm4		; xmm6=data5
-	paddw	xmm2,xmm1		; xmm2=data1
-
-	movdqa	XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
-	movdqa	XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
-	movdqa	XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
-	movdqa	XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
-
-	uncollect_args
-	mov	rsp,rbp		; rsp <- aligned rbp
-	pop	rsp		; rsp <- original rbp
-	pop	rbp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jfss2fst.asm b/simd/jfss2fst.asm
deleted file mode 100644
index 3232db5..0000000
--- a/simd/jfss2fst.asm
+++ /dev/null
@@ -1,404 +0,0 @@
-;
-; jfss2fst.asm - fast integer FDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the forward DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	8	; 14 is also OK.
-
-%if CONST_BITS == 8
-F_0_382	equ	 98		; FIX(0.382683433)
-F_0_541	equ	139		; FIX(0.541196100)
-F_0_707	equ	181		; FIX(0.707106781)
-F_1_306	equ	334		; FIX(1.306562965)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_382	equ	DESCALE( 410903207,30-CONST_BITS)	; FIX(0.382683433)
-F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
-F_0_707	equ	DESCALE( 759250124,30-CONST_BITS)	; FIX(0.707106781)
-F_1_306	equ	DESCALE(1402911301,30-CONST_BITS)	; FIX(1.306562965)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-	alignz	16
-	global	EXTN(jconst_fdct_ifast_sse2) PRIVATE
-
-EXTN(jconst_fdct_ifast_sse2):
-
-PW_F0707	times 8 dw  F_0_707 << CONST_SHIFT
-PW_F0382	times 8 dw  F_0_382 << CONST_SHIFT
-PW_F0541	times 8 dw  F_0_541 << CONST_SHIFT
-PW_F1306	times 8 dw  F_1_306 << CONST_SHIFT
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_ifast_sse2 (DCTELEM * data)
-;
-
-%define data(b)		(b)+8		; DCTELEM * data
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-
-	align	16
-	global	EXTN(jsimd_fdct_ifast_sse2) PRIVATE
-
-EXTN(jsimd_fdct_ifast_sse2):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	ebx
-;	push	ecx		; unused
-;	push	edx		; need not be preserved
-;	push	esi		; unused
-;	push	edi		; unused
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process rows.
-
-	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-	movdqa	xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-
-	; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
-	; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
-
-	movdqa    xmm4,xmm0		; transpose coefficients(phase 1)
-	punpcklwd xmm0,xmm1		; xmm0=(00 10 01 11 02 12 03 13)
-	punpckhwd xmm4,xmm1		; xmm4=(04 14 05 15 06 16 07 17)
-	movdqa    xmm5,xmm2		; transpose coefficients(phase 1)
-	punpcklwd xmm2,xmm3		; xmm2=(20 30 21 31 22 32 23 33)
-	punpckhwd xmm5,xmm3		; xmm5=(24 34 25 35 26 36 27 37)
-
-	movdqa	xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
-	movdqa	xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
-
-	; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
-	; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
-
-	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(20 30 21 31 22 32 23 33)
-	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(24 34 25 35 26 36 27 37)
-
-	movdqa    xmm2,xmm6		; transpose coefficients(phase 1)
-	punpcklwd xmm6,xmm7		; xmm6=(40 50 41 51 42 52 43 53)
-	punpckhwd xmm2,xmm7		; xmm2=(44 54 45 55 46 56 47 57)
-	movdqa    xmm5,xmm1		; transpose coefficients(phase 1)
-	punpcklwd xmm1,xmm3		; xmm1=(60 70 61 71 62 72 63 73)
-	punpckhwd xmm5,xmm3		; xmm5=(64 74 65 75 66 76 67 77)
-
-	movdqa    xmm7,xmm6		; transpose coefficients(phase 2)
-	punpckldq xmm6,xmm1		; xmm6=(40 50 60 70 41 51 61 71)
-	punpckhdq xmm7,xmm1		; xmm7=(42 52 62 72 43 53 63 73)
-	movdqa    xmm3,xmm2		; transpose coefficients(phase 2)
-	punpckldq xmm2,xmm5		; xmm2=(44 54 64 74 45 55 65 75)
-	punpckhdq xmm3,xmm5		; xmm3=(46 56 66 76 47 57 67 77)
-
-	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(20 30 21 31 22 32 23 33)
-	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(24 34 25 35 26 36 27 37)
-	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=(42 52 62 72 43 53 63 73)
-	movdqa	XMMWORD [wk(1)], xmm2	; wk(1)=(44 54 64 74 45 55 65 75)
-
-	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
-	punpckldq xmm0,xmm1		; xmm0=(00 10 20 30 01 11 21 31)
-	punpckhdq xmm7,xmm1		; xmm7=(02 12 22 32 03 13 23 33)
-	movdqa    xmm2,xmm4		; transpose coefficients(phase 2)
-	punpckldq xmm4,xmm5		; xmm4=(04 14 24 34 05 15 25 35)
-	punpckhdq xmm2,xmm5		; xmm2=(06 16 26 36 07 17 27 37)
-
-	movdqa     xmm1,xmm0		; transpose coefficients(phase 3)
-	punpcklqdq xmm0,xmm6		; xmm0=(00 10 20 30 40 50 60 70)=data0
-	punpckhqdq xmm1,xmm6		; xmm1=(01 11 21 31 41 51 61 71)=data1
-	movdqa     xmm5,xmm2		; transpose coefficients(phase 3)
-	punpcklqdq xmm2,xmm3		; xmm2=(06 16 26 36 46 56 66 76)=data6
-	punpckhqdq xmm5,xmm3		; xmm5=(07 17 27 37 47 57 67 77)=data7
-
-	movdqa	xmm6,xmm1
-	movdqa	xmm3,xmm0
-	psubw	xmm1,xmm2		; xmm1=data1-data6=tmp6
-	psubw	xmm0,xmm5		; xmm0=data0-data7=tmp7
-	paddw	xmm6,xmm2		; xmm6=data1+data6=tmp1
-	paddw	xmm3,xmm5		; xmm3=data0+data7=tmp0
-
-	movdqa	xmm2, XMMWORD [wk(0)]	; xmm2=(42 52 62 72 43 53 63 73)
-	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(44 54 64 74 45 55 65 75)
-	movdqa	XMMWORD [wk(0)], xmm1	; wk(0)=tmp6
-	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp7
-
-	movdqa     xmm1,xmm7		; transpose coefficients(phase 3)
-	punpcklqdq xmm7,xmm2		; xmm7=(02 12 22 32 42 52 62 72)=data2
-	punpckhqdq xmm1,xmm2		; xmm1=(03 13 23 33 43 53 63 73)=data3
-	movdqa     xmm0,xmm4		; transpose coefficients(phase 3)
-	punpcklqdq xmm4,xmm5		; xmm4=(04 14 24 34 44 54 64 74)=data4
-	punpckhqdq xmm0,xmm5		; xmm0=(05 15 25 35 45 55 65 75)=data5
-
-	movdqa	xmm2,xmm1
-	movdqa	xmm5,xmm7
-	paddw	xmm1,xmm4		; xmm1=data3+data4=tmp3
-	paddw	xmm7,xmm0		; xmm7=data2+data5=tmp2
-	psubw	xmm2,xmm4		; xmm2=data3-data4=tmp4
-	psubw	xmm5,xmm0		; xmm5=data2-data5=tmp5
-
-	; -- Even part
-
-	movdqa	xmm4,xmm3
-	movdqa	xmm0,xmm6
-	psubw	xmm3,xmm1		; xmm3=tmp13
-	psubw	xmm6,xmm7		; xmm6=tmp12
-	paddw	xmm4,xmm1		; xmm4=tmp10
-	paddw	xmm0,xmm7		; xmm0=tmp11
-
-	paddw	xmm6,xmm3
-	psllw	xmm6,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm6,[GOTOFF(ebx,PW_F0707)] ; xmm6=z1
-
-	movdqa	xmm1,xmm4
-	movdqa	xmm7,xmm3
-	psubw	xmm4,xmm0		; xmm4=data4
-	psubw	xmm3,xmm6		; xmm3=data6
-	paddw	xmm1,xmm0		; xmm1=data0
-	paddw	xmm7,xmm6		; xmm7=data2
-
-	movdqa	xmm0, XMMWORD [wk(0)]	; xmm0=tmp6
-	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=tmp7
-	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=data4
-	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=data6
-
-	; -- Odd part
-
-	paddw	xmm2,xmm5		; xmm2=tmp10
-	paddw	xmm5,xmm0		; xmm5=tmp11
-	paddw	xmm0,xmm6		; xmm0=tmp12, xmm6=tmp7
-
-	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
-	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
-
-	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z3
-
-	movdqa	xmm4,xmm2		; xmm4=tmp10
-	psubw	xmm2,xmm0
-	pmulhw	xmm2,[GOTOFF(ebx,PW_F0382)] ; xmm2=z5
-	pmulhw	xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
-	pmulhw	xmm0,[GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
-	paddw	xmm4,xmm2		; xmm4=z2
-	paddw	xmm0,xmm2		; xmm0=z4
-
-	movdqa	xmm3,xmm6
-	psubw	xmm6,xmm5		; xmm6=z13
-	paddw	xmm3,xmm5		; xmm3=z11
-
-	movdqa	xmm2,xmm6
-	movdqa	xmm5,xmm3
-	psubw	xmm6,xmm4		; xmm6=data3
-	psubw	xmm3,xmm0		; xmm3=data7
-	paddw	xmm2,xmm4		; xmm2=data5
-	paddw	xmm5,xmm0		; xmm5=data1
-
-	; ---- Pass 2: process columns.
-
-;	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
-
-	; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
-	; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
-
-	movdqa    xmm4,xmm1		; transpose coefficients(phase 1)
-	punpcklwd xmm1,xmm5		; xmm1=(00 01 10 11 20 21 30 31)
-	punpckhwd xmm4,xmm5		; xmm4=(40 41 50 51 60 61 70 71)
-	movdqa    xmm0,xmm7		; transpose coefficients(phase 1)
-	punpcklwd xmm7,xmm6		; xmm7=(02 03 12 13 22 23 32 33)
-	punpckhwd xmm0,xmm6		; xmm0=(42 43 52 53 62 63 72 73)
-
-	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=col4
-	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=col6
-
-	; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
-	; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
-
-	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=(02 03 12 13 22 23 32 33)
-	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(42 43 52 53 62 63 72 73)
-
-	movdqa    xmm7,xmm5		; transpose coefficients(phase 1)
-	punpcklwd xmm5,xmm2		; xmm5=(04 05 14 15 24 25 34 35)
-	punpckhwd xmm7,xmm2		; xmm7=(44 45 54 55 64 65 74 75)
-	movdqa    xmm0,xmm6		; transpose coefficients(phase 1)
-	punpcklwd xmm6,xmm3		; xmm6=(06 07 16 17 26 27 36 37)
-	punpckhwd xmm0,xmm3		; xmm0=(46 47 56 57 66 67 76 77)
-
-	movdqa    xmm2,xmm5		; transpose coefficients(phase 2)
-	punpckldq xmm5,xmm6		; xmm5=(04 05 06 07 14 15 16 17)
-	punpckhdq xmm2,xmm6		; xmm2=(24 25 26 27 34 35 36 37)
-	movdqa    xmm3,xmm7		; transpose coefficients(phase 2)
-	punpckldq xmm7,xmm0		; xmm7=(44 45 46 47 54 55 56 57)
-	punpckhdq xmm3,xmm0		; xmm3=(64 65 66 67 74 75 76 77)
-
-	movdqa	xmm6, XMMWORD [wk(0)]	; xmm6=(02 03 12 13 22 23 32 33)
-	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(42 43 52 53 62 63 72 73)
-	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(24 25 26 27 34 35 36 37)
-	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=(44 45 46 47 54 55 56 57)
-
-	movdqa    xmm2,xmm1		; transpose coefficients(phase 2)
-	punpckldq xmm1,xmm6		; xmm1=(00 01 02 03 10 11 12 13)
-	punpckhdq xmm2,xmm6		; xmm2=(20 21 22 23 30 31 32 33)
-	movdqa    xmm7,xmm4		; transpose coefficients(phase 2)
-	punpckldq xmm4,xmm0		; xmm4=(40 41 42 43 50 51 52 53)
-	punpckhdq xmm7,xmm0		; xmm7=(60 61 62 63 70 71 72 73)
-
-	movdqa     xmm6,xmm1		; transpose coefficients(phase 3)
-	punpcklqdq xmm1,xmm5		; xmm1=(00 01 02 03 04 05 06 07)=data0
-	punpckhqdq xmm6,xmm5		; xmm6=(10 11 12 13 14 15 16 17)=data1
-	movdqa     xmm0,xmm7		; transpose coefficients(phase 3)
-	punpcklqdq xmm7,xmm3		; xmm7=(60 61 62 63 64 65 66 67)=data6
-	punpckhqdq xmm0,xmm3		; xmm0=(70 71 72 73 74 75 76 77)=data7
-
-	movdqa	xmm5,xmm6
-	movdqa	xmm3,xmm1
-	psubw	xmm6,xmm7		; xmm6=data1-data6=tmp6
-	psubw	xmm1,xmm0		; xmm1=data0-data7=tmp7
-	paddw	xmm5,xmm7		; xmm5=data1+data6=tmp1
-	paddw	xmm3,xmm0		; xmm3=data0+data7=tmp0
-
-	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=(24 25 26 27 34 35 36 37)
-	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(44 45 46 47 54 55 56 57)
-	movdqa	XMMWORD [wk(0)], xmm6	; wk(0)=tmp6
-	movdqa	XMMWORD [wk(1)], xmm1	; wk(1)=tmp7
-
-	movdqa     xmm6,xmm2		; transpose coefficients(phase 3)
-	punpcklqdq xmm2,xmm7		; xmm2=(20 21 22 23 24 25 26 27)=data2
-	punpckhqdq xmm6,xmm7		; xmm6=(30 31 32 33 34 35 36 37)=data3
-	movdqa     xmm1,xmm4		; transpose coefficients(phase 3)
-	punpcklqdq xmm4,xmm0		; xmm4=(40 41 42 43 44 45 46 47)=data4
-	punpckhqdq xmm1,xmm0		; xmm1=(50 51 52 53 54 55 56 57)=data5
-
-	movdqa	xmm7,xmm6
-	movdqa	xmm0,xmm2
-	paddw	xmm6,xmm4		; xmm6=data3+data4=tmp3
-	paddw	xmm2,xmm1		; xmm2=data2+data5=tmp2
-	psubw	xmm7,xmm4		; xmm7=data3-data4=tmp4
-	psubw	xmm0,xmm1		; xmm0=data2-data5=tmp5
-
-	; -- Even part
-
-	movdqa	xmm4,xmm3
-	movdqa	xmm1,xmm5
-	psubw	xmm3,xmm6		; xmm3=tmp13
-	psubw	xmm5,xmm2		; xmm5=tmp12
-	paddw	xmm4,xmm6		; xmm4=tmp10
-	paddw	xmm1,xmm2		; xmm1=tmp11
-
-	paddw	xmm5,xmm3
-	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z1
-
-	movdqa	xmm6,xmm4
-	movdqa	xmm2,xmm3
-	psubw	xmm4,xmm1		; xmm4=data4
-	psubw	xmm3,xmm5		; xmm3=data6
-	paddw	xmm6,xmm1		; xmm6=data0
-	paddw	xmm2,xmm5		; xmm2=data2
-
-	movdqa	XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
-	movdqa	XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
-	movdqa	XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6
-	movdqa	XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2
-
-	; -- Odd part
-
-	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=tmp6
-	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp7
-
-	paddw	xmm7,xmm0		; xmm7=tmp10
-	paddw	xmm0,xmm1		; xmm0=tmp11
-	paddw	xmm1,xmm5		; xmm1=tmp12, xmm5=tmp7
-
-	psllw	xmm7,PRE_MULTIPLY_SCALE_BITS
-	psllw	xmm1,PRE_MULTIPLY_SCALE_BITS
-
-	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm0,[GOTOFF(ebx,PW_F0707)] ; xmm0=z3
-
-	movdqa	xmm4,xmm7		; xmm4=tmp10
-	psubw	xmm7,xmm1
-	pmulhw	xmm7,[GOTOFF(ebx,PW_F0382)] ; xmm7=z5
-	pmulhw	xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
-	pmulhw	xmm1,[GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
-	paddw	xmm4,xmm7		; xmm4=z2
-	paddw	xmm1,xmm7		; xmm1=z4
-
-	movdqa	xmm3,xmm5
-	psubw	xmm5,xmm0		; xmm5=z13
-	paddw	xmm3,xmm0		; xmm3=z11
-
-	movdqa	xmm6,xmm5
-	movdqa	xmm2,xmm3
-	psubw	xmm5,xmm4		; xmm5=data3
-	psubw	xmm3,xmm1		; xmm3=data7
-	paddw	xmm6,xmm4		; xmm6=data5
-	paddw	xmm2,xmm1		; xmm2=data1
-
-	movdqa	XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
-	movdqa	XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
-	movdqa	XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6
-	movdqa	XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2
-
-;	pop	edi		; unused
-;	pop	esi		; unused
-;	pop	edx		; need not be preserved
-;	pop	ecx		; unused
-	poppic	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jfss2int-64.asm b/simd/jfss2int-64.asm
deleted file mode 100644
index 0b710f2..0000000
--- a/simd/jfss2int-64.asm
+++ /dev/null
@@ -1,622 +0,0 @@
-;
-; jfss2int-64.asm - accurate integer FDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; forward DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jfdctint.c; see the jfdctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	13
-%define PASS1_BITS	2
-
-%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
-%define DESCALE_P2	(CONST_BITS+PASS1_BITS)
-
-%if CONST_BITS == 13
-F_0_298	equ	 2446		; FIX(0.298631336)
-F_0_390	equ	 3196		; FIX(0.390180644)
-F_0_541	equ	 4433		; FIX(0.541196100)
-F_0_765	equ	 6270		; FIX(0.765366865)
-F_0_899	equ	 7373		; FIX(0.899976223)
-F_1_175	equ	 9633		; FIX(1.175875602)
-F_1_501	equ	12299		; FIX(1.501321110)
-F_1_847	equ	15137		; FIX(1.847759065)
-F_1_961	equ	16069		; FIX(1.961570560)
-F_2_053	equ	16819		; FIX(2.053119869)
-F_2_562	equ	20995		; FIX(2.562915447)
-F_3_072	equ	25172		; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
-F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
-F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
-F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
-F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
-F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
-F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
-F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
-F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
-F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
-F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
-F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_fdct_islow_sse2) PRIVATE
-
-EXTN(jconst_fdct_islow_sse2):
-
-PW_F130_F054	times 4 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130	times 4 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117	times 4 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078	times 4 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089	times 4 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060	times 4 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256	times 4 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050	times 4 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1	times 4 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2	times 4 dd  1 << (DESCALE_P2-1)
-PW_DESCALE_P2X	times 8 dw  1 << (PASS1_BITS-1)
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_islow_sse2 (DCTELEM * data)
-;
-
-; r10 = DCTELEM * data
-
-%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		6
-
-	align	16
-	global	EXTN(jsimd_fdct_islow_sse2) PRIVATE
-
-EXTN(jsimd_fdct_islow_sse2):
-	push	rbp
-	mov	rax,rsp				; rax = original rbp
-	sub	rsp, byte 4
-	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[rsp],rax
-	mov	rbp,rsp				; rbp = aligned rbp
-	lea	rsp, [wk(0)]
-	collect_args
-
-	; ---- Pass 1: process rows.
-
-	mov	rdx, r10	; (DCTELEM *)
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
-	movdqa	xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
-
-	; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
-	; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
-
-	movdqa    xmm4,xmm0		; transpose coefficients(phase 1)
-	punpcklwd xmm0,xmm1		; xmm0=(00 10 01 11 02 12 03 13)
-	punpckhwd xmm4,xmm1		; xmm4=(04 14 05 15 06 16 07 17)
-	movdqa    xmm5,xmm2		; transpose coefficients(phase 1)
-	punpcklwd xmm2,xmm3		; xmm2=(20 30 21 31 22 32 23 33)
-	punpckhwd xmm5,xmm3		; xmm5=(24 34 25 35 26 36 27 37)
-
-	movdqa	xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
-	movdqa	xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
-
-	; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
-	; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
-
-	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(20 30 21 31 22 32 23 33)
-	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(24 34 25 35 26 36 27 37)
-
-	movdqa    xmm2,xmm6		; transpose coefficients(phase 1)
-	punpcklwd xmm6,xmm7		; xmm6=(40 50 41 51 42 52 43 53)
-	punpckhwd xmm2,xmm7		; xmm2=(44 54 45 55 46 56 47 57)
-	movdqa    xmm5,xmm1		; transpose coefficients(phase 1)
-	punpcklwd xmm1,xmm3		; xmm1=(60 70 61 71 62 72 63 73)
-	punpckhwd xmm5,xmm3		; xmm5=(64 74 65 75 66 76 67 77)
-
-	movdqa    xmm7,xmm6		; transpose coefficients(phase 2)
-	punpckldq xmm6,xmm1		; xmm6=(40 50 60 70 41 51 61 71)
-	punpckhdq xmm7,xmm1		; xmm7=(42 52 62 72 43 53 63 73)
-	movdqa    xmm3,xmm2		; transpose coefficients(phase 2)
-	punpckldq xmm2,xmm5		; xmm2=(44 54 64 74 45 55 65 75)
-	punpckhdq xmm3,xmm5		; xmm3=(46 56 66 76 47 57 67 77)
-
-	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(20 30 21 31 22 32 23 33)
-	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(24 34 25 35 26 36 27 37)
-	movdqa	XMMWORD [wk(2)], xmm7	; wk(2)=(42 52 62 72 43 53 63 73)
-	movdqa	XMMWORD [wk(3)], xmm2	; wk(3)=(44 54 64 74 45 55 65 75)
-
-	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
-	punpckldq xmm0,xmm1		; xmm0=(00 10 20 30 01 11 21 31)
-	punpckhdq xmm7,xmm1		; xmm7=(02 12 22 32 03 13 23 33)
-	movdqa    xmm2,xmm4		; transpose coefficients(phase 2)
-	punpckldq xmm4,xmm5		; xmm4=(04 14 24 34 05 15 25 35)
-	punpckhdq xmm2,xmm5		; xmm2=(06 16 26 36 07 17 27 37)
-
-	movdqa     xmm1,xmm0		; transpose coefficients(phase 3)
-	punpcklqdq xmm0,xmm6		; xmm0=(00 10 20 30 40 50 60 70)=data0
-	punpckhqdq xmm1,xmm6		; xmm1=(01 11 21 31 41 51 61 71)=data1
-	movdqa     xmm5,xmm2		; transpose coefficients(phase 3)
-	punpcklqdq xmm2,xmm3		; xmm2=(06 16 26 36 46 56 66 76)=data6
-	punpckhqdq xmm5,xmm3		; xmm5=(07 17 27 37 47 57 67 77)=data7
-
-	movdqa	xmm6,xmm1
-	movdqa	xmm3,xmm0
-	psubw	xmm1,xmm2		; xmm1=data1-data6=tmp6
-	psubw	xmm0,xmm5		; xmm0=data0-data7=tmp7
-	paddw	xmm6,xmm2		; xmm6=data1+data6=tmp1
-	paddw	xmm3,xmm5		; xmm3=data0+data7=tmp0
-
-	movdqa	xmm2, XMMWORD [wk(2)]	; xmm2=(42 52 62 72 43 53 63 73)
-	movdqa	xmm5, XMMWORD [wk(3)]	; xmm5=(44 54 64 74 45 55 65 75)
-	movdqa	XMMWORD [wk(0)], xmm1	; wk(0)=tmp6
-	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp7
-
-	movdqa     xmm1,xmm7		; transpose coefficients(phase 3)
-	punpcklqdq xmm7,xmm2		; xmm7=(02 12 22 32 42 52 62 72)=data2
-	punpckhqdq xmm1,xmm2		; xmm1=(03 13 23 33 43 53 63 73)=data3
-	movdqa     xmm0,xmm4		; transpose coefficients(phase 3)
-	punpcklqdq xmm4,xmm5		; xmm4=(04 14 24 34 44 54 64 74)=data4
-	punpckhqdq xmm0,xmm5		; xmm0=(05 15 25 35 45 55 65 75)=data5
-
-	movdqa	xmm2,xmm1
-	movdqa	xmm5,xmm7
-	paddw	xmm1,xmm4		; xmm1=data3+data4=tmp3
-	paddw	xmm7,xmm0		; xmm7=data2+data5=tmp2
-	psubw	xmm2,xmm4		; xmm2=data3-data4=tmp4
-	psubw	xmm5,xmm0		; xmm5=data2-data5=tmp5
-
-	; -- Even part
-
-	movdqa	xmm4,xmm3
-	movdqa	xmm0,xmm6
-	paddw	xmm3,xmm1		; xmm3=tmp10
-	paddw	xmm6,xmm7		; xmm6=tmp11
-	psubw	xmm4,xmm1		; xmm4=tmp13
-	psubw	xmm0,xmm7		; xmm0=tmp12
-
-	movdqa	xmm1,xmm3
-	paddw	xmm3,xmm6		; xmm3=tmp10+tmp11
-	psubw	xmm1,xmm6		; xmm1=tmp10-tmp11
-
-	psllw	xmm3,PASS1_BITS		; xmm3=data0
-	psllw	xmm1,PASS1_BITS		; xmm1=data4
-
-	movdqa	XMMWORD [wk(2)], xmm3	; wk(2)=data0
-	movdqa	XMMWORD [wk(3)], xmm1	; wk(3)=data4
-
-	; (Original)
-	; z1 = (tmp12 + tmp13) * 0.541196100;
-	; data2 = z1 + tmp13 * 0.765366865;
-	; data6 = z1 + tmp12 * -1.847759065;
-	;
-	; (This implementation)
-	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-	movdqa    xmm7,xmm4		; xmm4=tmp13
-	movdqa    xmm6,xmm4
-	punpcklwd xmm7,xmm0		; xmm0=tmp12
-	punpckhwd xmm6,xmm0
-	movdqa    xmm4,xmm7
-	movdqa    xmm0,xmm6
-	pmaddwd   xmm7,[rel PW_F130_F054]	; xmm7=data2L
-	pmaddwd   xmm6,[rel PW_F130_F054]	; xmm6=data2H
-	pmaddwd   xmm4,[rel PW_F054_MF130]	; xmm4=data6L
-	pmaddwd   xmm0,[rel PW_F054_MF130]	; xmm0=data6H
-
-	paddd	xmm7,[rel PD_DESCALE_P1]
-	paddd	xmm6,[rel PD_DESCALE_P1]
-	psrad	xmm7,DESCALE_P1
-	psrad	xmm6,DESCALE_P1
-	paddd	xmm4,[rel PD_DESCALE_P1]
-	paddd	xmm0,[rel PD_DESCALE_P1]
-	psrad	xmm4,DESCALE_P1
-	psrad	xmm0,DESCALE_P1
-
-	packssdw  xmm7,xmm6		; xmm7=data2
-	packssdw  xmm4,xmm0		; xmm4=data6
-
-	movdqa	XMMWORD [wk(4)], xmm7	; wk(4)=data2
-	movdqa	XMMWORD [wk(5)], xmm4	; wk(5)=data6
-
-	; -- Odd part
-
-	movdqa	xmm3, XMMWORD [wk(0)]	; xmm3=tmp6
-	movdqa	xmm1, XMMWORD [wk(1)]	; xmm1=tmp7
-
-	movdqa	xmm6,xmm2		; xmm2=tmp4
-	movdqa	xmm0,xmm5		; xmm5=tmp5
-	paddw	xmm6,xmm3		; xmm6=z3
-	paddw	xmm0,xmm1		; xmm0=z4
-
-	; (Original)
-	; z5 = (z3 + z4) * 1.175875602;
-	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-	; z3 += z5;  z4 += z5;
-	;
-	; (This implementation)
-	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-	movdqa    xmm7,xmm6
-	movdqa    xmm4,xmm6
-	punpcklwd xmm7,xmm0
-	punpckhwd xmm4,xmm0
-	movdqa    xmm6,xmm7
-	movdqa    xmm0,xmm4
-	pmaddwd   xmm7,[rel PW_MF078_F117]	; xmm7=z3L
-	pmaddwd   xmm4,[rel PW_MF078_F117]	; xmm4=z3H
-	pmaddwd   xmm6,[rel PW_F117_F078]	; xmm6=z4L
-	pmaddwd   xmm0,[rel PW_F117_F078]	; xmm0=z4H
-
-	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=z3L
-	movdqa	XMMWORD [wk(1)], xmm4	; wk(1)=z3H
-
-	; (Original)
-	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-	;
-	; (This implementation)
-	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-	movdqa    xmm7,xmm2
-	movdqa    xmm4,xmm2
-	punpcklwd xmm7,xmm1
-	punpckhwd xmm4,xmm1
-	movdqa    xmm2,xmm7
-	movdqa    xmm1,xmm4
-	pmaddwd   xmm7,[rel PW_MF060_MF089]	; xmm7=tmp4L
-	pmaddwd   xmm4,[rel PW_MF060_MF089]	; xmm4=tmp4H
-	pmaddwd   xmm2,[rel PW_MF089_F060]	; xmm2=tmp7L
-	pmaddwd   xmm1,[rel PW_MF089_F060]	; xmm1=tmp7H
-
-	paddd	xmm7, XMMWORD [wk(0)]	; xmm7=data7L
-	paddd	xmm4, XMMWORD [wk(1)]	; xmm4=data7H
-	paddd	xmm2,xmm6		; xmm2=data1L
-	paddd	xmm1,xmm0		; xmm1=data1H
-
-	paddd	xmm7,[rel PD_DESCALE_P1]
-	paddd	xmm4,[rel PD_DESCALE_P1]
-	psrad	xmm7,DESCALE_P1
-	psrad	xmm4,DESCALE_P1
-	paddd	xmm2,[rel PD_DESCALE_P1]
-	paddd	xmm1,[rel PD_DESCALE_P1]
-	psrad	xmm2,DESCALE_P1
-	psrad	xmm1,DESCALE_P1
-
-	packssdw  xmm7,xmm4		; xmm7=data7
-	packssdw  xmm2,xmm1		; xmm2=data1
-
-	movdqa    xmm4,xmm5
-	movdqa    xmm1,xmm5
-	punpcklwd xmm4,xmm3
-	punpckhwd xmm1,xmm3
-	movdqa    xmm5,xmm4
-	movdqa    xmm3,xmm1
-	pmaddwd   xmm4,[rel PW_MF050_MF256]	; xmm4=tmp5L
-	pmaddwd   xmm1,[rel PW_MF050_MF256]	; xmm1=tmp5H
-	pmaddwd   xmm5,[rel PW_MF256_F050]	; xmm5=tmp6L
-	pmaddwd   xmm3,[rel PW_MF256_F050]	; xmm3=tmp6H
-
-	paddd	xmm4,xmm6		; xmm4=data5L
-	paddd	xmm1,xmm0		; xmm1=data5H
-	paddd	xmm5, XMMWORD [wk(0)]	; xmm5=data3L
-	paddd	xmm3, XMMWORD [wk(1)]	; xmm3=data3H
-
-	paddd	xmm4,[rel PD_DESCALE_P1]
-	paddd	xmm1,[rel PD_DESCALE_P1]
-	psrad	xmm4,DESCALE_P1
-	psrad	xmm1,DESCALE_P1
-	paddd	xmm5,[rel PD_DESCALE_P1]
-	paddd	xmm3,[rel PD_DESCALE_P1]
-	psrad	xmm5,DESCALE_P1
-	psrad	xmm3,DESCALE_P1
-
-	packssdw  xmm4,xmm1		; xmm4=data5
-	packssdw  xmm5,xmm3		; xmm5=data3
-
-	; ---- Pass 2: process columns.
-
-	movdqa	xmm6, XMMWORD [wk(2)]	; xmm6=col0
-	movdqa	xmm0, XMMWORD [wk(4)]	; xmm0=col2
-
-	; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
-	; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
-
-	movdqa    xmm1,xmm6		; transpose coefficients(phase 1)
-	punpcklwd xmm6,xmm2		; xmm6=(00 01 10 11 20 21 30 31)
-	punpckhwd xmm1,xmm2		; xmm1=(40 41 50 51 60 61 70 71)
-	movdqa    xmm3,xmm0		; transpose coefficients(phase 1)
-	punpcklwd xmm0,xmm5		; xmm0=(02 03 12 13 22 23 32 33)
-	punpckhwd xmm3,xmm5		; xmm3=(42 43 52 53 62 63 72 73)
-
-	movdqa	xmm2, XMMWORD [wk(3)]	; xmm2=col4
-	movdqa	xmm5, XMMWORD [wk(5)]	; xmm5=col6
-
-	; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
-	; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
-
-	movdqa	XMMWORD [wk(0)], xmm0	; wk(0)=(02 03 12 13 22 23 32 33)
-	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=(42 43 52 53 62 63 72 73)
-
-	movdqa    xmm0,xmm2		; transpose coefficients(phase 1)
-	punpcklwd xmm2,xmm4		; xmm2=(04 05 14 15 24 25 34 35)
-	punpckhwd xmm0,xmm4		; xmm0=(44 45 54 55 64 65 74 75)
-	movdqa    xmm3,xmm5		; transpose coefficients(phase 1)
-	punpcklwd xmm5,xmm7		; xmm5=(06 07 16 17 26 27 36 37)
-	punpckhwd xmm3,xmm7		; xmm3=(46 47 56 57 66 67 76 77)
-
-	movdqa    xmm4,xmm2		; transpose coefficients(phase 2)
-	punpckldq xmm2,xmm5		; xmm2=(04 05 06 07 14 15 16 17)
-	punpckhdq xmm4,xmm5		; xmm4=(24 25 26 27 34 35 36 37)
-	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
-	punpckldq xmm0,xmm3		; xmm0=(44 45 46 47 54 55 56 57)
-	punpckhdq xmm7,xmm3		; xmm7=(64 65 66 67 74 75 76 77)
-
-	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=(02 03 12 13 22 23 32 33)
-	movdqa	xmm3, XMMWORD [wk(1)]	; xmm3=(42 43 52 53 62 63 72 73)
-	movdqa	XMMWORD [wk(2)], xmm4	; wk(2)=(24 25 26 27 34 35 36 37)
-	movdqa	XMMWORD [wk(3)], xmm0	; wk(3)=(44 45 46 47 54 55 56 57)
-
-	movdqa    xmm4,xmm6		; transpose coefficients(phase 2)
-	punpckldq xmm6,xmm5		; xmm6=(00 01 02 03 10 11 12 13)
-	punpckhdq xmm4,xmm5		; xmm4=(20 21 22 23 30 31 32 33)
-	movdqa    xmm0,xmm1		; transpose coefficients(phase 2)
-	punpckldq xmm1,xmm3		; xmm1=(40 41 42 43 50 51 52 53)
-	punpckhdq xmm0,xmm3		; xmm0=(60 61 62 63 70 71 72 73)
-
-	movdqa     xmm5,xmm6		; transpose coefficients(phase 3)
-	punpcklqdq xmm6,xmm2		; xmm6=(00 01 02 03 04 05 06 07)=data0
-	punpckhqdq xmm5,xmm2		; xmm5=(10 11 12 13 14 15 16 17)=data1
-	movdqa     xmm3,xmm0		; transpose coefficients(phase 3)
-	punpcklqdq xmm0,xmm7		; xmm0=(60 61 62 63 64 65 66 67)=data6
-	punpckhqdq xmm3,xmm7		; xmm3=(70 71 72 73 74 75 76 77)=data7
-
-	movdqa	xmm2,xmm5
-	movdqa	xmm7,xmm6
-	psubw	xmm5,xmm0		; xmm5=data1-data6=tmp6
-	psubw	xmm6,xmm3		; xmm6=data0-data7=tmp7
-	paddw	xmm2,xmm0		; xmm2=data1+data6=tmp1
-	paddw	xmm7,xmm3		; xmm7=data0+data7=tmp0
-
-	movdqa	xmm0, XMMWORD [wk(2)]	; xmm0=(24 25 26 27 34 35 36 37)
-	movdqa	xmm3, XMMWORD [wk(3)]	; xmm3=(44 45 46 47 54 55 56 57)
-	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=tmp6
-	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
-
-	movdqa     xmm5,xmm4		; transpose coefficients(phase 3)
-	punpcklqdq xmm4,xmm0		; xmm4=(20 21 22 23 24 25 26 27)=data2
-	punpckhqdq xmm5,xmm0		; xmm5=(30 31 32 33 34 35 36 37)=data3
-	movdqa     xmm6,xmm1		; transpose coefficients(phase 3)
-	punpcklqdq xmm1,xmm3		; xmm1=(40 41 42 43 44 45 46 47)=data4
-	punpckhqdq xmm6,xmm3		; xmm6=(50 51 52 53 54 55 56 57)=data5
-
-	movdqa	xmm0,xmm5
-	movdqa	xmm3,xmm4
-	paddw	xmm5,xmm1		; xmm5=data3+data4=tmp3
-	paddw	xmm4,xmm6		; xmm4=data2+data5=tmp2
-	psubw	xmm0,xmm1		; xmm0=data3-data4=tmp4
-	psubw	xmm3,xmm6		; xmm3=data2-data5=tmp5
-
-	; -- Even part
-
-	movdqa	xmm1,xmm7
-	movdqa	xmm6,xmm2
-	paddw	xmm7,xmm5		; xmm7=tmp10
-	paddw	xmm2,xmm4		; xmm2=tmp11
-	psubw	xmm1,xmm5		; xmm1=tmp13
-	psubw	xmm6,xmm4		; xmm6=tmp12
-
-	movdqa	xmm5,xmm7
-	paddw	xmm7,xmm2		; xmm7=tmp10+tmp11
-	psubw	xmm5,xmm2		; xmm5=tmp10-tmp11
-
-	paddw	xmm7,[rel PW_DESCALE_P2X]
-	paddw	xmm5,[rel PW_DESCALE_P2X]
-	psraw	xmm7,PASS1_BITS		; xmm7=data0
-	psraw	xmm5,PASS1_BITS		; xmm5=data4
-
-	movdqa	XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7
-	movdqa	XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5
-
-	; (Original)
-	; z1 = (tmp12 + tmp13) * 0.541196100;
-	; data2 = z1 + tmp13 * 0.765366865;
-	; data6 = z1 + tmp12 * -1.847759065;
-	;
-	; (This implementation)
-	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-	movdqa    xmm4,xmm1		; xmm1=tmp13
-	movdqa    xmm2,xmm1
-	punpcklwd xmm4,xmm6		; xmm6=tmp12
-	punpckhwd xmm2,xmm6
-	movdqa    xmm1,xmm4
-	movdqa    xmm6,xmm2
-	pmaddwd   xmm4,[rel PW_F130_F054]	; xmm4=data2L
-	pmaddwd   xmm2,[rel PW_F130_F054]	; xmm2=data2H
-	pmaddwd   xmm1,[rel PW_F054_MF130]	; xmm1=data6L
-	pmaddwd   xmm6,[rel PW_F054_MF130]	; xmm6=data6H
-
-	paddd	xmm4,[rel PD_DESCALE_P2]
-	paddd	xmm2,[rel PD_DESCALE_P2]
-	psrad	xmm4,DESCALE_P2
-	psrad	xmm2,DESCALE_P2
-	paddd	xmm1,[rel PD_DESCALE_P2]
-	paddd	xmm6,[rel PD_DESCALE_P2]
-	psrad	xmm1,DESCALE_P2
-	psrad	xmm6,DESCALE_P2
-
-	packssdw  xmm4,xmm2		; xmm4=data2
-	packssdw  xmm1,xmm6		; xmm1=data6
-
-	movdqa	XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4
-	movdqa	XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1
-
-	; -- Odd part
-
-	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp6
-	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp7
-
-	movdqa	xmm2,xmm0		; xmm0=tmp4
-	movdqa	xmm6,xmm3		; xmm3=tmp5
-	paddw	xmm2,xmm7		; xmm2=z3
-	paddw	xmm6,xmm5		; xmm6=z4
-
-	; (Original)
-	; z5 = (z3 + z4) * 1.175875602;
-	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-	; z3 += z5;  z4 += z5;
-	;
-	; (This implementation)
-	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-	movdqa    xmm4,xmm2
-	movdqa    xmm1,xmm2
-	punpcklwd xmm4,xmm6
-	punpckhwd xmm1,xmm6
-	movdqa    xmm2,xmm4
-	movdqa    xmm6,xmm1
-	pmaddwd   xmm4,[rel PW_MF078_F117]	; xmm4=z3L
-	pmaddwd   xmm1,[rel PW_MF078_F117]	; xmm1=z3H
-	pmaddwd   xmm2,[rel PW_F117_F078]	; xmm2=z4L
-	pmaddwd   xmm6,[rel PW_F117_F078]	; xmm6=z4H
-
-	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=z3L
-	movdqa	XMMWORD [wk(1)], xmm1	; wk(1)=z3H
-
-	; (Original)
-	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-	;
-	; (This implementation)
-	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-	movdqa    xmm4,xmm0
-	movdqa    xmm1,xmm0
-	punpcklwd xmm4,xmm5
-	punpckhwd xmm1,xmm5
-	movdqa    xmm0,xmm4
-	movdqa    xmm5,xmm1
-	pmaddwd   xmm4,[rel PW_MF060_MF089]	; xmm4=tmp4L
-	pmaddwd   xmm1,[rel PW_MF060_MF089]	; xmm1=tmp4H
-	pmaddwd   xmm0,[rel PW_MF089_F060]	; xmm0=tmp7L
-	pmaddwd   xmm5,[rel PW_MF089_F060]	; xmm5=tmp7H
-
-	paddd	xmm4, XMMWORD [wk(0)]	; xmm4=data7L
-	paddd	xmm1, XMMWORD [wk(1)]	; xmm1=data7H
-	paddd	xmm0,xmm2		; xmm0=data1L
-	paddd	xmm5,xmm6		; xmm5=data1H
-
-	paddd	xmm4,[rel PD_DESCALE_P2]
-	paddd	xmm1,[rel PD_DESCALE_P2]
-	psrad	xmm4,DESCALE_P2
-	psrad	xmm1,DESCALE_P2
-	paddd	xmm0,[rel PD_DESCALE_P2]
-	paddd	xmm5,[rel PD_DESCALE_P2]
-	psrad	xmm0,DESCALE_P2
-	psrad	xmm5,DESCALE_P2
-
-	packssdw  xmm4,xmm1		; xmm4=data7
-	packssdw  xmm0,xmm5		; xmm0=data1
-
-	movdqa	XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4
-	movdqa	XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0
-
-	movdqa    xmm1,xmm3
-	movdqa    xmm5,xmm3
-	punpcklwd xmm1,xmm7
-	punpckhwd xmm5,xmm7
-	movdqa    xmm3,xmm1
-	movdqa    xmm7,xmm5
-	pmaddwd   xmm1,[rel PW_MF050_MF256]	; xmm1=tmp5L
-	pmaddwd   xmm5,[rel PW_MF050_MF256]	; xmm5=tmp5H
-	pmaddwd   xmm3,[rel PW_MF256_F050]	; xmm3=tmp6L
-	pmaddwd   xmm7,[rel PW_MF256_F050]	; xmm7=tmp6H
-
-	paddd	xmm1,xmm2		; xmm1=data5L
-	paddd	xmm5,xmm6		; xmm5=data5H
-	paddd	xmm3, XMMWORD [wk(0)]	; xmm3=data3L
-	paddd	xmm7, XMMWORD [wk(1)]	; xmm7=data3H
-
-	paddd	xmm1,[rel PD_DESCALE_P2]
-	paddd	xmm5,[rel PD_DESCALE_P2]
-	psrad	xmm1,DESCALE_P2
-	psrad	xmm5,DESCALE_P2
-	paddd	xmm3,[rel PD_DESCALE_P2]
-	paddd	xmm7,[rel PD_DESCALE_P2]
-	psrad	xmm3,DESCALE_P2
-	psrad	xmm7,DESCALE_P2
-
-	packssdw  xmm1,xmm5		; xmm1=data5
-	packssdw  xmm3,xmm7		; xmm3=data3
-
-	movdqa	XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
-	movdqa	XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
-
-	uncollect_args
-	mov	rsp,rbp		; rsp <- aligned rbp
-	pop	rsp		; rsp <- original rbp
-	pop	rbp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jfss2int.asm b/simd/jfss2int.asm
deleted file mode 100644
index 1f73163..0000000
--- a/simd/jfss2int.asm
+++ /dev/null
@@ -1,634 +0,0 @@
-;
-; jfss2int.asm - accurate integer FDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; forward DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jfdctint.c; see the jfdctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	13
-%define PASS1_BITS	2
-
-%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
-%define DESCALE_P2	(CONST_BITS+PASS1_BITS)
-
-%if CONST_BITS == 13
-F_0_298	equ	 2446		; FIX(0.298631336)
-F_0_390	equ	 3196		; FIX(0.390180644)
-F_0_541	equ	 4433		; FIX(0.541196100)
-F_0_765	equ	 6270		; FIX(0.765366865)
-F_0_899	equ	 7373		; FIX(0.899976223)
-F_1_175	equ	 9633		; FIX(1.175875602)
-F_1_501	equ	12299		; FIX(1.501321110)
-F_1_847	equ	15137		; FIX(1.847759065)
-F_1_961	equ	16069		; FIX(1.961570560)
-F_2_053	equ	16819		; FIX(2.053119869)
-F_2_562	equ	20995		; FIX(2.562915447)
-F_3_072	equ	25172		; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
-F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
-F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
-F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
-F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
-F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
-F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
-F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
-F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
-F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
-F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
-F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_fdct_islow_sse2) PRIVATE
-
-EXTN(jconst_fdct_islow_sse2):
-
-PW_F130_F054	times 4 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130	times 4 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117	times 4 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078	times 4 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089	times 4 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060	times 4 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256	times 4 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050	times 4 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1	times 4 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2	times 4 dd  1 << (DESCALE_P2-1)
-PW_DESCALE_P2X	times 8 dw  1 << (PASS1_BITS-1)
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_islow_sse2 (DCTELEM * data)
-;
-
-%define data(b)		(b)+8		; DCTELEM * data
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		6
-
-	align	16
-	global	EXTN(jsimd_fdct_islow_sse2) PRIVATE
-
-EXTN(jsimd_fdct_islow_sse2):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	ebx
-;	push	ecx		; unused
-;	push	edx		; need not be preserved
-;	push	esi		; unused
-;	push	edi		; unused
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process rows.
-
-	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-	movdqa	xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-
-	; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
-	; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
-
-	movdqa    xmm4,xmm0		; transpose coefficients(phase 1)
-	punpcklwd xmm0,xmm1		; xmm0=(00 10 01 11 02 12 03 13)
-	punpckhwd xmm4,xmm1		; xmm4=(04 14 05 15 06 16 07 17)
-	movdqa    xmm5,xmm2		; transpose coefficients(phase 1)
-	punpcklwd xmm2,xmm3		; xmm2=(20 30 21 31 22 32 23 33)
-	punpckhwd xmm5,xmm3		; xmm5=(24 34 25 35 26 36 27 37)
-
-	movdqa	xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
-	movdqa	xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
-
-	; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
-	; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
-
-	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(20 30 21 31 22 32 23 33)
-	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(24 34 25 35 26 36 27 37)
-
-	movdqa    xmm2,xmm6		; transpose coefficients(phase 1)
-	punpcklwd xmm6,xmm7		; xmm6=(40 50 41 51 42 52 43 53)
-	punpckhwd xmm2,xmm7		; xmm2=(44 54 45 55 46 56 47 57)
-	movdqa    xmm5,xmm1		; transpose coefficients(phase 1)
-	punpcklwd xmm1,xmm3		; xmm1=(60 70 61 71 62 72 63 73)
-	punpckhwd xmm5,xmm3		; xmm5=(64 74 65 75 66 76 67 77)
-
-	movdqa    xmm7,xmm6		; transpose coefficients(phase 2)
-	punpckldq xmm6,xmm1		; xmm6=(40 50 60 70 41 51 61 71)
-	punpckhdq xmm7,xmm1		; xmm7=(42 52 62 72 43 53 63 73)
-	movdqa    xmm3,xmm2		; transpose coefficients(phase 2)
-	punpckldq xmm2,xmm5		; xmm2=(44 54 64 74 45 55 65 75)
-	punpckhdq xmm3,xmm5		; xmm3=(46 56 66 76 47 57 67 77)
-
-	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(20 30 21 31 22 32 23 33)
-	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(24 34 25 35 26 36 27 37)
-	movdqa	XMMWORD [wk(2)], xmm7	; wk(2)=(42 52 62 72 43 53 63 73)
-	movdqa	XMMWORD [wk(3)], xmm2	; wk(3)=(44 54 64 74 45 55 65 75)
-
-	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
-	punpckldq xmm0,xmm1		; xmm0=(00 10 20 30 01 11 21 31)
-	punpckhdq xmm7,xmm1		; xmm7=(02 12 22 32 03 13 23 33)
-	movdqa    xmm2,xmm4		; transpose coefficients(phase 2)
-	punpckldq xmm4,xmm5		; xmm4=(04 14 24 34 05 15 25 35)
-	punpckhdq xmm2,xmm5		; xmm2=(06 16 26 36 07 17 27 37)
-
-	movdqa     xmm1,xmm0		; transpose coefficients(phase 3)
-	punpcklqdq xmm0,xmm6		; xmm0=(00 10 20 30 40 50 60 70)=data0
-	punpckhqdq xmm1,xmm6		; xmm1=(01 11 21 31 41 51 61 71)=data1
-	movdqa     xmm5,xmm2		; transpose coefficients(phase 3)
-	punpcklqdq xmm2,xmm3		; xmm2=(06 16 26 36 46 56 66 76)=data6
-	punpckhqdq xmm5,xmm3		; xmm5=(07 17 27 37 47 57 67 77)=data7
-
-	movdqa	xmm6,xmm1
-	movdqa	xmm3,xmm0
-	psubw	xmm1,xmm2		; xmm1=data1-data6=tmp6
-	psubw	xmm0,xmm5		; xmm0=data0-data7=tmp7
-	paddw	xmm6,xmm2		; xmm6=data1+data6=tmp1
-	paddw	xmm3,xmm5		; xmm3=data0+data7=tmp0
-
-	movdqa	xmm2, XMMWORD [wk(2)]	; xmm2=(42 52 62 72 43 53 63 73)
-	movdqa	xmm5, XMMWORD [wk(3)]	; xmm5=(44 54 64 74 45 55 65 75)
-	movdqa	XMMWORD [wk(0)], xmm1	; wk(0)=tmp6
-	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp7
-
-	movdqa     xmm1,xmm7		; transpose coefficients(phase 3)
-	punpcklqdq xmm7,xmm2		; xmm7=(02 12 22 32 42 52 62 72)=data2
-	punpckhqdq xmm1,xmm2		; xmm1=(03 13 23 33 43 53 63 73)=data3
-	movdqa     xmm0,xmm4		; transpose coefficients(phase 3)
-	punpcklqdq xmm4,xmm5		; xmm4=(04 14 24 34 44 54 64 74)=data4
-	punpckhqdq xmm0,xmm5		; xmm0=(05 15 25 35 45 55 65 75)=data5
-
-	movdqa	xmm2,xmm1
-	movdqa	xmm5,xmm7
-	paddw	xmm1,xmm4		; xmm1=data3+data4=tmp3
-	paddw	xmm7,xmm0		; xmm7=data2+data5=tmp2
-	psubw	xmm2,xmm4		; xmm2=data3-data4=tmp4
-	psubw	xmm5,xmm0		; xmm5=data2-data5=tmp5
-
-	; -- Even part
-
-	movdqa	xmm4,xmm3
-	movdqa	xmm0,xmm6
-	paddw	xmm3,xmm1		; xmm3=tmp10
-	paddw	xmm6,xmm7		; xmm6=tmp11
-	psubw	xmm4,xmm1		; xmm4=tmp13
-	psubw	xmm0,xmm7		; xmm0=tmp12
-
-	movdqa	xmm1,xmm3
-	paddw	xmm3,xmm6		; xmm3=tmp10+tmp11
-	psubw	xmm1,xmm6		; xmm1=tmp10-tmp11
-
-	psllw	xmm3,PASS1_BITS		; xmm3=data0
-	psllw	xmm1,PASS1_BITS		; xmm1=data4
-
-	movdqa	XMMWORD [wk(2)], xmm3	; wk(2)=data0
-	movdqa	XMMWORD [wk(3)], xmm1	; wk(3)=data4
-
-	; (Original)
-	; z1 = (tmp12 + tmp13) * 0.541196100;
-	; data2 = z1 + tmp13 * 0.765366865;
-	; data6 = z1 + tmp12 * -1.847759065;
-	;
-	; (This implementation)
-	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-	movdqa    xmm7,xmm4		; xmm4=tmp13
-	movdqa    xmm6,xmm4
-	punpcklwd xmm7,xmm0		; xmm0=tmp12
-	punpckhwd xmm6,xmm0
-	movdqa    xmm4,xmm7
-	movdqa    xmm0,xmm6
-	pmaddwd   xmm7,[GOTOFF(ebx,PW_F130_F054)]	; xmm7=data2L
-	pmaddwd   xmm6,[GOTOFF(ebx,PW_F130_F054)]	; xmm6=data2H
-	pmaddwd   xmm4,[GOTOFF(ebx,PW_F054_MF130)]	; xmm4=data6L
-	pmaddwd   xmm0,[GOTOFF(ebx,PW_F054_MF130)]	; xmm0=data6H
-
-	paddd	xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]
-	paddd	xmm6,[GOTOFF(ebx,PD_DESCALE_P1)]
-	psrad	xmm7,DESCALE_P1
-	psrad	xmm6,DESCALE_P1
-	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
-	paddd	xmm0,[GOTOFF(ebx,PD_DESCALE_P1)]
-	psrad	xmm4,DESCALE_P1
-	psrad	xmm0,DESCALE_P1
-
-	packssdw  xmm7,xmm6		; xmm7=data2
-	packssdw  xmm4,xmm0		; xmm4=data6
-
-	movdqa	XMMWORD [wk(4)], xmm7	; wk(4)=data2
-	movdqa	XMMWORD [wk(5)], xmm4	; wk(5)=data6
-
-	; -- Odd part
-
-	movdqa	xmm3, XMMWORD [wk(0)]	; xmm3=tmp6
-	movdqa	xmm1, XMMWORD [wk(1)]	; xmm1=tmp7
-
-	movdqa	xmm6,xmm2		; xmm2=tmp4
-	movdqa	xmm0,xmm5		; xmm5=tmp5
-	paddw	xmm6,xmm3		; xmm6=z3
-	paddw	xmm0,xmm1		; xmm0=z4
-
-	; (Original)
-	; z5 = (z3 + z4) * 1.175875602;
-	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-	; z3 += z5;  z4 += z5;
-	;
-	; (This implementation)
-	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-	movdqa    xmm7,xmm6
-	movdqa    xmm4,xmm6
-	punpcklwd xmm7,xmm0
-	punpckhwd xmm4,xmm0
-	movdqa    xmm6,xmm7
-	movdqa    xmm0,xmm4
-	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF078_F117)]	; xmm7=z3L
-	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF078_F117)]	; xmm4=z3H
-	pmaddwd   xmm6,[GOTOFF(ebx,PW_F117_F078)]	; xmm6=z4L
-	pmaddwd   xmm0,[GOTOFF(ebx,PW_F117_F078)]	; xmm0=z4H
-
-	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=z3L
-	movdqa	XMMWORD [wk(1)], xmm4	; wk(1)=z3H
-
-	; (Original)
-	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-	;
-	; (This implementation)
-	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-	movdqa    xmm7,xmm2
-	movdqa    xmm4,xmm2
-	punpcklwd xmm7,xmm1
-	punpckhwd xmm4,xmm1
-	movdqa    xmm2,xmm7
-	movdqa    xmm1,xmm4
-	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm7=tmp4L
-	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm4=tmp4H
-	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF089_F060)]	; xmm2=tmp7L
-	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF089_F060)]	; xmm1=tmp7H
-
-	paddd	xmm7, XMMWORD [wk(0)]	; xmm7=data7L
-	paddd	xmm4, XMMWORD [wk(1)]	; xmm4=data7H
-	paddd	xmm2,xmm6		; xmm2=data1L
-	paddd	xmm1,xmm0		; xmm1=data1H
-
-	paddd	xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]
-	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
-	psrad	xmm7,DESCALE_P1
-	psrad	xmm4,DESCALE_P1
-	paddd	xmm2,[GOTOFF(ebx,PD_DESCALE_P1)]
-	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]
-	psrad	xmm2,DESCALE_P1
-	psrad	xmm1,DESCALE_P1
-
-	packssdw  xmm7,xmm4		; xmm7=data7
-	packssdw  xmm2,xmm1		; xmm2=data1
-
-	movdqa    xmm4,xmm5
-	movdqa    xmm1,xmm5
-	punpcklwd xmm4,xmm3
-	punpckhwd xmm1,xmm3
-	movdqa    xmm5,xmm4
-	movdqa    xmm3,xmm1
-	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm4=tmp5L
-	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm1=tmp5H
-	pmaddwd   xmm5,[GOTOFF(ebx,PW_MF256_F050)]	; xmm5=tmp6L
-	pmaddwd   xmm3,[GOTOFF(ebx,PW_MF256_F050)]	; xmm3=tmp6H
-
-	paddd	xmm4,xmm6		; xmm4=data5L
-	paddd	xmm1,xmm0		; xmm1=data5H
-	paddd	xmm5, XMMWORD [wk(0)]	; xmm5=data3L
-	paddd	xmm3, XMMWORD [wk(1)]	; xmm3=data3H
-
-	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
-	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]
-	psrad	xmm4,DESCALE_P1
-	psrad	xmm1,DESCALE_P1
-	paddd	xmm5,[GOTOFF(ebx,PD_DESCALE_P1)]
-	paddd	xmm3,[GOTOFF(ebx,PD_DESCALE_P1)]
-	psrad	xmm5,DESCALE_P1
-	psrad	xmm3,DESCALE_P1
-
-	packssdw  xmm4,xmm1		; xmm4=data5
-	packssdw  xmm5,xmm3		; xmm5=data3
-
-	; ---- Pass 2: process columns.
-
-;	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
-
-	movdqa	xmm6, XMMWORD [wk(2)]	; xmm6=col0
-	movdqa	xmm0, XMMWORD [wk(4)]	; xmm0=col2
-
-	; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
-	; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
-
-	movdqa    xmm1,xmm6		; transpose coefficients(phase 1)
-	punpcklwd xmm6,xmm2		; xmm6=(00 01 10 11 20 21 30 31)
-	punpckhwd xmm1,xmm2		; xmm1=(40 41 50 51 60 61 70 71)
-	movdqa    xmm3,xmm0		; transpose coefficients(phase 1)
-	punpcklwd xmm0,xmm5		; xmm0=(02 03 12 13 22 23 32 33)
-	punpckhwd xmm3,xmm5		; xmm3=(42 43 52 53 62 63 72 73)
-
-	movdqa	xmm2, XMMWORD [wk(3)]	; xmm2=col4
-	movdqa	xmm5, XMMWORD [wk(5)]	; xmm5=col6
-
-	; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
-	; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
-
-	movdqa	XMMWORD [wk(0)], xmm0	; wk(0)=(02 03 12 13 22 23 32 33)
-	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=(42 43 52 53 62 63 72 73)
-
-	movdqa    xmm0,xmm2		; transpose coefficients(phase 1)
-	punpcklwd xmm2,xmm4		; xmm2=(04 05 14 15 24 25 34 35)
-	punpckhwd xmm0,xmm4		; xmm0=(44 45 54 55 64 65 74 75)
-	movdqa    xmm3,xmm5		; transpose coefficients(phase 1)
-	punpcklwd xmm5,xmm7		; xmm5=(06 07 16 17 26 27 36 37)
-	punpckhwd xmm3,xmm7		; xmm3=(46 47 56 57 66 67 76 77)
-
-	movdqa    xmm4,xmm2		; transpose coefficients(phase 2)
-	punpckldq xmm2,xmm5		; xmm2=(04 05 06 07 14 15 16 17)
-	punpckhdq xmm4,xmm5		; xmm4=(24 25 26 27 34 35 36 37)
-	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
-	punpckldq xmm0,xmm3		; xmm0=(44 45 46 47 54 55 56 57)
-	punpckhdq xmm7,xmm3		; xmm7=(64 65 66 67 74 75 76 77)
-
-	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=(02 03 12 13 22 23 32 33)
-	movdqa	xmm3, XMMWORD [wk(1)]	; xmm3=(42 43 52 53 62 63 72 73)
-	movdqa	XMMWORD [wk(2)], xmm4	; wk(2)=(24 25 26 27 34 35 36 37)
-	movdqa	XMMWORD [wk(3)], xmm0	; wk(3)=(44 45 46 47 54 55 56 57)
-
-	movdqa    xmm4,xmm6		; transpose coefficients(phase 2)
-	punpckldq xmm6,xmm5		; xmm6=(00 01 02 03 10 11 12 13)
-	punpckhdq xmm4,xmm5		; xmm4=(20 21 22 23 30 31 32 33)
-	movdqa    xmm0,xmm1		; transpose coefficients(phase 2)
-	punpckldq xmm1,xmm3		; xmm1=(40 41 42 43 50 51 52 53)
-	punpckhdq xmm0,xmm3		; xmm0=(60 61 62 63 70 71 72 73)
-
-	movdqa     xmm5,xmm6		; transpose coefficients(phase 3)
-	punpcklqdq xmm6,xmm2		; xmm6=(00 01 02 03 04 05 06 07)=data0
-	punpckhqdq xmm5,xmm2		; xmm5=(10 11 12 13 14 15 16 17)=data1
-	movdqa     xmm3,xmm0		; transpose coefficients(phase 3)
-	punpcklqdq xmm0,xmm7		; xmm0=(60 61 62 63 64 65 66 67)=data6
-	punpckhqdq xmm3,xmm7		; xmm3=(70 71 72 73 74 75 76 77)=data7
-
-	movdqa	xmm2,xmm5
-	movdqa	xmm7,xmm6
-	psubw	xmm5,xmm0		; xmm5=data1-data6=tmp6
-	psubw	xmm6,xmm3		; xmm6=data0-data7=tmp7
-	paddw	xmm2,xmm0		; xmm2=data1+data6=tmp1
-	paddw	xmm7,xmm3		; xmm7=data0+data7=tmp0
-
-	movdqa	xmm0, XMMWORD [wk(2)]	; xmm0=(24 25 26 27 34 35 36 37)
-	movdqa	xmm3, XMMWORD [wk(3)]	; xmm3=(44 45 46 47 54 55 56 57)
-	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=tmp6
-	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
-
-	movdqa     xmm5,xmm4		; transpose coefficients(phase 3)
-	punpcklqdq xmm4,xmm0		; xmm4=(20 21 22 23 24 25 26 27)=data2
-	punpckhqdq xmm5,xmm0		; xmm5=(30 31 32 33 34 35 36 37)=data3
-	movdqa     xmm6,xmm1		; transpose coefficients(phase 3)
-	punpcklqdq xmm1,xmm3		; xmm1=(40 41 42 43 44 45 46 47)=data4
-	punpckhqdq xmm6,xmm3		; xmm6=(50 51 52 53 54 55 56 57)=data5
-
-	movdqa	xmm0,xmm5
-	movdqa	xmm3,xmm4
-	paddw	xmm5,xmm1		; xmm5=data3+data4=tmp3
-	paddw	xmm4,xmm6		; xmm4=data2+data5=tmp2
-	psubw	xmm0,xmm1		; xmm0=data3-data4=tmp4
-	psubw	xmm3,xmm6		; xmm3=data2-data5=tmp5
-
-	; -- Even part
-
-	movdqa	xmm1,xmm7
-	movdqa	xmm6,xmm2
-	paddw	xmm7,xmm5		; xmm7=tmp10
-	paddw	xmm2,xmm4		; xmm2=tmp11
-	psubw	xmm1,xmm5		; xmm1=tmp13
-	psubw	xmm6,xmm4		; xmm6=tmp12
-
-	movdqa	xmm5,xmm7
-	paddw	xmm7,xmm2		; xmm7=tmp10+tmp11
-	psubw	xmm5,xmm2		; xmm5=tmp10-tmp11
-
-	paddw	xmm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
-	paddw	xmm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
-	psraw	xmm7,PASS1_BITS		; xmm7=data0
-	psraw	xmm5,PASS1_BITS		; xmm5=data4
-
-	movdqa	XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7
-	movdqa	XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5
-
-	; (Original)
-	; z1 = (tmp12 + tmp13) * 0.541196100;
-	; data2 = z1 + tmp13 * 0.765366865;
-	; data6 = z1 + tmp12 * -1.847759065;
-	;
-	; (This implementation)
-	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-	movdqa    xmm4,xmm1		; xmm1=tmp13
-	movdqa    xmm2,xmm1
-	punpcklwd xmm4,xmm6		; xmm6=tmp12
-	punpckhwd xmm2,xmm6
-	movdqa    xmm1,xmm4
-	movdqa    xmm6,xmm2
-	pmaddwd   xmm4,[GOTOFF(ebx,PW_F130_F054)]	; xmm4=data2L
-	pmaddwd   xmm2,[GOTOFF(ebx,PW_F130_F054)]	; xmm2=data2H
-	pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]	; xmm1=data6L
-	pmaddwd   xmm6,[GOTOFF(ebx,PW_F054_MF130)]	; xmm6=data6H
-
-	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P2)]
-	paddd	xmm2,[GOTOFF(ebx,PD_DESCALE_P2)]
-	psrad	xmm4,DESCALE_P2
-	psrad	xmm2,DESCALE_P2
-	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-	paddd	xmm6,[GOTOFF(ebx,PD_DESCALE_P2)]
-	psrad	xmm1,DESCALE_P2
-	psrad	xmm6,DESCALE_P2
-
-	packssdw  xmm4,xmm2		; xmm4=data2
-	packssdw  xmm1,xmm6		; xmm1=data6
-
-	movdqa	XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4
-	movdqa	XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1
-
-	; -- Odd part
-
-	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp6
-	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp7
-
-	movdqa	xmm2,xmm0		; xmm0=tmp4
-	movdqa	xmm6,xmm3		; xmm3=tmp5
-	paddw	xmm2,xmm7		; xmm2=z3
-	paddw	xmm6,xmm5		; xmm6=z4
-
-	; (Original)
-	; z5 = (z3 + z4) * 1.175875602;
-	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-	; z3 += z5;  z4 += z5;
-	;
-	; (This implementation)
-	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-	movdqa    xmm4,xmm2
-	movdqa    xmm1,xmm2
-	punpcklwd xmm4,xmm6
-	punpckhwd xmm1,xmm6
-	movdqa    xmm2,xmm4
-	movdqa    xmm6,xmm1
-	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF078_F117)]	; xmm4=z3L
-	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF078_F117)]	; xmm1=z3H
-	pmaddwd   xmm2,[GOTOFF(ebx,PW_F117_F078)]	; xmm2=z4L
-	pmaddwd   xmm6,[GOTOFF(ebx,PW_F117_F078)]	; xmm6=z4H
-
-	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=z3L
-	movdqa	XMMWORD [wk(1)], xmm1	; wk(1)=z3H
-
-	; (Original)
-	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-	;
-	; (This implementation)
-	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-	movdqa    xmm4,xmm0
-	movdqa    xmm1,xmm0
-	punpcklwd xmm4,xmm5
-	punpckhwd xmm1,xmm5
-	movdqa    xmm0,xmm4
-	movdqa    xmm5,xmm1
-	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm4=tmp4L
-	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm1=tmp4H
-	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF089_F060)]	; xmm0=tmp7L
-	pmaddwd   xmm5,[GOTOFF(ebx,PW_MF089_F060)]	; xmm5=tmp7H
-
-	paddd	xmm4, XMMWORD [wk(0)]	; xmm4=data7L
-	paddd	xmm1, XMMWORD [wk(1)]	; xmm1=data7H
-	paddd	xmm0,xmm2		; xmm0=data1L
-	paddd	xmm5,xmm6		; xmm5=data1H
-
-	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P2)]
-	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-	psrad	xmm4,DESCALE_P2
-	psrad	xmm1,DESCALE_P2
-	paddd	xmm0,[GOTOFF(ebx,PD_DESCALE_P2)]
-	paddd	xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]
-	psrad	xmm0,DESCALE_P2
-	psrad	xmm5,DESCALE_P2
-
-	packssdw  xmm4,xmm1		; xmm4=data7
-	packssdw  xmm0,xmm5		; xmm0=data1
-
-	movdqa	XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4
-	movdqa	XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0
-
-	movdqa    xmm1,xmm3
-	movdqa    xmm5,xmm3
-	punpcklwd xmm1,xmm7
-	punpckhwd xmm5,xmm7
-	movdqa    xmm3,xmm1
-	movdqa    xmm7,xmm5
-	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm1=tmp5L
-	pmaddwd   xmm5,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm5=tmp5H
-	pmaddwd   xmm3,[GOTOFF(ebx,PW_MF256_F050)]	; xmm3=tmp6L
-	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF256_F050)]	; xmm7=tmp6H
-
-	paddd	xmm1,xmm2		; xmm1=data5L
-	paddd	xmm5,xmm6		; xmm5=data5H
-	paddd	xmm3, XMMWORD [wk(0)]	; xmm3=data3L
-	paddd	xmm7, XMMWORD [wk(1)]	; xmm7=data3H
-
-	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-	paddd	xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]
-	psrad	xmm1,DESCALE_P2
-	psrad	xmm5,DESCALE_P2
-	paddd	xmm3,[GOTOFF(ebx,PD_DESCALE_P2)]
-	paddd	xmm7,[GOTOFF(ebx,PD_DESCALE_P2)]
-	psrad	xmm3,DESCALE_P2
-	psrad	xmm7,DESCALE_P2
-
-	packssdw  xmm1,xmm5		; xmm1=data5
-	packssdw  xmm3,xmm7		; xmm3=data3
-
-	movdqa	XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1
-	movdqa	XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3
-
-;	pop	edi		; unused
-;	pop	esi		; unused
-;	pop	edx		; need not be preserved
-;	pop	ecx		; unused
-	poppic	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jfsseflt-64.asm b/simd/jfsseflt-64.asm
deleted file mode 100644
index b5de0c4..0000000
--- a/simd/jfsseflt-64.asm
+++ /dev/null
@@ -1,358 +0,0 @@
-;
-; jfsseflt-64.asm - floating-point FDCT (64-bit SSE)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the forward DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
-	shufps	%1,%2,0x44
-%endmacro
-
-%macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
-	shufps	%1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_fdct_float_sse) PRIVATE
-
-EXTN(jconst_fdct_float_sse):
-
-PD_0_382	times 4 dd  0.382683432365089771728460
-PD_0_707	times 4 dd  0.707106781186547524400844
-PD_0_541	times 4 dd  0.541196100146196984399723
-PD_1_306	times 4 dd  1.306562964876376527856643
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_float_sse (FAST_FLOAT * data)
-;
-
-; r10 = FAST_FLOAT * data
-
-%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-
-	align	16
-	global	EXTN(jsimd_fdct_float_sse) PRIVATE
-
-EXTN(jsimd_fdct_float_sse):
-	push	rbp
-	mov	rax,rsp				; rax = original rbp
-	sub	rsp, byte 4
-	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[rsp],rax
-	mov	rbp,rsp				; rbp = aligned rbp
-	lea	rsp, [wk(0)]
-	collect_args
-
-	; ---- Pass 1: process rows.
-
-	mov	rdx, r10	; (FAST_FLOAT *)
-	mov	rcx, DCTSIZE/4
-.rowloop:
-
-	movaps	xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)]
-
-	; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
-	; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
-
-	movaps   xmm4,xmm0		; transpose coefficients(phase 1)
-	unpcklps xmm0,xmm1		; xmm0=(20 30 21 31)
-	unpckhps xmm4,xmm1		; xmm4=(22 32 23 33)
-	movaps   xmm5,xmm2		; transpose coefficients(phase 1)
-	unpcklps xmm2,xmm3		; xmm2=(24 34 25 35)
-	unpckhps xmm5,xmm3		; xmm5=(26 36 27 37)
-
-	movaps	xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
-
-	; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
-	; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
-
-	movaps	XMMWORD [wk(0)], xmm4	; wk(0)=(22 32 23 33)
-	movaps	XMMWORD [wk(1)], xmm2	; wk(1)=(24 34 25 35)
-
-	movaps   xmm4,xmm6		; transpose coefficients(phase 1)
-	unpcklps xmm6,xmm7		; xmm6=(00 10 01 11)
-	unpckhps xmm4,xmm7		; xmm4=(02 12 03 13)
-	movaps   xmm2,xmm1		; transpose coefficients(phase 1)
-	unpcklps xmm1,xmm3		; xmm1=(04 14 05 15)
-	unpckhps xmm2,xmm3		; xmm2=(06 16 07 17)
-
-	movaps    xmm7,xmm6		; transpose coefficients(phase 2)
-	unpcklps2 xmm6,xmm0		; xmm6=(00 10 20 30)=data0
-	unpckhps2 xmm7,xmm0		; xmm7=(01 11 21 31)=data1
-	movaps    xmm3,xmm2		; transpose coefficients(phase 2)
-	unpcklps2 xmm2,xmm5		; xmm2=(06 16 26 36)=data6
-	unpckhps2 xmm3,xmm5		; xmm3=(07 17 27 37)=data7
-
-	movaps	xmm0,xmm7
-	movaps	xmm5,xmm6
-	subps	xmm7,xmm2		; xmm7=data1-data6=tmp6
-	subps	xmm6,xmm3		; xmm6=data0-data7=tmp7
-	addps	xmm0,xmm2		; xmm0=data1+data6=tmp1
-	addps	xmm5,xmm3		; xmm5=data0+data7=tmp0
-
-	movaps	xmm2, XMMWORD [wk(0)]	; xmm2=(22 32 23 33)
-	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=(24 34 25 35)
-	movaps	XMMWORD [wk(0)], xmm7	; wk(0)=tmp6
-	movaps	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
-
-	movaps    xmm7,xmm4		; transpose coefficients(phase 2)
-	unpcklps2 xmm4,xmm2		; xmm4=(02 12 22 32)=data2
-	unpckhps2 xmm7,xmm2		; xmm7=(03 13 23 33)=data3
-	movaps    xmm6,xmm1		; transpose coefficients(phase 2)
-	unpcklps2 xmm1,xmm3		; xmm1=(04 14 24 34)=data4
-	unpckhps2 xmm6,xmm3		; xmm6=(05 15 25 35)=data5
-
-	movaps	xmm2,xmm7
-	movaps	xmm3,xmm4
-	addps	xmm7,xmm1		; xmm7=data3+data4=tmp3
-	addps	xmm4,xmm6		; xmm4=data2+data5=tmp2
-	subps	xmm2,xmm1		; xmm2=data3-data4=tmp4
-	subps	xmm3,xmm6		; xmm3=data2-data5=tmp5
-
-	; -- Even part
-
-	movaps	xmm1,xmm5
-	movaps	xmm6,xmm0
-	subps	xmm5,xmm7		; xmm5=tmp13
-	subps	xmm0,xmm4		; xmm0=tmp12
-	addps	xmm1,xmm7		; xmm1=tmp10
-	addps	xmm6,xmm4		; xmm6=tmp11
-
-	addps	xmm0,xmm5
-	mulps	xmm0,[rel PD_0_707] ; xmm0=z1
-
-	movaps	xmm7,xmm1
-	movaps	xmm4,xmm5
-	subps	xmm1,xmm6		; xmm1=data4
-	subps	xmm5,xmm0		; xmm5=data6
-	addps	xmm7,xmm6		; xmm7=data0
-	addps	xmm4,xmm0		; xmm4=data2
-
-	movaps	XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
-	movaps	XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
-	movaps	XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
-	movaps	XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
-
-	; -- Odd part
-
-	movaps	xmm6, XMMWORD [wk(0)]	; xmm6=tmp6
-	movaps	xmm0, XMMWORD [wk(1)]	; xmm0=tmp7
-
-	addps	xmm2,xmm3		; xmm2=tmp10
-	addps	xmm3,xmm6		; xmm3=tmp11
-	addps	xmm6,xmm0		; xmm6=tmp12, xmm0=tmp7
-
-	mulps	xmm3,[rel PD_0_707] ; xmm3=z3
-
-	movaps	xmm1,xmm2		; xmm1=tmp10
-	subps	xmm2,xmm6
-	mulps	xmm2,[rel PD_0_382] ; xmm2=z5
-	mulps	xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
-	mulps	xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
-	addps	xmm1,xmm2		; xmm1=z2
-	addps	xmm6,xmm2		; xmm6=z4
-
-	movaps	xmm5,xmm0
-	subps	xmm0,xmm3		; xmm0=z13
-	addps	xmm5,xmm3		; xmm5=z11
-
-	movaps	xmm7,xmm0
-	movaps	xmm4,xmm5
-	subps	xmm0,xmm1		; xmm0=data3
-	subps	xmm5,xmm6		; xmm5=data7
-	addps	xmm7,xmm1		; xmm7=data5
-	addps	xmm4,xmm6		; xmm4=data1
-
-	movaps	XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
-	movaps	XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
-	movaps	XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7
-	movaps	XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
-
-	add	rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
-	dec	rcx
-	jnz	near .rowloop
-
-	; ---- Pass 2: process columns.
-
-	mov	rdx, r10	; (FAST_FLOAT *)
-	mov	rcx, DCTSIZE/4
-.columnloop:
-
-	movaps	xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)]
-
-	; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
-	; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
-
-	movaps   xmm4,xmm0		; transpose coefficients(phase 1)
-	unpcklps xmm0,xmm1		; xmm0=(02 03 12 13)
-	unpckhps xmm4,xmm1		; xmm4=(22 23 32 33)
-	movaps   xmm5,xmm2		; transpose coefficients(phase 1)
-	unpcklps xmm2,xmm3		; xmm2=(42 43 52 53)
-	unpckhps xmm5,xmm3		; xmm5=(62 63 72 73)
-
-	movaps	xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)]
-
-	; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
-	; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
-
-	movaps	XMMWORD [wk(0)], xmm4	; wk(0)=(22 23 32 33)
-	movaps	XMMWORD [wk(1)], xmm2	; wk(1)=(42 43 52 53)
-
-	movaps   xmm4,xmm6		; transpose coefficients(phase 1)
-	unpcklps xmm6,xmm7		; xmm6=(00 01 10 11)
-	unpckhps xmm4,xmm7		; xmm4=(20 21 30 31)
-	movaps   xmm2,xmm1		; transpose coefficients(phase 1)
-	unpcklps xmm1,xmm3		; xmm1=(40 41 50 51)
-	unpckhps xmm2,xmm3		; xmm2=(60 61 70 71)
-
-	movaps    xmm7,xmm6		; transpose coefficients(phase 2)
-	unpcklps2 xmm6,xmm0		; xmm6=(00 01 02 03)=data0
-	unpckhps2 xmm7,xmm0		; xmm7=(10 11 12 13)=data1
-	movaps    xmm3,xmm2		; transpose coefficients(phase 2)
-	unpcklps2 xmm2,xmm5		; xmm2=(60 61 62 63)=data6
-	unpckhps2 xmm3,xmm5		; xmm3=(70 71 72 73)=data7
-
-	movaps	xmm0,xmm7
-	movaps	xmm5,xmm6
-	subps	xmm7,xmm2		; xmm7=data1-data6=tmp6
-	subps	xmm6,xmm3		; xmm6=data0-data7=tmp7
-	addps	xmm0,xmm2		; xmm0=data1+data6=tmp1
-	addps	xmm5,xmm3		; xmm5=data0+data7=tmp0
-
-	movaps	xmm2, XMMWORD [wk(0)]	; xmm2=(22 23 32 33)
-	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=(42 43 52 53)
-	movaps	XMMWORD [wk(0)], xmm7	; wk(0)=tmp6
-	movaps	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
-
-	movaps    xmm7,xmm4		; transpose coefficients(phase 2)
-	unpcklps2 xmm4,xmm2		; xmm4=(20 21 22 23)=data2
-	unpckhps2 xmm7,xmm2		; xmm7=(30 31 32 33)=data3
-	movaps    xmm6,xmm1		; transpose coefficients(phase 2)
-	unpcklps2 xmm1,xmm3		; xmm1=(40 41 42 43)=data4
-	unpckhps2 xmm6,xmm3		; xmm6=(50 51 52 53)=data5
-
-	movaps	xmm2,xmm7
-	movaps	xmm3,xmm4
-	addps	xmm7,xmm1		; xmm7=data3+data4=tmp3
-	addps	xmm4,xmm6		; xmm4=data2+data5=tmp2
-	subps	xmm2,xmm1		; xmm2=data3-data4=tmp4
-	subps	xmm3,xmm6		; xmm3=data2-data5=tmp5
-
-	; -- Even part
-
-	movaps	xmm1,xmm5
-	movaps	xmm6,xmm0
-	subps	xmm5,xmm7		; xmm5=tmp13
-	subps	xmm0,xmm4		; xmm0=tmp12
-	addps	xmm1,xmm7		; xmm1=tmp10
-	addps	xmm6,xmm4		; xmm6=tmp11
-
-	addps	xmm0,xmm5
-	mulps	xmm0,[rel PD_0_707] ; xmm0=z1
-
-	movaps	xmm7,xmm1
-	movaps	xmm4,xmm5
-	subps	xmm1,xmm6		; xmm1=data4
-	subps	xmm5,xmm0		; xmm5=data6
-	addps	xmm7,xmm6		; xmm7=data0
-	addps	xmm4,xmm0		; xmm4=data2
-
-	movaps	XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
-	movaps	XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
-	movaps	XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
-	movaps	XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
-
-	; -- Odd part
-
-	movaps	xmm6, XMMWORD [wk(0)]	; xmm6=tmp6
-	movaps	xmm0, XMMWORD [wk(1)]	; xmm0=tmp7
-
-	addps	xmm2,xmm3		; xmm2=tmp10
-	addps	xmm3,xmm6		; xmm3=tmp11
-	addps	xmm6,xmm0		; xmm6=tmp12, xmm0=tmp7
-
-	mulps	xmm3,[rel PD_0_707] ; xmm3=z3
-
-	movaps	xmm1,xmm2		; xmm1=tmp10
-	subps	xmm2,xmm6
-	mulps	xmm2,[rel PD_0_382] ; xmm2=z5
-	mulps	xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
-	mulps	xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
-	addps	xmm1,xmm2		; xmm1=z2
-	addps	xmm6,xmm2		; xmm6=z4
-
-	movaps	xmm5,xmm0
-	subps	xmm0,xmm3		; xmm0=z13
-	addps	xmm5,xmm3		; xmm5=z11
-
-	movaps	xmm7,xmm0
-	movaps	xmm4,xmm5
-	subps	xmm0,xmm1		; xmm0=data3
-	subps	xmm5,xmm6		; xmm5=data7
-	addps	xmm7,xmm1		; xmm7=data5
-	addps	xmm4,xmm6		; xmm4=data1
-
-	movaps	XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
-	movaps	XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
-	movaps	XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
-	movaps	XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
-
-	add	rdx, byte 4*SIZEOF_FAST_FLOAT
-	dec	rcx
-	jnz	near .columnloop
-
-	uncollect_args
-	mov	rsp,rbp		; rsp <- aligned rbp
-	pop	rsp		; rsp <- original rbp
-	pop	rbp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jfsseflt.asm b/simd/jfsseflt.asm
deleted file mode 100644
index dc52c32..0000000
--- a/simd/jfsseflt.asm
+++ /dev/null
@@ -1,370 +0,0 @@
-;
-; jfsseflt.asm - floating-point FDCT (SSE)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the forward DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
-	shufps	%1,%2,0x44
-%endmacro
-
-%macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
-	shufps	%1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_fdct_float_sse) PRIVATE
-
-EXTN(jconst_fdct_float_sse):
-
-PD_0_382	times 4 dd  0.382683432365089771728460
-PD_0_707	times 4 dd  0.707106781186547524400844
-PD_0_541	times 4 dd  0.541196100146196984399723
-PD_1_306	times 4 dd  1.306562964876376527856643
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_float_sse (FAST_FLOAT * data)
-;
-
-%define data(b)		(b)+8		; FAST_FLOAT * data
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-
-	align	16
-	global	EXTN(jsimd_fdct_float_sse) PRIVATE
-
-EXTN(jsimd_fdct_float_sse):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-;	push	esi		; unused
-;	push	edi		; unused
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process rows.
-
-	mov	edx, POINTER [data(eax)]	; (FAST_FLOAT *)
-	mov	ecx, DCTSIZE/4
-	alignx	16,7
-.rowloop:
-
-	movaps	xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
-
-	; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
-	; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
-
-	movaps   xmm4,xmm0		; transpose coefficients(phase 1)
-	unpcklps xmm0,xmm1		; xmm0=(20 30 21 31)
-	unpckhps xmm4,xmm1		; xmm4=(22 32 23 33)
-	movaps   xmm5,xmm2		; transpose coefficients(phase 1)
-	unpcklps xmm2,xmm3		; xmm2=(24 34 25 35)
-	unpckhps xmm5,xmm3		; xmm5=(26 36 27 37)
-
-	movaps	xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-
-	; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
-	; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
-
-	movaps	XMMWORD [wk(0)], xmm4	; wk(0)=(22 32 23 33)
-	movaps	XMMWORD [wk(1)], xmm2	; wk(1)=(24 34 25 35)
-
-	movaps   xmm4,xmm6		; transpose coefficients(phase 1)
-	unpcklps xmm6,xmm7		; xmm6=(00 10 01 11)
-	unpckhps xmm4,xmm7		; xmm4=(02 12 03 13)
-	movaps   xmm2,xmm1		; transpose coefficients(phase 1)
-	unpcklps xmm1,xmm3		; xmm1=(04 14 05 15)
-	unpckhps xmm2,xmm3		; xmm2=(06 16 07 17)
-
-	movaps    xmm7,xmm6		; transpose coefficients(phase 2)
-	unpcklps2 xmm6,xmm0		; xmm6=(00 10 20 30)=data0
-	unpckhps2 xmm7,xmm0		; xmm7=(01 11 21 31)=data1
-	movaps    xmm3,xmm2		; transpose coefficients(phase 2)
-	unpcklps2 xmm2,xmm5		; xmm2=(06 16 26 36)=data6
-	unpckhps2 xmm3,xmm5		; xmm3=(07 17 27 37)=data7
-
-	movaps	xmm0,xmm7
-	movaps	xmm5,xmm6
-	subps	xmm7,xmm2		; xmm7=data1-data6=tmp6
-	subps	xmm6,xmm3		; xmm6=data0-data7=tmp7
-	addps	xmm0,xmm2		; xmm0=data1+data6=tmp1
-	addps	xmm5,xmm3		; xmm5=data0+data7=tmp0
-
-	movaps	xmm2, XMMWORD [wk(0)]	; xmm2=(22 32 23 33)
-	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=(24 34 25 35)
-	movaps	XMMWORD [wk(0)], xmm7	; wk(0)=tmp6
-	movaps	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
-
-	movaps    xmm7,xmm4		; transpose coefficients(phase 2)
-	unpcklps2 xmm4,xmm2		; xmm4=(02 12 22 32)=data2
-	unpckhps2 xmm7,xmm2		; xmm7=(03 13 23 33)=data3
-	movaps    xmm6,xmm1		; transpose coefficients(phase 2)
-	unpcklps2 xmm1,xmm3		; xmm1=(04 14 24 34)=data4
-	unpckhps2 xmm6,xmm3		; xmm6=(05 15 25 35)=data5
-
-	movaps	xmm2,xmm7
-	movaps	xmm3,xmm4
-	addps	xmm7,xmm1		; xmm7=data3+data4=tmp3
-	addps	xmm4,xmm6		; xmm4=data2+data5=tmp2
-	subps	xmm2,xmm1		; xmm2=data3-data4=tmp4
-	subps	xmm3,xmm6		; xmm3=data2-data5=tmp5
-
-	; -- Even part
-
-	movaps	xmm1,xmm5
-	movaps	xmm6,xmm0
-	subps	xmm5,xmm7		; xmm5=tmp13
-	subps	xmm0,xmm4		; xmm0=tmp12
-	addps	xmm1,xmm7		; xmm1=tmp10
-	addps	xmm6,xmm4		; xmm6=tmp11
-
-	addps	xmm0,xmm5
-	mulps	xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
-
-	movaps	xmm7,xmm1
-	movaps	xmm4,xmm5
-	subps	xmm1,xmm6		; xmm1=data4
-	subps	xmm5,xmm0		; xmm5=data6
-	addps	xmm7,xmm6		; xmm7=data0
-	addps	xmm4,xmm0		; xmm4=data2
-
-	movaps	XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
-	movaps	XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
-	movaps	XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
-	movaps	XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
-
-	; -- Odd part
-
-	movaps	xmm6, XMMWORD [wk(0)]	; xmm6=tmp6
-	movaps	xmm0, XMMWORD [wk(1)]	; xmm0=tmp7
-
-	addps	xmm2,xmm3		; xmm2=tmp10
-	addps	xmm3,xmm6		; xmm3=tmp11
-	addps	xmm6,xmm0		; xmm6=tmp12, xmm0=tmp7
-
-	mulps	xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
-
-	movaps	xmm1,xmm2		; xmm1=tmp10
-	subps	xmm2,xmm6
-	mulps	xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
-	mulps	xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
-	mulps	xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
-	addps	xmm1,xmm2		; xmm1=z2
-	addps	xmm6,xmm2		; xmm6=z4
-
-	movaps	xmm5,xmm0
-	subps	xmm0,xmm3		; xmm0=z13
-	addps	xmm5,xmm3		; xmm5=z11
-
-	movaps	xmm7,xmm0
-	movaps	xmm4,xmm5
-	subps	xmm0,xmm1		; xmm0=data3
-	subps	xmm5,xmm6		; xmm5=data7
-	addps	xmm7,xmm1		; xmm7=data5
-	addps	xmm4,xmm6		; xmm4=data1
-
-	movaps	XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
-	movaps	XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
-	movaps	XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
-	movaps	XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
-
-	add	edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
-	dec	ecx
-	jnz	near .rowloop
-
-	; ---- Pass 2: process columns.
-
-	mov	edx, POINTER [data(eax)]	; (FAST_FLOAT *)
-	mov	ecx, DCTSIZE/4
-	alignx	16,7
-.columnloop:
-
-	movaps	xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
-
-	; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
-	; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
-
-	movaps   xmm4,xmm0		; transpose coefficients(phase 1)
-	unpcklps xmm0,xmm1		; xmm0=(02 03 12 13)
-	unpckhps xmm4,xmm1		; xmm4=(22 23 32 33)
-	movaps   xmm5,xmm2		; transpose coefficients(phase 1)
-	unpcklps xmm2,xmm3		; xmm2=(42 43 52 53)
-	unpckhps xmm5,xmm3		; xmm5=(62 63 72 73)
-
-	movaps	xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
-
-	; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
-	; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
-
-	movaps	XMMWORD [wk(0)], xmm4	; wk(0)=(22 23 32 33)
-	movaps	XMMWORD [wk(1)], xmm2	; wk(1)=(42 43 52 53)
-
-	movaps   xmm4,xmm6		; transpose coefficients(phase 1)
-	unpcklps xmm6,xmm7		; xmm6=(00 01 10 11)
-	unpckhps xmm4,xmm7		; xmm4=(20 21 30 31)
-	movaps   xmm2,xmm1		; transpose coefficients(phase 1)
-	unpcklps xmm1,xmm3		; xmm1=(40 41 50 51)
-	unpckhps xmm2,xmm3		; xmm2=(60 61 70 71)
-
-	movaps    xmm7,xmm6		; transpose coefficients(phase 2)
-	unpcklps2 xmm6,xmm0		; xmm6=(00 01 02 03)=data0
-	unpckhps2 xmm7,xmm0		; xmm7=(10 11 12 13)=data1
-	movaps    xmm3,xmm2		; transpose coefficients(phase 2)
-	unpcklps2 xmm2,xmm5		; xmm2=(60 61 62 63)=data6
-	unpckhps2 xmm3,xmm5		; xmm3=(70 71 72 73)=data7
-
-	movaps	xmm0,xmm7
-	movaps	xmm5,xmm6
-	subps	xmm7,xmm2		; xmm7=data1-data6=tmp6
-	subps	xmm6,xmm3		; xmm6=data0-data7=tmp7
-	addps	xmm0,xmm2		; xmm0=data1+data6=tmp1
-	addps	xmm5,xmm3		; xmm5=data0+data7=tmp0
-
-	movaps	xmm2, XMMWORD [wk(0)]	; xmm2=(22 23 32 33)
-	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=(42 43 52 53)
-	movaps	XMMWORD [wk(0)], xmm7	; wk(0)=tmp6
-	movaps	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
-
-	movaps    xmm7,xmm4		; transpose coefficients(phase 2)
-	unpcklps2 xmm4,xmm2		; xmm4=(20 21 22 23)=data2
-	unpckhps2 xmm7,xmm2		; xmm7=(30 31 32 33)=data3
-	movaps    xmm6,xmm1		; transpose coefficients(phase 2)
-	unpcklps2 xmm1,xmm3		; xmm1=(40 41 42 43)=data4
-	unpckhps2 xmm6,xmm3		; xmm6=(50 51 52 53)=data5
-
-	movaps	xmm2,xmm7
-	movaps	xmm3,xmm4
-	addps	xmm7,xmm1		; xmm7=data3+data4=tmp3
-	addps	xmm4,xmm6		; xmm4=data2+data5=tmp2
-	subps	xmm2,xmm1		; xmm2=data3-data4=tmp4
-	subps	xmm3,xmm6		; xmm3=data2-data5=tmp5
-
-	; -- Even part
-
-	movaps	xmm1,xmm5
-	movaps	xmm6,xmm0
-	subps	xmm5,xmm7		; xmm5=tmp13
-	subps	xmm0,xmm4		; xmm0=tmp12
-	addps	xmm1,xmm7		; xmm1=tmp10
-	addps	xmm6,xmm4		; xmm6=tmp11
-
-	addps	xmm0,xmm5
-	mulps	xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
-
-	movaps	xmm7,xmm1
-	movaps	xmm4,xmm5
-	subps	xmm1,xmm6		; xmm1=data4
-	subps	xmm5,xmm0		; xmm5=data6
-	addps	xmm7,xmm6		; xmm7=data0
-	addps	xmm4,xmm0		; xmm4=data2
-
-	movaps	XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
-	movaps	XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
-	movaps	XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
-	movaps	XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
-
-	; -- Odd part
-
-	movaps	xmm6, XMMWORD [wk(0)]	; xmm6=tmp6
-	movaps	xmm0, XMMWORD [wk(1)]	; xmm0=tmp7
-
-	addps	xmm2,xmm3		; xmm2=tmp10
-	addps	xmm3,xmm6		; xmm3=tmp11
-	addps	xmm6,xmm0		; xmm6=tmp12, xmm0=tmp7
-
-	mulps	xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
-
-	movaps	xmm1,xmm2		; xmm1=tmp10
-	subps	xmm2,xmm6
-	mulps	xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
-	mulps	xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
-	mulps	xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
-	addps	xmm1,xmm2		; xmm1=z2
-	addps	xmm6,xmm2		; xmm6=z4
-
-	movaps	xmm5,xmm0
-	subps	xmm0,xmm3		; xmm0=z13
-	addps	xmm5,xmm3		; xmm5=z11
-
-	movaps	xmm7,xmm0
-	movaps	xmm4,xmm5
-	subps	xmm0,xmm1		; xmm0=data3
-	subps	xmm5,xmm6		; xmm5=data7
-	addps	xmm7,xmm1		; xmm7=data5
-	addps	xmm4,xmm6		; xmm4=data1
-
-	movaps	XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
-	movaps	XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
-	movaps	XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
-	movaps	XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
-
-	add	edx, byte 4*SIZEOF_FAST_FLOAT
-	dec	ecx
-	jnz	near .columnloop
-
-;	pop	edi		; unused
-;	pop	esi		; unused
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	poppic	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/ji3dnflt.asm b/simd/ji3dnflt.asm
deleted file mode 100644
index 30ff49d..0000000
--- a/simd/ji3dnflt.asm
+++ /dev/null
@@ -1,452 +0,0 @@
-;
-; ji3dnflt.asm - floating-point IDCT (3DNow! & MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the inverse DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_idct_float_3dnow) PRIVATE
-
-EXTN(jconst_idct_float_3dnow):
-
-PD_1_414	times 2 dd  1.414213562373095048801689
-PD_1_847	times 2 dd  1.847759065022573512256366
-PD_1_082	times 2 dd  1.082392200292393968799446
-PD_2_613	times 2 dd  2.613125929752753055713286
-PD_RNDINT_MAGIC	times 2 dd  100663296.0	; (float)(0x00C00000 << 3)
-PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_float_3dnow (void * dct_table, JCOEFPTR coef_block,
-;                         JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)	(b)+8			; void * dct_table
-%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
-%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
-%define output_col(b)	(b)+20		; JDIMENSION output_col
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
-%define WK_NUM		2
-%define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
-					; FAST_FLOAT workspace[DCTSIZE2]
-
-	align	16
-	global	EXTN(jsimd_idct_float_3dnow) PRIVATE
-
-EXTN(jsimd_idct_float_3dnow):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [workspace]
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process columns from input, store into work array.
-
-;	mov	eax, [original_ebp]
-	mov	edx, POINTER [dct_table(eax)]	; quantptr
-	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
-	lea	edi, [workspace]			; FAST_FLOAT * wsptr
-	mov	ecx, DCTSIZE/2				; ctr
-	alignx	16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
-	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	jnz	short .columnDCT
-
-	pushpic	ebx		; save GOT address
-	mov	ebx, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	mov	eax, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	or	ebx, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	or	eax, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	or	ebx, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	or	eax,ebx
-	poppic	ebx		; restore GOT address
-	jnz	short .columnDCT
-
-	; -- AC terms all zero
-
-	movd      mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
-
-	punpcklwd mm0,mm0
-	psrad     mm0,(DWORD_BIT-WORD_BIT)
-	pi2fd     mm0,mm0
-
-	pfmul     mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	movq      mm1,mm0
-	punpckldq mm0,mm0
-	punpckhdq mm1,mm1
-
-	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0
-	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0
-	movq	MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0
-	movq	MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
-	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
-	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1
-	movq	MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1
-	movq	MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
-	jmp	near .nextcolumn
-	alignx	16,7
-%endif
-.columnDCT:
-
-	; -- Even part
-
-	movd      mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	movd      mm1, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	movd      mm2, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	movd      mm3, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-	punpcklwd mm0,mm0
-	punpcklwd mm1,mm1
-	psrad     mm0,(DWORD_BIT-WORD_BIT)
-	psrad     mm1,(DWORD_BIT-WORD_BIT)
-	pi2fd     mm0,mm0
-	pi2fd     mm1,mm1
-
-	pfmul     mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-	pfmul     mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	punpcklwd mm2,mm2
-	punpcklwd mm3,mm3
-	psrad     mm2,(DWORD_BIT-WORD_BIT)
-	psrad     mm3,(DWORD_BIT-WORD_BIT)
-	pi2fd     mm2,mm2
-	pi2fd     mm3,mm3
-
-	pfmul     mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-	pfmul     mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	movq	mm4,mm0
-	movq	mm5,mm1
-	pfsub	mm0,mm2			; mm0=tmp11
-	pfsub	mm1,mm3
-	pfadd	mm4,mm2			; mm4=tmp10
-	pfadd	mm5,mm3			; mm5=tmp13
-
-	pfmul	mm1,[GOTOFF(ebx,PD_1_414)]
-	pfsub	mm1,mm5			; mm1=tmp12
-
-	movq	mm6,mm4
-	movq	mm7,mm0
-	pfsub	mm4,mm5			; mm4=tmp3
-	pfsub	mm0,mm1			; mm0=tmp2
-	pfadd	mm6,mm5			; mm6=tmp0
-	pfadd	mm7,mm1			; mm7=tmp1
-
-	movq	MMWORD [wk(1)], mm4	; tmp3
-	movq	MMWORD [wk(0)], mm0	; tmp2
-
-	; -- Odd part
-
-	movd      mm2, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movd      mm3, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	movd      mm5, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movd      mm1, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-	punpcklwd mm2,mm2
-	punpcklwd mm3,mm3
-	psrad     mm2,(DWORD_BIT-WORD_BIT)
-	psrad     mm3,(DWORD_BIT-WORD_BIT)
-	pi2fd     mm2,mm2
-	pi2fd     mm3,mm3
-
-	pfmul     mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-	pfmul     mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	punpcklwd mm5,mm5
-	punpcklwd mm1,mm1
-	psrad     mm5,(DWORD_BIT-WORD_BIT)
-	psrad     mm1,(DWORD_BIT-WORD_BIT)
-	pi2fd     mm5,mm5
-	pi2fd     mm1,mm1
-
-	pfmul     mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-	pfmul     mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	movq	mm4,mm2
-	movq	mm0,mm5
-	pfadd	mm2,mm1			; mm2=z11
-	pfadd	mm5,mm3			; mm5=z13
-	pfsub	mm4,mm1			; mm4=z12
-	pfsub	mm0,mm3			; mm0=z10
-
-	movq	mm1,mm2
-	pfsub	mm2,mm5
-	pfadd	mm1,mm5			; mm1=tmp7
-
-	pfmul	mm2,[GOTOFF(ebx,PD_1_414)]	; mm2=tmp11
-
-	movq	mm3,mm0
-	pfadd	mm0,mm4
-	pfmul	mm0,[GOTOFF(ebx,PD_1_847)]	; mm0=z5
-	pfmul	mm3,[GOTOFF(ebx,PD_2_613)]	; mm3=(z10 * 2.613125930)
-	pfmul	mm4,[GOTOFF(ebx,PD_1_082)]	; mm4=(z12 * 1.082392200)
-	pfsubr	mm3,mm0			; mm3=tmp12
-	pfsub	mm4,mm0			; mm4=tmp10
-
-	; -- Final output stage
-
-	pfsub	mm3,mm1			; mm3=tmp6
-	movq	mm5,mm6
-	movq	mm0,mm7
-	pfadd	mm6,mm1			; mm6=data0=(00 01)
-	pfadd	mm7,mm3			; mm7=data1=(10 11)
-	pfsub	mm5,mm1			; mm5=data7=(70 71)
-	pfsub	mm0,mm3			; mm0=data6=(60 61)
-	pfsub	mm2,mm3			; mm2=tmp5
-
-	movq      mm1,mm6		; transpose coefficients
-	punpckldq mm6,mm7		; mm6=(00 10)
-	punpckhdq mm1,mm7		; mm1=(01 11)
-	movq      mm3,mm0		; transpose coefficients
-	punpckldq mm0,mm5		; mm0=(60 70)
-	punpckhdq mm3,mm5		; mm3=(61 71)
-
-	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6
-	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
-	movq	MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
-	movq	MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3
-
-	movq	mm7, MMWORD [wk(0)]	; mm7=tmp2
-	movq	mm5, MMWORD [wk(1)]	; mm5=tmp3
-
-	pfadd	mm4,mm2			; mm4=tmp4
-	movq	mm6,mm7
-	movq	mm1,mm5
-	pfadd	mm7,mm2			; mm7=data2=(20 21)
-	pfadd	mm5,mm4			; mm5=data4=(40 41)
-	pfsub	mm6,mm2			; mm6=data5=(50 51)
-	pfsub	mm1,mm4			; mm1=data3=(30 31)
-
-	movq      mm0,mm7		; transpose coefficients
-	punpckldq mm7,mm1		; mm7=(20 30)
-	punpckhdq mm0,mm1		; mm0=(21 31)
-	movq      mm3,mm5		; transpose coefficients
-	punpckldq mm5,mm6		; mm5=(40 50)
-	punpckhdq mm3,mm6		; mm3=(41 51)
-
-	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7
-	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0
-	movq	MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
-	movq	MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3
-
-.nextcolumn:
-	add	esi, byte 2*SIZEOF_JCOEF		; coef_block
-	add	edx, byte 2*SIZEOF_FLOAT_MULT_TYPE	; quantptr
-	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr
-	dec	ecx					; ctr
-	jnz	near .columnloop
-
-	; -- Prefetch the next coefficient block
-
-	prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
-	prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
-	prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
-	prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
-
-	; ---- Pass 2: process rows from work array, store into output array.
-
-	mov	eax, [original_ebp]
-	lea	esi, [workspace]			; FAST_FLOAT * wsptr
-	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [output_col(eax)]
-	mov	ecx, DCTSIZE/2				; ctr
-	alignx	16,7
-.rowloop:
-
-	; -- Even part
-
-	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
-	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
-	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
-
-	movq	mm4,mm0
-	movq	mm5,mm1
-	pfsub	mm0,mm2			; mm0=tmp11
-	pfsub	mm1,mm3
-	pfadd	mm4,mm2			; mm4=tmp10
-	pfadd	mm5,mm3			; mm5=tmp13
-
-	pfmul	mm1,[GOTOFF(ebx,PD_1_414)]
-	pfsub	mm1,mm5			; mm1=tmp12
-
-	movq	mm6,mm4
-	movq	mm7,mm0
-	pfsub	mm4,mm5			; mm4=tmp3
-	pfsub	mm0,mm1			; mm0=tmp2
-	pfadd	mm6,mm5			; mm6=tmp0
-	pfadd	mm7,mm1			; mm7=tmp1
-
-	movq	MMWORD [wk(1)], mm4	; tmp3
-	movq	MMWORD [wk(0)], mm0	; tmp2
-
-	; -- Odd part
-
-	movq	mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-	movq	mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
-	movq	mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
-	movq	mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
-
-	movq	mm4,mm2
-	movq	mm0,mm5
-	pfadd	mm2,mm1			; mm2=z11
-	pfadd	mm5,mm3			; mm5=z13
-	pfsub	mm4,mm1			; mm4=z12
-	pfsub	mm0,mm3			; mm0=z10
-
-	movq	mm1,mm2
-	pfsub	mm2,mm5
-	pfadd	mm1,mm5			; mm1=tmp7
-
-	pfmul	mm2,[GOTOFF(ebx,PD_1_414)]	; mm2=tmp11
-
-	movq	mm3,mm0
-	pfadd	mm0,mm4
-	pfmul	mm0,[GOTOFF(ebx,PD_1_847)]	; mm0=z5
-	pfmul	mm3,[GOTOFF(ebx,PD_2_613)]	; mm3=(z10 * 2.613125930)
-	pfmul	mm4,[GOTOFF(ebx,PD_1_082)]	; mm4=(z12 * 1.082392200)
-	pfsubr	mm3,mm0			; mm3=tmp12
-	pfsub	mm4,mm0			; mm4=tmp10
-
-	; -- Final output stage
-
-	pfsub	mm3,mm1			; mm3=tmp6
-	movq	mm5,mm6
-	movq	mm0,mm7
-	pfadd	mm6,mm1			; mm6=data0=(00 10)
-	pfadd	mm7,mm3			; mm7=data1=(01 11)
-	pfsub	mm5,mm1			; mm5=data7=(07 17)
-	pfsub	mm0,mm3			; mm0=data6=(06 16)
-	pfsub	mm2,mm3			; mm2=tmp5
-
-	movq	mm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; mm1=[PD_RNDINT_MAGIC]
-	pcmpeqd	mm3,mm3
-	psrld	mm3,WORD_BIT		; mm3={0xFFFF 0x0000 0xFFFF 0x0000}
-
-	pfadd	mm6,mm1			; mm6=roundint(data0/8)=(00 ** 10 **)
-	pfadd	mm7,mm1			; mm7=roundint(data1/8)=(01 ** 11 **)
-	pfadd	mm0,mm1			; mm0=roundint(data6/8)=(06 ** 16 **)
-	pfadd	mm5,mm1			; mm5=roundint(data7/8)=(07 ** 17 **)
-
-	pand	mm6,mm3			; mm6=(00 -- 10 --)
-	pslld	mm7,WORD_BIT		; mm7=(-- 01 -- 11)
-	pand	mm0,mm3			; mm0=(06 -- 16 --)
-	pslld	mm5,WORD_BIT		; mm5=(-- 07 -- 17)
-	por	mm6,mm7			; mm6=(00 01 10 11)
-	por	mm0,mm5			; mm0=(06 07 16 17)
-
-	movq	mm1, MMWORD [wk(0)]	; mm1=tmp2
-	movq	mm3, MMWORD [wk(1)]	; mm3=tmp3
-
-	pfadd	mm4,mm2			; mm4=tmp4
-	movq	mm7,mm1
-	movq	mm5,mm3
-	pfadd	mm1,mm2			; mm1=data2=(02 12)
-	pfadd	mm3,mm4			; mm3=data4=(04 14)
-	pfsub	mm7,mm2			; mm7=data5=(05 15)
-	pfsub	mm5,mm4			; mm5=data3=(03 13)
-
-	movq	mm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; mm2=[PD_RNDINT_MAGIC]
-	pcmpeqd	mm4,mm4
-	psrld	mm4,WORD_BIT		; mm4={0xFFFF 0x0000 0xFFFF 0x0000}
-
-	pfadd	mm3,mm2			; mm3=roundint(data4/8)=(04 ** 14 **)
-	pfadd	mm7,mm2			; mm7=roundint(data5/8)=(05 ** 15 **)
-	pfadd	mm1,mm2			; mm1=roundint(data2/8)=(02 ** 12 **)
-	pfadd	mm5,mm2			; mm5=roundint(data3/8)=(03 ** 13 **)
-
-	pand	mm3,mm4			; mm3=(04 -- 14 --)
-	pslld	mm7,WORD_BIT		; mm7=(-- 05 -- 15)
-	pand	mm1,mm4			; mm1=(02 -- 12 --)
-	pslld	mm5,WORD_BIT		; mm5=(-- 03 -- 13)
-	por	mm3,mm7			; mm3=(04 05 14 15)
-	por	mm1,mm5			; mm1=(02 03 12 13)
-
-	movq      mm2,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm2=[PB_CENTERJSAMP]
-
-	packsswb  mm6,mm3		; mm6=(00 01 10 11 04 05 14 15)
-	packsswb  mm1,mm0		; mm1=(02 03 12 13 06 07 16 17)
-	paddb     mm6,mm2
-	paddb     mm1,mm2
-
-	movq      mm4,mm6		; transpose coefficients(phase 2)
-	punpcklwd mm6,mm1		; mm6=(00 01 02 03 10 11 12 13)
-	punpckhwd mm4,mm1		; mm4=(04 05 06 07 14 15 16 17)
-
-	movq      mm7,mm6		; transpose coefficients(phase 3)
-	punpckldq mm6,mm4		; mm6=(00 01 02 03 04 05 06 07)
-	punpckhdq mm7,mm4		; mm7=(10 11 12 13 14 15 16 17)
-
-	pushpic	ebx			; save GOT address
-
-	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-	mov	ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
-	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
-
-	poppic	ebx			; restore GOT address
-
-	add	esi, byte 2*SIZEOF_FAST_FLOAT	; wsptr
-	add	edi, byte 2*SIZEOF_JSAMPROW
-	dec	ecx				; ctr
-	jnz	near .rowloop
-
-	femms		; empty MMX/3DNow! state
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jimmxfst.asm b/simd/jimmxfst.asm
deleted file mode 100644
index 1b535e1..0000000
--- a/simd/jimmxfst.asm
+++ /dev/null
@@ -1,500 +0,0 @@
-;
-; jimmxfst.asm - fast integer IDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the inverse DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jidctfst.c; see the jidctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	8	; 14 is also OK.
-%define PASS1_BITS	2
-
-%if IFAST_SCALE_BITS != PASS1_BITS
-%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
-%endif
-
-%if CONST_BITS == 8
-F_1_082	equ	277		; FIX(1.082392200)
-F_1_414	equ	362		; FIX(1.414213562)
-F_1_847	equ	473		; FIX(1.847759065)
-F_2_613	equ	669		; FIX(2.613125930)
-F_1_613	equ	(F_2_613 - 256)	; FIX(2.613125930) - FIX(1)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define	DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_1_082	equ	DESCALE(1162209775,30-CONST_BITS)	; FIX(1.082392200)
-F_1_414	equ	DESCALE(1518500249,30-CONST_BITS)	; FIX(1.414213562)
-F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
-F_2_613	equ	DESCALE(2805822602,30-CONST_BITS)	; FIX(2.613125930)
-F_1_613	equ	(F_2_613 - (1 << CONST_BITS))	; FIX(2.613125930) - FIX(1)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-	alignz	16
-	global	EXTN(jconst_idct_ifast_mmx) PRIVATE
-
-EXTN(jconst_idct_ifast_mmx):
-
-PW_F1414	times 4 dw  F_1_414 << CONST_SHIFT
-PW_F1847	times 4 dw  F_1_847 << CONST_SHIFT
-PW_MF1613	times 4 dw -F_1_613 << CONST_SHIFT
-PW_F1082	times 4 dw  F_1_082 << CONST_SHIFT
-PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_ifast_mmx (void * dct_table, JCOEFPTR coef_block,
-;                       JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)	(b)+8			; jpeg_component_info * compptr
-%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
-%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
-%define output_col(b)	(b)+20		; JDIMENSION output_col
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
-%define WK_NUM		2
-%define workspace	wk(0)-DCTSIZE2*SIZEOF_JCOEF
-					; JCOEF workspace[DCTSIZE2]
-
-	align	16
-	global	EXTN(jsimd_idct_ifast_mmx) PRIVATE
-
-EXTN(jsimd_idct_ifast_mmx):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [workspace]
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process columns from input, store into work array.
-
-;	mov	eax, [original_ebp]
-	mov	edx, POINTER [dct_table(eax)]	; quantptr
-	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
-	lea	edi, [workspace]			; JCOEF * wsptr
-	mov	ecx, DCTSIZE/4				; ctr
-	alignx	16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
-	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	jnz	short .columnDCT
-
-	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	por	mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	por	mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	por	mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	por	mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	por	mm1,mm0
-	packsswb mm1,mm1
-	movd	eax,mm1
-	test	eax,eax
-	jnz	short .columnDCT
-
-	; -- AC terms all zero
-
-	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
-	movq      mm2,mm0		; mm0=in0=(00 01 02 03)
-	punpcklwd mm0,mm0		; mm0=(00 00 01 01)
-	punpckhwd mm2,mm2		; mm2=(02 02 03 03)
-
-	movq      mm1,mm0
-	punpckldq mm0,mm0		; mm0=(00 00 00 00)
-	punpckhdq mm1,mm1		; mm1=(01 01 01 01)
-	movq      mm3,mm2
-	punpckldq mm2,mm2		; mm2=(02 02 02 02)
-	punpckhdq mm3,mm3		; mm3=(03 03 03 03)
-
-	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
-	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
-	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
-	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
-	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
-	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
-	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
-	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
-	jmp	near .nextcolumn
-	alignx	16,7
-%endif
-.columnDCT:
-
-	; -- Even part
-
-	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-	pmullw	mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-	pmullw	mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
-	movq	mm4,mm0
-	movq	mm5,mm1
-	psubw	mm0,mm2			; mm0=tmp11
-	psubw	mm1,mm3
-	paddw	mm4,mm2			; mm4=tmp10
-	paddw	mm5,mm3			; mm5=tmp13
-
-	psllw	mm1,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	mm1,[GOTOFF(ebx,PW_F1414)]
-	psubw	mm1,mm5			; mm1=tmp12
-
-	movq	mm6,mm4
-	movq	mm7,mm0
-	psubw	mm4,mm5			; mm4=tmp3
-	psubw	mm0,mm1			; mm0=tmp2
-	paddw	mm6,mm5			; mm6=tmp0
-	paddw	mm7,mm1			; mm7=tmp1
-
-	movq	MMWORD [wk(1)], mm4	; wk(1)=tmp3
-	movq	MMWORD [wk(0)], mm0	; wk(0)=tmp2
-
-	; -- Odd part
-
-	movq	mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movq	mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-	pmullw	mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-	movq	mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-	pmullw	mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
-	movq	mm4,mm2
-	movq	mm0,mm5
-	psubw	mm2,mm1			; mm2=z12
-	psubw	mm5,mm3			; mm5=z10
-	paddw	mm4,mm1			; mm4=z11
-	paddw	mm0,mm3			; mm0=z13
-
-	movq	mm1,mm5			; mm1=z10(unscaled)
-	psllw	mm2,PRE_MULTIPLY_SCALE_BITS
-	psllw	mm5,PRE_MULTIPLY_SCALE_BITS
-
-	movq	mm3,mm4
-	psubw	mm4,mm0
-	paddw	mm3,mm0			; mm3=tmp7
-
-	psllw	mm4,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	mm4,[GOTOFF(ebx,PW_F1414)]	; mm4=tmp11
-
-	; To avoid overflow...
-	;
-	; (Original)
-	; tmp12 = -2.613125930 * z10 + z5;
-	;
-	; (This implementation)
-	; tmp12 = (-1.613125930 - 1) * z10 + z5;
-	;       = -1.613125930 * z10 - z10 + z5;
-
-	movq	mm0,mm5
-	paddw	mm5,mm2
-	pmulhw	mm5,[GOTOFF(ebx,PW_F1847)]	; mm5=z5
-	pmulhw	mm0,[GOTOFF(ebx,PW_MF1613)]
-	pmulhw	mm2,[GOTOFF(ebx,PW_F1082)]
-	psubw	mm0,mm1
-	psubw	mm2,mm5			; mm2=tmp10
-	paddw	mm0,mm5			; mm0=tmp12
-
-	; -- Final output stage
-
-	psubw	mm0,mm3			; mm0=tmp6
-	movq	mm1,mm6
-	movq	mm5,mm7
-	paddw	mm6,mm3			; mm6=data0=(00 01 02 03)
-	paddw	mm7,mm0			; mm7=data1=(10 11 12 13)
-	psubw	mm1,mm3			; mm1=data7=(70 71 72 73)
-	psubw	mm5,mm0			; mm5=data6=(60 61 62 63)
-	psubw	mm4,mm0			; mm4=tmp5
-
-	movq      mm3,mm6		; transpose coefficients(phase 1)
-	punpcklwd mm6,mm7		; mm6=(00 10 01 11)
-	punpckhwd mm3,mm7		; mm3=(02 12 03 13)
-	movq      mm0,mm5		; transpose coefficients(phase 1)
-	punpcklwd mm5,mm1		; mm5=(60 70 61 71)
-	punpckhwd mm0,mm1		; mm0=(62 72 63 73)
-
-	movq	mm7, MMWORD [wk(0)]	; mm7=tmp2
-	movq	mm1, MMWORD [wk(1)]	; mm1=tmp3
-
-	movq	MMWORD [wk(0)], mm5	; wk(0)=(60 70 61 71)
-	movq	MMWORD [wk(1)], mm0	; wk(1)=(62 72 63 73)
-
-	paddw	mm2,mm4			; mm2=tmp4
-	movq	mm5,mm7
-	movq	mm0,mm1
-	paddw	mm7,mm4			; mm7=data2=(20 21 22 23)
-	paddw	mm1,mm2			; mm1=data4=(40 41 42 43)
-	psubw	mm5,mm4			; mm5=data5=(50 51 52 53)
-	psubw	mm0,mm2			; mm0=data3=(30 31 32 33)
-
-	movq      mm4,mm7		; transpose coefficients(phase 1)
-	punpcklwd mm7,mm0		; mm7=(20 30 21 31)
-	punpckhwd mm4,mm0		; mm4=(22 32 23 33)
-	movq      mm2,mm1		; transpose coefficients(phase 1)
-	punpcklwd mm1,mm5		; mm1=(40 50 41 51)
-	punpckhwd mm2,mm5		; mm2=(42 52 43 53)
-
-	movq      mm0,mm6		; transpose coefficients(phase 2)
-	punpckldq mm6,mm7		; mm6=(00 10 20 30)
-	punpckhdq mm0,mm7		; mm0=(01 11 21 31)
-	movq      mm5,mm3		; transpose coefficients(phase 2)
-	punpckldq mm3,mm4		; mm3=(02 12 22 32)
-	punpckhdq mm5,mm4		; mm5=(03 13 23 33)
-
-	movq	mm7, MMWORD [wk(0)]	; mm7=(60 70 61 71)
-	movq	mm4, MMWORD [wk(1)]	; mm4=(62 72 63 73)
-
-	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6
-	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
-	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3
-	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
-
-	movq      mm6,mm1		; transpose coefficients(phase 2)
-	punpckldq mm1,mm7		; mm1=(40 50 60 70)
-	punpckhdq mm6,mm7		; mm6=(41 51 61 71)
-	movq      mm0,mm2		; transpose coefficients(phase 2)
-	punpckldq mm2,mm4		; mm2=(42 52 62 72)
-	punpckhdq mm0,mm4		; mm0=(43 53 63 73)
-
-	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
-	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6
-	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
-	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0
-
-.nextcolumn:
-	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
-	add	edx, byte 4*SIZEOF_IFAST_MULT_TYPE	; quantptr
-	add	edi, byte 4*DCTSIZE*SIZEOF_JCOEF	; wsptr
-	dec	ecx					; ctr
-	jnz	near .columnloop
-
-	; ---- Pass 2: process rows from work array, store into output array.
-
-	mov	eax, [original_ebp]
-	lea	esi, [workspace]			; JCOEF * wsptr
-	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [output_col(eax)]
-	mov	ecx, DCTSIZE/4				; ctr
-	alignx	16,7
-.rowloop:
-
-	; -- Even part
-
-	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-	movq	mm4,mm0
-	movq	mm5,mm1
-	psubw	mm0,mm2			; mm0=tmp11
-	psubw	mm1,mm3
-	paddw	mm4,mm2			; mm4=tmp10
-	paddw	mm5,mm3			; mm5=tmp13
-
-	psllw	mm1,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	mm1,[GOTOFF(ebx,PW_F1414)]
-	psubw	mm1,mm5			; mm1=tmp12
-
-	movq	mm6,mm4
-	movq	mm7,mm0
-	psubw	mm4,mm5			; mm4=tmp3
-	psubw	mm0,mm1			; mm0=tmp2
-	paddw	mm6,mm5			; mm6=tmp0
-	paddw	mm7,mm1			; mm7=tmp1
-
-	movq	MMWORD [wk(1)], mm4	; wk(1)=tmp3
-	movq	MMWORD [wk(0)], mm0	; wk(0)=tmp2
-
-	; -- Odd part
-
-	movq	mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movq	mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	movq	mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-	movq	mm4,mm2
-	movq	mm0,mm5
-	psubw	mm2,mm1			; mm2=z12
-	psubw	mm5,mm3			; mm5=z10
-	paddw	mm4,mm1			; mm4=z11
-	paddw	mm0,mm3			; mm0=z13
-
-	movq	mm1,mm5			; mm1=z10(unscaled)
-	psllw	mm2,PRE_MULTIPLY_SCALE_BITS
-	psllw	mm5,PRE_MULTIPLY_SCALE_BITS
-
-	movq	mm3,mm4
-	psubw	mm4,mm0
-	paddw	mm3,mm0			; mm3=tmp7
-
-	psllw	mm4,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	mm4,[GOTOFF(ebx,PW_F1414)]	; mm4=tmp11
-
-	; To avoid overflow...
-	;
-	; (Original)
-	; tmp12 = -2.613125930 * z10 + z5;
-	;
-	; (This implementation)
-	; tmp12 = (-1.613125930 - 1) * z10 + z5;
-	;       = -1.613125930 * z10 - z10 + z5;
-
-	movq	mm0,mm5
-	paddw	mm5,mm2
-	pmulhw	mm5,[GOTOFF(ebx,PW_F1847)]	; mm5=z5
-	pmulhw	mm0,[GOTOFF(ebx,PW_MF1613)]
-	pmulhw	mm2,[GOTOFF(ebx,PW_F1082)]
-	psubw	mm0,mm1
-	psubw	mm2,mm5			; mm2=tmp10
-	paddw	mm0,mm5			; mm0=tmp12
-
-	; -- Final output stage
-
-	psubw	mm0,mm3			; mm0=tmp6
-	movq	mm1,mm6
-	movq	mm5,mm7
-	paddw	mm6,mm3			; mm6=data0=(00 10 20 30)
-	paddw	mm7,mm0			; mm7=data1=(01 11 21 31)
-	psraw	mm6,(PASS1_BITS+3)	; descale
-	psraw	mm7,(PASS1_BITS+3)	; descale
-	psubw	mm1,mm3			; mm1=data7=(07 17 27 37)
-	psubw	mm5,mm0			; mm5=data6=(06 16 26 36)
-	psraw	mm1,(PASS1_BITS+3)	; descale
-	psraw	mm5,(PASS1_BITS+3)	; descale
-	psubw	mm4,mm0			; mm4=tmp5
-
-	packsswb  mm6,mm5		; mm6=(00 10 20 30 06 16 26 36)
-	packsswb  mm7,mm1		; mm7=(01 11 21 31 07 17 27 37)
-
-	movq	mm3, MMWORD [wk(0)]	; mm3=tmp2
-	movq	mm0, MMWORD [wk(1)]	; mm0=tmp3
-
-	paddw	mm2,mm4			; mm2=tmp4
-	movq	mm5,mm3
-	movq	mm1,mm0
-	paddw	mm3,mm4			; mm3=data2=(02 12 22 32)
-	paddw	mm0,mm2			; mm0=data4=(04 14 24 34)
-	psraw	mm3,(PASS1_BITS+3)	; descale
-	psraw	mm0,(PASS1_BITS+3)	; descale
-	psubw	mm5,mm4			; mm5=data5=(05 15 25 35)
-	psubw	mm1,mm2			; mm1=data3=(03 13 23 33)
-	psraw	mm5,(PASS1_BITS+3)	; descale
-	psraw	mm1,(PASS1_BITS+3)	; descale
-
-	movq      mm4,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm4=[PB_CENTERJSAMP]
-
-	packsswb  mm3,mm0		; mm3=(02 12 22 32 04 14 24 34)
-	packsswb  mm1,mm5		; mm1=(03 13 23 33 05 15 25 35)
-
-	paddb     mm6,mm4
-	paddb     mm7,mm4
-	paddb     mm3,mm4
-	paddb     mm1,mm4
-
-	movq      mm2,mm6		; transpose coefficients(phase 1)
-	punpcklbw mm6,mm7		; mm6=(00 01 10 11 20 21 30 31)
-	punpckhbw mm2,mm7		; mm2=(06 07 16 17 26 27 36 37)
-	movq      mm0,mm3		; transpose coefficients(phase 1)
-	punpcklbw mm3,mm1		; mm3=(02 03 12 13 22 23 32 33)
-	punpckhbw mm0,mm1		; mm0=(04 05 14 15 24 25 34 35)
-
-	movq      mm5,mm6		; transpose coefficients(phase 2)
-	punpcklwd mm6,mm3		; mm6=(00 01 02 03 10 11 12 13)
-	punpckhwd mm5,mm3		; mm5=(20 21 22 23 30 31 32 33)
-	movq      mm4,mm0		; transpose coefficients(phase 2)
-	punpcklwd mm0,mm2		; mm0=(04 05 06 07 14 15 16 17)
-	punpckhwd mm4,mm2		; mm4=(24 25 26 27 34 35 36 37)
-
-	movq      mm7,mm6		; transpose coefficients(phase 3)
-	punpckldq mm6,mm0		; mm6=(00 01 02 03 04 05 06 07)
-	punpckhdq mm7,mm0		; mm7=(10 11 12 13 14 15 16 17)
-	movq      mm1,mm5		; transpose coefficients(phase 3)
-	punpckldq mm5,mm4		; mm5=(20 21 22 23 24 25 26 27)
-	punpckhdq mm1,mm4		; mm1=(30 31 32 33 34 35 36 37)
-
-	pushpic	ebx			; save GOT address
-
-	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-	mov	ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
-	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
-	mov	edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
-	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
-
-	poppic	ebx			; restore GOT address
-
-	add	esi, byte 4*SIZEOF_JCOEF	; wsptr
-	add	edi, byte 4*SIZEOF_JSAMPROW
-	dec	ecx				; ctr
-	jnz	near .rowloop
-
-	emms		; empty MMX state
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jimmxint.asm b/simd/jimmxint.asm
deleted file mode 100644
index 2b84f62..0000000
--- a/simd/jimmxint.asm
+++ /dev/null
@@ -1,852 +0,0 @@
-;
-; jimmxint.asm - accurate integer IDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; inverse DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jidctint.c; see the jidctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	13
-%define PASS1_BITS	2
-
-%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
-%define DESCALE_P2	(CONST_BITS+PASS1_BITS+3)
-
-%if CONST_BITS == 13
-F_0_298	equ	 2446		; FIX(0.298631336)
-F_0_390	equ	 3196		; FIX(0.390180644)
-F_0_541	equ	 4433		; FIX(0.541196100)
-F_0_765	equ	 6270		; FIX(0.765366865)
-F_0_899	equ	 7373		; FIX(0.899976223)
-F_1_175	equ	 9633		; FIX(1.175875602)
-F_1_501	equ	12299		; FIX(1.501321110)
-F_1_847	equ	15137		; FIX(1.847759065)
-F_1_961	equ	16069		; FIX(1.961570560)
-F_2_053	equ	16819		; FIX(2.053119869)
-F_2_562	equ	20995		; FIX(2.562915447)
-F_3_072	equ	25172		; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
-F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
-F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
-F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
-F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
-F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
-F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
-F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
-F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
-F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
-F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
-F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_idct_islow_mmx) PRIVATE
-
-EXTN(jconst_idct_islow_mmx):
-
-PW_F130_F054	times 2 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130	times 2 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117	times 2 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078	times 2 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089	times 2 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060	times 2 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256	times 2 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050	times 2 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1	times 2 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2	times 2 dd  1 << (DESCALE_P2-1)
-PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_islow_mmx (void * dct_table, JCOEFPTR coef_block,
-;                       JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)	(b)+8			; jpeg_component_info * compptr
-%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
-%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
-%define output_col(b)	(b)+20		; JDIMENSION output_col
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
-%define WK_NUM		12
-%define workspace	wk(0)-DCTSIZE2*SIZEOF_JCOEF
-					; JCOEF workspace[DCTSIZE2]
-
-	align	16
-	global	EXTN(jsimd_idct_islow_mmx) PRIVATE
-
-EXTN(jsimd_idct_islow_mmx):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [workspace]
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process columns from input, store into work array.
-
-;	mov	eax, [original_ebp]
-	mov	edx, POINTER [dct_table(eax)]	; quantptr
-	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
-	lea	edi, [workspace]			; JCOEF * wsptr
-	mov	ecx, DCTSIZE/4				; ctr
-	alignx	16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
-	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	jnz	short .columnDCT
-
-	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	por	mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	por	mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	por	mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	por	mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	por	mm1,mm0
-	packsswb mm1,mm1
-	movd	eax,mm1
-	test	eax,eax
-	jnz	short .columnDCT
-
-	; -- AC terms all zero
-
-	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	psllw	mm0,PASS1_BITS
-
-	movq      mm2,mm0		; mm0=in0=(00 01 02 03)
-	punpcklwd mm0,mm0		; mm0=(00 00 01 01)
-	punpckhwd mm2,mm2		; mm2=(02 02 03 03)
-
-	movq      mm1,mm0
-	punpckldq mm0,mm0		; mm0=(00 00 00 00)
-	punpckhdq mm1,mm1		; mm1=(01 01 01 01)
-	movq      mm3,mm2
-	punpckldq mm2,mm2		; mm2=(02 02 02 02)
-	punpckhdq mm3,mm3		; mm3=(03 03 03 03)
-
-	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
-	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
-	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
-	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
-	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
-	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
-	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
-	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
-	jmp	near .nextcolumn
-	alignx	16,7
-%endif
-.columnDCT:
-
-	; -- Even part
-
-	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	; (Original)
-	; z1 = (z2 + z3) * 0.541196100;
-	; tmp2 = z1 + z3 * -1.847759065;
-	; tmp3 = z1 + z2 * 0.765366865;
-	;
-	; (This implementation)
-	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-	movq      mm4,mm1		; mm1=in2=z2
-	movq      mm5,mm1
-	punpcklwd mm4,mm3		; mm3=in6=z3
-	punpckhwd mm5,mm3
-	movq      mm1,mm4
-	movq      mm3,mm5
-	pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]	; mm4=tmp3L
-	pmaddwd   mm5,[GOTOFF(ebx,PW_F130_F054)]	; mm5=tmp3H
-	pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]	; mm1=tmp2L
-	pmaddwd   mm3,[GOTOFF(ebx,PW_F054_MF130)]	; mm3=tmp2H
-
-	movq      mm6,mm0
-	paddw     mm0,mm2		; mm0=in0+in4
-	psubw     mm6,mm2		; mm6=in0-in4
-
-	pxor      mm7,mm7
-	pxor      mm2,mm2
-	punpcklwd mm7,mm0		; mm7=tmp0L
-	punpckhwd mm2,mm0		; mm2=tmp0H
-	psrad     mm7,(16-CONST_BITS)	; psrad mm7,16 & pslld mm7,CONST_BITS
-	psrad     mm2,(16-CONST_BITS)	; psrad mm2,16 & pslld mm2,CONST_BITS
-
-	movq	mm0,mm7
-	paddd	mm7,mm4			; mm7=tmp10L
-	psubd	mm0,mm4			; mm0=tmp13L
-	movq	mm4,mm2
-	paddd	mm2,mm5			; mm2=tmp10H
-	psubd	mm4,mm5			; mm4=tmp13H
-
-	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp10L
-	movq	MMWORD [wk(1)], mm2	; wk(1)=tmp10H
-	movq	MMWORD [wk(2)], mm0	; wk(2)=tmp13L
-	movq	MMWORD [wk(3)], mm4	; wk(3)=tmp13H
-
-	pxor      mm5,mm5
-	pxor      mm7,mm7
-	punpcklwd mm5,mm6		; mm5=tmp1L
-	punpckhwd mm7,mm6		; mm7=tmp1H
-	psrad     mm5,(16-CONST_BITS)	; psrad mm5,16 & pslld mm5,CONST_BITS
-	psrad     mm7,(16-CONST_BITS)	; psrad mm7,16 & pslld mm7,CONST_BITS
-
-	movq	mm2,mm5
-	paddd	mm5,mm1			; mm5=tmp11L
-	psubd	mm2,mm1			; mm2=tmp12L
-	movq	mm0,mm7
-	paddd	mm7,mm3			; mm7=tmp11H
-	psubd	mm0,mm3			; mm0=tmp12H
-
-	movq	MMWORD [wk(4)], mm5	; wk(4)=tmp11L
-	movq	MMWORD [wk(5)], mm7	; wk(5)=tmp11H
-	movq	MMWORD [wk(6)], mm2	; wk(6)=tmp12L
-	movq	MMWORD [wk(7)], mm0	; wk(7)=tmp12H
-
-	; -- Odd part
-
-	movq	mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movq	mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	movq	mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	movq	mm5,mm6
-	movq	mm7,mm4
-	paddw	mm5,mm3			; mm5=z3
-	paddw	mm7,mm1			; mm7=z4
-
-	; (Original)
-	; z5 = (z3 + z4) * 1.175875602;
-	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-	; z3 += z5;  z4 += z5;
-	;
-	; (This implementation)
-	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-	movq      mm2,mm5
-	movq      mm0,mm5
-	punpcklwd mm2,mm7
-	punpckhwd mm0,mm7
-	movq      mm5,mm2
-	movq      mm7,mm0
-	pmaddwd   mm2,[GOTOFF(ebx,PW_MF078_F117)]	; mm2=z3L
-	pmaddwd   mm0,[GOTOFF(ebx,PW_MF078_F117)]	; mm0=z3H
-	pmaddwd   mm5,[GOTOFF(ebx,PW_F117_F078)]	; mm5=z4L
-	pmaddwd   mm7,[GOTOFF(ebx,PW_F117_F078)]	; mm7=z4H
-
-	movq	MMWORD [wk(10)], mm2	; wk(10)=z3L
-	movq	MMWORD [wk(11)], mm0	; wk(11)=z3H
-
-	; (Original)
-	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-	;
-	; (This implementation)
-	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-	; tmp0 += z3;  tmp1 += z4;
-	; tmp2 += z3;  tmp3 += z4;
-
-	movq      mm2,mm3
-	movq      mm0,mm3
-	punpcklwd mm2,mm4
-	punpckhwd mm0,mm4
-	movq      mm3,mm2
-	movq      mm4,mm0
-	pmaddwd   mm2,[GOTOFF(ebx,PW_MF060_MF089)]	; mm2=tmp0L
-	pmaddwd   mm0,[GOTOFF(ebx,PW_MF060_MF089)]	; mm0=tmp0H
-	pmaddwd   mm3,[GOTOFF(ebx,PW_MF089_F060)]	; mm3=tmp3L
-	pmaddwd   mm4,[GOTOFF(ebx,PW_MF089_F060)]	; mm4=tmp3H
-
-	paddd	mm2, MMWORD [wk(10)]	; mm2=tmp0L
-	paddd	mm0, MMWORD [wk(11)]	; mm0=tmp0H
-	paddd	mm3,mm5			; mm3=tmp3L
-	paddd	mm4,mm7			; mm4=tmp3H
-
-	movq	MMWORD [wk(8)], mm2	; wk(8)=tmp0L
-	movq	MMWORD [wk(9)], mm0	; wk(9)=tmp0H
-
-	movq      mm2,mm1
-	movq      mm0,mm1
-	punpcklwd mm2,mm6
-	punpckhwd mm0,mm6
-	movq      mm1,mm2
-	movq      mm6,mm0
-	pmaddwd   mm2,[GOTOFF(ebx,PW_MF050_MF256)]	; mm2=tmp1L
-	pmaddwd   mm0,[GOTOFF(ebx,PW_MF050_MF256)]	; mm0=tmp1H
-	pmaddwd   mm1,[GOTOFF(ebx,PW_MF256_F050)]	; mm1=tmp2L
-	pmaddwd   mm6,[GOTOFF(ebx,PW_MF256_F050)]	; mm6=tmp2H
-
-	paddd	mm2,mm5			; mm2=tmp1L
-	paddd	mm0,mm7			; mm0=tmp1H
-	paddd	mm1, MMWORD [wk(10)]	; mm1=tmp2L
-	paddd	mm6, MMWORD [wk(11)]	; mm6=tmp2H
-
-	movq	MMWORD [wk(10)], mm2	; wk(10)=tmp1L
-	movq	MMWORD [wk(11)], mm0	; wk(11)=tmp1H
-
-	; -- Final output stage
-
-	movq	mm5, MMWORD [wk(0)]	; mm5=tmp10L
-	movq	mm7, MMWORD [wk(1)]	; mm7=tmp10H
-
-	movq	mm2,mm5
-	movq	mm0,mm7
-	paddd	mm5,mm3			; mm5=data0L
-	paddd	mm7,mm4			; mm7=data0H
-	psubd	mm2,mm3			; mm2=data7L
-	psubd	mm0,mm4			; mm0=data7H
-
-	movq	mm3,[GOTOFF(ebx,PD_DESCALE_P1)]	; mm3=[PD_DESCALE_P1]
-
-	paddd	mm5,mm3
-	paddd	mm7,mm3
-	psrad	mm5,DESCALE_P1
-	psrad	mm7,DESCALE_P1
-	paddd	mm2,mm3
-	paddd	mm0,mm3
-	psrad	mm2,DESCALE_P1
-	psrad	mm0,DESCALE_P1
-
-	packssdw  mm5,mm7		; mm5=data0=(00 01 02 03)
-	packssdw  mm2,mm0		; mm2=data7=(70 71 72 73)
-
-	movq	mm4, MMWORD [wk(4)]	; mm4=tmp11L
-	movq	mm3, MMWORD [wk(5)]	; mm3=tmp11H
-
-	movq	mm7,mm4
-	movq	mm0,mm3
-	paddd	mm4,mm1			; mm4=data1L
-	paddd	mm3,mm6			; mm3=data1H
-	psubd	mm7,mm1			; mm7=data6L
-	psubd	mm0,mm6			; mm0=data6H
-
-	movq	mm1,[GOTOFF(ebx,PD_DESCALE_P1)]	; mm1=[PD_DESCALE_P1]
-
-	paddd	mm4,mm1
-	paddd	mm3,mm1
-	psrad	mm4,DESCALE_P1
-	psrad	mm3,DESCALE_P1
-	paddd	mm7,mm1
-	paddd	mm0,mm1
-	psrad	mm7,DESCALE_P1
-	psrad	mm0,DESCALE_P1
-
-	packssdw  mm4,mm3		; mm4=data1=(10 11 12 13)
-	packssdw  mm7,mm0		; mm7=data6=(60 61 62 63)
-
-	movq      mm6,mm5		; transpose coefficients(phase 1)
-	punpcklwd mm5,mm4		; mm5=(00 10 01 11)
-	punpckhwd mm6,mm4		; mm6=(02 12 03 13)
-	movq      mm1,mm7		; transpose coefficients(phase 1)
-	punpcklwd mm7,mm2		; mm7=(60 70 61 71)
-	punpckhwd mm1,mm2		; mm1=(62 72 63 73)
-
-	movq	mm3, MMWORD [wk(6)]	; mm3=tmp12L
-	movq	mm0, MMWORD [wk(7)]	; mm0=tmp12H
-	movq	mm4, MMWORD [wk(10)]	; mm4=tmp1L
-	movq	mm2, MMWORD [wk(11)]	; mm2=tmp1H
-
-	movq	MMWORD [wk(0)], mm5	; wk(0)=(00 10 01 11)
-	movq	MMWORD [wk(1)], mm6	; wk(1)=(02 12 03 13)
-	movq	MMWORD [wk(4)], mm7	; wk(4)=(60 70 61 71)
-	movq	MMWORD [wk(5)], mm1	; wk(5)=(62 72 63 73)
-
-	movq	mm5,mm3
-	movq	mm6,mm0
-	paddd	mm3,mm4			; mm3=data2L
-	paddd	mm0,mm2			; mm0=data2H
-	psubd	mm5,mm4			; mm5=data5L
-	psubd	mm6,mm2			; mm6=data5H
-
-	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P1)]	; mm7=[PD_DESCALE_P1]
-
-	paddd	mm3,mm7
-	paddd	mm0,mm7
-	psrad	mm3,DESCALE_P1
-	psrad	mm0,DESCALE_P1
-	paddd	mm5,mm7
-	paddd	mm6,mm7
-	psrad	mm5,DESCALE_P1
-	psrad	mm6,DESCALE_P1
-
-	packssdw  mm3,mm0		; mm3=data2=(20 21 22 23)
-	packssdw  mm5,mm6		; mm5=data5=(50 51 52 53)
-
-	movq	mm1, MMWORD [wk(2)]	; mm1=tmp13L
-	movq	mm4, MMWORD [wk(3)]	; mm4=tmp13H
-	movq	mm2, MMWORD [wk(8)]	; mm2=tmp0L
-	movq	mm7, MMWORD [wk(9)]	; mm7=tmp0H
-
-	movq	mm0,mm1
-	movq	mm6,mm4
-	paddd	mm1,mm2			; mm1=data3L
-	paddd	mm4,mm7			; mm4=data3H
-	psubd	mm0,mm2			; mm0=data4L
-	psubd	mm6,mm7			; mm6=data4H
-
-	movq	mm2,[GOTOFF(ebx,PD_DESCALE_P1)]	; mm2=[PD_DESCALE_P1]
-
-	paddd	mm1,mm2
-	paddd	mm4,mm2
-	psrad	mm1,DESCALE_P1
-	psrad	mm4,DESCALE_P1
-	paddd	mm0,mm2
-	paddd	mm6,mm2
-	psrad	mm0,DESCALE_P1
-	psrad	mm6,DESCALE_P1
-
-	packssdw  mm1,mm4		; mm1=data3=(30 31 32 33)
-	packssdw  mm0,mm6		; mm0=data4=(40 41 42 43)
-
-	movq	mm7, MMWORD [wk(0)]	; mm7=(00 10 01 11)
-	movq	mm2, MMWORD [wk(1)]	; mm2=(02 12 03 13)
-
-	movq      mm4,mm3		; transpose coefficients(phase 1)
-	punpcklwd mm3,mm1		; mm3=(20 30 21 31)
-	punpckhwd mm4,mm1		; mm4=(22 32 23 33)
-	movq      mm6,mm0		; transpose coefficients(phase 1)
-	punpcklwd mm0,mm5		; mm0=(40 50 41 51)
-	punpckhwd mm6,mm5		; mm6=(42 52 43 53)
-
-	movq      mm1,mm7		; transpose coefficients(phase 2)
-	punpckldq mm7,mm3		; mm7=(00 10 20 30)
-	punpckhdq mm1,mm3		; mm1=(01 11 21 31)
-	movq      mm5,mm2		; transpose coefficients(phase 2)
-	punpckldq mm2,mm4		; mm2=(02 12 22 32)
-	punpckhdq mm5,mm4		; mm5=(03 13 23 33)
-
-	movq	mm3, MMWORD [wk(4)]	; mm3=(60 70 61 71)
-	movq	mm4, MMWORD [wk(5)]	; mm4=(62 72 63 73)
-
-	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7
-	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
-	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
-	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
-
-	movq      mm7,mm0		; transpose coefficients(phase 2)
-	punpckldq mm0,mm3		; mm0=(40 50 60 70)
-	punpckhdq mm7,mm3		; mm7=(41 51 61 71)
-	movq      mm1,mm6		; transpose coefficients(phase 2)
-	punpckldq mm6,mm4		; mm6=(42 52 62 72)
-	punpckhdq mm1,mm4		; mm1=(43 53 63 73)
-
-	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
-	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7
-	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6
-	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1
-
-.nextcolumn:
-	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
-	add	edx, byte 4*SIZEOF_ISLOW_MULT_TYPE	; quantptr
-	add	edi, byte 4*DCTSIZE*SIZEOF_JCOEF	; wsptr
-	dec	ecx					; ctr
-	jnz	near .columnloop
-
-	; ---- Pass 2: process rows from work array, store into output array.
-
-	mov	eax, [original_ebp]
-	lea	esi, [workspace]			; JCOEF * wsptr
-	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [output_col(eax)]
-	mov	ecx, DCTSIZE/4				; ctr
-	alignx	16,7
-.rowloop:
-
-	; -- Even part
-
-	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-	; (Original)
-	; z1 = (z2 + z3) * 0.541196100;
-	; tmp2 = z1 + z3 * -1.847759065;
-	; tmp3 = z1 + z2 * 0.765366865;
-	;
-	; (This implementation)
-	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-	movq      mm4,mm1		; mm1=in2=z2
-	movq      mm5,mm1
-	punpcklwd mm4,mm3		; mm3=in6=z3
-	punpckhwd mm5,mm3
-	movq      mm1,mm4
-	movq      mm3,mm5
-	pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]	; mm4=tmp3L
-	pmaddwd   mm5,[GOTOFF(ebx,PW_F130_F054)]	; mm5=tmp3H
-	pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]	; mm1=tmp2L
-	pmaddwd   mm3,[GOTOFF(ebx,PW_F054_MF130)]	; mm3=tmp2H
-
-	movq      mm6,mm0
-	paddw     mm0,mm2		; mm0=in0+in4
-	psubw     mm6,mm2		; mm6=in0-in4
-
-	pxor      mm7,mm7
-	pxor      mm2,mm2
-	punpcklwd mm7,mm0		; mm7=tmp0L
-	punpckhwd mm2,mm0		; mm2=tmp0H
-	psrad     mm7,(16-CONST_BITS)	; psrad mm7,16 & pslld mm7,CONST_BITS
-	psrad     mm2,(16-CONST_BITS)	; psrad mm2,16 & pslld mm2,CONST_BITS
-
-	movq	mm0,mm7
-	paddd	mm7,mm4			; mm7=tmp10L
-	psubd	mm0,mm4			; mm0=tmp13L
-	movq	mm4,mm2
-	paddd	mm2,mm5			; mm2=tmp10H
-	psubd	mm4,mm5			; mm4=tmp13H
-
-	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp10L
-	movq	MMWORD [wk(1)], mm2	; wk(1)=tmp10H
-	movq	MMWORD [wk(2)], mm0	; wk(2)=tmp13L
-	movq	MMWORD [wk(3)], mm4	; wk(3)=tmp13H
-
-	pxor      mm5,mm5
-	pxor      mm7,mm7
-	punpcklwd mm5,mm6		; mm5=tmp1L
-	punpckhwd mm7,mm6		; mm7=tmp1H
-	psrad     mm5,(16-CONST_BITS)	; psrad mm5,16 & pslld mm5,CONST_BITS
-	psrad     mm7,(16-CONST_BITS)	; psrad mm7,16 & pslld mm7,CONST_BITS
-
-	movq	mm2,mm5
-	paddd	mm5,mm1			; mm5=tmp11L
-	psubd	mm2,mm1			; mm2=tmp12L
-	movq	mm0,mm7
-	paddd	mm7,mm3			; mm7=tmp11H
-	psubd	mm0,mm3			; mm0=tmp12H
-
-	movq	MMWORD [wk(4)], mm5	; wk(4)=tmp11L
-	movq	MMWORD [wk(5)], mm7	; wk(5)=tmp11H
-	movq	MMWORD [wk(6)], mm2	; wk(6)=tmp12L
-	movq	MMWORD [wk(7)], mm0	; wk(7)=tmp12H
-
-	; -- Odd part
-
-	movq	mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movq	mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-	movq	mm5,mm6
-	movq	mm7,mm4
-	paddw	mm5,mm3			; mm5=z3
-	paddw	mm7,mm1			; mm7=z4
-
-	; (Original)
-	; z5 = (z3 + z4) * 1.175875602;
-	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-	; z3 += z5;  z4 += z5;
-	;
-	; (This implementation)
-	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-	movq      mm2,mm5
-	movq      mm0,mm5
-	punpcklwd mm2,mm7
-	punpckhwd mm0,mm7
-	movq      mm5,mm2
-	movq      mm7,mm0
-	pmaddwd   mm2,[GOTOFF(ebx,PW_MF078_F117)]	; mm2=z3L
-	pmaddwd   mm0,[GOTOFF(ebx,PW_MF078_F117)]	; mm0=z3H
-	pmaddwd   mm5,[GOTOFF(ebx,PW_F117_F078)]	; mm5=z4L
-	pmaddwd   mm7,[GOTOFF(ebx,PW_F117_F078)]	; mm7=z4H
-
-	movq	MMWORD [wk(10)], mm2	; wk(10)=z3L
-	movq	MMWORD [wk(11)], mm0	; wk(11)=z3H
-
-	; (Original)
-	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-	;
-	; (This implementation)
-	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-	; tmp0 += z3;  tmp1 += z4;
-	; tmp2 += z3;  tmp3 += z4;
-
-	movq      mm2,mm3
-	movq      mm0,mm3
-	punpcklwd mm2,mm4
-	punpckhwd mm0,mm4
-	movq      mm3,mm2
-	movq      mm4,mm0
-	pmaddwd   mm2,[GOTOFF(ebx,PW_MF060_MF089)]	; mm2=tmp0L
-	pmaddwd   mm0,[GOTOFF(ebx,PW_MF060_MF089)]	; mm0=tmp0H
-	pmaddwd   mm3,[GOTOFF(ebx,PW_MF089_F060)]	; mm3=tmp3L
-	pmaddwd   mm4,[GOTOFF(ebx,PW_MF089_F060)]	; mm4=tmp3H
-
-	paddd	mm2, MMWORD [wk(10)]	; mm2=tmp0L
-	paddd	mm0, MMWORD [wk(11)]	; mm0=tmp0H
-	paddd	mm3,mm5			; mm3=tmp3L
-	paddd	mm4,mm7			; mm4=tmp3H
-
-	movq	MMWORD [wk(8)], mm2	; wk(8)=tmp0L
-	movq	MMWORD [wk(9)], mm0	; wk(9)=tmp0H
-
-	movq      mm2,mm1
-	movq      mm0,mm1
-	punpcklwd mm2,mm6
-	punpckhwd mm0,mm6
-	movq      mm1,mm2
-	movq      mm6,mm0
-	pmaddwd   mm2,[GOTOFF(ebx,PW_MF050_MF256)]	; mm2=tmp1L
-	pmaddwd   mm0,[GOTOFF(ebx,PW_MF050_MF256)]	; mm0=tmp1H
-	pmaddwd   mm1,[GOTOFF(ebx,PW_MF256_F050)]	; mm1=tmp2L
-	pmaddwd   mm6,[GOTOFF(ebx,PW_MF256_F050)]	; mm6=tmp2H
-
-	paddd	mm2,mm5			; mm2=tmp1L
-	paddd	mm0,mm7			; mm0=tmp1H
-	paddd	mm1, MMWORD [wk(10)]	; mm1=tmp2L
-	paddd	mm6, MMWORD [wk(11)]	; mm6=tmp2H
-
-	movq	MMWORD [wk(10)], mm2	; wk(10)=tmp1L
-	movq	MMWORD [wk(11)], mm0	; wk(11)=tmp1H
-
-	; -- Final output stage
-
-	movq	mm5, MMWORD [wk(0)]	; mm5=tmp10L
-	movq	mm7, MMWORD [wk(1)]	; mm7=tmp10H
-
-	movq	mm2,mm5
-	movq	mm0,mm7
-	paddd	mm5,mm3			; mm5=data0L
-	paddd	mm7,mm4			; mm7=data0H
-	psubd	mm2,mm3			; mm2=data7L
-	psubd	mm0,mm4			; mm0=data7H
-
-	movq	mm3,[GOTOFF(ebx,PD_DESCALE_P2)]	; mm3=[PD_DESCALE_P2]
-
-	paddd	mm5,mm3
-	paddd	mm7,mm3
-	psrad	mm5,DESCALE_P2
-	psrad	mm7,DESCALE_P2
-	paddd	mm2,mm3
-	paddd	mm0,mm3
-	psrad	mm2,DESCALE_P2
-	psrad	mm0,DESCALE_P2
-
-	packssdw  mm5,mm7		; mm5=data0=(00 10 20 30)
-	packssdw  mm2,mm0		; mm2=data7=(07 17 27 37)
-
-	movq	mm4, MMWORD [wk(4)]	; mm4=tmp11L
-	movq	mm3, MMWORD [wk(5)]	; mm3=tmp11H
-
-	movq	mm7,mm4
-	movq	mm0,mm3
-	paddd	mm4,mm1			; mm4=data1L
-	paddd	mm3,mm6			; mm3=data1H
-	psubd	mm7,mm1			; mm7=data6L
-	psubd	mm0,mm6			; mm0=data6H
-
-	movq	mm1,[GOTOFF(ebx,PD_DESCALE_P2)]	; mm1=[PD_DESCALE_P2]
-
-	paddd	mm4,mm1
-	paddd	mm3,mm1
-	psrad	mm4,DESCALE_P2
-	psrad	mm3,DESCALE_P2
-	paddd	mm7,mm1
-	paddd	mm0,mm1
-	psrad	mm7,DESCALE_P2
-	psrad	mm0,DESCALE_P2
-
-	packssdw  mm4,mm3		; mm4=data1=(01 11 21 31)
-	packssdw  mm7,mm0		; mm7=data6=(06 16 26 36)
-
-	packsswb  mm5,mm7		; mm5=(00 10 20 30 06 16 26 36)
-	packsswb  mm4,mm2		; mm4=(01 11 21 31 07 17 27 37)
-
-	movq	mm6, MMWORD [wk(6)]	; mm6=tmp12L
-	movq	mm1, MMWORD [wk(7)]	; mm1=tmp12H
-	movq	mm3, MMWORD [wk(10)]	; mm3=tmp1L
-	movq	mm0, MMWORD [wk(11)]	; mm0=tmp1H
-
-	movq	MMWORD [wk(0)], mm5	; wk(0)=(00 10 20 30 06 16 26 36)
-	movq	MMWORD [wk(1)], mm4	; wk(1)=(01 11 21 31 07 17 27 37)
-
-	movq	mm7,mm6
-	movq	mm2,mm1
-	paddd	mm6,mm3			; mm6=data2L
-	paddd	mm1,mm0			; mm1=data2H
-	psubd	mm7,mm3			; mm7=data5L
-	psubd	mm2,mm0			; mm2=data5H
-
-	movq	mm5,[GOTOFF(ebx,PD_DESCALE_P2)]	; mm5=[PD_DESCALE_P2]
-
-	paddd	mm6,mm5
-	paddd	mm1,mm5
-	psrad	mm6,DESCALE_P2
-	psrad	mm1,DESCALE_P2
-	paddd	mm7,mm5
-	paddd	mm2,mm5
-	psrad	mm7,DESCALE_P2
-	psrad	mm2,DESCALE_P2
-
-	packssdw  mm6,mm1		; mm6=data2=(02 12 22 32)
-	packssdw  mm7,mm2		; mm7=data5=(05 15 25 35)
-
-	movq	mm4, MMWORD [wk(2)]	; mm4=tmp13L
-	movq	mm3, MMWORD [wk(3)]	; mm3=tmp13H
-	movq	mm0, MMWORD [wk(8)]	; mm0=tmp0L
-	movq	mm5, MMWORD [wk(9)]	; mm5=tmp0H
-
-	movq	mm1,mm4
-	movq	mm2,mm3
-	paddd	mm4,mm0			; mm4=data3L
-	paddd	mm3,mm5			; mm3=data3H
-	psubd	mm1,mm0			; mm1=data4L
-	psubd	mm2,mm5			; mm2=data4H
-
-	movq	mm0,[GOTOFF(ebx,PD_DESCALE_P2)]	; mm0=[PD_DESCALE_P2]
-
-	paddd	mm4,mm0
-	paddd	mm3,mm0
-	psrad	mm4,DESCALE_P2
-	psrad	mm3,DESCALE_P2
-	paddd	mm1,mm0
-	paddd	mm2,mm0
-	psrad	mm1,DESCALE_P2
-	psrad	mm2,DESCALE_P2
-
-	movq      mm5,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm5=[PB_CENTERJSAMP]
-
-	packssdw  mm4,mm3		; mm4=data3=(03 13 23 33)
-	packssdw  mm1,mm2		; mm1=data4=(04 14 24 34)
-
-	movq      mm0, MMWORD [wk(0)]	; mm0=(00 10 20 30 06 16 26 36)
-	movq      mm3, MMWORD [wk(1)]	; mm3=(01 11 21 31 07 17 27 37)
-
-	packsswb  mm6,mm1		; mm6=(02 12 22 32 04 14 24 34)
-	packsswb  mm4,mm7		; mm4=(03 13 23 33 05 15 25 35)
-
-	paddb     mm0,mm5
-	paddb     mm3,mm5
-	paddb     mm6,mm5
-	paddb     mm4,mm5
-
-	movq      mm2,mm0		; transpose coefficients(phase 1)
-	punpcklbw mm0,mm3		; mm0=(00 01 10 11 20 21 30 31)
-	punpckhbw mm2,mm3		; mm2=(06 07 16 17 26 27 36 37)
-	movq      mm1,mm6		; transpose coefficients(phase 1)
-	punpcklbw mm6,mm4		; mm6=(02 03 12 13 22 23 32 33)
-	punpckhbw mm1,mm4		; mm1=(04 05 14 15 24 25 34 35)
-
-	movq      mm7,mm0		; transpose coefficients(phase 2)
-	punpcklwd mm0,mm6		; mm0=(00 01 02 03 10 11 12 13)
-	punpckhwd mm7,mm6		; mm7=(20 21 22 23 30 31 32 33)
-	movq      mm5,mm1		; transpose coefficients(phase 2)
-	punpcklwd mm1,mm2		; mm1=(04 05 06 07 14 15 16 17)
-	punpckhwd mm5,mm2		; mm5=(24 25 26 27 34 35 36 37)
-
-	movq      mm3,mm0		; transpose coefficients(phase 3)
-	punpckldq mm0,mm1		; mm0=(00 01 02 03 04 05 06 07)
-	punpckhdq mm3,mm1		; mm3=(10 11 12 13 14 15 16 17)
-	movq      mm4,mm7		; transpose coefficients(phase 3)
-	punpckldq mm7,mm5		; mm7=(20 21 22 23 24 25 26 27)
-	punpckhdq mm4,mm5		; mm4=(30 31 32 33 34 35 36 37)
-
-	pushpic	ebx			; save GOT address
-
-	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-	mov	ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
-	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3
-	mov	edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7
-	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
-
-	poppic	ebx			; restore GOT address
-
-	add	esi, byte 4*SIZEOF_JCOEF	; wsptr
-	add	edi, byte 4*SIZEOF_JSAMPROW
-	dec	ecx				; ctr
-	jnz	near .rowloop
-
-	emms		; empty MMX state
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jimmxred.asm b/simd/jimmxred.asm
deleted file mode 100644
index f8e61d1..0000000
--- a/simd/jimmxred.asm
+++ /dev/null
@@ -1,706 +0,0 @@
-;
-; jimmxred.asm - reduced-size IDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains inverse-DCT routines that produce reduced-size
-; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
-; The following code is based directly on the IJG's original jidctred.c;
-; see the jidctred.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	13
-%define PASS1_BITS	2
-
-%define DESCALE_P1_4	(CONST_BITS-PASS1_BITS+1)
-%define DESCALE_P2_4	(CONST_BITS+PASS1_BITS+3+1)
-%define DESCALE_P1_2	(CONST_BITS-PASS1_BITS+2)
-%define DESCALE_P2_2	(CONST_BITS+PASS1_BITS+3+2)
-
-%if CONST_BITS == 13
-F_0_211	equ	 1730		; FIX(0.211164243)
-F_0_509	equ	 4176		; FIX(0.509795579)
-F_0_601	equ	 4926		; FIX(0.601344887)
-F_0_720	equ	 5906		; FIX(0.720959822)
-F_0_765	equ	 6270		; FIX(0.765366865)
-F_0_850	equ	 6967		; FIX(0.850430095)
-F_0_899	equ	 7373		; FIX(0.899976223)
-F_1_061	equ	 8697		; FIX(1.061594337)
-F_1_272	equ	10426		; FIX(1.272758580)
-F_1_451	equ	11893		; FIX(1.451774981)
-F_1_847	equ	15137		; FIX(1.847759065)
-F_2_172	equ	17799		; FIX(2.172734803)
-F_2_562	equ	20995		; FIX(2.562915447)
-F_3_624	equ	29692		; FIX(3.624509785)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_211	equ	DESCALE( 226735879,30-CONST_BITS)	; FIX(0.211164243)
-F_0_509	equ	DESCALE( 547388834,30-CONST_BITS)	; FIX(0.509795579)
-F_0_601	equ	DESCALE( 645689155,30-CONST_BITS)	; FIX(0.601344887)
-F_0_720	equ	DESCALE( 774124714,30-CONST_BITS)	; FIX(0.720959822)
-F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
-F_0_850	equ	DESCALE( 913142361,30-CONST_BITS)	; FIX(0.850430095)
-F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
-F_1_061	equ	DESCALE(1139878239,30-CONST_BITS)	; FIX(1.061594337)
-F_1_272	equ	DESCALE(1366614119,30-CONST_BITS)	; FIX(1.272758580)
-F_1_451	equ	DESCALE(1558831516,30-CONST_BITS)	; FIX(1.451774981)
-F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
-F_2_172	equ	DESCALE(2332956230,30-CONST_BITS)	; FIX(2.172734803)
-F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
-F_3_624	equ	DESCALE(3891787747,30-CONST_BITS)	; FIX(3.624509785)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_idct_red_mmx) PRIVATE
-
-EXTN(jconst_idct_red_mmx):
-
-PW_F184_MF076	times 2 dw  F_1_847,-F_0_765
-PW_F256_F089	times 2 dw  F_2_562, F_0_899
-PW_F106_MF217	times 2 dw  F_1_061,-F_2_172
-PW_MF060_MF050	times 2 dw -F_0_601,-F_0_509
-PW_F145_MF021	times 2 dw  F_1_451,-F_0_211
-PW_F362_MF127	times 2 dw  F_3_624,-F_1_272
-PW_F085_MF072	times 2 dw  F_0_850,-F_0_720
-PD_DESCALE_P1_4	times 2 dd  1 << (DESCALE_P1_4-1)
-PD_DESCALE_P2_4	times 2 dd  1 << (DESCALE_P2_4-1)
-PD_DESCALE_P1_2	times 2 dd  1 << (DESCALE_P1_2-1)
-PD_DESCALE_P2_2	times 2 dd  1 << (DESCALE_P2_2-1)
-PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 4x4 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_4x4_mmx (void * dct_table, JCOEFPTR coef_block,
-;                     JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)	(b)+8			; void * dct_table
-%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
-%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
-%define output_col(b)	(b)+20		; JDIMENSION output_col
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
-%define WK_NUM		2
-%define workspace	wk(0)-DCTSIZE2*SIZEOF_JCOEF
-					; JCOEF workspace[DCTSIZE2]
-
-	align	16
-	global	EXTN(jsimd_idct_4x4_mmx) PRIVATE
-
-EXTN(jsimd_idct_4x4_mmx):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [workspace]
-	pushpic	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process columns from input, store into work array.
-
-;	mov	eax, [original_ebp]
-	mov	edx, POINTER [dct_table(eax)]	; quantptr
-	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
-	lea	edi, [workspace]			; JCOEF * wsptr
-	mov	ecx, DCTSIZE/4				; ctr
-	alignx	16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
-	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	jnz	short .columnDCT
-
-	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	por	mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	por	mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	por	mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	por	mm0,mm1
-	packsswb mm0,mm0
-	movd	eax,mm0
-	test	eax,eax
-	jnz	short .columnDCT
-
-	; -- AC terms all zero
-
-	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	psllw	mm0,PASS1_BITS
-
-	movq      mm2,mm0		; mm0=in0=(00 01 02 03)
-	punpcklwd mm0,mm0		; mm0=(00 00 01 01)
-	punpckhwd mm2,mm2		; mm2=(02 02 03 03)
-
-	movq      mm1,mm0
-	punpckldq mm0,mm0		; mm0=(00 00 00 00)
-	punpckhdq mm1,mm1		; mm1=(01 01 01 01)
-	movq      mm3,mm2
-	punpckldq mm2,mm2		; mm2=(02 02 02 02)
-	punpckhdq mm3,mm3		; mm3=(03 03 03 03)
-
-	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
-	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
-	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
-	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
-	jmp	near .nextcolumn
-	alignx	16,7
-%endif
-.columnDCT:
-
-	; -- Odd part
-
-	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	movq	mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	movq      mm4,mm0
-	movq      mm5,mm0
-	punpcklwd mm4,mm1
-	punpckhwd mm5,mm1
-	movq      mm0,mm4
-	movq      mm1,mm5
-	pmaddwd   mm4,[GOTOFF(ebx,PW_F256_F089)]	; mm4=(tmp2L)
-	pmaddwd   mm5,[GOTOFF(ebx,PW_F256_F089)]	; mm5=(tmp2H)
-	pmaddwd   mm0,[GOTOFF(ebx,PW_F106_MF217)]	; mm0=(tmp0L)
-	pmaddwd   mm1,[GOTOFF(ebx,PW_F106_MF217)]	; mm1=(tmp0H)
-
-	movq      mm6,mm2
-	movq      mm7,mm2
-	punpcklwd mm6,mm3
-	punpckhwd mm7,mm3
-	movq      mm2,mm6
-	movq      mm3,mm7
-	pmaddwd   mm6,[GOTOFF(ebx,PW_MF060_MF050)]	; mm6=(tmp2L)
-	pmaddwd   mm7,[GOTOFF(ebx,PW_MF060_MF050)]	; mm7=(tmp2H)
-	pmaddwd   mm2,[GOTOFF(ebx,PW_F145_MF021)]	; mm2=(tmp0L)
-	pmaddwd   mm3,[GOTOFF(ebx,PW_F145_MF021)]	; mm3=(tmp0H)
-
-	paddd	mm6,mm4			; mm6=tmp2L
-	paddd	mm7,mm5			; mm7=tmp2H
-	paddd	mm2,mm0			; mm2=tmp0L
-	paddd	mm3,mm1			; mm3=tmp0H
-
-	movq	MMWORD [wk(0)], mm2	; wk(0)=tmp0L
-	movq	MMWORD [wk(1)], mm3	; wk(1)=tmp0H
-
-	; -- Even part
-
-	movq	mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	movq	mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	movq	mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	pxor      mm1,mm1
-	pxor      mm2,mm2
-	punpcklwd mm1,mm4		; mm1=tmp0L
-	punpckhwd mm2,mm4		; mm2=tmp0H
-	psrad     mm1,(16-CONST_BITS-1)	; psrad mm1,16 & pslld mm1,CONST_BITS+1
-	psrad     mm2,(16-CONST_BITS-1)	; psrad mm2,16 & pslld mm2,CONST_BITS+1
-
-	movq      mm3,mm5		; mm5=in2=z2
-	punpcklwd mm5,mm0		; mm0=in6=z3
-	punpckhwd mm3,mm0
-	pmaddwd   mm5,[GOTOFF(ebx,PW_F184_MF076)]	; mm5=tmp2L
-	pmaddwd   mm3,[GOTOFF(ebx,PW_F184_MF076)]	; mm3=tmp2H
-
-	movq	mm4,mm1
-	movq	mm0,mm2
-	paddd	mm1,mm5			; mm1=tmp10L
-	paddd	mm2,mm3			; mm2=tmp10H
-	psubd	mm4,mm5			; mm4=tmp12L
-	psubd	mm0,mm3			; mm0=tmp12H
-
-	; -- Final output stage
-
-	movq	mm5,mm1
-	movq	mm3,mm2
-	paddd	mm1,mm6			; mm1=data0L
-	paddd	mm2,mm7			; mm2=data0H
-	psubd	mm5,mm6			; mm5=data3L
-	psubd	mm3,mm7			; mm3=data3H
-
-	movq	mm6,[GOTOFF(ebx,PD_DESCALE_P1_4)]	; mm6=[PD_DESCALE_P1_4]
-
-	paddd	mm1,mm6
-	paddd	mm2,mm6
-	psrad	mm1,DESCALE_P1_4
-	psrad	mm2,DESCALE_P1_4
-	paddd	mm5,mm6
-	paddd	mm3,mm6
-	psrad	mm5,DESCALE_P1_4
-	psrad	mm3,DESCALE_P1_4
-
-	packssdw  mm1,mm2		; mm1=data0=(00 01 02 03)
-	packssdw  mm5,mm3		; mm5=data3=(30 31 32 33)
-
-	movq	mm7, MMWORD [wk(0)]	; mm7=tmp0L
-	movq	mm6, MMWORD [wk(1)]	; mm6=tmp0H
-
-	movq	mm2,mm4
-	movq	mm3,mm0
-	paddd	mm4,mm7			; mm4=data1L
-	paddd	mm0,mm6			; mm0=data1H
-	psubd	mm2,mm7			; mm2=data2L
-	psubd	mm3,mm6			; mm3=data2H
-
-	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P1_4)]	; mm7=[PD_DESCALE_P1_4]
-
-	paddd	mm4,mm7
-	paddd	mm0,mm7
-	psrad	mm4,DESCALE_P1_4
-	psrad	mm0,DESCALE_P1_4
-	paddd	mm2,mm7
-	paddd	mm3,mm7
-	psrad	mm2,DESCALE_P1_4
-	psrad	mm3,DESCALE_P1_4
-
-	packssdw  mm4,mm0		; mm4=data1=(10 11 12 13)
-	packssdw  mm2,mm3		; mm2=data2=(20 21 22 23)
-
-	movq      mm6,mm1		; transpose coefficients(phase 1)
-	punpcklwd mm1,mm4		; mm1=(00 10 01 11)
-	punpckhwd mm6,mm4		; mm6=(02 12 03 13)
-	movq      mm7,mm2		; transpose coefficients(phase 1)
-	punpcklwd mm2,mm5		; mm2=(20 30 21 31)
-	punpckhwd mm7,mm5		; mm7=(22 32 23 33)
-
-	movq      mm0,mm1		; transpose coefficients(phase 2)
-	punpckldq mm1,mm2		; mm1=(00 10 20 30)
-	punpckhdq mm0,mm2		; mm0=(01 11 21 31)
-	movq      mm3,mm6		; transpose coefficients(phase 2)
-	punpckldq mm6,mm7		; mm6=(02 12 22 32)
-	punpckhdq mm3,mm7		; mm3=(03 13 23 33)
-
-	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1
-	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
-	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6
-	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
-
-.nextcolumn:
-	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
-	add	edx, byte 4*SIZEOF_ISLOW_MULT_TYPE	; quantptr
-	add	edi, byte 4*DCTSIZE*SIZEOF_JCOEF	; wsptr
-	dec	ecx					; ctr
-	jnz	near .columnloop
-
-	; ---- Pass 2: process rows from work array, store into output array.
-
-	mov	eax, [original_ebp]
-	lea	esi, [workspace]			; JCOEF * wsptr
-	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [output_col(eax)]
-
-	; -- Odd part
-
-	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	movq	mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-	movq      mm4,mm0
-	movq      mm5,mm0
-	punpcklwd mm4,mm1
-	punpckhwd mm5,mm1
-	movq      mm0,mm4
-	movq      mm1,mm5
-	pmaddwd   mm4,[GOTOFF(ebx,PW_F256_F089)]	; mm4=(tmp2L)
-	pmaddwd   mm5,[GOTOFF(ebx,PW_F256_F089)]	; mm5=(tmp2H)
-	pmaddwd   mm0,[GOTOFF(ebx,PW_F106_MF217)]	; mm0=(tmp0L)
-	pmaddwd   mm1,[GOTOFF(ebx,PW_F106_MF217)]	; mm1=(tmp0H)
-
-	movq      mm6,mm2
-	movq      mm7,mm2
-	punpcklwd mm6,mm3
-	punpckhwd mm7,mm3
-	movq      mm2,mm6
-	movq      mm3,mm7
-	pmaddwd   mm6,[GOTOFF(ebx,PW_MF060_MF050)]	; mm6=(tmp2L)
-	pmaddwd   mm7,[GOTOFF(ebx,PW_MF060_MF050)]	; mm7=(tmp2H)
-	pmaddwd   mm2,[GOTOFF(ebx,PW_F145_MF021)]	; mm2=(tmp0L)
-	pmaddwd   mm3,[GOTOFF(ebx,PW_F145_MF021)]	; mm3=(tmp0H)
-
-	paddd	mm6,mm4			; mm6=tmp2L
-	paddd	mm7,mm5			; mm7=tmp2H
-	paddd	mm2,mm0			; mm2=tmp0L
-	paddd	mm3,mm1			; mm3=tmp0H
-
-	movq	MMWORD [wk(0)], mm2	; wk(0)=tmp0L
-	movq	MMWORD [wk(1)], mm3	; wk(1)=tmp0H
-
-	; -- Even part
-
-	movq	mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	movq	mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	movq	mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-	pxor      mm1,mm1
-	pxor      mm2,mm2
-	punpcklwd mm1,mm4		; mm1=tmp0L
-	punpckhwd mm2,mm4		; mm2=tmp0H
-	psrad     mm1,(16-CONST_BITS-1)	; psrad mm1,16 & pslld mm1,CONST_BITS+1
-	psrad     mm2,(16-CONST_BITS-1)	; psrad mm2,16 & pslld mm2,CONST_BITS+1
-
-	movq      mm3,mm5		; mm5=in2=z2
-	punpcklwd mm5,mm0		; mm0=in6=z3
-	punpckhwd mm3,mm0
-	pmaddwd   mm5,[GOTOFF(ebx,PW_F184_MF076)]	; mm5=tmp2L
-	pmaddwd   mm3,[GOTOFF(ebx,PW_F184_MF076)]	; mm3=tmp2H
-
-	movq	mm4,mm1
-	movq	mm0,mm2
-	paddd	mm1,mm5			; mm1=tmp10L
-	paddd	mm2,mm3			; mm2=tmp10H
-	psubd	mm4,mm5			; mm4=tmp12L
-	psubd	mm0,mm3			; mm0=tmp12H
-
-	; -- Final output stage
-
-	movq	mm5,mm1
-	movq	mm3,mm2
-	paddd	mm1,mm6			; mm1=data0L
-	paddd	mm2,mm7			; mm2=data0H
-	psubd	mm5,mm6			; mm5=data3L
-	psubd	mm3,mm7			; mm3=data3H
-
-	movq	mm6,[GOTOFF(ebx,PD_DESCALE_P2_4)]	; mm6=[PD_DESCALE_P2_4]
-
-	paddd	mm1,mm6
-	paddd	mm2,mm6
-	psrad	mm1,DESCALE_P2_4
-	psrad	mm2,DESCALE_P2_4
-	paddd	mm5,mm6
-	paddd	mm3,mm6
-	psrad	mm5,DESCALE_P2_4
-	psrad	mm3,DESCALE_P2_4
-
-	packssdw  mm1,mm2		; mm1=data0=(00 10 20 30)
-	packssdw  mm5,mm3		; mm5=data3=(03 13 23 33)
-
-	movq	mm7, MMWORD [wk(0)]	; mm7=tmp0L
-	movq	mm6, MMWORD [wk(1)]	; mm6=tmp0H
-
-	movq	mm2,mm4
-	movq	mm3,mm0
-	paddd	mm4,mm7			; mm4=data1L
-	paddd	mm0,mm6			; mm0=data1H
-	psubd	mm2,mm7			; mm2=data2L
-	psubd	mm3,mm6			; mm3=data2H
-
-	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P2_4)]	; mm7=[PD_DESCALE_P2_4]
-
-	paddd	mm4,mm7
-	paddd	mm0,mm7
-	psrad	mm4,DESCALE_P2_4
-	psrad	mm0,DESCALE_P2_4
-	paddd	mm2,mm7
-	paddd	mm3,mm7
-	psrad	mm2,DESCALE_P2_4
-	psrad	mm3,DESCALE_P2_4
-
-	packssdw  mm4,mm0		; mm4=data1=(01 11 21 31)
-	packssdw  mm2,mm3		; mm2=data2=(02 12 22 32)
-
-	movq      mm6,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm6=[PB_CENTERJSAMP]
-
-	packsswb  mm1,mm2		; mm1=(00 10 20 30 02 12 22 32)
-	packsswb  mm4,mm5		; mm4=(01 11 21 31 03 13 23 33)
-	paddb     mm1,mm6
-	paddb     mm4,mm6
-
-	movq      mm7,mm1		; transpose coefficients(phase 1)
-	punpcklbw mm1,mm4		; mm1=(00 01 10 11 20 21 30 31)
-	punpckhbw mm7,mm4		; mm7=(02 03 12 13 22 23 32 33)
-
-	movq      mm0,mm1		; transpose coefficients(phase 2)
-	punpcklwd mm1,mm7		; mm1=(00 01 02 03 10 11 12 13)
-	punpckhwd mm0,mm7		; mm0=(20 21 22 23 30 31 32 33)
-
-	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-	mov	esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-	movd	DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
-	movd	DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
-
-	psrlq	mm1,4*BYTE_BIT
-	psrlq	mm0,4*BYTE_BIT
-
-	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-	mov	esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-	movd	DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
-	movd	DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
-
-	emms		; empty MMX state
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	poppic	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-
-; --------------------------------------------------------------------------
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 2x2 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_2x2_mmx (void * dct_table, JCOEFPTR coef_block,
-;                     JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)	(b)+8			; void * dct_table
-%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
-%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
-%define output_col(b)	(b)+20		; JDIMENSION output_col
-
-	align	16
-	global	EXTN(jsimd_idct_2x2_mmx) PRIVATE
-
-EXTN(jsimd_idct_2x2_mmx):
-	push	ebp
-	mov	ebp,esp
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process columns from input.
-
-	mov	edx, POINTER [dct_table(ebp)]	; quantptr
-	mov	esi, JCOEFPTR [coef_block(ebp)]		; inptr
-
-	; | input:                  | result:        |
-	; | 00 01 ** 03 ** 05 ** 07 |                |
-	; | 10 11 ** 13 ** 15 ** 17 |                |
-	; | ** ** ** ** ** ** ** ** |                |
-	; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
-	; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
-	; | 50 51 ** 53 ** 55 ** 57 |                |
-	; | ** ** ** ** ** ** ** ** |                |
-	; | 70 71 ** 73 ** 75 ** 77 |                |
-
-	; -- Odd part
-
-	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	movq	mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	; mm0=(10 11 ** 13), mm1=(30 31 ** 33)
-	; mm2=(50 51 ** 53), mm3=(70 71 ** 73)
-
-	pcmpeqd   mm7,mm7
-	pslld     mm7,WORD_BIT		; mm7={0x0000 0xFFFF 0x0000 0xFFFF}
-
-	movq      mm4,mm0		; mm4=(10 11 ** 13)
-	movq      mm5,mm2		; mm5=(50 51 ** 53)
-	punpcklwd mm4,mm1		; mm4=(10 30 11 31)
-	punpcklwd mm5,mm3		; mm5=(50 70 51 71)
-	pmaddwd   mm4,[GOTOFF(ebx,PW_F362_MF127)]
-	pmaddwd   mm5,[GOTOFF(ebx,PW_F085_MF072)]
-
-	psrld	mm0,WORD_BIT		; mm0=(11 -- 13 --)
-	pand	mm1,mm7			; mm1=(-- 31 -- 33)
-	psrld	mm2,WORD_BIT		; mm2=(51 -- 53 --)
-	pand	mm3,mm7			; mm3=(-- 71 -- 73)
-	por	mm0,mm1			; mm0=(11 31 13 33)
-	por	mm2,mm3			; mm2=(51 71 53 73)
-	pmaddwd	mm0,[GOTOFF(ebx,PW_F362_MF127)]
-	pmaddwd	mm2,[GOTOFF(ebx,PW_F085_MF072)]
-
-	paddd	mm4,mm5			; mm4=tmp0[col0 col1]
-
-	movq	mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)]
-	pmullw	mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	movq	mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)]
-	movq	mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)]
-	pmullw	mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	; mm6=(** 15 ** 17), mm1=(** 35 ** 37)
-	; mm3=(** 55 ** 57), mm5=(** 75 ** 77)
-
-	psrld	mm6,WORD_BIT		; mm6=(15 -- 17 --)
-	pand	mm1,mm7			; mm1=(-- 35 -- 37)
-	psrld	mm3,WORD_BIT		; mm3=(55 -- 57 --)
-	pand	mm5,mm7			; mm5=(-- 75 -- 77)
-	por	mm6,mm1			; mm6=(15 35 17 37)
-	por	mm3,mm5			; mm3=(55 75 57 77)
-	pmaddwd	mm6,[GOTOFF(ebx,PW_F362_MF127)]
-	pmaddwd	mm3,[GOTOFF(ebx,PW_F085_MF072)]
-
-	paddd	mm0,mm2			; mm0=tmp0[col1 col3]
-	paddd	mm6,mm3			; mm6=tmp0[col5 col7]
-
-	; -- Even part
-
-	movq	mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	movq	mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)]
-	pmullw	mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	; mm1=(00 01 ** 03), mm5=(** 05 ** 07)
-
-	movq	mm2,mm1				; mm2=(00 01 ** 03)
-	pslld	mm1,WORD_BIT			; mm1=(-- 00 -- **)
-	psrad	mm1,(WORD_BIT-CONST_BITS-2)	; mm1=tmp10[col0 ****]
-
-	pand	mm2,mm7				; mm2=(-- 01 -- 03)
-	pand	mm5,mm7				; mm5=(-- 05 -- 07)
-	psrad	mm2,(WORD_BIT-CONST_BITS-2)	; mm2=tmp10[col1 col3]
-	psrad	mm5,(WORD_BIT-CONST_BITS-2)	; mm5=tmp10[col5 col7]
-
-	; -- Final output stage
-
-	movq      mm3,mm1
-	paddd     mm1,mm4		; mm1=data0[col0 ****]=(A0 **)
-	psubd     mm3,mm4		; mm3=data1[col0 ****]=(B0 **)
-	punpckldq mm1,mm3		; mm1=(A0 B0)
-
-	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P1_2)]	; mm7=[PD_DESCALE_P1_2]
-
-	movq	mm4,mm2
-	movq	mm3,mm5
-	paddd	mm2,mm0			; mm2=data0[col1 col3]=(A1 A3)
-	paddd	mm5,mm6			; mm5=data0[col5 col7]=(A5 A7)
-	psubd	mm4,mm0			; mm4=data1[col1 col3]=(B1 B3)
-	psubd	mm3,mm6			; mm3=data1[col5 col7]=(B5 B7)
-
-	paddd	mm1,mm7
-	psrad	mm1,DESCALE_P1_2
-
-	paddd	mm2,mm7
-	paddd	mm5,mm7
-	psrad	mm2,DESCALE_P1_2
-	psrad	mm5,DESCALE_P1_2
-	paddd	mm4,mm7
-	paddd	mm3,mm7
-	psrad	mm4,DESCALE_P1_2
-	psrad	mm3,DESCALE_P1_2
-
-	; ---- Pass 2: process rows, store into output array.
-
-	mov	edi, JSAMPARRAY [output_buf(ebp)]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [output_col(ebp)]
-
-	; | input:| result:|
-	; | A0 B0 |        |
-	; | A1 B1 | C0 C1  |
-	; | A3 B3 | D0 D1  |
-	; | A5 B5 |        |
-	; | A7 B7 |        |
-
-	; -- Odd part
-
-	packssdw  mm2,mm4		; mm2=(A1 A3 B1 B3)
-	packssdw  mm5,mm3		; mm5=(A5 A7 B5 B7)
-	pmaddwd   mm2,[GOTOFF(ebx,PW_F362_MF127)]
-	pmaddwd   mm5,[GOTOFF(ebx,PW_F085_MF072)]
-
-	paddd     mm2,mm5		; mm2=tmp0[row0 row1]
-
-	; -- Even part
-
-	pslld     mm1,(CONST_BITS+2)	; mm1=tmp10[row0 row1]
-
-	; -- Final output stage
-
-	movq      mm0,[GOTOFF(ebx,PD_DESCALE_P2_2)]	; mm0=[PD_DESCALE_P2_2]
-
-	movq      mm6,mm1
-	paddd     mm1,mm2		; mm1=data0[row0 row1]=(C0 C1)
-	psubd     mm6,mm2		; mm6=data1[row0 row1]=(D0 D1)
-
-	paddd     mm1,mm0
-	paddd     mm6,mm0
-	psrad     mm1,DESCALE_P2_2
-	psrad     mm6,DESCALE_P2_2
-
-	movq      mm7,mm1		; transpose coefficients
-	punpckldq mm1,mm6		; mm1=(C0 D0)
-	punpckhdq mm7,mm6		; mm7=(C1 D1)
-
-	packssdw  mm1,mm7		; mm1=(C0 D0 C1 D1)
-	packsswb  mm1,mm1		; mm1=(C0 D0 C1 D1 C0 D0 C1 D1)
-	paddb     mm1,[GOTOFF(ebx,PB_CENTERJSAMP)]
-
-	movd	ecx,mm1
-	movd	ebx,mm1			; ebx=(C0 D0 C1 D1)
-	shr	ecx,2*BYTE_BIT		; ecx=(C1 D1 -- --)
-
-	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-	mov	esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-	mov	WORD [edx+eax*SIZEOF_JSAMPLE], bx
-	mov	WORD [esi+eax*SIZEOF_JSAMPLE], cx
-
-	emms		; empty MMX state
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jiss2flt-64.asm b/simd/jiss2flt-64.asm
deleted file mode 100644
index f092599..0000000
--- a/simd/jiss2flt-64.asm
+++ /dev/null
@@ -1,483 +0,0 @@
-;
-; jiss2flt-64.asm - floating-point IDCT (64-bit SSE & SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the inverse DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
-	shufps	%1,%2,0x44
-%endmacro
-
-%macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
-	shufps	%1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_idct_float_sse2) PRIVATE
-
-EXTN(jconst_idct_float_sse2):
-
-PD_1_414	times 4 dd  1.414213562373095048801689
-PD_1_847	times 4 dd  1.847759065022573512256366
-PD_1_082	times 4 dd  1.082392200292393968799446
-PD_M2_613	times 4 dd -2.613125929752753055713286
-PD_RNDINT_MAGIC	times 4 dd  100663296.0	; (float)(0x00C00000 << 3)
-PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block,
-;                        JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = void * dct_table
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-%define original_rbp	rbp+0
-%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-%define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
-					; FAST_FLOAT workspace[DCTSIZE2]
-
-	align	16
-	global	EXTN(jsimd_idct_float_sse2) PRIVATE
-
-EXTN(jsimd_idct_float_sse2):
-	push	rbp
-	mov	rax,rsp				; rax = original rbp
-	sub	rsp, byte 4
-	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[rsp],rax
-	mov	rbp,rsp				; rbp = aligned rbp
-	lea	rsp, [workspace]
-	collect_args
-	push	rbx
-
-	; ---- Pass 1: process columns from input, store into work array.
-
-	mov	rdx, r10	; quantptr
-	mov	rsi, r11		; inptr
-	lea	rdi, [workspace]			; FAST_FLOAT * wsptr
-	mov	rcx, DCTSIZE/4				; ctr
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
-	mov	eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-	or	eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-	jnz	near .columnDCT
-
-	movq	xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-	movq	xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-	movq	xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-	movq	xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-	movq	xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-	movq	xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-	movq	xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-	por	xmm1,xmm2
-	por	xmm3,xmm4
-	por	xmm5,xmm6
-	por	xmm1,xmm3
-	por	xmm5,xmm7
-	por	xmm1,xmm5
-	packsswb xmm1,xmm1
-	movd	eax,xmm1
-	test	rax,rax
-	jnz	short .columnDCT
-
-	; -- AC terms all zero
-
-	movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-
-	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
-	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)
-	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)
-
-	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	movaps	xmm1,xmm0
-	movaps	xmm2,xmm0
-	movaps	xmm3,xmm0
-
-	shufps	xmm0,xmm0,0x00			; xmm0=(00 00 00 00)
-	shufps	xmm1,xmm1,0x55			; xmm1=(01 01 01 01)
-	shufps	xmm2,xmm2,0xAA			; xmm2=(02 02 02 02)
-	shufps	xmm3,xmm3,0xFF			; xmm3=(03 03 03 03)
-
-	movaps	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
-	movaps	XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
-	movaps	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
-	movaps	XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
-	movaps	XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
-	movaps	XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
-	movaps	XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
-	movaps	XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
-	jmp	near .nextcolumn
-%endif
-.columnDCT:
-
-	; -- Even part
-
-	movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-	movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-	movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-	movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-
-	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
-	punpcklwd xmm1,xmm1		; xmm1=(20 20 21 21 22 22 23 23)
-	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)
-	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in2=(20 21 22 23)
-	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)
-	cvtdq2ps  xmm1,xmm1			; xmm1=in2=(20 21 22 23)
-
-	punpcklwd xmm2,xmm2		; xmm2=(40 40 41 41 42 42 43 43)
-	punpcklwd xmm3,xmm3		; xmm3=(60 60 61 61 62 62 63 63)
-	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in4=(40 41 42 43)
-	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in6=(60 61 62 63)
-	cvtdq2ps  xmm2,xmm2			; xmm2=in4=(40 41 42 43)
-	cvtdq2ps  xmm3,xmm3			; xmm3=in6=(60 61 62 63)
-
-	mulps     xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	movaps	xmm4,xmm0
-	movaps	xmm5,xmm1
-	subps	xmm0,xmm2		; xmm0=tmp11
-	subps	xmm1,xmm3
-	addps	xmm4,xmm2		; xmm4=tmp10
-	addps	xmm5,xmm3		; xmm5=tmp13
-
-	mulps	xmm1,[rel PD_1_414]
-	subps	xmm1,xmm5		; xmm1=tmp12
-
-	movaps	xmm6,xmm4
-	movaps	xmm7,xmm0
-	subps	xmm4,xmm5		; xmm4=tmp3
-	subps	xmm0,xmm1		; xmm0=tmp2
-	addps	xmm6,xmm5		; xmm6=tmp0
-	addps	xmm7,xmm1		; xmm7=tmp1
-
-	movaps	XMMWORD [wk(1)], xmm4	; tmp3
-	movaps	XMMWORD [wk(0)], xmm0	; tmp2
-
-	; -- Odd part
-
-	movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-	movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-	movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-	movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-
-	punpcklwd xmm2,xmm2		; xmm2=(10 10 11 11 12 12 13 13)
-	punpcklwd xmm3,xmm3		; xmm3=(30 30 31 31 32 32 33 33)
-	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in1=(10 11 12 13)
-	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in3=(30 31 32 33)
-	cvtdq2ps  xmm2,xmm2			; xmm2=in1=(10 11 12 13)
-	cvtdq2ps  xmm3,xmm3			; xmm3=in3=(30 31 32 33)
-
-	punpcklwd xmm5,xmm5		; xmm5=(50 50 51 51 52 52 53 53)
-	punpcklwd xmm1,xmm1		; xmm1=(70 70 71 71 72 72 73 73)
-	psrad     xmm5,(DWORD_BIT-WORD_BIT)	; xmm5=in5=(50 51 52 53)
-	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in7=(70 71 72 73)
-	cvtdq2ps  xmm5,xmm5			; xmm5=in5=(50 51 52 53)
-	cvtdq2ps  xmm1,xmm1			; xmm1=in7=(70 71 72 73)
-
-	mulps     xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	movaps	xmm4,xmm2
-	movaps	xmm0,xmm5
-	addps	xmm2,xmm1		; xmm2=z11
-	addps	xmm5,xmm3		; xmm5=z13
-	subps	xmm4,xmm1		; xmm4=z12
-	subps	xmm0,xmm3		; xmm0=z10
-
-	movaps	xmm1,xmm2
-	subps	xmm2,xmm5
-	addps	xmm1,xmm5		; xmm1=tmp7
-
-	mulps	xmm2,[rel PD_1_414]	; xmm2=tmp11
-
-	movaps	xmm3,xmm0
-	addps	xmm0,xmm4
-	mulps	xmm0,[rel PD_1_847]	; xmm0=z5
-	mulps	xmm3,[rel PD_M2_613]	; xmm3=(z10 * -2.613125930)
-	mulps	xmm4,[rel PD_1_082]	; xmm4=(z12 * 1.082392200)
-	addps	xmm3,xmm0		; xmm3=tmp12
-	subps	xmm4,xmm0		; xmm4=tmp10
-
-	; -- Final output stage
-
-	subps	xmm3,xmm1		; xmm3=tmp6
-	movaps	xmm5,xmm6
-	movaps	xmm0,xmm7
-	addps	xmm6,xmm1		; xmm6=data0=(00 01 02 03)
-	addps	xmm7,xmm3		; xmm7=data1=(10 11 12 13)
-	subps	xmm5,xmm1		; xmm5=data7=(70 71 72 73)
-	subps	xmm0,xmm3		; xmm0=data6=(60 61 62 63)
-	subps	xmm2,xmm3		; xmm2=tmp5
-
-	movaps    xmm1,xmm6		; transpose coefficients(phase 1)
-	unpcklps  xmm6,xmm7		; xmm6=(00 10 01 11)
-	unpckhps  xmm1,xmm7		; xmm1=(02 12 03 13)
-	movaps    xmm3,xmm0		; transpose coefficients(phase 1)
-	unpcklps  xmm0,xmm5		; xmm0=(60 70 61 71)
-	unpckhps  xmm3,xmm5		; xmm3=(62 72 63 73)
-
-	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
-	movaps	xmm5, XMMWORD [wk(1)]	; xmm5=tmp3
-
-	movaps	XMMWORD [wk(0)], xmm0	; wk(0)=(60 70 61 71)
-	movaps	XMMWORD [wk(1)], xmm3	; wk(1)=(62 72 63 73)
-
-	addps	xmm4,xmm2		; xmm4=tmp4
-	movaps	xmm0,xmm7
-	movaps	xmm3,xmm5
-	addps	xmm7,xmm2		; xmm7=data2=(20 21 22 23)
-	addps	xmm5,xmm4		; xmm5=data4=(40 41 42 43)
-	subps	xmm0,xmm2		; xmm0=data5=(50 51 52 53)
-	subps	xmm3,xmm4		; xmm3=data3=(30 31 32 33)
-
-	movaps    xmm2,xmm7		; transpose coefficients(phase 1)
-	unpcklps  xmm7,xmm3		; xmm7=(20 30 21 31)
-	unpckhps  xmm2,xmm3		; xmm2=(22 32 23 33)
-	movaps    xmm4,xmm5		; transpose coefficients(phase 1)
-	unpcklps  xmm5,xmm0		; xmm5=(40 50 41 51)
-	unpckhps  xmm4,xmm0		; xmm4=(42 52 43 53)
-
-	movaps    xmm3,xmm6		; transpose coefficients(phase 2)
-	unpcklps2 xmm6,xmm7		; xmm6=(00 10 20 30)
-	unpckhps2 xmm3,xmm7		; xmm3=(01 11 21 31)
-	movaps    xmm0,xmm1		; transpose coefficients(phase 2)
-	unpcklps2 xmm1,xmm2		; xmm1=(02 12 22 32)
-	unpckhps2 xmm0,xmm2		; xmm0=(03 13 23 33)
-
-	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=(60 70 61 71)
-	movaps	xmm2, XMMWORD [wk(1)]	; xmm2=(62 72 63 73)
-
-	movaps	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
-	movaps	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
-	movaps	XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
-	movaps	XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
-
-	movaps    xmm6,xmm5		; transpose coefficients(phase 2)
-	unpcklps2 xmm5,xmm7		; xmm5=(40 50 60 70)
-	unpckhps2 xmm6,xmm7		; xmm6=(41 51 61 71)
-	movaps    xmm3,xmm4		; transpose coefficients(phase 2)
-	unpcklps2 xmm4,xmm2		; xmm4=(42 52 62 72)
-	unpckhps2 xmm3,xmm2		; xmm3=(43 53 63 73)
-
-	movaps	XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
-	movaps	XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
-	movaps	XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
-	movaps	XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
-
-.nextcolumn:
-	add	rsi, byte 4*SIZEOF_JCOEF		; coef_block
-	add	rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE	; quantptr
-	add	rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr
-	dec	rcx					; ctr
-	jnz	near .columnloop
-
-	; -- Prefetch the next coefficient block
-
-	prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
-	prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
-	prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
-	prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
-
-	; ---- Pass 2: process rows from work array, store into output array.
-
-	mov	rax, [original_rbp]
-	lea	rsi, [workspace]			; FAST_FLOAT * wsptr
-	mov	rdi, r12	; (JSAMPROW *)
-	mov	rax, r13
-	mov	rcx, DCTSIZE/4				; ctr
-.rowloop:
-
-	; -- Even part
-
-	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
-
-	movaps	xmm4,xmm0
-	movaps	xmm5,xmm1
-	subps	xmm0,xmm2		; xmm0=tmp11
-	subps	xmm1,xmm3
-	addps	xmm4,xmm2		; xmm4=tmp10
-	addps	xmm5,xmm3		; xmm5=tmp13
-
-	mulps	xmm1,[rel PD_1_414]
-	subps	xmm1,xmm5		; xmm1=tmp12
-
-	movaps	xmm6,xmm4
-	movaps	xmm7,xmm0
-	subps	xmm4,xmm5		; xmm4=tmp3
-	subps	xmm0,xmm1		; xmm0=tmp2
-	addps	xmm6,xmm5		; xmm6=tmp0
-	addps	xmm7,xmm1		; xmm7=tmp1
-
-	movaps	XMMWORD [wk(1)], xmm4	; tmp3
-	movaps	XMMWORD [wk(0)], xmm0	; tmp2
-
-	; -- Odd part
-
-	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
-
-	movaps	xmm4,xmm2
-	movaps	xmm0,xmm5
-	addps	xmm2,xmm1		; xmm2=z11
-	addps	xmm5,xmm3		; xmm5=z13
-	subps	xmm4,xmm1		; xmm4=z12
-	subps	xmm0,xmm3		; xmm0=z10
-
-	movaps	xmm1,xmm2
-	subps	xmm2,xmm5
-	addps	xmm1,xmm5		; xmm1=tmp7
-
-	mulps	xmm2,[rel PD_1_414]	; xmm2=tmp11
-
-	movaps	xmm3,xmm0
-	addps	xmm0,xmm4
-	mulps	xmm0,[rel PD_1_847]	; xmm0=z5
-	mulps	xmm3,[rel PD_M2_613]	; xmm3=(z10 * -2.613125930)
-	mulps	xmm4,[rel PD_1_082]	; xmm4=(z12 * 1.082392200)
-	addps	xmm3,xmm0		; xmm3=tmp12
-	subps	xmm4,xmm0		; xmm4=tmp10
-
-	; -- Final output stage
-
-	subps	xmm3,xmm1		; xmm3=tmp6
-	movaps	xmm5,xmm6
-	movaps	xmm0,xmm7
-	addps	xmm6,xmm1		; xmm6=data0=(00 10 20 30)
-	addps	xmm7,xmm3		; xmm7=data1=(01 11 21 31)
-	subps	xmm5,xmm1		; xmm5=data7=(07 17 27 37)
-	subps	xmm0,xmm3		; xmm0=data6=(06 16 26 36)
-	subps	xmm2,xmm3		; xmm2=tmp5
-
-	movaps	xmm1,[rel PD_RNDINT_MAGIC]	; xmm1=[rel PD_RNDINT_MAGIC]
-	pcmpeqd	xmm3,xmm3
-	psrld	xmm3,WORD_BIT		; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
-
-	addps	xmm6,xmm1	; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
-	addps	xmm7,xmm1	; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
-	addps	xmm0,xmm1	; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
-	addps	xmm5,xmm1	; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
-
-	pand	xmm6,xmm3		; xmm6=(00 -- 10 -- 20 -- 30 --)
-	pslld	xmm7,WORD_BIT		; xmm7=(-- 01 -- 11 -- 21 -- 31)
-	pand	xmm0,xmm3		; xmm0=(06 -- 16 -- 26 -- 36 --)
-	pslld	xmm5,WORD_BIT		; xmm5=(-- 07 -- 17 -- 27 -- 37)
-	por	xmm6,xmm7		; xmm6=(00 01 10 11 20 21 30 31)
-	por	xmm0,xmm5		; xmm0=(06 07 16 17 26 27 36 37)
-
-	movaps	xmm1, XMMWORD [wk(0)]	; xmm1=tmp2
-	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=tmp3
-
-	addps	xmm4,xmm2		; xmm4=tmp4
-	movaps	xmm7,xmm1
-	movaps	xmm5,xmm3
-	addps	xmm1,xmm2		; xmm1=data2=(02 12 22 32)
-	addps	xmm3,xmm4		; xmm3=data4=(04 14 24 34)
-	subps	xmm7,xmm2		; xmm7=data5=(05 15 25 35)
-	subps	xmm5,xmm4		; xmm5=data3=(03 13 23 33)
-
-	movaps	xmm2,[rel PD_RNDINT_MAGIC]	; xmm2=[rel PD_RNDINT_MAGIC]
-	pcmpeqd	xmm4,xmm4
-	psrld	xmm4,WORD_BIT		; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
-
-	addps	xmm3,xmm2	; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
-	addps	xmm7,xmm2	; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
-	addps	xmm1,xmm2	; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
-	addps	xmm5,xmm2	; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
-
-	pand	xmm3,xmm4		; xmm3=(04 -- 14 -- 24 -- 34 --)
-	pslld	xmm7,WORD_BIT		; xmm7=(-- 05 -- 15 -- 25 -- 35)
-	pand	xmm1,xmm4		; xmm1=(02 -- 12 -- 22 -- 32 --)
-	pslld	xmm5,WORD_BIT		; xmm5=(-- 03 -- 13 -- 23 -- 33)
-	por	xmm3,xmm7		; xmm3=(04 05 14 15 24 25 34 35)
-	por	xmm1,xmm5		; xmm1=(02 03 12 13 22 23 32 33)
-
-	movdqa    xmm2,[rel PB_CENTERJSAMP]	; xmm2=[rel PB_CENTERJSAMP]
-
-	packsswb  xmm6,xmm3	; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
-	packsswb  xmm1,xmm0	; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
-	paddb     xmm6,xmm2
-	paddb     xmm1,xmm2
-
-	movdqa    xmm4,xmm6	; transpose coefficients(phase 2)
-	punpcklwd xmm6,xmm1	; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-	punpckhwd xmm4,xmm1	; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-
-	movdqa    xmm7,xmm6	; transpose coefficients(phase 3)
-	punpckldq xmm6,xmm4	; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-	punpckhdq xmm7,xmm4	; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-
-	pshufd	xmm5,xmm6,0x4E	; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-	pshufd	xmm3,xmm7,0x4E	; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-
-	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-	mov	rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
-	movq	XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
-	mov	rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-	mov	rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
-	movq	XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
-
-	add	rsi, byte 4*SIZEOF_FAST_FLOAT	; wsptr
-	add	rdi, byte 4*SIZEOF_JSAMPROW
-	dec	rcx				; ctr
-	jnz	near .rowloop
-
-	pop	rbx
-	uncollect_args
-	mov	rsp,rbp		; rsp <- aligned rbp
-	pop	rsp		; rsp <- original rbp
-	pop	rbp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jiss2flt.asm b/simd/jiss2flt.asm
deleted file mode 100644
index 6eebe88..0000000
--- a/simd/jiss2flt.asm
+++ /dev/null
@@ -1,498 +0,0 @@
-;
-; jiss2flt.asm - floating-point IDCT (SSE & SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the inverse DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
-	shufps	%1,%2,0x44
-%endmacro
-
-%macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
-	shufps	%1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_idct_float_sse2) PRIVATE
-
-EXTN(jconst_idct_float_sse2):
-
-PD_1_414	times 4 dd  1.414213562373095048801689
-PD_1_847	times 4 dd  1.847759065022573512256366
-PD_1_082	times 4 dd  1.082392200292393968799446
-PD_M2_613	times 4 dd -2.613125929752753055713286
-PD_RNDINT_MAGIC	times 4 dd  100663296.0	; (float)(0x00C00000 << 3)
-PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block,
-;                        JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)	(b)+8			; void * dct_table
-%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
-%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
-%define output_col(b)	(b)+20		; JDIMENSION output_col
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-%define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
-					; FAST_FLOAT workspace[DCTSIZE2]
-
-	align	16
-	global	EXTN(jsimd_idct_float_sse2) PRIVATE
-
-EXTN(jsimd_idct_float_sse2):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [workspace]
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process columns from input, store into work array.
-
-;	mov	eax, [original_ebp]
-	mov	edx, POINTER [dct_table(eax)]	; quantptr
-	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
-	lea	edi, [workspace]			; FAST_FLOAT * wsptr
-	mov	ecx, DCTSIZE/4				; ctr
-	alignx	16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
-	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	jnz	near .columnDCT
-
-	movq	xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movq	xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	movq	xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	movq	xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	movq	xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movq	xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	movq	xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	por	xmm1,xmm2
-	por	xmm3,xmm4
-	por	xmm5,xmm6
-	por	xmm1,xmm3
-	por	xmm5,xmm7
-	por	xmm1,xmm5
-	packsswb xmm1,xmm1
-	movd	eax,xmm1
-	test	eax,eax
-	jnz	short .columnDCT
-
-	; -- AC terms all zero
-
-	movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-
-	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
-	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)
-	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)
-
-	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	movaps	xmm1,xmm0
-	movaps	xmm2,xmm0
-	movaps	xmm3,xmm0
-
-	shufps	xmm0,xmm0,0x00			; xmm0=(00 00 00 00)
-	shufps	xmm1,xmm1,0x55			; xmm1=(01 01 01 01)
-	shufps	xmm2,xmm2,0xAA			; xmm2=(02 02 02 02)
-	shufps	xmm3,xmm3,0xFF			; xmm3=(03 03 03 03)
-
-	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
-	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
-	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
-	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
-	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
-	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
-	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
-	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
-	jmp	near .nextcolumn
-	alignx	16,7
-%endif
-.columnDCT:
-
-	; -- Even part
-
-	movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
-	punpcklwd xmm1,xmm1		; xmm1=(20 20 21 21 22 22 23 23)
-	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)
-	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in2=(20 21 22 23)
-	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)
-	cvtdq2ps  xmm1,xmm1			; xmm1=in2=(20 21 22 23)
-
-	punpcklwd xmm2,xmm2		; xmm2=(40 40 41 41 42 42 43 43)
-	punpcklwd xmm3,xmm3		; xmm3=(60 60 61 61 62 62 63 63)
-	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in4=(40 41 42 43)
-	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in6=(60 61 62 63)
-	cvtdq2ps  xmm2,xmm2			; xmm2=in4=(40 41 42 43)
-	cvtdq2ps  xmm3,xmm3			; xmm3=in6=(60 61 62 63)
-
-	mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	movaps	xmm4,xmm0
-	movaps	xmm5,xmm1
-	subps	xmm0,xmm2		; xmm0=tmp11
-	subps	xmm1,xmm3
-	addps	xmm4,xmm2		; xmm4=tmp10
-	addps	xmm5,xmm3		; xmm5=tmp13
-
-	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
-	subps	xmm1,xmm5		; xmm1=tmp12
-
-	movaps	xmm6,xmm4
-	movaps	xmm7,xmm0
-	subps	xmm4,xmm5		; xmm4=tmp3
-	subps	xmm0,xmm1		; xmm0=tmp2
-	addps	xmm6,xmm5		; xmm6=tmp0
-	addps	xmm7,xmm1		; xmm7=tmp1
-
-	movaps	XMMWORD [wk(1)], xmm4	; tmp3
-	movaps	XMMWORD [wk(0)], xmm0	; tmp2
-
-	; -- Odd part
-
-	movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-	punpcklwd xmm2,xmm2		; xmm2=(10 10 11 11 12 12 13 13)
-	punpcklwd xmm3,xmm3		; xmm3=(30 30 31 31 32 32 33 33)
-	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in1=(10 11 12 13)
-	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in3=(30 31 32 33)
-	cvtdq2ps  xmm2,xmm2			; xmm2=in1=(10 11 12 13)
-	cvtdq2ps  xmm3,xmm3			; xmm3=in3=(30 31 32 33)
-
-	punpcklwd xmm5,xmm5		; xmm5=(50 50 51 51 52 52 53 53)
-	punpcklwd xmm1,xmm1		; xmm1=(70 70 71 71 72 72 73 73)
-	psrad     xmm5,(DWORD_BIT-WORD_BIT)	; xmm5=in5=(50 51 52 53)
-	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in7=(70 71 72 73)
-	cvtdq2ps  xmm5,xmm5			; xmm5=in5=(50 51 52 53)
-	cvtdq2ps  xmm1,xmm1			; xmm1=in7=(70 71 72 73)
-
-	mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	movaps	xmm4,xmm2
-	movaps	xmm0,xmm5
-	addps	xmm2,xmm1		; xmm2=z11
-	addps	xmm5,xmm3		; xmm5=z13
-	subps	xmm4,xmm1		; xmm4=z12
-	subps	xmm0,xmm3		; xmm0=z10
-
-	movaps	xmm1,xmm2
-	subps	xmm2,xmm5
-	addps	xmm1,xmm5		; xmm1=tmp7
-
-	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
-
-	movaps	xmm3,xmm0
-	addps	xmm0,xmm4
-	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
-	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
-	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
-	addps	xmm3,xmm0		; xmm3=tmp12
-	subps	xmm4,xmm0		; xmm4=tmp10
-
-	; -- Final output stage
-
-	subps	xmm3,xmm1		; xmm3=tmp6
-	movaps	xmm5,xmm6
-	movaps	xmm0,xmm7
-	addps	xmm6,xmm1		; xmm6=data0=(00 01 02 03)
-	addps	xmm7,xmm3		; xmm7=data1=(10 11 12 13)
-	subps	xmm5,xmm1		; xmm5=data7=(70 71 72 73)
-	subps	xmm0,xmm3		; xmm0=data6=(60 61 62 63)
-	subps	xmm2,xmm3		; xmm2=tmp5
-
-	movaps    xmm1,xmm6		; transpose coefficients(phase 1)
-	unpcklps  xmm6,xmm7		; xmm6=(00 10 01 11)
-	unpckhps  xmm1,xmm7		; xmm1=(02 12 03 13)
-	movaps    xmm3,xmm0		; transpose coefficients(phase 1)
-	unpcklps  xmm0,xmm5		; xmm0=(60 70 61 71)
-	unpckhps  xmm3,xmm5		; xmm3=(62 72 63 73)
-
-	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
-	movaps	xmm5, XMMWORD [wk(1)]	; xmm5=tmp3
-
-	movaps	XMMWORD [wk(0)], xmm0	; wk(0)=(60 70 61 71)
-	movaps	XMMWORD [wk(1)], xmm3	; wk(1)=(62 72 63 73)
-
-	addps	xmm4,xmm2		; xmm4=tmp4
-	movaps	xmm0,xmm7
-	movaps	xmm3,xmm5
-	addps	xmm7,xmm2		; xmm7=data2=(20 21 22 23)
-	addps	xmm5,xmm4		; xmm5=data4=(40 41 42 43)
-	subps	xmm0,xmm2		; xmm0=data5=(50 51 52 53)
-	subps	xmm3,xmm4		; xmm3=data3=(30 31 32 33)
-
-	movaps    xmm2,xmm7		; transpose coefficients(phase 1)
-	unpcklps  xmm7,xmm3		; xmm7=(20 30 21 31)
-	unpckhps  xmm2,xmm3		; xmm2=(22 32 23 33)
-	movaps    xmm4,xmm5		; transpose coefficients(phase 1)
-	unpcklps  xmm5,xmm0		; xmm5=(40 50 41 51)
-	unpckhps  xmm4,xmm0		; xmm4=(42 52 43 53)
-
-	movaps    xmm3,xmm6		; transpose coefficients(phase 2)
-	unpcklps2 xmm6,xmm7		; xmm6=(00 10 20 30)
-	unpckhps2 xmm3,xmm7		; xmm3=(01 11 21 31)
-	movaps    xmm0,xmm1		; transpose coefficients(phase 2)
-	unpcklps2 xmm1,xmm2		; xmm1=(02 12 22 32)
-	unpckhps2 xmm0,xmm2		; xmm0=(03 13 23 33)
-
-	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=(60 70 61 71)
-	movaps	xmm2, XMMWORD [wk(1)]	; xmm2=(62 72 63 73)
-
-	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
-	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
-	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
-	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
-
-	movaps    xmm6,xmm5		; transpose coefficients(phase 2)
-	unpcklps2 xmm5,xmm7		; xmm5=(40 50 60 70)
-	unpckhps2 xmm6,xmm7		; xmm6=(41 51 61 71)
-	movaps    xmm3,xmm4		; transpose coefficients(phase 2)
-	unpcklps2 xmm4,xmm2		; xmm4=(42 52 62 72)
-	unpckhps2 xmm3,xmm2		; xmm3=(43 53 63 73)
-
-	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
-	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
-	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
-	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
-
-.nextcolumn:
-	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
-	add	edx, byte 4*SIZEOF_FLOAT_MULT_TYPE	; quantptr
-	add	edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr
-	dec	ecx					; ctr
-	jnz	near .columnloop
-
-	; -- Prefetch the next coefficient block
-
-	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
-	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
-	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
-	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
-
-	; ---- Pass 2: process rows from work array, store into output array.
-
-	mov	eax, [original_ebp]
-	lea	esi, [workspace]			; FAST_FLOAT * wsptr
-	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [output_col(eax)]
-	mov	ecx, DCTSIZE/4				; ctr
-	alignx	16,7
-.rowloop:
-
-	; -- Even part
-
-	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
-
-	movaps	xmm4,xmm0
-	movaps	xmm5,xmm1
-	subps	xmm0,xmm2		; xmm0=tmp11
-	subps	xmm1,xmm3
-	addps	xmm4,xmm2		; xmm4=tmp10
-	addps	xmm5,xmm3		; xmm5=tmp13
-
-	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
-	subps	xmm1,xmm5		; xmm1=tmp12
-
-	movaps	xmm6,xmm4
-	movaps	xmm7,xmm0
-	subps	xmm4,xmm5		; xmm4=tmp3
-	subps	xmm0,xmm1		; xmm0=tmp2
-	addps	xmm6,xmm5		; xmm6=tmp0
-	addps	xmm7,xmm1		; xmm7=tmp1
-
-	movaps	XMMWORD [wk(1)], xmm4	; tmp3
-	movaps	XMMWORD [wk(0)], xmm0	; tmp2
-
-	; -- Odd part
-
-	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
-
-	movaps	xmm4,xmm2
-	movaps	xmm0,xmm5
-	addps	xmm2,xmm1		; xmm2=z11
-	addps	xmm5,xmm3		; xmm5=z13
-	subps	xmm4,xmm1		; xmm4=z12
-	subps	xmm0,xmm3		; xmm0=z10
-
-	movaps	xmm1,xmm2
-	subps	xmm2,xmm5
-	addps	xmm1,xmm5		; xmm1=tmp7
-
-	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
-
-	movaps	xmm3,xmm0
-	addps	xmm0,xmm4
-	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
-	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
-	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
-	addps	xmm3,xmm0		; xmm3=tmp12
-	subps	xmm4,xmm0		; xmm4=tmp10
-
-	; -- Final output stage
-
-	subps	xmm3,xmm1		; xmm3=tmp6
-	movaps	xmm5,xmm6
-	movaps	xmm0,xmm7
-	addps	xmm6,xmm1		; xmm6=data0=(00 10 20 30)
-	addps	xmm7,xmm3		; xmm7=data1=(01 11 21 31)
-	subps	xmm5,xmm1		; xmm5=data7=(07 17 27 37)
-	subps	xmm0,xmm3		; xmm0=data6=(06 16 26 36)
-	subps	xmm2,xmm3		; xmm2=tmp5
-
-	movaps	xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; xmm1=[PD_RNDINT_MAGIC]
-	pcmpeqd	xmm3,xmm3
-	psrld	xmm3,WORD_BIT		; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
-
-	addps	xmm6,xmm1	; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
-	addps	xmm7,xmm1	; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
-	addps	xmm0,xmm1	; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
-	addps	xmm5,xmm1	; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
-
-	pand	xmm6,xmm3		; xmm6=(00 -- 10 -- 20 -- 30 --)
-	pslld	xmm7,WORD_BIT		; xmm7=(-- 01 -- 11 -- 21 -- 31)
-	pand	xmm0,xmm3		; xmm0=(06 -- 16 -- 26 -- 36 --)
-	pslld	xmm5,WORD_BIT		; xmm5=(-- 07 -- 17 -- 27 -- 37)
-	por	xmm6,xmm7		; xmm6=(00 01 10 11 20 21 30 31)
-	por	xmm0,xmm5		; xmm0=(06 07 16 17 26 27 36 37)
-
-	movaps	xmm1, XMMWORD [wk(0)]	; xmm1=tmp2
-	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=tmp3
-
-	addps	xmm4,xmm2		; xmm4=tmp4
-	movaps	xmm7,xmm1
-	movaps	xmm5,xmm3
-	addps	xmm1,xmm2		; xmm1=data2=(02 12 22 32)
-	addps	xmm3,xmm4		; xmm3=data4=(04 14 24 34)
-	subps	xmm7,xmm2		; xmm7=data5=(05 15 25 35)
-	subps	xmm5,xmm4		; xmm5=data3=(03 13 23 33)
-
-	movaps	xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; xmm2=[PD_RNDINT_MAGIC]
-	pcmpeqd	xmm4,xmm4
-	psrld	xmm4,WORD_BIT		; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
-
-	addps	xmm3,xmm2	; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
-	addps	xmm7,xmm2	; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
-	addps	xmm1,xmm2	; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
-	addps	xmm5,xmm2	; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
-
-	pand	xmm3,xmm4		; xmm3=(04 -- 14 -- 24 -- 34 --)
-	pslld	xmm7,WORD_BIT		; xmm7=(-- 05 -- 15 -- 25 -- 35)
-	pand	xmm1,xmm4		; xmm1=(02 -- 12 -- 22 -- 32 --)
-	pslld	xmm5,WORD_BIT		; xmm5=(-- 03 -- 13 -- 23 -- 33)
-	por	xmm3,xmm7		; xmm3=(04 05 14 15 24 25 34 35)
-	por	xmm1,xmm5		; xmm1=(02 03 12 13 22 23 32 33)
-
-	movdqa    xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)]	; xmm2=[PB_CENTERJSAMP]
-
-	packsswb  xmm6,xmm3	; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
-	packsswb  xmm1,xmm0	; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
-	paddb     xmm6,xmm2
-	paddb     xmm1,xmm2
-
-	movdqa    xmm4,xmm6	; transpose coefficients(phase 2)
-	punpcklwd xmm6,xmm1	; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-	punpckhwd xmm4,xmm1	; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-
-	movdqa    xmm7,xmm6	; transpose coefficients(phase 3)
-	punpckldq xmm6,xmm4	; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-	punpckhdq xmm7,xmm4	; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-
-	pshufd	xmm5,xmm6,0x4E	; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-	pshufd	xmm3,xmm7,0x4E	; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-
-	pushpic	ebx			; save GOT address
-
-	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-	mov	ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
-	movq	XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
-	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
-	movq	XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
-
-	poppic	ebx			; restore GOT address
-
-	add	esi, byte 4*SIZEOF_FAST_FLOAT	; wsptr
-	add	edi, byte 4*SIZEOF_JSAMPROW
-	dec	ecx				; ctr
-	jnz	near .rowloop
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jiss2fst-64.asm b/simd/jiss2fst-64.asm
deleted file mode 100644
index 2b4e4b5..0000000
--- a/simd/jiss2fst-64.asm
+++ /dev/null
@@ -1,492 +0,0 @@
-;
-; jiss2fst-64.asm - fast integer IDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/projecpt/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the inverse DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jidctfst.c; see the jidctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	8	; 14 is also OK.
-%define PASS1_BITS	2
-
-%if IFAST_SCALE_BITS != PASS1_BITS
-%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
-%endif
-
-%if CONST_BITS == 8
-F_1_082	equ	277		; FIX(1.082392200)
-F_1_414	equ	362		; FIX(1.414213562)
-F_1_847	equ	473		; FIX(1.847759065)
-F_2_613	equ	669		; FIX(2.613125930)
-F_1_613	equ	(F_2_613 - 256)	; FIX(2.613125930) - FIX(1)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define	DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_1_082	equ	DESCALE(1162209775,30-CONST_BITS)	; FIX(1.082392200)
-F_1_414	equ	DESCALE(1518500249,30-CONST_BITS)	; FIX(1.414213562)
-F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
-F_2_613	equ	DESCALE(2805822602,30-CONST_BITS)	; FIX(2.613125930)
-F_1_613	equ	(F_2_613 - (1 << CONST_BITS))	; FIX(2.613125930) - FIX(1)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-	alignz	16
-	global	EXTN(jconst_idct_ifast_sse2) PRIVATE
-
-EXTN(jconst_idct_ifast_sse2):
-
-PW_F1414	times 8 dw  F_1_414 << CONST_SHIFT
-PW_F1847	times 8 dw  F_1_847 << CONST_SHIFT
-PW_MF1613	times 8 dw -F_1_613 << CONST_SHIFT
-PW_F1082	times 8 dw  F_1_082 << CONST_SHIFT
-PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_ifast_sse2 (void * dct_table, JCOEFPTR coef_block,
-;                       JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = jpeg_component_info * compptr
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-%define original_rbp	rbp+0
-%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-
-	align	16
-	global	EXTN(jsimd_idct_ifast_sse2) PRIVATE
-
-EXTN(jsimd_idct_ifast_sse2):
-	push	rbp
-	mov	rax,rsp				; rax = original rbp
-	sub	rsp, byte 4
-	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[rsp],rax
-	mov	rbp,rsp				; rbp = aligned rbp
-	lea	rsp, [wk(0)]
-	collect_args
-
-	; ---- Pass 1: process columns from input.
-
-	mov	rdx, r10	; quantptr
-	mov	rsi, r11		; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
-	mov	eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-	or	eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-	jnz	near .columnDCT
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-	por	xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-	por	xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-	por	xmm1,xmm0
-	packsswb xmm1,xmm1
-	packsswb xmm1,xmm1
-	movd	eax,xmm1
-	test	rax,rax
-	jnz	short .columnDCT
-
-	; -- AC terms all zero
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	movdqa    xmm7,xmm0		; xmm0=in0=(00 01 02 03 04 05 06 07)
-	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
-	punpckhwd xmm7,xmm7		; xmm7=(04 04 05 05 06 06 07 07)
-
-	pshufd	xmm6,xmm0,0x00		; xmm6=col0=(00 00 00 00 00 00 00 00)
-	pshufd	xmm2,xmm0,0x55		; xmm2=col1=(01 01 01 01 01 01 01 01)
-	pshufd	xmm5,xmm0,0xAA		; xmm5=col2=(02 02 02 02 02 02 02 02)
-	pshufd	xmm0,xmm0,0xFF		; xmm0=col3=(03 03 03 03 03 03 03 03)
-	pshufd	xmm1,xmm7,0x00		; xmm1=col4=(04 04 04 04 04 04 04 04)
-	pshufd	xmm4,xmm7,0x55		; xmm4=col5=(05 05 05 05 05 05 05 05)
-	pshufd	xmm3,xmm7,0xAA		; xmm3=col6=(06 06 06 06 06 06 06 06)
-	pshufd	xmm7,xmm7,0xFF		; xmm7=col7=(07 07 07 07 07 07 07 07)
-
-	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=col1
-	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=col3
-	jmp	near .column_end
-%endif
-.columnDCT:
-
-	; -- Even part
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-	pmullw	xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-	movdqa	xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-	pmullw	xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-
-	movdqa	xmm4,xmm0
-	movdqa	xmm5,xmm1
-	psubw	xmm0,xmm2		; xmm0=tmp11
-	psubw	xmm1,xmm3
-	paddw	xmm4,xmm2		; xmm4=tmp10
-	paddw	xmm5,xmm3		; xmm5=tmp13
-
-	psllw	xmm1,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm1,[rel PW_F1414]
-	psubw	xmm1,xmm5		; xmm1=tmp12
-
-	movdqa	xmm6,xmm4
-	movdqa	xmm7,xmm0
-	psubw	xmm4,xmm5		; xmm4=tmp3
-	psubw	xmm0,xmm1		; xmm0=tmp2
-	paddw	xmm6,xmm5		; xmm6=tmp0
-	paddw	xmm7,xmm1		; xmm7=tmp1
-
-	movdqa	XMMWORD [wk(1)], xmm4	; wk(1)=tmp3
-	movdqa	XMMWORD [wk(0)], xmm0	; wk(0)=tmp2
-
-	; -- Odd part
-
-	movdqa	xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-	pmullw	xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-	movdqa	xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-	pmullw	xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-
-	movdqa	xmm4,xmm2
-	movdqa	xmm0,xmm5
-	psubw	xmm2,xmm1		; xmm2=z12
-	psubw	xmm5,xmm3		; xmm5=z10
-	paddw	xmm4,xmm1		; xmm4=z11
-	paddw	xmm0,xmm3		; xmm0=z13
-
-	movdqa	xmm1,xmm5		; xmm1=z10(unscaled)
-	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
-	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
-
-	movdqa	xmm3,xmm4
-	psubw	xmm4,xmm0
-	paddw	xmm3,xmm0		; xmm3=tmp7
-
-	psllw	xmm4,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm4,[rel PW_F1414]	; xmm4=tmp11
-
-	; To avoid overflow...
-	;
-	; (Original)
-	; tmp12 = -2.613125930 * z10 + z5;
-	;
-	; (This implementation)
-	; tmp12 = (-1.613125930 - 1) * z10 + z5;
-	;       = -1.613125930 * z10 - z10 + z5;
-
-	movdqa	xmm0,xmm5
-	paddw	xmm5,xmm2
-	pmulhw	xmm5,[rel PW_F1847]	; xmm5=z5
-	pmulhw	xmm0,[rel PW_MF1613]
-	pmulhw	xmm2,[rel PW_F1082]
-	psubw	xmm0,xmm1
-	psubw	xmm2,xmm5		; xmm2=tmp10
-	paddw	xmm0,xmm5		; xmm0=tmp12
-
-	; -- Final output stage
-
-	psubw	xmm0,xmm3		; xmm0=tmp6
-	movdqa	xmm1,xmm6
-	movdqa	xmm5,xmm7
-	paddw	xmm6,xmm3		; xmm6=data0=(00 01 02 03 04 05 06 07)
-	paddw	xmm7,xmm0		; xmm7=data1=(10 11 12 13 14 15 16 17)
-	psubw	xmm1,xmm3		; xmm1=data7=(70 71 72 73 74 75 76 77)
-	psubw	xmm5,xmm0		; xmm5=data6=(60 61 62 63 64 65 66 67)
-	psubw	xmm4,xmm0		; xmm4=tmp5
-
-	movdqa    xmm3,xmm6		; transpose coefficients(phase 1)
-	punpcklwd xmm6,xmm7		; xmm6=(00 10 01 11 02 12 03 13)
-	punpckhwd xmm3,xmm7		; xmm3=(04 14 05 15 06 16 07 17)
-	movdqa    xmm0,xmm5		; transpose coefficients(phase 1)
-	punpcklwd xmm5,xmm1		; xmm5=(60 70 61 71 62 72 63 73)
-	punpckhwd xmm0,xmm1		; xmm0=(64 74 65 75 66 76 67 77)
-
-	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
-	movdqa	xmm1, XMMWORD [wk(1)]	; xmm1=tmp3
-
-	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(60 70 61 71 62 72 63 73)
-	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(64 74 65 75 66 76 67 77)
-
-	paddw	xmm2,xmm4		; xmm2=tmp4
-	movdqa	xmm5,xmm7
-	movdqa	xmm0,xmm1
-	paddw	xmm7,xmm4		; xmm7=data2=(20 21 22 23 24 25 26 27)
-	paddw	xmm1,xmm2		; xmm1=data4=(40 41 42 43 44 45 46 47)
-	psubw	xmm5,xmm4		; xmm5=data5=(50 51 52 53 54 55 56 57)
-	psubw	xmm0,xmm2		; xmm0=data3=(30 31 32 33 34 35 36 37)
-
-	movdqa    xmm4,xmm7		; transpose coefficients(phase 1)
-	punpcklwd xmm7,xmm0		; xmm7=(20 30 21 31 22 32 23 33)
-	punpckhwd xmm4,xmm0		; xmm4=(24 34 25 35 26 36 27 37)
-	movdqa    xmm2,xmm1		; transpose coefficients(phase 1)
-	punpcklwd xmm1,xmm5		; xmm1=(40 50 41 51 42 52 43 53)
-	punpckhwd xmm2,xmm5		; xmm2=(44 54 45 55 46 56 47 57)
-
-	movdqa    xmm0,xmm3		; transpose coefficients(phase 2)
-	punpckldq xmm3,xmm4		; xmm3=(04 14 24 34 05 15 25 35)
-	punpckhdq xmm0,xmm4		; xmm0=(06 16 26 36 07 17 27 37)
-	movdqa    xmm5,xmm6		; transpose coefficients(phase 2)
-	punpckldq xmm6,xmm7		; xmm6=(00 10 20 30 01 11 21 31)
-	punpckhdq xmm5,xmm7		; xmm5=(02 12 22 32 03 13 23 33)
-
-	movdqa	xmm4, XMMWORD [wk(0)]	; xmm4=(60 70 61 71 62 72 63 73)
-	movdqa	xmm7, XMMWORD [wk(1)]	; xmm7=(64 74 65 75 66 76 67 77)
-
-	movdqa	XMMWORD [wk(0)], xmm3	; wk(0)=(04 14 24 34 05 15 25 35)
-	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(06 16 26 36 07 17 27 37)
-
-	movdqa    xmm3,xmm1		; transpose coefficients(phase 2)
-	punpckldq xmm1,xmm4		; xmm1=(40 50 60 70 41 51 61 71)
-	punpckhdq xmm3,xmm4		; xmm3=(42 52 62 72 43 53 63 73)
-	movdqa    xmm0,xmm2		; transpose coefficients(phase 2)
-	punpckldq xmm2,xmm7		; xmm2=(44 54 64 74 45 55 65 75)
-	punpckhdq xmm0,xmm7		; xmm0=(46 56 66 76 47 57 67 77)
-
-	movdqa     xmm4,xmm6		; transpose coefficients(phase 3)
-	punpcklqdq xmm6,xmm1		; xmm6=col0=(00 10 20 30 40 50 60 70)
-	punpckhqdq xmm4,xmm1		; xmm4=col1=(01 11 21 31 41 51 61 71)
-	movdqa     xmm7,xmm5		; transpose coefficients(phase 3)
-	punpcklqdq xmm5,xmm3		; xmm5=col2=(02 12 22 32 42 52 62 72)
-	punpckhqdq xmm7,xmm3		; xmm7=col3=(03 13 23 33 43 53 63 73)
-
-	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(04 14 24 34 05 15 25 35)
-	movdqa	xmm3, XMMWORD [wk(1)]	; xmm3=(06 16 26 36 07 17 27 37)
-
-	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=col1
-	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=col3
-
-	movdqa     xmm4,xmm1		; transpose coefficients(phase 3)
-	punpcklqdq xmm1,xmm2		; xmm1=col4=(04 14 24 34 44 54 64 74)
-	punpckhqdq xmm4,xmm2		; xmm4=col5=(05 15 25 35 45 55 65 75)
-	movdqa     xmm7,xmm3		; transpose coefficients(phase 3)
-	punpcklqdq xmm3,xmm0		; xmm3=col6=(06 16 26 36 46 56 66 76)
-	punpckhqdq xmm7,xmm0		; xmm7=col7=(07 17 27 37 47 57 67 77)
-.column_end:
-
-	; -- Prefetch the next coefficient block
-
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-	; ---- Pass 2: process rows from work array, store into output array.
-
-	mov	rax, [original_rbp]
-	mov	rdi, r12	; (JSAMPROW *)
-	mov	rax, r13
-
-	; -- Even part
-
-	; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
-
-	movdqa	xmm2,xmm6
-	movdqa	xmm0,xmm5
-	psubw	xmm6,xmm1		; xmm6=tmp11
-	psubw	xmm5,xmm3
-	paddw	xmm2,xmm1		; xmm2=tmp10
-	paddw	xmm0,xmm3		; xmm0=tmp13
-
-	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm5,[rel PW_F1414]
-	psubw	xmm5,xmm0		; xmm5=tmp12
-
-	movdqa	xmm1,xmm2
-	movdqa	xmm3,xmm6
-	psubw	xmm2,xmm0		; xmm2=tmp3
-	psubw	xmm6,xmm5		; xmm6=tmp2
-	paddw	xmm1,xmm0		; xmm1=tmp0
-	paddw	xmm3,xmm5		; xmm3=tmp1
-
-	movdqa	xmm0, XMMWORD [wk(0)]	; xmm0=col1
-	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=col3
-
-	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=tmp3
-	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=tmp2
-
-	; -- Odd part
-
-	; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
-
-	movdqa	xmm2,xmm0
-	movdqa	xmm6,xmm4
-	psubw	xmm0,xmm7		; xmm0=z12
-	psubw	xmm4,xmm5		; xmm4=z10
-	paddw	xmm2,xmm7		; xmm2=z11
-	paddw	xmm6,xmm5		; xmm6=z13
-
-	movdqa	xmm7,xmm4		; xmm7=z10(unscaled)
-	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
-	psllw	xmm4,PRE_MULTIPLY_SCALE_BITS
-
-	movdqa	xmm5,xmm2
-	psubw	xmm2,xmm6
-	paddw	xmm5,xmm6		; xmm5=tmp7
-
-	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm2,[rel PW_F1414]	; xmm2=tmp11
-
-	; To avoid overflow...
-	;
-	; (Original)
-	; tmp12 = -2.613125930 * z10 + z5;
-	;
-	; (This implementation)
-	; tmp12 = (-1.613125930 - 1) * z10 + z5;
-	;       = -1.613125930 * z10 - z10 + z5;
-
-	movdqa	xmm6,xmm4
-	paddw	xmm4,xmm0
-	pmulhw	xmm4,[rel PW_F1847]	; xmm4=z5
-	pmulhw	xmm6,[rel PW_MF1613]
-	pmulhw	xmm0,[rel PW_F1082]
-	psubw	xmm6,xmm7
-	psubw	xmm0,xmm4		; xmm0=tmp10
-	paddw	xmm6,xmm4		; xmm6=tmp12
-
-	; -- Final output stage
-
-	psubw	xmm6,xmm5		; xmm6=tmp6
-	movdqa	xmm7,xmm1
-	movdqa	xmm4,xmm3
-	paddw	xmm1,xmm5		; xmm1=data0=(00 10 20 30 40 50 60 70)
-	paddw	xmm3,xmm6		; xmm3=data1=(01 11 21 31 41 51 61 71)
-	psraw	xmm1,(PASS1_BITS+3)	; descale
-	psraw	xmm3,(PASS1_BITS+3)	; descale
-	psubw	xmm7,xmm5		; xmm7=data7=(07 17 27 37 47 57 67 77)
-	psubw	xmm4,xmm6		; xmm4=data6=(06 16 26 36 46 56 66 76)
-	psraw	xmm7,(PASS1_BITS+3)	; descale
-	psraw	xmm4,(PASS1_BITS+3)	; descale
-	psubw	xmm2,xmm6		; xmm2=tmp5
-
-	packsswb  xmm1,xmm4	; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-	packsswb  xmm3,xmm7	; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp2
-	movdqa	xmm6, XMMWORD [wk(0)]	; xmm6=tmp3
-
-	paddw	xmm0,xmm2		; xmm0=tmp4
-	movdqa	xmm4,xmm5
-	movdqa	xmm7,xmm6
-	paddw	xmm5,xmm2		; xmm5=data2=(02 12 22 32 42 52 62 72)
-	paddw	xmm6,xmm0		; xmm6=data4=(04 14 24 34 44 54 64 74)
-	psraw	xmm5,(PASS1_BITS+3)	; descale
-	psraw	xmm6,(PASS1_BITS+3)	; descale
-	psubw	xmm4,xmm2		; xmm4=data5=(05 15 25 35 45 55 65 75)
-	psubw	xmm7,xmm0		; xmm7=data3=(03 13 23 33 43 53 63 73)
-	psraw	xmm4,(PASS1_BITS+3)	; descale
-	psraw	xmm7,(PASS1_BITS+3)	; descale
-
-	movdqa    xmm2,[rel PB_CENTERJSAMP]	; xmm2=[rel PB_CENTERJSAMP]
-
-	packsswb  xmm5,xmm6	; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
-	packsswb  xmm7,xmm4	; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
-
-	paddb     xmm1,xmm2
-	paddb     xmm3,xmm2
-	paddb     xmm5,xmm2
-	paddb     xmm7,xmm2
-
-	movdqa    xmm0,xmm1	; transpose coefficients(phase 1)
-	punpcklbw xmm1,xmm3	; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
-	punpckhbw xmm0,xmm3	; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
-	movdqa    xmm6,xmm5	; transpose coefficients(phase 1)
-	punpcklbw xmm5,xmm7	; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
-	punpckhbw xmm6,xmm7	; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
-
-	movdqa    xmm4,xmm1	; transpose coefficients(phase 2)
-	punpcklwd xmm1,xmm5	; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-	punpckhwd xmm4,xmm5	; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
-	movdqa    xmm2,xmm6	; transpose coefficients(phase 2)
-	punpcklwd xmm6,xmm0	; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-	punpckhwd xmm2,xmm0	; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
-
-	movdqa    xmm3,xmm1	; transpose coefficients(phase 3)
-	punpckldq xmm1,xmm6	; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-	punpckhdq xmm3,xmm6	; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-	movdqa    xmm7,xmm4	; transpose coefficients(phase 3)
-	punpckldq xmm4,xmm2	; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
-	punpckhdq xmm7,xmm2	; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
-
-	pshufd	xmm5,xmm1,0x4E	; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-	pshufd	xmm0,xmm3,0x4E	; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-	pshufd	xmm6,xmm4,0x4E	; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
-	pshufd	xmm2,xmm7,0x4E	; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
-
-	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-	mov	rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
-	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
-	mov	rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
-	mov	rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
-	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
-
-	mov	rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-	mov	rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
-	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
-	mov	rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
-	mov	rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
-	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
-
-	uncollect_args
-	mov	rsp,rbp		; rsp <- aligned rbp
-	pop	rsp		; rsp <- original rbp
-	pop	rbp
-	ret
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jiss2fst.asm b/simd/jiss2fst.asm
deleted file mode 100644
index 84b54b9..0000000
--- a/simd/jiss2fst.asm
+++ /dev/null
@@ -1,502 +0,0 @@
-;
-; jiss2fst.asm - fast integer IDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the inverse DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jidctfst.c; see the jidctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	8	; 14 is also OK.
-%define PASS1_BITS	2
-
-%if IFAST_SCALE_BITS != PASS1_BITS
-%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
-%endif
-
-%if CONST_BITS == 8
-F_1_082	equ	277		; FIX(1.082392200)
-F_1_414	equ	362		; FIX(1.414213562)
-F_1_847	equ	473		; FIX(1.847759065)
-F_2_613	equ	669		; FIX(2.613125930)
-F_1_613	equ	(F_2_613 - 256)	; FIX(2.613125930) - FIX(1)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define	DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_1_082	equ	DESCALE(1162209775,30-CONST_BITS)	; FIX(1.082392200)
-F_1_414	equ	DESCALE(1518500249,30-CONST_BITS)	; FIX(1.414213562)
-F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
-F_2_613	equ	DESCALE(2805822602,30-CONST_BITS)	; FIX(2.613125930)
-F_1_613	equ	(F_2_613 - (1 << CONST_BITS))	; FIX(2.613125930) - FIX(1)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-	alignz	16
-	global	EXTN(jconst_idct_ifast_sse2) PRIVATE
-
-EXTN(jconst_idct_ifast_sse2):
-
-PW_F1414	times 8 dw  F_1_414 << CONST_SHIFT
-PW_F1847	times 8 dw  F_1_847 << CONST_SHIFT
-PW_MF1613	times 8 dw -F_1_613 << CONST_SHIFT
-PW_F1082	times 8 dw  F_1_082 << CONST_SHIFT
-PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_ifast_sse2 (void * dct_table, JCOEFPTR coef_block,
-;                       JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)	(b)+8			; jpeg_component_info * compptr
-%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
-%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
-%define output_col(b)	(b)+20		; JDIMENSION output_col
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-
-	align	16
-	global	EXTN(jsimd_idct_ifast_sse2) PRIVATE
-
-EXTN(jsimd_idct_ifast_sse2):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	ebx
-;	push	ecx		; unused
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process columns from input.
-
-;	mov	eax, [original_ebp]
-	mov	edx, POINTER [dct_table(eax)]	; quantptr
-	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
-	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	jnz	near .columnDCT
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	por	xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	por	xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	por	xmm1,xmm0
-	packsswb xmm1,xmm1
-	packsswb xmm1,xmm1
-	movd	eax,xmm1
-	test	eax,eax
-	jnz	short .columnDCT
-
-	; -- AC terms all zero
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	movdqa    xmm7,xmm0		; xmm0=in0=(00 01 02 03 04 05 06 07)
-	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
-	punpckhwd xmm7,xmm7		; xmm7=(04 04 05 05 06 06 07 07)
-
-	pshufd	xmm6,xmm0,0x00		; xmm6=col0=(00 00 00 00 00 00 00 00)
-	pshufd	xmm2,xmm0,0x55		; xmm2=col1=(01 01 01 01 01 01 01 01)
-	pshufd	xmm5,xmm0,0xAA		; xmm5=col2=(02 02 02 02 02 02 02 02)
-	pshufd	xmm0,xmm0,0xFF		; xmm0=col3=(03 03 03 03 03 03 03 03)
-	pshufd	xmm1,xmm7,0x00		; xmm1=col4=(04 04 04 04 04 04 04 04)
-	pshufd	xmm4,xmm7,0x55		; xmm4=col5=(05 05 05 05 05 05 05 05)
-	pshufd	xmm3,xmm7,0xAA		; xmm3=col6=(06 06 06 06 06 06 06 06)
-	pshufd	xmm7,xmm7,0xFF		; xmm7=col7=(07 07 07 07 07 07 07 07)
-
-	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=col1
-	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=col3
-	jmp	near .column_end
-	alignx	16,7
-%endif
-.columnDCT:
-
-	; -- Even part
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-	pmullw	xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-	movdqa	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-	pmullw	xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
-	movdqa	xmm4,xmm0
-	movdqa	xmm5,xmm1
-	psubw	xmm0,xmm2		; xmm0=tmp11
-	psubw	xmm1,xmm3
-	paddw	xmm4,xmm2		; xmm4=tmp10
-	paddw	xmm5,xmm3		; xmm5=tmp13
-
-	psllw	xmm1,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm1,[GOTOFF(ebx,PW_F1414)]
-	psubw	xmm1,xmm5		; xmm1=tmp12
-
-	movdqa	xmm6,xmm4
-	movdqa	xmm7,xmm0
-	psubw	xmm4,xmm5		; xmm4=tmp3
-	psubw	xmm0,xmm1		; xmm0=tmp2
-	paddw	xmm6,xmm5		; xmm6=tmp0
-	paddw	xmm7,xmm1		; xmm7=tmp1
-
-	movdqa	XMMWORD [wk(1)], xmm4	; wk(1)=tmp3
-	movdqa	XMMWORD [wk(0)], xmm0	; wk(0)=tmp2
-
-	; -- Odd part
-
-	movdqa	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-	pmullw	xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-	movdqa	xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-	pmullw	xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
-	movdqa	xmm4,xmm2
-	movdqa	xmm0,xmm5
-	psubw	xmm2,xmm1		; xmm2=z12
-	psubw	xmm5,xmm3		; xmm5=z10
-	paddw	xmm4,xmm1		; xmm4=z11
-	paddw	xmm0,xmm3		; xmm0=z13
-
-	movdqa	xmm1,xmm5		; xmm1=z10(unscaled)
-	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
-	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
-
-	movdqa	xmm3,xmm4
-	psubw	xmm4,xmm0
-	paddw	xmm3,xmm0		; xmm3=tmp7
-
-	psllw	xmm4,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm4,[GOTOFF(ebx,PW_F1414)]	; xmm4=tmp11
-
-	; To avoid overflow...
-	;
-	; (Original)
-	; tmp12 = -2.613125930 * z10 + z5;
-	;
-	; (This implementation)
-	; tmp12 = (-1.613125930 - 1) * z10 + z5;
-	;       = -1.613125930 * z10 - z10 + z5;
-
-	movdqa	xmm0,xmm5
-	paddw	xmm5,xmm2
-	pmulhw	xmm5,[GOTOFF(ebx,PW_F1847)]	; xmm5=z5
-	pmulhw	xmm0,[GOTOFF(ebx,PW_MF1613)]
-	pmulhw	xmm2,[GOTOFF(ebx,PW_F1082)]
-	psubw	xmm0,xmm1
-	psubw	xmm2,xmm5		; xmm2=tmp10
-	paddw	xmm0,xmm5		; xmm0=tmp12
-
-	; -- Final output stage
-
-	psubw	xmm0,xmm3		; xmm0=tmp6
-	movdqa	xmm1,xmm6
-	movdqa	xmm5,xmm7
-	paddw	xmm6,xmm3		; xmm6=data0=(00 01 02 03 04 05 06 07)
-	paddw	xmm7,xmm0		; xmm7=data1=(10 11 12 13 14 15 16 17)
-	psubw	xmm1,xmm3		; xmm1=data7=(70 71 72 73 74 75 76 77)
-	psubw	xmm5,xmm0		; xmm5=data6=(60 61 62 63 64 65 66 67)
-	psubw	xmm4,xmm0		; xmm4=tmp5
-
-	movdqa    xmm3,xmm6		; transpose coefficients(phase 1)
-	punpcklwd xmm6,xmm7		; xmm6=(00 10 01 11 02 12 03 13)
-	punpckhwd xmm3,xmm7		; xmm3=(04 14 05 15 06 16 07 17)
-	movdqa    xmm0,xmm5		; transpose coefficients(phase 1)
-	punpcklwd xmm5,xmm1		; xmm5=(60 70 61 71 62 72 63 73)
-	punpckhwd xmm0,xmm1		; xmm0=(64 74 65 75 66 76 67 77)
-
-	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
-	movdqa	xmm1, XMMWORD [wk(1)]	; xmm1=tmp3
-
-	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(60 70 61 71 62 72 63 73)
-	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(64 74 65 75 66 76 67 77)
-
-	paddw	xmm2,xmm4		; xmm2=tmp4
-	movdqa	xmm5,xmm7
-	movdqa	xmm0,xmm1
-	paddw	xmm7,xmm4		; xmm7=data2=(20 21 22 23 24 25 26 27)
-	paddw	xmm1,xmm2		; xmm1=data4=(40 41 42 43 44 45 46 47)
-	psubw	xmm5,xmm4		; xmm5=data5=(50 51 52 53 54 55 56 57)
-	psubw	xmm0,xmm2		; xmm0=data3=(30 31 32 33 34 35 36 37)
-
-	movdqa    xmm4,xmm7		; transpose coefficients(phase 1)
-	punpcklwd xmm7,xmm0		; xmm7=(20 30 21 31 22 32 23 33)
-	punpckhwd xmm4,xmm0		; xmm4=(24 34 25 35 26 36 27 37)
-	movdqa    xmm2,xmm1		; transpose coefficients(phase 1)
-	punpcklwd xmm1,xmm5		; xmm1=(40 50 41 51 42 52 43 53)
-	punpckhwd xmm2,xmm5		; xmm2=(44 54 45 55 46 56 47 57)
-
-	movdqa    xmm0,xmm3		; transpose coefficients(phase 2)
-	punpckldq xmm3,xmm4		; xmm3=(04 14 24 34 05 15 25 35)
-	punpckhdq xmm0,xmm4		; xmm0=(06 16 26 36 07 17 27 37)
-	movdqa    xmm5,xmm6		; transpose coefficients(phase 2)
-	punpckldq xmm6,xmm7		; xmm6=(00 10 20 30 01 11 21 31)
-	punpckhdq xmm5,xmm7		; xmm5=(02 12 22 32 03 13 23 33)
-
-	movdqa	xmm4, XMMWORD [wk(0)]	; xmm4=(60 70 61 71 62 72 63 73)
-	movdqa	xmm7, XMMWORD [wk(1)]	; xmm7=(64 74 65 75 66 76 67 77)
-
-	movdqa	XMMWORD [wk(0)], xmm3	; wk(0)=(04 14 24 34 05 15 25 35)
-	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(06 16 26 36 07 17 27 37)
-
-	movdqa    xmm3,xmm1		; transpose coefficients(phase 2)
-	punpckldq xmm1,xmm4		; xmm1=(40 50 60 70 41 51 61 71)
-	punpckhdq xmm3,xmm4		; xmm3=(42 52 62 72 43 53 63 73)
-	movdqa    xmm0,xmm2		; transpose coefficients(phase 2)
-	punpckldq xmm2,xmm7		; xmm2=(44 54 64 74 45 55 65 75)
-	punpckhdq xmm0,xmm7		; xmm0=(46 56 66 76 47 57 67 77)
-
-	movdqa     xmm4,xmm6		; transpose coefficients(phase 3)
-	punpcklqdq xmm6,xmm1		; xmm6=col0=(00 10 20 30 40 50 60 70)
-	punpckhqdq xmm4,xmm1		; xmm4=col1=(01 11 21 31 41 51 61 71)
-	movdqa     xmm7,xmm5		; transpose coefficients(phase 3)
-	punpcklqdq xmm5,xmm3		; xmm5=col2=(02 12 22 32 42 52 62 72)
-	punpckhqdq xmm7,xmm3		; xmm7=col3=(03 13 23 33 43 53 63 73)
-
-	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(04 14 24 34 05 15 25 35)
-	movdqa	xmm3, XMMWORD [wk(1)]	; xmm3=(06 16 26 36 07 17 27 37)
-
-	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=col1
-	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=col3
-
-	movdqa     xmm4,xmm1		; transpose coefficients(phase 3)
-	punpcklqdq xmm1,xmm2		; xmm1=col4=(04 14 24 34 44 54 64 74)
-	punpckhqdq xmm4,xmm2		; xmm4=col5=(05 15 25 35 45 55 65 75)
-	movdqa     xmm7,xmm3		; transpose coefficients(phase 3)
-	punpcklqdq xmm3,xmm0		; xmm3=col6=(06 16 26 36 46 56 66 76)
-	punpckhqdq xmm7,xmm0		; xmm7=col7=(07 17 27 37 47 57 67 77)
-.column_end:
-
-	; -- Prefetch the next coefficient block
-
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-	; ---- Pass 2: process rows from work array, store into output array.
-
-	mov	eax, [original_ebp]
-	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [output_col(eax)]
-
-	; -- Even part
-
-	; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
-
-	movdqa	xmm2,xmm6
-	movdqa	xmm0,xmm5
-	psubw	xmm6,xmm1		; xmm6=tmp11
-	psubw	xmm5,xmm3
-	paddw	xmm2,xmm1		; xmm2=tmp10
-	paddw	xmm0,xmm3		; xmm0=tmp13
-
-	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm5,[GOTOFF(ebx,PW_F1414)]
-	psubw	xmm5,xmm0		; xmm5=tmp12
-
-	movdqa	xmm1,xmm2
-	movdqa	xmm3,xmm6
-	psubw	xmm2,xmm0		; xmm2=tmp3
-	psubw	xmm6,xmm5		; xmm6=tmp2
-	paddw	xmm1,xmm0		; xmm1=tmp0
-	paddw	xmm3,xmm5		; xmm3=tmp1
-
-	movdqa	xmm0, XMMWORD [wk(0)]	; xmm0=col1
-	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=col3
-
-	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=tmp3
-	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=tmp2
-
-	; -- Odd part
-
-	; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
-
-	movdqa	xmm2,xmm0
-	movdqa	xmm6,xmm4
-	psubw	xmm0,xmm7		; xmm0=z12
-	psubw	xmm4,xmm5		; xmm4=z10
-	paddw	xmm2,xmm7		; xmm2=z11
-	paddw	xmm6,xmm5		; xmm6=z13
-
-	movdqa	xmm7,xmm4		; xmm7=z10(unscaled)
-	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
-	psllw	xmm4,PRE_MULTIPLY_SCALE_BITS
-
-	movdqa	xmm5,xmm2
-	psubw	xmm2,xmm6
-	paddw	xmm5,xmm6		; xmm5=tmp7
-
-	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm2,[GOTOFF(ebx,PW_F1414)]	; xmm2=tmp11
-
-	; To avoid overflow...
-	;
-	; (Original)
-	; tmp12 = -2.613125930 * z10 + z5;
-	;
-	; (This implementation)
-	; tmp12 = (-1.613125930 - 1) * z10 + z5;
-	;       = -1.613125930 * z10 - z10 + z5;
-
-	movdqa	xmm6,xmm4
-	paddw	xmm4,xmm0
-	pmulhw	xmm4,[GOTOFF(ebx,PW_F1847)]	; xmm4=z5
-	pmulhw	xmm6,[GOTOFF(ebx,PW_MF1613)]
-	pmulhw	xmm0,[GOTOFF(ebx,PW_F1082)]
-	psubw	xmm6,xmm7
-	psubw	xmm0,xmm4		; xmm0=tmp10
-	paddw	xmm6,xmm4		; xmm6=tmp12
-
-	; -- Final output stage
-
-	psubw	xmm6,xmm5		; xmm6=tmp6
-	movdqa	xmm7,xmm1
-	movdqa	xmm4,xmm3
-	paddw	xmm1,xmm5		; xmm1=data0=(00 10 20 30 40 50 60 70)
-	paddw	xmm3,xmm6		; xmm3=data1=(01 11 21 31 41 51 61 71)
-	psraw	xmm1,(PASS1_BITS+3)	; descale
-	psraw	xmm3,(PASS1_BITS+3)	; descale
-	psubw	xmm7,xmm5		; xmm7=data7=(07 17 27 37 47 57 67 77)
-	psubw	xmm4,xmm6		; xmm4=data6=(06 16 26 36 46 56 66 76)
-	psraw	xmm7,(PASS1_BITS+3)	; descale
-	psraw	xmm4,(PASS1_BITS+3)	; descale
-	psubw	xmm2,xmm6		; xmm2=tmp5
-
-	packsswb  xmm1,xmm4	; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-	packsswb  xmm3,xmm7	; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp2
-	movdqa	xmm6, XMMWORD [wk(0)]	; xmm6=tmp3
-
-	paddw	xmm0,xmm2		; xmm0=tmp4
-	movdqa	xmm4,xmm5
-	movdqa	xmm7,xmm6
-	paddw	xmm5,xmm2		; xmm5=data2=(02 12 22 32 42 52 62 72)
-	paddw	xmm6,xmm0		; xmm6=data4=(04 14 24 34 44 54 64 74)
-	psraw	xmm5,(PASS1_BITS+3)	; descale
-	psraw	xmm6,(PASS1_BITS+3)	; descale
-	psubw	xmm4,xmm2		; xmm4=data5=(05 15 25 35 45 55 65 75)
-	psubw	xmm7,xmm0		; xmm7=data3=(03 13 23 33 43 53 63 73)
-	psraw	xmm4,(PASS1_BITS+3)	; descale
-	psraw	xmm7,(PASS1_BITS+3)	; descale
-
-	movdqa    xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)]	; xmm2=[PB_CENTERJSAMP]
-
-	packsswb  xmm5,xmm6	; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
-	packsswb  xmm7,xmm4	; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
-
-	paddb     xmm1,xmm2
-	paddb     xmm3,xmm2
-	paddb     xmm5,xmm2
-	paddb     xmm7,xmm2
-
-	movdqa    xmm0,xmm1	; transpose coefficients(phase 1)
-	punpcklbw xmm1,xmm3	; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
-	punpckhbw xmm0,xmm3	; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
-	movdqa    xmm6,xmm5	; transpose coefficients(phase 1)
-	punpcklbw xmm5,xmm7	; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
-	punpckhbw xmm6,xmm7	; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
-
-	movdqa    xmm4,xmm1	; transpose coefficients(phase 2)
-	punpcklwd xmm1,xmm5	; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-	punpckhwd xmm4,xmm5	; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
-	movdqa    xmm2,xmm6	; transpose coefficients(phase 2)
-	punpcklwd xmm6,xmm0	; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-	punpckhwd xmm2,xmm0	; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
-
-	movdqa    xmm3,xmm1	; transpose coefficients(phase 3)
-	punpckldq xmm1,xmm6	; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-	punpckhdq xmm3,xmm6	; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-	movdqa    xmm7,xmm4	; transpose coefficients(phase 3)
-	punpckldq xmm4,xmm2	; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
-	punpckhdq xmm7,xmm2	; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
-
-	pshufd	xmm5,xmm1,0x4E	; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-	pshufd	xmm0,xmm3,0x4E	; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-	pshufd	xmm6,xmm4,0x4E	; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
-	pshufd	xmm2,xmm7,0x4E	; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
-
-	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-	mov	esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
-	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
-	mov	edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
-	mov	esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
-	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
-
-	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-	mov	esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
-	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
-	mov	edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
-	mov	esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
-	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; unused
-	poppic	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jiss2int-64.asm b/simd/jiss2int-64.asm
deleted file mode 100644
index 5ebfae8..0000000
--- a/simd/jiss2int-64.asm
+++ /dev/null
@@ -1,848 +0,0 @@
-;
-; jiss2int-64.asm - accurate integer IDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; inverse DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jidctint.c; see the jidctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	13
-%define PASS1_BITS	2
-
-%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
-%define DESCALE_P2	(CONST_BITS+PASS1_BITS+3)
-
-%if CONST_BITS == 13
-F_0_298	equ	 2446		; FIX(0.298631336)
-F_0_390	equ	 3196		; FIX(0.390180644)
-F_0_541	equ	 4433		; FIX(0.541196100)
-F_0_765	equ	 6270		; FIX(0.765366865)
-F_0_899	equ	 7373		; FIX(0.899976223)
-F_1_175	equ	 9633		; FIX(1.175875602)
-F_1_501	equ	12299		; FIX(1.501321110)
-F_1_847	equ	15137		; FIX(1.847759065)
-F_1_961	equ	16069		; FIX(1.961570560)
-F_2_053	equ	16819		; FIX(2.053119869)
-F_2_562	equ	20995		; FIX(2.562915447)
-F_3_072	equ	25172		; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
-F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
-F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
-F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
-F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
-F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
-F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
-F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
-F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
-F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
-F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
-F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_idct_islow_sse2) PRIVATE
-
-EXTN(jconst_idct_islow_sse2):
-
-PW_F130_F054	times 4 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130	times 4 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117	times 4 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078	times 4 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089	times 4 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060	times 4 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256	times 4 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050	times 4 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1	times 4 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2	times 4 dd  1 << (DESCALE_P2-1)
-PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_islow_sse2 (void * dct_table, JCOEFPTR coef_block,
-;                        JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = jpeg_component_info * compptr
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-%define original_rbp	rbp+0
-%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		12
-
-	align	16
-	global	EXTN(jsimd_idct_islow_sse2) PRIVATE
-
-EXTN(jsimd_idct_islow_sse2):
-	push	rbp
-	mov	rax,rsp				; rax = original rbp
-	sub	rsp, byte 4
-	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[rsp],rax
-	mov	rbp,rsp				; rbp = aligned rbp
-	lea	rsp, [wk(0)]
-	collect_args
-
-	; ---- Pass 1: process columns from input.
-
-	mov	rdx, r10	; quantptr
-	mov	rsi, r11		; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
-	mov	eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-	or	eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-	jnz	near .columnDCT
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-	por	xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-	por	xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-	por	xmm1,xmm0
-	packsswb xmm1,xmm1
-	packsswb xmm1,xmm1
-	movd	eax,xmm1
-	test	rax,rax
-	jnz	short .columnDCT
-
-	; -- AC terms all zero
-
-	movdqa	xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	psllw	xmm5,PASS1_BITS
-
-	movdqa    xmm4,xmm5		; xmm5=in0=(00 01 02 03 04 05 06 07)
-	punpcklwd xmm5,xmm5		; xmm5=(00 00 01 01 02 02 03 03)
-	punpckhwd xmm4,xmm4		; xmm4=(04 04 05 05 06 06 07 07)
-
-	pshufd	xmm7,xmm5,0x00		; xmm7=col0=(00 00 00 00 00 00 00 00)
-	pshufd	xmm6,xmm5,0x55		; xmm6=col1=(01 01 01 01 01 01 01 01)
-	pshufd	xmm1,xmm5,0xAA		; xmm1=col2=(02 02 02 02 02 02 02 02)
-	pshufd	xmm5,xmm5,0xFF		; xmm5=col3=(03 03 03 03 03 03 03 03)
-	pshufd	xmm0,xmm4,0x00		; xmm0=col4=(04 04 04 04 04 04 04 04)
-	pshufd	xmm3,xmm4,0x55		; xmm3=col5=(05 05 05 05 05 05 05 05)
-	pshufd	xmm2,xmm4,0xAA		; xmm2=col6=(06 06 06 06 06 06 06 06)
-	pshufd	xmm4,xmm4,0xFF		; xmm4=col7=(07 07 07 07 07 07 07 07)
-
-	movdqa	XMMWORD [wk(8)], xmm6	; wk(8)=col1
-	movdqa	XMMWORD [wk(9)], xmm5	; wk(9)=col3
-	movdqa	XMMWORD [wk(10)], xmm3	; wk(10)=col5
-	movdqa	XMMWORD [wk(11)], xmm4	; wk(11)=col7
-	jmp	near .column_end
-%endif
-.columnDCT:
-
-	; -- Even part
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-	movdqa	xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	; (Original)
-	; z1 = (z2 + z3) * 0.541196100;
-	; tmp2 = z1 + z3 * -1.847759065;
-	; tmp3 = z1 + z2 * 0.765366865;
-	;
-	; (This implementation)
-	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-	movdqa    xmm4,xmm1		; xmm1=in2=z2
-	movdqa    xmm5,xmm1
-	punpcklwd xmm4,xmm3		; xmm3=in6=z3
-	punpckhwd xmm5,xmm3
-	movdqa    xmm1,xmm4
-	movdqa    xmm3,xmm5
-	pmaddwd   xmm4,[rel PW_F130_F054]	; xmm4=tmp3L
-	pmaddwd   xmm5,[rel PW_F130_F054]	; xmm5=tmp3H
-	pmaddwd   xmm1,[rel PW_F054_MF130]	; xmm1=tmp2L
-	pmaddwd   xmm3,[rel PW_F054_MF130]	; xmm3=tmp2H
-
-	movdqa    xmm6,xmm0
-	paddw     xmm0,xmm2		; xmm0=in0+in4
-	psubw     xmm6,xmm2		; xmm6=in0-in4
-
-	pxor      xmm7,xmm7
-	pxor      xmm2,xmm2
-	punpcklwd xmm7,xmm0		; xmm7=tmp0L
-	punpckhwd xmm2,xmm0		; xmm2=tmp0H
-	psrad     xmm7,(16-CONST_BITS)	; psrad xmm7,16 & pslld xmm7,CONST_BITS
-	psrad     xmm2,(16-CONST_BITS)	; psrad xmm2,16 & pslld xmm2,CONST_BITS
-
-	movdqa	xmm0,xmm7
-	paddd	xmm7,xmm4		; xmm7=tmp10L
-	psubd	xmm0,xmm4		; xmm0=tmp13L
-	movdqa	xmm4,xmm2
-	paddd	xmm2,xmm5		; xmm2=tmp10H
-	psubd	xmm4,xmm5		; xmm4=tmp13H
-
-	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=tmp10L
-	movdqa	XMMWORD [wk(1)], xmm2	; wk(1)=tmp10H
-	movdqa	XMMWORD [wk(2)], xmm0	; wk(2)=tmp13L
-	movdqa	XMMWORD [wk(3)], xmm4	; wk(3)=tmp13H
-
-	pxor      xmm5,xmm5
-	pxor      xmm7,xmm7
-	punpcklwd xmm5,xmm6		; xmm5=tmp1L
-	punpckhwd xmm7,xmm6		; xmm7=tmp1H
-	psrad     xmm5,(16-CONST_BITS)	; psrad xmm5,16 & pslld xmm5,CONST_BITS
-	psrad     xmm7,(16-CONST_BITS)	; psrad xmm7,16 & pslld xmm7,CONST_BITS
-
-	movdqa	xmm2,xmm5
-	paddd	xmm5,xmm1		; xmm5=tmp11L
-	psubd	xmm2,xmm1		; xmm2=tmp12L
-	movdqa	xmm0,xmm7
-	paddd	xmm7,xmm3		; xmm7=tmp11H
-	psubd	xmm0,xmm3		; xmm0=tmp12H
-
-	movdqa	XMMWORD [wk(4)], xmm5	; wk(4)=tmp11L
-	movdqa	XMMWORD [wk(5)], xmm7	; wk(5)=tmp11H
-	movdqa	XMMWORD [wk(6)], xmm2	; wk(6)=tmp12L
-	movdqa	XMMWORD [wk(7)], xmm0	; wk(7)=tmp12H
-
-	; -- Odd part
-
-	movdqa	xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	movdqa	xmm5,xmm6
-	movdqa	xmm7,xmm4
-	paddw	xmm5,xmm3		; xmm5=z3
-	paddw	xmm7,xmm1		; xmm7=z4
-
-	; (Original)
-	; z5 = (z3 + z4) * 1.175875602;
-	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-	; z3 += z5;  z4 += z5;
-	;
-	; (This implementation)
-	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-	movdqa    xmm2,xmm5
-	movdqa    xmm0,xmm5
-	punpcklwd xmm2,xmm7
-	punpckhwd xmm0,xmm7
-	movdqa    xmm5,xmm2
-	movdqa    xmm7,xmm0
-	pmaddwd   xmm2,[rel PW_MF078_F117]	; xmm2=z3L
-	pmaddwd   xmm0,[rel PW_MF078_F117]	; xmm0=z3H
-	pmaddwd   xmm5,[rel PW_F117_F078]	; xmm5=z4L
-	pmaddwd   xmm7,[rel PW_F117_F078]	; xmm7=z4H
-
-	movdqa	XMMWORD [wk(10)], xmm2	; wk(10)=z3L
-	movdqa	XMMWORD [wk(11)], xmm0	; wk(11)=z3H
-
-	; (Original)
-	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-	;
-	; (This implementation)
-	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-	; tmp0 += z3;  tmp1 += z4;
-	; tmp2 += z3;  tmp3 += z4;
-
-	movdqa    xmm2,xmm3
-	movdqa    xmm0,xmm3
-	punpcklwd xmm2,xmm4
-	punpckhwd xmm0,xmm4
-	movdqa    xmm3,xmm2
-	movdqa    xmm4,xmm0
-	pmaddwd   xmm2,[rel PW_MF060_MF089]	; xmm2=tmp0L
-	pmaddwd   xmm0,[rel PW_MF060_MF089]	; xmm0=tmp0H
-	pmaddwd   xmm3,[rel PW_MF089_F060]	; xmm3=tmp3L
-	pmaddwd   xmm4,[rel PW_MF089_F060]	; xmm4=tmp3H
-
-	paddd	xmm2, XMMWORD [wk(10)]	; xmm2=tmp0L
-	paddd	xmm0, XMMWORD [wk(11)]	; xmm0=tmp0H
-	paddd	xmm3,xmm5		; xmm3=tmp3L
-	paddd	xmm4,xmm7		; xmm4=tmp3H
-
-	movdqa	XMMWORD [wk(8)], xmm2	; wk(8)=tmp0L
-	movdqa	XMMWORD [wk(9)], xmm0	; wk(9)=tmp0H
-
-	movdqa    xmm2,xmm1
-	movdqa    xmm0,xmm1
-	punpcklwd xmm2,xmm6
-	punpckhwd xmm0,xmm6
-	movdqa    xmm1,xmm2
-	movdqa    xmm6,xmm0
-	pmaddwd   xmm2,[rel PW_MF050_MF256]	; xmm2=tmp1L
-	pmaddwd   xmm0,[rel PW_MF050_MF256]	; xmm0=tmp1H
-	pmaddwd   xmm1,[rel PW_MF256_F050]	; xmm1=tmp2L
-	pmaddwd   xmm6,[rel PW_MF256_F050]	; xmm6=tmp2H
-
-	paddd	xmm2,xmm5		; xmm2=tmp1L
-	paddd	xmm0,xmm7		; xmm0=tmp1H
-	paddd	xmm1, XMMWORD [wk(10)]	; xmm1=tmp2L
-	paddd	xmm6, XMMWORD [wk(11)]	; xmm6=tmp2H
-
-	movdqa	XMMWORD [wk(10)], xmm2	; wk(10)=tmp1L
-	movdqa	XMMWORD [wk(11)], xmm0	; wk(11)=tmp1H
-
-	; -- Final output stage
-
-	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=tmp10L
-	movdqa	xmm7, XMMWORD [wk(1)]	; xmm7=tmp10H
-
-	movdqa	xmm2,xmm5
-	movdqa	xmm0,xmm7
-	paddd	xmm5,xmm3		; xmm5=data0L
-	paddd	xmm7,xmm4		; xmm7=data0H
-	psubd	xmm2,xmm3		; xmm2=data7L
-	psubd	xmm0,xmm4		; xmm0=data7H
-
-	movdqa	xmm3,[rel PD_DESCALE_P1]	; xmm3=[rel PD_DESCALE_P1]
-
-	paddd	xmm5,xmm3
-	paddd	xmm7,xmm3
-	psrad	xmm5,DESCALE_P1
-	psrad	xmm7,DESCALE_P1
-	paddd	xmm2,xmm3
-	paddd	xmm0,xmm3
-	psrad	xmm2,DESCALE_P1
-	psrad	xmm0,DESCALE_P1
-
-	packssdw  xmm5,xmm7		; xmm5=data0=(00 01 02 03 04 05 06 07)
-	packssdw  xmm2,xmm0		; xmm2=data7=(70 71 72 73 74 75 76 77)
-
-	movdqa	xmm4, XMMWORD [wk(4)]	; xmm4=tmp11L
-	movdqa	xmm3, XMMWORD [wk(5)]	; xmm3=tmp11H
-
-	movdqa	xmm7,xmm4
-	movdqa	xmm0,xmm3
-	paddd	xmm4,xmm1		; xmm4=data1L
-	paddd	xmm3,xmm6		; xmm3=data1H
-	psubd	xmm7,xmm1		; xmm7=data6L
-	psubd	xmm0,xmm6		; xmm0=data6H
-
-	movdqa	xmm1,[rel PD_DESCALE_P1]	; xmm1=[rel PD_DESCALE_P1]
-
-	paddd	xmm4,xmm1
-	paddd	xmm3,xmm1
-	psrad	xmm4,DESCALE_P1
-	psrad	xmm3,DESCALE_P1
-	paddd	xmm7,xmm1
-	paddd	xmm0,xmm1
-	psrad	xmm7,DESCALE_P1
-	psrad	xmm0,DESCALE_P1
-
-	packssdw  xmm4,xmm3		; xmm4=data1=(10 11 12 13 14 15 16 17)
-	packssdw  xmm7,xmm0		; xmm7=data6=(60 61 62 63 64 65 66 67)
-
-	movdqa    xmm6,xmm5		; transpose coefficients(phase 1)
-	punpcklwd xmm5,xmm4		; xmm5=(00 10 01 11 02 12 03 13)
-	punpckhwd xmm6,xmm4		; xmm6=(04 14 05 15 06 16 07 17)
-	movdqa    xmm1,xmm7		; transpose coefficients(phase 1)
-	punpcklwd xmm7,xmm2		; xmm7=(60 70 61 71 62 72 63 73)
-	punpckhwd xmm1,xmm2		; xmm1=(64 74 65 75 66 76 67 77)
-
-	movdqa	xmm3, XMMWORD [wk(6)]	; xmm3=tmp12L
-	movdqa	xmm0, XMMWORD [wk(7)]	; xmm0=tmp12H
-	movdqa	xmm4, XMMWORD [wk(10)]	; xmm4=tmp1L
-	movdqa	xmm2, XMMWORD [wk(11)]	; xmm2=tmp1H
-
-	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(00 10 01 11 02 12 03 13)
-	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=(04 14 05 15 06 16 07 17)
-	movdqa	XMMWORD [wk(4)], xmm7	; wk(4)=(60 70 61 71 62 72 63 73)
-	movdqa	XMMWORD [wk(5)], xmm1	; wk(5)=(64 74 65 75 66 76 67 77)
-
-	movdqa	xmm5,xmm3
-	movdqa	xmm6,xmm0
-	paddd	xmm3,xmm4		; xmm3=data2L
-	paddd	xmm0,xmm2		; xmm0=data2H
-	psubd	xmm5,xmm4		; xmm5=data5L
-	psubd	xmm6,xmm2		; xmm6=data5H
-
-	movdqa	xmm7,[rel PD_DESCALE_P1]	; xmm7=[rel PD_DESCALE_P1]
-
-	paddd	xmm3,xmm7
-	paddd	xmm0,xmm7
-	psrad	xmm3,DESCALE_P1
-	psrad	xmm0,DESCALE_P1
-	paddd	xmm5,xmm7
-	paddd	xmm6,xmm7
-	psrad	xmm5,DESCALE_P1
-	psrad	xmm6,DESCALE_P1
-
-	packssdw  xmm3,xmm0		; xmm3=data2=(20 21 22 23 24 25 26 27)
-	packssdw  xmm5,xmm6		; xmm5=data5=(50 51 52 53 54 55 56 57)
-
-	movdqa	xmm1, XMMWORD [wk(2)]	; xmm1=tmp13L
-	movdqa	xmm4, XMMWORD [wk(3)]	; xmm4=tmp13H
-	movdqa	xmm2, XMMWORD [wk(8)]	; xmm2=tmp0L
-	movdqa	xmm7, XMMWORD [wk(9)]	; xmm7=tmp0H
-
-	movdqa	xmm0,xmm1
-	movdqa	xmm6,xmm4
-	paddd	xmm1,xmm2		; xmm1=data3L
-	paddd	xmm4,xmm7		; xmm4=data3H
-	psubd	xmm0,xmm2		; xmm0=data4L
-	psubd	xmm6,xmm7		; xmm6=data4H
-
-	movdqa	xmm2,[rel PD_DESCALE_P1]	; xmm2=[rel PD_DESCALE_P1]
-
-	paddd	xmm1,xmm2
-	paddd	xmm4,xmm2
-	psrad	xmm1,DESCALE_P1
-	psrad	xmm4,DESCALE_P1
-	paddd	xmm0,xmm2
-	paddd	xmm6,xmm2
-	psrad	xmm0,DESCALE_P1
-	psrad	xmm6,DESCALE_P1
-
-	packssdw  xmm1,xmm4		; xmm1=data3=(30 31 32 33 34 35 36 37)
-	packssdw  xmm0,xmm6		; xmm0=data4=(40 41 42 43 44 45 46 47)
-
-	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=(00 10 01 11 02 12 03 13)
-	movdqa	xmm2, XMMWORD [wk(1)]	; xmm2=(04 14 05 15 06 16 07 17)
-
-	movdqa    xmm4,xmm3		; transpose coefficients(phase 1)
-	punpcklwd xmm3,xmm1		; xmm3=(20 30 21 31 22 32 23 33)
-	punpckhwd xmm4,xmm1		; xmm4=(24 34 25 35 26 36 27 37)
-	movdqa    xmm6,xmm0		; transpose coefficients(phase 1)
-	punpcklwd xmm0,xmm5		; xmm0=(40 50 41 51 42 52 43 53)
-	punpckhwd xmm6,xmm5		; xmm6=(44 54 45 55 46 56 47 57)
-
-	movdqa    xmm1,xmm7		; transpose coefficients(phase 2)
-	punpckldq xmm7,xmm3		; xmm7=(00 10 20 30 01 11 21 31)
-	punpckhdq xmm1,xmm3		; xmm1=(02 12 22 32 03 13 23 33)
-	movdqa    xmm5,xmm2		; transpose coefficients(phase 2)
-	punpckldq xmm2,xmm4		; xmm2=(04 14 24 34 05 15 25 35)
-	punpckhdq xmm5,xmm4		; xmm5=(06 16 26 36 07 17 27 37)
-
-	movdqa	xmm3, XMMWORD [wk(4)]	; xmm3=(60 70 61 71 62 72 63 73)
-	movdqa	xmm4, XMMWORD [wk(5)]	; xmm4=(64 74 65 75 66 76 67 77)
-
-	movdqa	XMMWORD [wk(6)], xmm2	; wk(6)=(04 14 24 34 05 15 25 35)
-	movdqa	XMMWORD [wk(7)], xmm5	; wk(7)=(06 16 26 36 07 17 27 37)
-
-	movdqa    xmm2,xmm0		; transpose coefficients(phase 2)
-	punpckldq xmm0,xmm3		; xmm0=(40 50 60 70 41 51 61 71)
-	punpckhdq xmm2,xmm3		; xmm2=(42 52 62 72 43 53 63 73)
-	movdqa    xmm5,xmm6		; transpose coefficients(phase 2)
-	punpckldq xmm6,xmm4		; xmm6=(44 54 64 74 45 55 65 75)
-	punpckhdq xmm5,xmm4		; xmm5=(46 56 66 76 47 57 67 77)
-
-	movdqa     xmm3,xmm7		; transpose coefficients(phase 3)
-	punpcklqdq xmm7,xmm0		; xmm7=col0=(00 10 20 30 40 50 60 70)
-	punpckhqdq xmm3,xmm0		; xmm3=col1=(01 11 21 31 41 51 61 71)
-	movdqa     xmm4,xmm1		; transpose coefficients(phase 3)
-	punpcklqdq xmm1,xmm2		; xmm1=col2=(02 12 22 32 42 52 62 72)
-	punpckhqdq xmm4,xmm2		; xmm4=col3=(03 13 23 33 43 53 63 73)
-
-	movdqa	xmm0, XMMWORD [wk(6)]	; xmm0=(04 14 24 34 05 15 25 35)
-	movdqa	xmm2, XMMWORD [wk(7)]	; xmm2=(06 16 26 36 07 17 27 37)
-
-	movdqa	XMMWORD [wk(8)], xmm3	; wk(8)=col1
-	movdqa	XMMWORD [wk(9)], xmm4	; wk(9)=col3
-
-	movdqa     xmm3,xmm0		; transpose coefficients(phase 3)
-	punpcklqdq xmm0,xmm6		; xmm0=col4=(04 14 24 34 44 54 64 74)
-	punpckhqdq xmm3,xmm6		; xmm3=col5=(05 15 25 35 45 55 65 75)
-	movdqa     xmm4,xmm2		; transpose coefficients(phase 3)
-	punpcklqdq xmm2,xmm5		; xmm2=col6=(06 16 26 36 46 56 66 76)
-	punpckhqdq xmm4,xmm5		; xmm4=col7=(07 17 27 37 47 57 67 77)
-
-	movdqa	XMMWORD [wk(10)], xmm3	; wk(10)=col5
-	movdqa	XMMWORD [wk(11)], xmm4	; wk(11)=col7
-.column_end:
-
-	; -- Prefetch the next coefficient block
-
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-	; ---- Pass 2: process rows from work array, store into output array.
-
-	mov	rax, [original_rbp]
-	mov	rdi, r12	; (JSAMPROW *)
-	mov	rax, r13
-
-	; -- Even part
-
-	; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
-
-	; (Original)
-	; z1 = (z2 + z3) * 0.541196100;
-	; tmp2 = z1 + z3 * -1.847759065;
-	; tmp3 = z1 + z2 * 0.765366865;
-	;
-	; (This implementation)
-	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-	movdqa    xmm6,xmm1		; xmm1=in2=z2
-	movdqa    xmm5,xmm1
-	punpcklwd xmm6,xmm2		; xmm2=in6=z3
-	punpckhwd xmm5,xmm2
-	movdqa    xmm1,xmm6
-	movdqa    xmm2,xmm5
-	pmaddwd   xmm6,[rel PW_F130_F054]	; xmm6=tmp3L
-	pmaddwd   xmm5,[rel PW_F130_F054]	; xmm5=tmp3H
-	pmaddwd   xmm1,[rel PW_F054_MF130]	; xmm1=tmp2L
-	pmaddwd   xmm2,[rel PW_F054_MF130]	; xmm2=tmp2H
-
-	movdqa    xmm3,xmm7
-	paddw     xmm7,xmm0		; xmm7=in0+in4
-	psubw     xmm3,xmm0		; xmm3=in0-in4
-
-	pxor      xmm4,xmm4
-	pxor      xmm0,xmm0
-	punpcklwd xmm4,xmm7		; xmm4=tmp0L
-	punpckhwd xmm0,xmm7		; xmm0=tmp0H
-	psrad     xmm4,(16-CONST_BITS)	; psrad xmm4,16 & pslld xmm4,CONST_BITS
-	psrad     xmm0,(16-CONST_BITS)	; psrad xmm0,16 & pslld xmm0,CONST_BITS
-
-	movdqa	xmm7,xmm4
-	paddd	xmm4,xmm6		; xmm4=tmp10L
-	psubd	xmm7,xmm6		; xmm7=tmp13L
-	movdqa	xmm6,xmm0
-	paddd	xmm0,xmm5		; xmm0=tmp10H
-	psubd	xmm6,xmm5		; xmm6=tmp13H
-
-	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=tmp10L
-	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp10H
-	movdqa	XMMWORD [wk(2)], xmm7	; wk(2)=tmp13L
-	movdqa	XMMWORD [wk(3)], xmm6	; wk(3)=tmp13H
-
-	pxor      xmm5,xmm5
-	pxor      xmm4,xmm4
-	punpcklwd xmm5,xmm3		; xmm5=tmp1L
-	punpckhwd xmm4,xmm3		; xmm4=tmp1H
-	psrad     xmm5,(16-CONST_BITS)	; psrad xmm5,16 & pslld xmm5,CONST_BITS
-	psrad     xmm4,(16-CONST_BITS)	; psrad xmm4,16 & pslld xmm4,CONST_BITS
-
-	movdqa	xmm0,xmm5
-	paddd	xmm5,xmm1		; xmm5=tmp11L
-	psubd	xmm0,xmm1		; xmm0=tmp12L
-	movdqa	xmm7,xmm4
-	paddd	xmm4,xmm2		; xmm4=tmp11H
-	psubd	xmm7,xmm2		; xmm7=tmp12H
-
-	movdqa	XMMWORD [wk(4)], xmm5	; wk(4)=tmp11L
-	movdqa	XMMWORD [wk(5)], xmm4	; wk(5)=tmp11H
-	movdqa	XMMWORD [wk(6)], xmm0	; wk(6)=tmp12L
-	movdqa	XMMWORD [wk(7)], xmm7	; wk(7)=tmp12H
-
-	; -- Odd part
-
-	movdqa	xmm6, XMMWORD [wk(9)]	; xmm6=col3
-	movdqa	xmm3, XMMWORD [wk(8)]	; xmm3=col1
-	movdqa	xmm1, XMMWORD [wk(11)]	; xmm1=col7
-	movdqa	xmm2, XMMWORD [wk(10)]	; xmm2=col5
-
-	movdqa	xmm5,xmm6
-	movdqa	xmm4,xmm3
-	paddw	xmm5,xmm1		; xmm5=z3
-	paddw	xmm4,xmm2		; xmm4=z4
-
-	; (Original)
-	; z5 = (z3 + z4) * 1.175875602;
-	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-	; z3 += z5;  z4 += z5;
-	;
-	; (This implementation)
-	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-	movdqa    xmm0,xmm5
-	movdqa    xmm7,xmm5
-	punpcklwd xmm0,xmm4
-	punpckhwd xmm7,xmm4
-	movdqa    xmm5,xmm0
-	movdqa    xmm4,xmm7
-	pmaddwd   xmm0,[rel PW_MF078_F117]	; xmm0=z3L
-	pmaddwd   xmm7,[rel PW_MF078_F117]	; xmm7=z3H
-	pmaddwd   xmm5,[rel PW_F117_F078]	; xmm5=z4L
-	pmaddwd   xmm4,[rel PW_F117_F078]	; xmm4=z4H
-
-	movdqa	XMMWORD [wk(10)], xmm0	; wk(10)=z3L
-	movdqa	XMMWORD [wk(11)], xmm7	; wk(11)=z3H
-
-	; (Original)
-	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-	;
-	; (This implementation)
-	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-	; tmp0 += z3;  tmp1 += z4;
-	; tmp2 += z3;  tmp3 += z4;
-
-	movdqa    xmm0,xmm1
-	movdqa    xmm7,xmm1
-	punpcklwd xmm0,xmm3
-	punpckhwd xmm7,xmm3
-	movdqa    xmm1,xmm0
-	movdqa    xmm3,xmm7
-	pmaddwd   xmm0,[rel PW_MF060_MF089]	; xmm0=tmp0L
-	pmaddwd   xmm7,[rel PW_MF060_MF089]	; xmm7=tmp0H
-	pmaddwd   xmm1,[rel PW_MF089_F060]	; xmm1=tmp3L
-	pmaddwd   xmm3,[rel PW_MF089_F060]	; xmm3=tmp3H
-
-	paddd	xmm0, XMMWORD [wk(10)]	; xmm0=tmp0L
-	paddd	xmm7, XMMWORD [wk(11)]	; xmm7=tmp0H
-	paddd	xmm1,xmm5		; xmm1=tmp3L
-	paddd	xmm3,xmm4		; xmm3=tmp3H
-
-	movdqa	XMMWORD [wk(8)], xmm0	; wk(8)=tmp0L
-	movdqa	XMMWORD [wk(9)], xmm7	; wk(9)=tmp0H
-
-	movdqa    xmm0,xmm2
-	movdqa    xmm7,xmm2
-	punpcklwd xmm0,xmm6
-	punpckhwd xmm7,xmm6
-	movdqa    xmm2,xmm0
-	movdqa    xmm6,xmm7
-	pmaddwd   xmm0,[rel PW_MF050_MF256]	; xmm0=tmp1L
-	pmaddwd   xmm7,[rel PW_MF050_MF256]	; xmm7=tmp1H
-	pmaddwd   xmm2,[rel PW_MF256_F050]	; xmm2=tmp2L
-	pmaddwd   xmm6,[rel PW_MF256_F050]	; xmm6=tmp2H
-
-	paddd	xmm0,xmm5		; xmm0=tmp1L
-	paddd	xmm7,xmm4		; xmm7=tmp1H
-	paddd	xmm2, XMMWORD [wk(10)]	; xmm2=tmp2L
-	paddd	xmm6, XMMWORD [wk(11)]	; xmm6=tmp2H
-
-	movdqa	XMMWORD [wk(10)], xmm0	; wk(10)=tmp1L
-	movdqa	XMMWORD [wk(11)], xmm7	; wk(11)=tmp1H
-
-	; -- Final output stage
-
-	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=tmp10L
-	movdqa	xmm4, XMMWORD [wk(1)]	; xmm4=tmp10H
-
-	movdqa	xmm0,xmm5
-	movdqa	xmm7,xmm4
-	paddd	xmm5,xmm1		; xmm5=data0L
-	paddd	xmm4,xmm3		; xmm4=data0H
-	psubd	xmm0,xmm1		; xmm0=data7L
-	psubd	xmm7,xmm3		; xmm7=data7H
-
-	movdqa	xmm1,[rel PD_DESCALE_P2]	; xmm1=[rel PD_DESCALE_P2]
-
-	paddd	xmm5,xmm1
-	paddd	xmm4,xmm1
-	psrad	xmm5,DESCALE_P2
-	psrad	xmm4,DESCALE_P2
-	paddd	xmm0,xmm1
-	paddd	xmm7,xmm1
-	psrad	xmm0,DESCALE_P2
-	psrad	xmm7,DESCALE_P2
-
-	packssdw  xmm5,xmm4		; xmm5=data0=(00 10 20 30 40 50 60 70)
-	packssdw  xmm0,xmm7		; xmm0=data7=(07 17 27 37 47 57 67 77)
-
-	movdqa	xmm3, XMMWORD [wk(4)]	; xmm3=tmp11L
-	movdqa	xmm1, XMMWORD [wk(5)]	; xmm1=tmp11H
-
-	movdqa	xmm4,xmm3
-	movdqa	xmm7,xmm1
-	paddd	xmm3,xmm2		; xmm3=data1L
-	paddd	xmm1,xmm6		; xmm1=data1H
-	psubd	xmm4,xmm2		; xmm4=data6L
-	psubd	xmm7,xmm6		; xmm7=data6H
-
-	movdqa	xmm2,[rel PD_DESCALE_P2]	; xmm2=[rel PD_DESCALE_P2]
-
-	paddd	xmm3,xmm2
-	paddd	xmm1,xmm2
-	psrad	xmm3,DESCALE_P2
-	psrad	xmm1,DESCALE_P2
-	paddd	xmm4,xmm2
-	paddd	xmm7,xmm2
-	psrad	xmm4,DESCALE_P2
-	psrad	xmm7,DESCALE_P2
-
-	packssdw  xmm3,xmm1		; xmm3=data1=(01 11 21 31 41 51 61 71)
-	packssdw  xmm4,xmm7		; xmm4=data6=(06 16 26 36 46 56 66 76)
-
-	packsswb  xmm5,xmm4		; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-	packsswb  xmm3,xmm0		; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-	movdqa	xmm6, XMMWORD [wk(6)]	; xmm6=tmp12L
-	movdqa	xmm2, XMMWORD [wk(7)]	; xmm2=tmp12H
-	movdqa	xmm1, XMMWORD [wk(10)]	; xmm1=tmp1L
-	movdqa	xmm7, XMMWORD [wk(11)]	; xmm7=tmp1H
-
-	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-	movdqa	xmm4,xmm6
-	movdqa	xmm0,xmm2
-	paddd	xmm6,xmm1		; xmm6=data2L
-	paddd	xmm2,xmm7		; xmm2=data2H
-	psubd	xmm4,xmm1		; xmm4=data5L
-	psubd	xmm0,xmm7		; xmm0=data5H
-
-	movdqa	xmm5,[rel PD_DESCALE_P2]	; xmm5=[rel PD_DESCALE_P2]
-
-	paddd	xmm6,xmm5
-	paddd	xmm2,xmm5
-	psrad	xmm6,DESCALE_P2
-	psrad	xmm2,DESCALE_P2
-	paddd	xmm4,xmm5
-	paddd	xmm0,xmm5
-	psrad	xmm4,DESCALE_P2
-	psrad	xmm0,DESCALE_P2
-
-	packssdw  xmm6,xmm2		; xmm6=data2=(02 12 22 32 42 52 62 72)
-	packssdw  xmm4,xmm0		; xmm4=data5=(05 15 25 35 45 55 65 75)
-
-	movdqa	xmm3, XMMWORD [wk(2)]	; xmm3=tmp13L
-	movdqa	xmm1, XMMWORD [wk(3)]	; xmm1=tmp13H
-	movdqa	xmm7, XMMWORD [wk(8)]	; xmm7=tmp0L
-	movdqa	xmm5, XMMWORD [wk(9)]	; xmm5=tmp0H
-
-	movdqa	xmm2,xmm3
-	movdqa	xmm0,xmm1
-	paddd	xmm3,xmm7		; xmm3=data3L
-	paddd	xmm1,xmm5		; xmm1=data3H
-	psubd	xmm2,xmm7		; xmm2=data4L
-	psubd	xmm0,xmm5		; xmm0=data4H
-
-	movdqa	xmm7,[rel PD_DESCALE_P2]	; xmm7=[rel PD_DESCALE_P2]
-
-	paddd	xmm3,xmm7
-	paddd	xmm1,xmm7
-	psrad	xmm3,DESCALE_P2
-	psrad	xmm1,DESCALE_P2
-	paddd	xmm2,xmm7
-	paddd	xmm0,xmm7
-	psrad	xmm2,DESCALE_P2
-	psrad	xmm0,DESCALE_P2
-
-	movdqa    xmm5,[rel PB_CENTERJSAMP]	; xmm5=[rel PB_CENTERJSAMP]
-
-	packssdw  xmm3,xmm1		; xmm3=data3=(03 13 23 33 43 53 63 73)
-	packssdw  xmm2,xmm0		; xmm2=data4=(04 14 24 34 44 54 64 74)
-
-	movdqa    xmm7, XMMWORD [wk(0)]	; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-	movdqa    xmm1, XMMWORD [wk(1)]	; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-	packsswb  xmm6,xmm2		; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
-	packsswb  xmm3,xmm4		; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
-
-	paddb     xmm7,xmm5
-	paddb     xmm1,xmm5
-	paddb     xmm6,xmm5
-	paddb     xmm3,xmm5
-
-	movdqa    xmm0,xmm7	; transpose coefficients(phase 1)
-	punpcklbw xmm7,xmm1	; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
-	punpckhbw xmm0,xmm1	; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
-	movdqa    xmm2,xmm6	; transpose coefficients(phase 1)
-	punpcklbw xmm6,xmm3	; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
-	punpckhbw xmm2,xmm3	; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
-
-	movdqa    xmm4,xmm7	; transpose coefficients(phase 2)
-	punpcklwd xmm7,xmm6	; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-	punpckhwd xmm4,xmm6	; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
-	movdqa    xmm5,xmm2	; transpose coefficients(phase 2)
-	punpcklwd xmm2,xmm0	; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-	punpckhwd xmm5,xmm0	; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
-
-	movdqa    xmm1,xmm7	; transpose coefficients(phase 3)
-	punpckldq xmm7,xmm2	; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-	punpckhdq xmm1,xmm2	; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-	movdqa    xmm3,xmm4	; transpose coefficients(phase 3)
-	punpckldq xmm4,xmm5	; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
-	punpckhdq xmm3,xmm5	; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
-
-	pshufd	xmm6,xmm7,0x4E	; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-	pshufd	xmm0,xmm1,0x4E	; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-	pshufd	xmm2,xmm4,0x4E	; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
-	pshufd	xmm5,xmm3,0x4E	; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
-
-	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-	mov	rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7
-	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
-	mov	rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
-	mov	rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
-	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
-
-	mov	rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-	mov	rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
-	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
-	mov	rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
-	mov	rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
-	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
-
-	uncollect_args
-	mov	rsp,rbp		; rsp <- aligned rbp
-	pop	rsp		; rsp <- original rbp
-	pop	rbp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jiss2int.asm b/simd/jiss2int.asm
deleted file mode 100644
index 17a23f3..0000000
--- a/simd/jiss2int.asm
+++ /dev/null
@@ -1,859 +0,0 @@
-;
-; jiss2int.asm - accurate integer IDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; inverse DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jidctint.c; see the jidctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	13
-%define PASS1_BITS	2
-
-%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
-%define DESCALE_P2	(CONST_BITS+PASS1_BITS+3)
-
-%if CONST_BITS == 13
-F_0_298	equ	 2446		; FIX(0.298631336)
-F_0_390	equ	 3196		; FIX(0.390180644)
-F_0_541	equ	 4433		; FIX(0.541196100)
-F_0_765	equ	 6270		; FIX(0.765366865)
-F_0_899	equ	 7373		; FIX(0.899976223)
-F_1_175	equ	 9633		; FIX(1.175875602)
-F_1_501	equ	12299		; FIX(1.501321110)
-F_1_847	equ	15137		; FIX(1.847759065)
-F_1_961	equ	16069		; FIX(1.961570560)
-F_2_053	equ	16819		; FIX(2.053119869)
-F_2_562	equ	20995		; FIX(2.562915447)
-F_3_072	equ	25172		; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
-F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
-F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
-F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
-F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
-F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
-F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
-F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
-F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
-F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
-F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
-F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_idct_islow_sse2) PRIVATE
-
-EXTN(jconst_idct_islow_sse2):
-
-PW_F130_F054	times 4 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130	times 4 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117	times 4 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078	times 4 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089	times 4 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060	times 4 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256	times 4 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050	times 4 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1	times 4 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2	times 4 dd  1 << (DESCALE_P2-1)
-PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_islow_sse2 (void * dct_table, JCOEFPTR coef_block,
-;                        JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)	(b)+8			; jpeg_component_info * compptr
-%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
-%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
-%define output_col(b)	(b)+20		; JDIMENSION output_col
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		12
-
-	align	16
-	global	EXTN(jsimd_idct_islow_sse2) PRIVATE
-
-EXTN(jsimd_idct_islow_sse2):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	ebx
-;	push	ecx		; unused
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process columns from input.
-
-;	mov	eax, [original_ebp]
-	mov	edx, POINTER [dct_table(eax)]	; quantptr
-	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
-	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	jnz	near .columnDCT
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	por	xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	por	xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	por	xmm1,xmm0
-	packsswb xmm1,xmm1
-	packsswb xmm1,xmm1
-	movd	eax,xmm1
-	test	eax,eax
-	jnz	short .columnDCT
-
-	; -- AC terms all zero
-
-	movdqa	xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	psllw	xmm5,PASS1_BITS
-
-	movdqa    xmm4,xmm5		; xmm5=in0=(00 01 02 03 04 05 06 07)
-	punpcklwd xmm5,xmm5		; xmm5=(00 00 01 01 02 02 03 03)
-	punpckhwd xmm4,xmm4		; xmm4=(04 04 05 05 06 06 07 07)
-
-	pshufd	xmm7,xmm5,0x00		; xmm7=col0=(00 00 00 00 00 00 00 00)
-	pshufd	xmm6,xmm5,0x55		; xmm6=col1=(01 01 01 01 01 01 01 01)
-	pshufd	xmm1,xmm5,0xAA		; xmm1=col2=(02 02 02 02 02 02 02 02)
-	pshufd	xmm5,xmm5,0xFF		; xmm5=col3=(03 03 03 03 03 03 03 03)
-	pshufd	xmm0,xmm4,0x00		; xmm0=col4=(04 04 04 04 04 04 04 04)
-	pshufd	xmm3,xmm4,0x55		; xmm3=col5=(05 05 05 05 05 05 05 05)
-	pshufd	xmm2,xmm4,0xAA		; xmm2=col6=(06 06 06 06 06 06 06 06)
-	pshufd	xmm4,xmm4,0xFF		; xmm4=col7=(07 07 07 07 07 07 07 07)
-
-	movdqa	XMMWORD [wk(8)], xmm6	; wk(8)=col1
-	movdqa	XMMWORD [wk(9)], xmm5	; wk(9)=col3
-	movdqa	XMMWORD [wk(10)], xmm3	; wk(10)=col5
-	movdqa	XMMWORD [wk(11)], xmm4	; wk(11)=col7
-	jmp	near .column_end
-	alignx	16,7
-%endif
-.columnDCT:
-
-	; -- Even part
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	movdqa	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	; (Original)
-	; z1 = (z2 + z3) * 0.541196100;
-	; tmp2 = z1 + z3 * -1.847759065;
-	; tmp3 = z1 + z2 * 0.765366865;
-	;
-	; (This implementation)
-	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-	movdqa    xmm4,xmm1		; xmm1=in2=z2
-	movdqa    xmm5,xmm1
-	punpcklwd xmm4,xmm3		; xmm3=in6=z3
-	punpckhwd xmm5,xmm3
-	movdqa    xmm1,xmm4
-	movdqa    xmm3,xmm5
-	pmaddwd   xmm4,[GOTOFF(ebx,PW_F130_F054)]	; xmm4=tmp3L
-	pmaddwd   xmm5,[GOTOFF(ebx,PW_F130_F054)]	; xmm5=tmp3H
-	pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]	; xmm1=tmp2L
-	pmaddwd   xmm3,[GOTOFF(ebx,PW_F054_MF130)]	; xmm3=tmp2H
-
-	movdqa    xmm6,xmm0
-	paddw     xmm0,xmm2		; xmm0=in0+in4
-	psubw     xmm6,xmm2		; xmm6=in0-in4
-
-	pxor      xmm7,xmm7
-	pxor      xmm2,xmm2
-	punpcklwd xmm7,xmm0		; xmm7=tmp0L
-	punpckhwd xmm2,xmm0		; xmm2=tmp0H
-	psrad     xmm7,(16-CONST_BITS)	; psrad xmm7,16 & pslld xmm7,CONST_BITS
-	psrad     xmm2,(16-CONST_BITS)	; psrad xmm2,16 & pslld xmm2,CONST_BITS
-
-	movdqa	xmm0,xmm7
-	paddd	xmm7,xmm4		; xmm7=tmp10L
-	psubd	xmm0,xmm4		; xmm0=tmp13L
-	movdqa	xmm4,xmm2
-	paddd	xmm2,xmm5		; xmm2=tmp10H
-	psubd	xmm4,xmm5		; xmm4=tmp13H
-
-	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=tmp10L
-	movdqa	XMMWORD [wk(1)], xmm2	; wk(1)=tmp10H
-	movdqa	XMMWORD [wk(2)], xmm0	; wk(2)=tmp13L
-	movdqa	XMMWORD [wk(3)], xmm4	; wk(3)=tmp13H
-
-	pxor      xmm5,xmm5
-	pxor      xmm7,xmm7
-	punpcklwd xmm5,xmm6		; xmm5=tmp1L
-	punpckhwd xmm7,xmm6		; xmm7=tmp1H
-	psrad     xmm5,(16-CONST_BITS)	; psrad xmm5,16 & pslld xmm5,CONST_BITS
-	psrad     xmm7,(16-CONST_BITS)	; psrad xmm7,16 & pslld xmm7,CONST_BITS
-
-	movdqa	xmm2,xmm5
-	paddd	xmm5,xmm1		; xmm5=tmp11L
-	psubd	xmm2,xmm1		; xmm2=tmp12L
-	movdqa	xmm0,xmm7
-	paddd	xmm7,xmm3		; xmm7=tmp11H
-	psubd	xmm0,xmm3		; xmm0=tmp12H
-
-	movdqa	XMMWORD [wk(4)], xmm5	; wk(4)=tmp11L
-	movdqa	XMMWORD [wk(5)], xmm7	; wk(5)=tmp11H
-	movdqa	XMMWORD [wk(6)], xmm2	; wk(6)=tmp12L
-	movdqa	XMMWORD [wk(7)], xmm0	; wk(7)=tmp12H
-
-	; -- Odd part
-
-	movdqa	xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	movdqa	xmm5,xmm6
-	movdqa	xmm7,xmm4
-	paddw	xmm5,xmm3		; xmm5=z3
-	paddw	xmm7,xmm1		; xmm7=z4
-
-	; (Original)
-	; z5 = (z3 + z4) * 1.175875602;
-	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-	; z3 += z5;  z4 += z5;
-	;
-	; (This implementation)
-	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-	movdqa    xmm2,xmm5
-	movdqa    xmm0,xmm5
-	punpcklwd xmm2,xmm7
-	punpckhwd xmm0,xmm7
-	movdqa    xmm5,xmm2
-	movdqa    xmm7,xmm0
-	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF078_F117)]	; xmm2=z3L
-	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF078_F117)]	; xmm0=z3H
-	pmaddwd   xmm5,[GOTOFF(ebx,PW_F117_F078)]	; xmm5=z4L
-	pmaddwd   xmm7,[GOTOFF(ebx,PW_F117_F078)]	; xmm7=z4H
-
-	movdqa	XMMWORD [wk(10)], xmm2	; wk(10)=z3L
-	movdqa	XMMWORD [wk(11)], xmm0	; wk(11)=z3H
-
-	; (Original)
-	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-	;
-	; (This implementation)
-	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-	; tmp0 += z3;  tmp1 += z4;
-	; tmp2 += z3;  tmp3 += z4;
-
-	movdqa    xmm2,xmm3
-	movdqa    xmm0,xmm3
-	punpcklwd xmm2,xmm4
-	punpckhwd xmm0,xmm4
-	movdqa    xmm3,xmm2
-	movdqa    xmm4,xmm0
-	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm2=tmp0L
-	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm0=tmp0H
-	pmaddwd   xmm3,[GOTOFF(ebx,PW_MF089_F060)]	; xmm3=tmp3L
-	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF089_F060)]	; xmm4=tmp3H
-
-	paddd	xmm2, XMMWORD [wk(10)]	; xmm2=tmp0L
-	paddd	xmm0, XMMWORD [wk(11)]	; xmm0=tmp0H
-	paddd	xmm3,xmm5		; xmm3=tmp3L
-	paddd	xmm4,xmm7		; xmm4=tmp3H
-
-	movdqa	XMMWORD [wk(8)], xmm2	; wk(8)=tmp0L
-	movdqa	XMMWORD [wk(9)], xmm0	; wk(9)=tmp0H
-
-	movdqa    xmm2,xmm1
-	movdqa    xmm0,xmm1
-	punpcklwd xmm2,xmm6
-	punpckhwd xmm0,xmm6
-	movdqa    xmm1,xmm2
-	movdqa    xmm6,xmm0
-	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm2=tmp1L
-	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm0=tmp1H
-	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF256_F050)]	; xmm1=tmp2L
-	pmaddwd   xmm6,[GOTOFF(ebx,PW_MF256_F050)]	; xmm6=tmp2H
-
-	paddd	xmm2,xmm5		; xmm2=tmp1L
-	paddd	xmm0,xmm7		; xmm0=tmp1H
-	paddd	xmm1, XMMWORD [wk(10)]	; xmm1=tmp2L
-	paddd	xmm6, XMMWORD [wk(11)]	; xmm6=tmp2H
-
-	movdqa	XMMWORD [wk(10)], xmm2	; wk(10)=tmp1L
-	movdqa	XMMWORD [wk(11)], xmm0	; wk(11)=tmp1H
-
-	; -- Final output stage
-
-	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=tmp10L
-	movdqa	xmm7, XMMWORD [wk(1)]	; xmm7=tmp10H
-
-	movdqa	xmm2,xmm5
-	movdqa	xmm0,xmm7
-	paddd	xmm5,xmm3		; xmm5=data0L
-	paddd	xmm7,xmm4		; xmm7=data0H
-	psubd	xmm2,xmm3		; xmm2=data7L
-	psubd	xmm0,xmm4		; xmm0=data7H
-
-	movdqa	xmm3,[GOTOFF(ebx,PD_DESCALE_P1)]	; xmm3=[PD_DESCALE_P1]
-
-	paddd	xmm5,xmm3
-	paddd	xmm7,xmm3
-	psrad	xmm5,DESCALE_P1
-	psrad	xmm7,DESCALE_P1
-	paddd	xmm2,xmm3
-	paddd	xmm0,xmm3
-	psrad	xmm2,DESCALE_P1
-	psrad	xmm0,DESCALE_P1
-
-	packssdw  xmm5,xmm7		; xmm5=data0=(00 01 02 03 04 05 06 07)
-	packssdw  xmm2,xmm0		; xmm2=data7=(70 71 72 73 74 75 76 77)
-
-	movdqa	xmm4, XMMWORD [wk(4)]	; xmm4=tmp11L
-	movdqa	xmm3, XMMWORD [wk(5)]	; xmm3=tmp11H
-
-	movdqa	xmm7,xmm4
-	movdqa	xmm0,xmm3
-	paddd	xmm4,xmm1		; xmm4=data1L
-	paddd	xmm3,xmm6		; xmm3=data1H
-	psubd	xmm7,xmm1		; xmm7=data6L
-	psubd	xmm0,xmm6		; xmm0=data6H
-
-	movdqa	xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]	; xmm1=[PD_DESCALE_P1]
-
-	paddd	xmm4,xmm1
-	paddd	xmm3,xmm1
-	psrad	xmm4,DESCALE_P1
-	psrad	xmm3,DESCALE_P1
-	paddd	xmm7,xmm1
-	paddd	xmm0,xmm1
-	psrad	xmm7,DESCALE_P1
-	psrad	xmm0,DESCALE_P1
-
-	packssdw  xmm4,xmm3		; xmm4=data1=(10 11 12 13 14 15 16 17)
-	packssdw  xmm7,xmm0		; xmm7=data6=(60 61 62 63 64 65 66 67)
-
-	movdqa    xmm6,xmm5		; transpose coefficients(phase 1)
-	punpcklwd xmm5,xmm4		; xmm5=(00 10 01 11 02 12 03 13)
-	punpckhwd xmm6,xmm4		; xmm6=(04 14 05 15 06 16 07 17)
-	movdqa    xmm1,xmm7		; transpose coefficients(phase 1)
-	punpcklwd xmm7,xmm2		; xmm7=(60 70 61 71 62 72 63 73)
-	punpckhwd xmm1,xmm2		; xmm1=(64 74 65 75 66 76 67 77)
-
-	movdqa	xmm3, XMMWORD [wk(6)]	; xmm3=tmp12L
-	movdqa	xmm0, XMMWORD [wk(7)]	; xmm0=tmp12H
-	movdqa	xmm4, XMMWORD [wk(10)]	; xmm4=tmp1L
-	movdqa	xmm2, XMMWORD [wk(11)]	; xmm2=tmp1H
-
-	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(00 10 01 11 02 12 03 13)
-	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=(04 14 05 15 06 16 07 17)
-	movdqa	XMMWORD [wk(4)], xmm7	; wk(4)=(60 70 61 71 62 72 63 73)
-	movdqa	XMMWORD [wk(5)], xmm1	; wk(5)=(64 74 65 75 66 76 67 77)
-
-	movdqa	xmm5,xmm3
-	movdqa	xmm6,xmm0
-	paddd	xmm3,xmm4		; xmm3=data2L
-	paddd	xmm0,xmm2		; xmm0=data2H
-	psubd	xmm5,xmm4		; xmm5=data5L
-	psubd	xmm6,xmm2		; xmm6=data5H
-
-	movdqa	xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]	; xmm7=[PD_DESCALE_P1]
-
-	paddd	xmm3,xmm7
-	paddd	xmm0,xmm7
-	psrad	xmm3,DESCALE_P1
-	psrad	xmm0,DESCALE_P1
-	paddd	xmm5,xmm7
-	paddd	xmm6,xmm7
-	psrad	xmm5,DESCALE_P1
-	psrad	xmm6,DESCALE_P1
-
-	packssdw  xmm3,xmm0		; xmm3=data2=(20 21 22 23 24 25 26 27)
-	packssdw  xmm5,xmm6		; xmm5=data5=(50 51 52 53 54 55 56 57)
-
-	movdqa	xmm1, XMMWORD [wk(2)]	; xmm1=tmp13L
-	movdqa	xmm4, XMMWORD [wk(3)]	; xmm4=tmp13H
-	movdqa	xmm2, XMMWORD [wk(8)]	; xmm2=tmp0L
-	movdqa	xmm7, XMMWORD [wk(9)]	; xmm7=tmp0H
-
-	movdqa	xmm0,xmm1
-	movdqa	xmm6,xmm4
-	paddd	xmm1,xmm2		; xmm1=data3L
-	paddd	xmm4,xmm7		; xmm4=data3H
-	psubd	xmm0,xmm2		; xmm0=data4L
-	psubd	xmm6,xmm7		; xmm6=data4H
-
-	movdqa	xmm2,[GOTOFF(ebx,PD_DESCALE_P1)]	; xmm2=[PD_DESCALE_P1]
-
-	paddd	xmm1,xmm2
-	paddd	xmm4,xmm2
-	psrad	xmm1,DESCALE_P1
-	psrad	xmm4,DESCALE_P1
-	paddd	xmm0,xmm2
-	paddd	xmm6,xmm2
-	psrad	xmm0,DESCALE_P1
-	psrad	xmm6,DESCALE_P1
-
-	packssdw  xmm1,xmm4		; xmm1=data3=(30 31 32 33 34 35 36 37)
-	packssdw  xmm0,xmm6		; xmm0=data4=(40 41 42 43 44 45 46 47)
-
-	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=(00 10 01 11 02 12 03 13)
-	movdqa	xmm2, XMMWORD [wk(1)]	; xmm2=(04 14 05 15 06 16 07 17)
-
-	movdqa    xmm4,xmm3		; transpose coefficients(phase 1)
-	punpcklwd xmm3,xmm1		; xmm3=(20 30 21 31 22 32 23 33)
-	punpckhwd xmm4,xmm1		; xmm4=(24 34 25 35 26 36 27 37)
-	movdqa    xmm6,xmm0		; transpose coefficients(phase 1)
-	punpcklwd xmm0,xmm5		; xmm0=(40 50 41 51 42 52 43 53)
-	punpckhwd xmm6,xmm5		; xmm6=(44 54 45 55 46 56 47 57)
-
-	movdqa    xmm1,xmm7		; transpose coefficients(phase 2)
-	punpckldq xmm7,xmm3		; xmm7=(00 10 20 30 01 11 21 31)
-	punpckhdq xmm1,xmm3		; xmm1=(02 12 22 32 03 13 23 33)
-	movdqa    xmm5,xmm2		; transpose coefficients(phase 2)
-	punpckldq xmm2,xmm4		; xmm2=(04 14 24 34 05 15 25 35)
-	punpckhdq xmm5,xmm4		; xmm5=(06 16 26 36 07 17 27 37)
-
-	movdqa	xmm3, XMMWORD [wk(4)]	; xmm3=(60 70 61 71 62 72 63 73)
-	movdqa	xmm4, XMMWORD [wk(5)]	; xmm4=(64 74 65 75 66 76 67 77)
-
-	movdqa	XMMWORD [wk(6)], xmm2	; wk(6)=(04 14 24 34 05 15 25 35)
-	movdqa	XMMWORD [wk(7)], xmm5	; wk(7)=(06 16 26 36 07 17 27 37)
-
-	movdqa    xmm2,xmm0		; transpose coefficients(phase 2)
-	punpckldq xmm0,xmm3		; xmm0=(40 50 60 70 41 51 61 71)
-	punpckhdq xmm2,xmm3		; xmm2=(42 52 62 72 43 53 63 73)
-	movdqa    xmm5,xmm6		; transpose coefficients(phase 2)
-	punpckldq xmm6,xmm4		; xmm6=(44 54 64 74 45 55 65 75)
-	punpckhdq xmm5,xmm4		; xmm5=(46 56 66 76 47 57 67 77)
-
-	movdqa     xmm3,xmm7		; transpose coefficients(phase 3)
-	punpcklqdq xmm7,xmm0		; xmm7=col0=(00 10 20 30 40 50 60 70)
-	punpckhqdq xmm3,xmm0		; xmm3=col1=(01 11 21 31 41 51 61 71)
-	movdqa     xmm4,xmm1		; transpose coefficients(phase 3)
-	punpcklqdq xmm1,xmm2		; xmm1=col2=(02 12 22 32 42 52 62 72)
-	punpckhqdq xmm4,xmm2		; xmm4=col3=(03 13 23 33 43 53 63 73)
-
-	movdqa	xmm0, XMMWORD [wk(6)]	; xmm0=(04 14 24 34 05 15 25 35)
-	movdqa	xmm2, XMMWORD [wk(7)]	; xmm2=(06 16 26 36 07 17 27 37)
-
-	movdqa	XMMWORD [wk(8)], xmm3	; wk(8)=col1
-	movdqa	XMMWORD [wk(9)], xmm4	; wk(9)=col3
-
-	movdqa     xmm3,xmm0		; transpose coefficients(phase 3)
-	punpcklqdq xmm0,xmm6		; xmm0=col4=(04 14 24 34 44 54 64 74)
-	punpckhqdq xmm3,xmm6		; xmm3=col5=(05 15 25 35 45 55 65 75)
-	movdqa     xmm4,xmm2		; transpose coefficients(phase 3)
-	punpcklqdq xmm2,xmm5		; xmm2=col6=(06 16 26 36 46 56 66 76)
-	punpckhqdq xmm4,xmm5		; xmm4=col7=(07 17 27 37 47 57 67 77)
-
-	movdqa	XMMWORD [wk(10)], xmm3	; wk(10)=col5
-	movdqa	XMMWORD [wk(11)], xmm4	; wk(11)=col7
-.column_end:
-
-	; -- Prefetch the next coefficient block
-
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-	; ---- Pass 2: process rows from work array, store into output array.
-
-	mov	eax, [original_ebp]
-	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [output_col(eax)]
-
-	; -- Even part
-
-	; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
-
-	; (Original)
-	; z1 = (z2 + z3) * 0.541196100;
-	; tmp2 = z1 + z3 * -1.847759065;
-	; tmp3 = z1 + z2 * 0.765366865;
-	;
-	; (This implementation)
-	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-	movdqa    xmm6,xmm1		; xmm1=in2=z2
-	movdqa    xmm5,xmm1
-	punpcklwd xmm6,xmm2		; xmm2=in6=z3
-	punpckhwd xmm5,xmm2
-	movdqa    xmm1,xmm6
-	movdqa    xmm2,xmm5
-	pmaddwd   xmm6,[GOTOFF(ebx,PW_F130_F054)]	; xmm6=tmp3L
-	pmaddwd   xmm5,[GOTOFF(ebx,PW_F130_F054)]	; xmm5=tmp3H
-	pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]	; xmm1=tmp2L
-	pmaddwd   xmm2,[GOTOFF(ebx,PW_F054_MF130)]	; xmm2=tmp2H
-
-	movdqa    xmm3,xmm7
-	paddw     xmm7,xmm0		; xmm7=in0+in4
-	psubw     xmm3,xmm0		; xmm3=in0-in4
-
-	pxor      xmm4,xmm4
-	pxor      xmm0,xmm0
-	punpcklwd xmm4,xmm7		; xmm4=tmp0L
-	punpckhwd xmm0,xmm7		; xmm0=tmp0H
-	psrad     xmm4,(16-CONST_BITS)	; psrad xmm4,16 & pslld xmm4,CONST_BITS
-	psrad     xmm0,(16-CONST_BITS)	; psrad xmm0,16 & pslld xmm0,CONST_BITS
-
-	movdqa	xmm7,xmm4
-	paddd	xmm4,xmm6		; xmm4=tmp10L
-	psubd	xmm7,xmm6		; xmm7=tmp13L
-	movdqa	xmm6,xmm0
-	paddd	xmm0,xmm5		; xmm0=tmp10H
-	psubd	xmm6,xmm5		; xmm6=tmp13H
-
-	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=tmp10L
-	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp10H
-	movdqa	XMMWORD [wk(2)], xmm7	; wk(2)=tmp13L
-	movdqa	XMMWORD [wk(3)], xmm6	; wk(3)=tmp13H
-
-	pxor      xmm5,xmm5
-	pxor      xmm4,xmm4
-	punpcklwd xmm5,xmm3		; xmm5=tmp1L
-	punpckhwd xmm4,xmm3		; xmm4=tmp1H
-	psrad     xmm5,(16-CONST_BITS)	; psrad xmm5,16 & pslld xmm5,CONST_BITS
-	psrad     xmm4,(16-CONST_BITS)	; psrad xmm4,16 & pslld xmm4,CONST_BITS
-
-	movdqa	xmm0,xmm5
-	paddd	xmm5,xmm1		; xmm5=tmp11L
-	psubd	xmm0,xmm1		; xmm0=tmp12L
-	movdqa	xmm7,xmm4
-	paddd	xmm4,xmm2		; xmm4=tmp11H
-	psubd	xmm7,xmm2		; xmm7=tmp12H
-
-	movdqa	XMMWORD [wk(4)], xmm5	; wk(4)=tmp11L
-	movdqa	XMMWORD [wk(5)], xmm4	; wk(5)=tmp11H
-	movdqa	XMMWORD [wk(6)], xmm0	; wk(6)=tmp12L
-	movdqa	XMMWORD [wk(7)], xmm7	; wk(7)=tmp12H
-
-	; -- Odd part
-
-	movdqa	xmm6, XMMWORD [wk(9)]	; xmm6=col3
-	movdqa	xmm3, XMMWORD [wk(8)]	; xmm3=col1
-	movdqa	xmm1, XMMWORD [wk(11)]	; xmm1=col7
-	movdqa	xmm2, XMMWORD [wk(10)]	; xmm2=col5
-
-	movdqa	xmm5,xmm6
-	movdqa	xmm4,xmm3
-	paddw	xmm5,xmm1		; xmm5=z3
-	paddw	xmm4,xmm2		; xmm4=z4
-
-	; (Original)
-	; z5 = (z3 + z4) * 1.175875602;
-	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-	; z3 += z5;  z4 += z5;
-	;
-	; (This implementation)
-	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-	movdqa    xmm0,xmm5
-	movdqa    xmm7,xmm5
-	punpcklwd xmm0,xmm4
-	punpckhwd xmm7,xmm4
-	movdqa    xmm5,xmm0
-	movdqa    xmm4,xmm7
-	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF078_F117)]	; xmm0=z3L
-	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF078_F117)]	; xmm7=z3H
-	pmaddwd   xmm5,[GOTOFF(ebx,PW_F117_F078)]	; xmm5=z4L
-	pmaddwd   xmm4,[GOTOFF(ebx,PW_F117_F078)]	; xmm4=z4H
-
-	movdqa	XMMWORD [wk(10)], xmm0	; wk(10)=z3L
-	movdqa	XMMWORD [wk(11)], xmm7	; wk(11)=z3H
-
-	; (Original)
-	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-	;
-	; (This implementation)
-	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-	; tmp0 += z3;  tmp1 += z4;
-	; tmp2 += z3;  tmp3 += z4;
-
-	movdqa    xmm0,xmm1
-	movdqa    xmm7,xmm1
-	punpcklwd xmm0,xmm3
-	punpckhwd xmm7,xmm3
-	movdqa    xmm1,xmm0
-	movdqa    xmm3,xmm7
-	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm0=tmp0L
-	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm7=tmp0H
-	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF089_F060)]	; xmm1=tmp3L
-	pmaddwd   xmm3,[GOTOFF(ebx,PW_MF089_F060)]	; xmm3=tmp3H
-
-	paddd	xmm0, XMMWORD [wk(10)]	; xmm0=tmp0L
-	paddd	xmm7, XMMWORD [wk(11)]	; xmm7=tmp0H
-	paddd	xmm1,xmm5		; xmm1=tmp3L
-	paddd	xmm3,xmm4		; xmm3=tmp3H
-
-	movdqa	XMMWORD [wk(8)], xmm0	; wk(8)=tmp0L
-	movdqa	XMMWORD [wk(9)], xmm7	; wk(9)=tmp0H
-
-	movdqa    xmm0,xmm2
-	movdqa    xmm7,xmm2
-	punpcklwd xmm0,xmm6
-	punpckhwd xmm7,xmm6
-	movdqa    xmm2,xmm0
-	movdqa    xmm6,xmm7
-	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm0=tmp1L
-	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm7=tmp1H
-	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF256_F050)]	; xmm2=tmp2L
-	pmaddwd   xmm6,[GOTOFF(ebx,PW_MF256_F050)]	; xmm6=tmp2H
-
-	paddd	xmm0,xmm5		; xmm0=tmp1L
-	paddd	xmm7,xmm4		; xmm7=tmp1H
-	paddd	xmm2, XMMWORD [wk(10)]	; xmm2=tmp2L
-	paddd	xmm6, XMMWORD [wk(11)]	; xmm6=tmp2H
-
-	movdqa	XMMWORD [wk(10)], xmm0	; wk(10)=tmp1L
-	movdqa	XMMWORD [wk(11)], xmm7	; wk(11)=tmp1H
-
-	; -- Final output stage
-
-	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=tmp10L
-	movdqa	xmm4, XMMWORD [wk(1)]	; xmm4=tmp10H
-
-	movdqa	xmm0,xmm5
-	movdqa	xmm7,xmm4
-	paddd	xmm5,xmm1		; xmm5=data0L
-	paddd	xmm4,xmm3		; xmm4=data0H
-	psubd	xmm0,xmm1		; xmm0=data7L
-	psubd	xmm7,xmm3		; xmm7=data7H
-
-	movdqa	xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]	; xmm1=[PD_DESCALE_P2]
-
-	paddd	xmm5,xmm1
-	paddd	xmm4,xmm1
-	psrad	xmm5,DESCALE_P2
-	psrad	xmm4,DESCALE_P2
-	paddd	xmm0,xmm1
-	paddd	xmm7,xmm1
-	psrad	xmm0,DESCALE_P2
-	psrad	xmm7,DESCALE_P2
-
-	packssdw  xmm5,xmm4		; xmm5=data0=(00 10 20 30 40 50 60 70)
-	packssdw  xmm0,xmm7		; xmm0=data7=(07 17 27 37 47 57 67 77)
-
-	movdqa	xmm3, XMMWORD [wk(4)]	; xmm3=tmp11L
-	movdqa	xmm1, XMMWORD [wk(5)]	; xmm1=tmp11H
-
-	movdqa	xmm4,xmm3
-	movdqa	xmm7,xmm1
-	paddd	xmm3,xmm2		; xmm3=data1L
-	paddd	xmm1,xmm6		; xmm1=data1H
-	psubd	xmm4,xmm2		; xmm4=data6L
-	psubd	xmm7,xmm6		; xmm7=data6H
-
-	movdqa	xmm2,[GOTOFF(ebx,PD_DESCALE_P2)]	; xmm2=[PD_DESCALE_P2]
-
-	paddd	xmm3,xmm2
-	paddd	xmm1,xmm2
-	psrad	xmm3,DESCALE_P2
-	psrad	xmm1,DESCALE_P2
-	paddd	xmm4,xmm2
-	paddd	xmm7,xmm2
-	psrad	xmm4,DESCALE_P2
-	psrad	xmm7,DESCALE_P2
-
-	packssdw  xmm3,xmm1		; xmm3=data1=(01 11 21 31 41 51 61 71)
-	packssdw  xmm4,xmm7		; xmm4=data6=(06 16 26 36 46 56 66 76)
-
-	packsswb  xmm5,xmm4		; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-	packsswb  xmm3,xmm0		; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-	movdqa	xmm6, XMMWORD [wk(6)]	; xmm6=tmp12L
-	movdqa	xmm2, XMMWORD [wk(7)]	; xmm2=tmp12H
-	movdqa	xmm1, XMMWORD [wk(10)]	; xmm1=tmp1L
-	movdqa	xmm7, XMMWORD [wk(11)]	; xmm7=tmp1H
-
-	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-	movdqa	xmm4,xmm6
-	movdqa	xmm0,xmm2
-	paddd	xmm6,xmm1		; xmm6=data2L
-	paddd	xmm2,xmm7		; xmm2=data2H
-	psubd	xmm4,xmm1		; xmm4=data5L
-	psubd	xmm0,xmm7		; xmm0=data5H
-
-	movdqa	xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]	; xmm5=[PD_DESCALE_P2]
-
-	paddd	xmm6,xmm5
-	paddd	xmm2,xmm5
-	psrad	xmm6,DESCALE_P2
-	psrad	xmm2,DESCALE_P2
-	paddd	xmm4,xmm5
-	paddd	xmm0,xmm5
-	psrad	xmm4,DESCALE_P2
-	psrad	xmm0,DESCALE_P2
-
-	packssdw  xmm6,xmm2		; xmm6=data2=(02 12 22 32 42 52 62 72)
-	packssdw  xmm4,xmm0		; xmm4=data5=(05 15 25 35 45 55 65 75)
-
-	movdqa	xmm3, XMMWORD [wk(2)]	; xmm3=tmp13L
-	movdqa	xmm1, XMMWORD [wk(3)]	; xmm1=tmp13H
-	movdqa	xmm7, XMMWORD [wk(8)]	; xmm7=tmp0L
-	movdqa	xmm5, XMMWORD [wk(9)]	; xmm5=tmp0H
-
-	movdqa	xmm2,xmm3
-	movdqa	xmm0,xmm1
-	paddd	xmm3,xmm7		; xmm3=data3L
-	paddd	xmm1,xmm5		; xmm1=data3H
-	psubd	xmm2,xmm7		; xmm2=data4L
-	psubd	xmm0,xmm5		; xmm0=data4H
-
-	movdqa	xmm7,[GOTOFF(ebx,PD_DESCALE_P2)]	; xmm7=[PD_DESCALE_P2]
-
-	paddd	xmm3,xmm7
-	paddd	xmm1,xmm7
-	psrad	xmm3,DESCALE_P2
-	psrad	xmm1,DESCALE_P2
-	paddd	xmm2,xmm7
-	paddd	xmm0,xmm7
-	psrad	xmm2,DESCALE_P2
-	psrad	xmm0,DESCALE_P2
-
-	movdqa    xmm5,[GOTOFF(ebx,PB_CENTERJSAMP)]	; xmm5=[PB_CENTERJSAMP]
-
-	packssdw  xmm3,xmm1		; xmm3=data3=(03 13 23 33 43 53 63 73)
-	packssdw  xmm2,xmm0		; xmm2=data4=(04 14 24 34 44 54 64 74)
-
-	movdqa    xmm7, XMMWORD [wk(0)]	; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-	movdqa    xmm1, XMMWORD [wk(1)]	; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-	packsswb  xmm6,xmm2		; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
-	packsswb  xmm3,xmm4		; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
-
-	paddb     xmm7,xmm5
-	paddb     xmm1,xmm5
-	paddb     xmm6,xmm5
-	paddb     xmm3,xmm5
-
-	movdqa    xmm0,xmm7	; transpose coefficients(phase 1)
-	punpcklbw xmm7,xmm1	; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
-	punpckhbw xmm0,xmm1	; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
-	movdqa    xmm2,xmm6	; transpose coefficients(phase 1)
-	punpcklbw xmm6,xmm3	; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
-	punpckhbw xmm2,xmm3	; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
-
-	movdqa    xmm4,xmm7	; transpose coefficients(phase 2)
-	punpcklwd xmm7,xmm6	; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-	punpckhwd xmm4,xmm6	; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
-	movdqa    xmm5,xmm2	; transpose coefficients(phase 2)
-	punpcklwd xmm2,xmm0	; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-	punpckhwd xmm5,xmm0	; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
-
-	movdqa    xmm1,xmm7	; transpose coefficients(phase 3)
-	punpckldq xmm7,xmm2	; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-	punpckhdq xmm1,xmm2	; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-	movdqa    xmm3,xmm4	; transpose coefficients(phase 3)
-	punpckldq xmm4,xmm5	; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
-	punpckhdq xmm3,xmm5	; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
-
-	pshufd	xmm6,xmm7,0x4E	; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-	pshufd	xmm0,xmm1,0x4E	; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-	pshufd	xmm2,xmm4,0x4E	; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
-	pshufd	xmm5,xmm3,0x4E	; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
-
-	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-	mov	esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7
-	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
-	mov	edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
-	mov	esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
-	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
-
-	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-	mov	esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
-	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
-	mov	edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
-	mov	esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
-	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; unused
-	poppic	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jiss2red-64.asm b/simd/jiss2red-64.asm
deleted file mode 100644
index 637339e..0000000
--- a/simd/jiss2red-64.asm
+++ /dev/null
@@ -1,576 +0,0 @@
-;
-; jiss2red-64.asm - reduced-size IDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains inverse-DCT routines that produce reduced-size
-; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
-; The following code is based directly on the IJG's original jidctred.c;
-; see the jidctred.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	13
-%define PASS1_BITS	2
-
-%define DESCALE_P1_4	(CONST_BITS-PASS1_BITS+1)
-%define DESCALE_P2_4	(CONST_BITS+PASS1_BITS+3+1)
-%define DESCALE_P1_2	(CONST_BITS-PASS1_BITS+2)
-%define DESCALE_P2_2	(CONST_BITS+PASS1_BITS+3+2)
-
-%if CONST_BITS == 13
-F_0_211	equ	 1730		; FIX(0.211164243)
-F_0_509	equ	 4176		; FIX(0.509795579)
-F_0_601	equ	 4926		; FIX(0.601344887)
-F_0_720	equ	 5906		; FIX(0.720959822)
-F_0_765	equ	 6270		; FIX(0.765366865)
-F_0_850	equ	 6967		; FIX(0.850430095)
-F_0_899	equ	 7373		; FIX(0.899976223)
-F_1_061	equ	 8697		; FIX(1.061594337)
-F_1_272	equ	10426		; FIX(1.272758580)
-F_1_451	equ	11893		; FIX(1.451774981)
-F_1_847	equ	15137		; FIX(1.847759065)
-F_2_172	equ	17799		; FIX(2.172734803)
-F_2_562	equ	20995		; FIX(2.562915447)
-F_3_624	equ	29692		; FIX(3.624509785)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_211	equ	DESCALE( 226735879,30-CONST_BITS)	; FIX(0.211164243)
-F_0_509	equ	DESCALE( 547388834,30-CONST_BITS)	; FIX(0.509795579)
-F_0_601	equ	DESCALE( 645689155,30-CONST_BITS)	; FIX(0.601344887)
-F_0_720	equ	DESCALE( 774124714,30-CONST_BITS)	; FIX(0.720959822)
-F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
-F_0_850	equ	DESCALE( 913142361,30-CONST_BITS)	; FIX(0.850430095)
-F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
-F_1_061	equ	DESCALE(1139878239,30-CONST_BITS)	; FIX(1.061594337)
-F_1_272	equ	DESCALE(1366614119,30-CONST_BITS)	; FIX(1.272758580)
-F_1_451	equ	DESCALE(1558831516,30-CONST_BITS)	; FIX(1.451774981)
-F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
-F_2_172	equ	DESCALE(2332956230,30-CONST_BITS)	; FIX(2.172734803)
-F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
-F_3_624	equ	DESCALE(3891787747,30-CONST_BITS)	; FIX(3.624509785)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_idct_red_sse2) PRIVATE
-
-EXTN(jconst_idct_red_sse2):
-
-PW_F184_MF076	times 4 dw  F_1_847,-F_0_765
-PW_F256_F089	times 4 dw  F_2_562, F_0_899
-PW_F106_MF217	times 4 dw  F_1_061,-F_2_172
-PW_MF060_MF050	times 4 dw -F_0_601,-F_0_509
-PW_F145_MF021	times 4 dw  F_1_451,-F_0_211
-PW_F362_MF127	times 4 dw  F_3_624,-F_1_272
-PW_F085_MF072	times 4 dw  F_0_850,-F_0_720
-PD_DESCALE_P1_4	times 4 dd  1 << (DESCALE_P1_4-1)
-PD_DESCALE_P2_4	times 4 dd  1 << (DESCALE_P2_4-1)
-PD_DESCALE_P1_2	times 4 dd  1 << (DESCALE_P1_2-1)
-PD_DESCALE_P2_2	times 4 dd  1 << (DESCALE_P2_2-1)
-PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 4x4 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_4x4_sse2 (void * dct_table, JCOEFPTR coef_block,
-;                      JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = void * dct_table
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-%define original_rbp	rbp+0
-%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-
-	align	16
-	global	EXTN(jsimd_idct_4x4_sse2) PRIVATE
-
-EXTN(jsimd_idct_4x4_sse2):
-	push	rbp
-	mov	rax,rsp				; rax = original rbp
-	sub	rsp, byte 4
-	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[rsp],rax
-	mov	rbp,rsp				; rbp = aligned rbp
-	lea	rsp, [wk(0)]
-	collect_args
-
-	; ---- Pass 1: process columns from input.
-
-	mov	rdx, r10	; quantptr
-	mov	rsi, r11		; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
-	mov	eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-	or	eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-	jnz	short .columnDCT
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-	por	xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-	por	xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-	por	xmm0,xmm1
-	packsswb xmm0,xmm0
-	packsswb xmm0,xmm0
-	movd	eax,xmm0
-	test	rax,rax
-	jnz	short .columnDCT
-
-	; -- AC terms all zero
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	psllw	xmm0,PASS1_BITS
-
-	movdqa    xmm3,xmm0	; xmm0=in0=(00 01 02 03 04 05 06 07)
-	punpcklwd xmm0,xmm0	; xmm0=(00 00 01 01 02 02 03 03)
-	punpckhwd xmm3,xmm3	; xmm3=(04 04 05 05 06 06 07 07)
-
-	pshufd	xmm1,xmm0,0x50	; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
-	pshufd	xmm0,xmm0,0xFA	; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
-	pshufd	xmm6,xmm3,0x50	; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
-	pshufd	xmm3,xmm3,0xFA	; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
-
-	jmp	near .column_end
-%endif
-.columnDCT:
-
-	; -- Odd part
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-	movdqa	xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	movdqa    xmm4,xmm0
-	movdqa    xmm5,xmm0
-	punpcklwd xmm4,xmm1
-	punpckhwd xmm5,xmm1
-	movdqa    xmm0,xmm4
-	movdqa    xmm1,xmm5
-	pmaddwd   xmm4,[rel PW_F256_F089]	; xmm4=(tmp2L)
-	pmaddwd   xmm5,[rel PW_F256_F089]	; xmm5=(tmp2H)
-	pmaddwd   xmm0,[rel PW_F106_MF217]	; xmm0=(tmp0L)
-	pmaddwd   xmm1,[rel PW_F106_MF217]	; xmm1=(tmp0H)
-
-	movdqa    xmm6,xmm2
-	movdqa    xmm7,xmm2
-	punpcklwd xmm6,xmm3
-	punpckhwd xmm7,xmm3
-	movdqa    xmm2,xmm6
-	movdqa    xmm3,xmm7
-	pmaddwd   xmm6,[rel PW_MF060_MF050]	; xmm6=(tmp2L)
-	pmaddwd   xmm7,[rel PW_MF060_MF050]	; xmm7=(tmp2H)
-	pmaddwd   xmm2,[rel PW_F145_MF021]	; xmm2=(tmp0L)
-	pmaddwd   xmm3,[rel PW_F145_MF021]	; xmm3=(tmp0H)
-
-	paddd	xmm6,xmm4		; xmm6=tmp2L
-	paddd	xmm7,xmm5		; xmm7=tmp2H
-	paddd	xmm2,xmm0		; xmm2=tmp0L
-	paddd	xmm3,xmm1		; xmm3=tmp0H
-
-	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=tmp0L
-	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=tmp0H
-
-	; -- Even part
-
-	movdqa	xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	pxor      xmm1,xmm1
-	pxor      xmm2,xmm2
-	punpcklwd xmm1,xmm4		; xmm1=tmp0L
-	punpckhwd xmm2,xmm4		; xmm2=tmp0H
-	psrad     xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
-	psrad     xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
-
-	movdqa    xmm3,xmm5		; xmm5=in2=z2
-	punpcklwd xmm5,xmm0		; xmm0=in6=z3
-	punpckhwd xmm3,xmm0
-	pmaddwd   xmm5,[rel PW_F184_MF076]	; xmm5=tmp2L
-	pmaddwd   xmm3,[rel PW_F184_MF076]	; xmm3=tmp2H
-
-	movdqa	xmm4,xmm1
-	movdqa	xmm0,xmm2
-	paddd	xmm1,xmm5		; xmm1=tmp10L
-	paddd	xmm2,xmm3		; xmm2=tmp10H
-	psubd	xmm4,xmm5		; xmm4=tmp12L
-	psubd	xmm0,xmm3		; xmm0=tmp12H
-
-	; -- Final output stage
-
-	movdqa	xmm5,xmm1
-	movdqa	xmm3,xmm2
-	paddd	xmm1,xmm6		; xmm1=data0L
-	paddd	xmm2,xmm7		; xmm2=data0H
-	psubd	xmm5,xmm6		; xmm5=data3L
-	psubd	xmm3,xmm7		; xmm3=data3H
-
-	movdqa	xmm6,[rel PD_DESCALE_P1_4]	; xmm6=[rel PD_DESCALE_P1_4]
-
-	paddd	xmm1,xmm6
-	paddd	xmm2,xmm6
-	psrad	xmm1,DESCALE_P1_4
-	psrad	xmm2,DESCALE_P1_4
-	paddd	xmm5,xmm6
-	paddd	xmm3,xmm6
-	psrad	xmm5,DESCALE_P1_4
-	psrad	xmm3,DESCALE_P1_4
-
-	packssdw  xmm1,xmm2		; xmm1=data0=(00 01 02 03 04 05 06 07)
-	packssdw  xmm5,xmm3		; xmm5=data3=(30 31 32 33 34 35 36 37)
-
-	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp0L
-	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=tmp0H
-
-	movdqa	xmm2,xmm4
-	movdqa	xmm3,xmm0
-	paddd	xmm4,xmm7		; xmm4=data1L
-	paddd	xmm0,xmm6		; xmm0=data1H
-	psubd	xmm2,xmm7		; xmm2=data2L
-	psubd	xmm3,xmm6		; xmm3=data2H
-
-	movdqa	xmm7,[rel PD_DESCALE_P1_4]	; xmm7=[rel PD_DESCALE_P1_4]
-
-	paddd	xmm4,xmm7
-	paddd	xmm0,xmm7
-	psrad	xmm4,DESCALE_P1_4
-	psrad	xmm0,DESCALE_P1_4
-	paddd	xmm2,xmm7
-	paddd	xmm3,xmm7
-	psrad	xmm2,DESCALE_P1_4
-	psrad	xmm3,DESCALE_P1_4
-
-	packssdw  xmm4,xmm0		; xmm4=data1=(10 11 12 13 14 15 16 17)
-	packssdw  xmm2,xmm3		; xmm2=data2=(20 21 22 23 24 25 26 27)
-
-	movdqa    xmm6,xmm1	; transpose coefficients(phase 1)
-	punpcklwd xmm1,xmm4	; xmm1=(00 10 01 11 02 12 03 13)
-	punpckhwd xmm6,xmm4	; xmm6=(04 14 05 15 06 16 07 17)
-	movdqa    xmm7,xmm2	; transpose coefficients(phase 1)
-	punpcklwd xmm2,xmm5	; xmm2=(20 30 21 31 22 32 23 33)
-	punpckhwd xmm7,xmm5	; xmm7=(24 34 25 35 26 36 27 37)
-
-	movdqa    xmm0,xmm1	; transpose coefficients(phase 2)
-	punpckldq xmm1,xmm2	; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
-	punpckhdq xmm0,xmm2	; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
-	movdqa    xmm3,xmm6	; transpose coefficients(phase 2)
-	punpckldq xmm6,xmm7	; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
-	punpckhdq xmm3,xmm7	; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
-.column_end:
-
-	; -- Prefetch the next coefficient block
-
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-	; ---- Pass 2: process rows, store into output array.
-
-	mov	rax, [original_rbp]
-	mov	rdi, r12	; (JSAMPROW *)
-	mov	rax, r13
-
-	; -- Even part
-
-	pxor      xmm4,xmm4
-	punpcklwd xmm4,xmm1		; xmm4=tmp0
-	psrad     xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
-
-	; -- Odd part
-
-	punpckhwd xmm1,xmm0
-	punpckhwd xmm6,xmm3
-	movdqa    xmm5,xmm1
-	movdqa    xmm2,xmm6
-	pmaddwd   xmm1,[rel PW_F256_F089]	; xmm1=(tmp2)
-	pmaddwd   xmm6,[rel PW_MF060_MF050]	; xmm6=(tmp2)
-	pmaddwd   xmm5,[rel PW_F106_MF217]	; xmm5=(tmp0)
-	pmaddwd   xmm2,[rel PW_F145_MF021]	; xmm2=(tmp0)
-
-	paddd     xmm6,xmm1		; xmm6=tmp2
-	paddd     xmm2,xmm5		; xmm2=tmp0
-
-	; -- Even part
-
-	punpcklwd xmm0,xmm3
-	pmaddwd   xmm0,[rel PW_F184_MF076]	; xmm0=tmp2
-
-	movdqa    xmm7,xmm4
-	paddd     xmm4,xmm0		; xmm4=tmp10
-	psubd     xmm7,xmm0		; xmm7=tmp12
-
-	; -- Final output stage
-
-	movdqa	xmm1,[rel PD_DESCALE_P2_4]	; xmm1=[rel PD_DESCALE_P2_4]
-
-	movdqa	xmm5,xmm4
-	movdqa	xmm3,xmm7
-	paddd	xmm4,xmm6		; xmm4=data0=(00 10 20 30)
-	paddd	xmm7,xmm2		; xmm7=data1=(01 11 21 31)
-	psubd	xmm5,xmm6		; xmm5=data3=(03 13 23 33)
-	psubd	xmm3,xmm2		; xmm3=data2=(02 12 22 32)
-
-	paddd	xmm4,xmm1
-	paddd	xmm7,xmm1
-	psrad	xmm4,DESCALE_P2_4
-	psrad	xmm7,DESCALE_P2_4
-	paddd	xmm5,xmm1
-	paddd	xmm3,xmm1
-	psrad	xmm5,DESCALE_P2_4
-	psrad	xmm3,DESCALE_P2_4
-
-	packssdw  xmm4,xmm3		; xmm4=(00 10 20 30 02 12 22 32)
-	packssdw  xmm7,xmm5		; xmm7=(01 11 21 31 03 13 23 33)
-
-	movdqa    xmm0,xmm4		; transpose coefficients(phase 1)
-	punpcklwd xmm4,xmm7		; xmm4=(00 01 10 11 20 21 30 31)
-	punpckhwd xmm0,xmm7		; xmm0=(02 03 12 13 22 23 32 33)
-
-	movdqa    xmm6,xmm4		; transpose coefficients(phase 2)
-	punpckldq xmm4,xmm0		; xmm4=(00 01 02 03 10 11 12 13)
-	punpckhdq xmm6,xmm0		; xmm6=(20 21 22 23 30 31 32 33)
-
-	packsswb  xmm4,xmm6		; xmm4=(00 01 02 03 10 11 12 13 20 ..)
-	paddb     xmm4,[rel PB_CENTERJSAMP]
-
-	pshufd    xmm2,xmm4,0x39	; xmm2=(10 11 12 13 20 21 22 23 30 ..)
-	pshufd    xmm1,xmm4,0x4E	; xmm1=(20 21 22 23 30 31 32 33 00 ..)
-	pshufd    xmm3,xmm4,0x93	; xmm3=(30 31 32 33 00 01 02 03 10 ..)
-
-	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-	mov	rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-	movd	XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
-	movd	XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
-	mov	rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
-	mov	rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
-	movd	XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
-	movd	XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
-
-	uncollect_args
-	mov	rsp,rbp		; rsp <- aligned rbp
-	pop	rsp		; rsp <- original rbp
-	pop	rbp
-	ret
-
-
-; --------------------------------------------------------------------------
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 2x2 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_2x2_sse2 (void * dct_table, JCOEFPTR coef_block,
-;                      JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = void * dct_table
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-	align	16
-	global	EXTN(jsimd_idct_2x2_sse2) PRIVATE
-
-EXTN(jsimd_idct_2x2_sse2):
-	push	rbp
-	mov	rax,rsp
-	mov	rbp,rsp
-	collect_args
-	push	rbx
-
-	; ---- Pass 1: process columns from input.
-
-	mov	rdx, r10	; quantptr
-	mov	rsi, r11		; inptr
-
-	; | input:                  | result:        |
-	; | 00 01 ** 03 ** 05 ** 07 |                |
-	; | 10 11 ** 13 ** 15 ** 17 |                |
-	; | ** ** ** ** ** ** ** ** |                |
-	; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
-	; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
-	; | 50 51 ** 53 ** 55 ** 57 |                |
-	; | ** ** ** ** ** ** ** ** |                |
-	; | 70 71 ** 73 ** 75 ** 77 |                |
-
-	; -- Odd part
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-	movdqa	xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
-	; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
-
-	pcmpeqd   xmm7,xmm7
-	pslld     xmm7,WORD_BIT		; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
-
-	movdqa    xmm4,xmm0		; xmm4=(10 11 ** 13 ** 15 ** 17)
-	movdqa    xmm5,xmm2		; xmm5=(50 51 ** 53 ** 55 ** 57)
-	punpcklwd xmm4,xmm1		; xmm4=(10 30 11 31 ** ** 13 33)
-	punpcklwd xmm5,xmm3		; xmm5=(50 70 51 71 ** ** 53 73)
-	pmaddwd   xmm4,[rel PW_F362_MF127]
-	pmaddwd   xmm5,[rel PW_F085_MF072]
-
-	psrld	xmm0,WORD_BIT		; xmm0=(11 -- 13 -- 15 -- 17 --)
-	pand	xmm1,xmm7		; xmm1=(-- 31 -- 33 -- 35 -- 37)
-	psrld	xmm2,WORD_BIT		; xmm2=(51 -- 53 -- 55 -- 57 --)
-	pand	xmm3,xmm7		; xmm3=(-- 71 -- 73 -- 75 -- 77)
-	por	xmm0,xmm1		; xmm0=(11 31 13 33 15 35 17 37)
-	por	xmm2,xmm3		; xmm2=(51 71 53 73 55 75 57 77)
-	pmaddwd	xmm0,[rel PW_F362_MF127]
-	pmaddwd	xmm2,[rel PW_F085_MF072]
-
-	paddd	xmm4,xmm5		; xmm4=tmp0[col0 col1 **** col3]
-	paddd	xmm0,xmm2		; xmm0=tmp0[col1 col3 col5 col7]
-
-	; -- Even part
-
-	movdqa	xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	; xmm6=(00 01 ** 03 ** 05 ** 07)
-
-	movdqa	xmm1,xmm6		; xmm1=(00 01 ** 03 ** 05 ** 07)
-	pslld	xmm6,WORD_BIT		; xmm6=(-- 00 -- ** -- ** -- **)
-	pand	xmm1,xmm7		; xmm1=(-- 01 -- 03 -- 05 -- 07)
-	psrad	xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
-	psrad	xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
-
-	; -- Final output stage
-
-	movdqa	xmm3,xmm6
-	movdqa	xmm5,xmm1
-	paddd	xmm6,xmm4	; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
-	paddd	xmm1,xmm0	; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
-	psubd	xmm3,xmm4	; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
-	psubd	xmm5,xmm0	; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
-
-	movdqa	xmm2,[rel PD_DESCALE_P1_2]	; xmm2=[rel PD_DESCALE_P1_2]
-
-	punpckldq  xmm6,xmm3		; xmm6=(A0 B0 ** **)
-
-	movdqa     xmm7,xmm1
-	punpcklqdq xmm1,xmm5		; xmm1=(A1 A3 B1 B3)
-	punpckhqdq xmm7,xmm5		; xmm7=(A5 A7 B5 B7)
-
-	paddd	xmm6,xmm2
-	psrad	xmm6,DESCALE_P1_2
-
-	paddd	xmm1,xmm2
-	paddd	xmm7,xmm2
-	psrad	xmm1,DESCALE_P1_2
-	psrad	xmm7,DESCALE_P1_2
-
-	; -- Prefetch the next coefficient block
-
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-	; ---- Pass 2: process rows, store into output array.
-
-	mov	rdi, r12	; (JSAMPROW *)
-	mov	rax, r13
-
-	; | input:| result:|
-	; | A0 B0 |        |
-	; | A1 B1 | C0 C1  |
-	; | A3 B3 | D0 D1  |
-	; | A5 B5 |        |
-	; | A7 B7 |        |
-
-	; -- Odd part
-
-	packssdw  xmm1,xmm1		; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
-	packssdw  xmm7,xmm7		; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
-	pmaddwd   xmm1,[rel PW_F362_MF127]
-	pmaddwd   xmm7,[rel PW_F085_MF072]
-
-	paddd     xmm1,xmm7		; xmm1=tmp0[row0 row1 row0 row1]
-
-	; -- Even part
-
-	pslld     xmm6,(CONST_BITS+2)	; xmm6=tmp10[row0 row1 **** ****]
-
-	; -- Final output stage
-
-	movdqa    xmm4,xmm6
-	paddd     xmm6,xmm1	; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
-	psubd     xmm4,xmm1	; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
-
-	punpckldq xmm6,xmm4	; xmm6=(C0 D0 C1 D1)
-
-	paddd     xmm6,[rel PD_DESCALE_P2_2]
-	psrad     xmm6,DESCALE_P2_2
-
-	packssdw  xmm6,xmm6		; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
-	packsswb  xmm6,xmm6		; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
-	paddb     xmm6,[rel PB_CENTERJSAMP]
-
-	pextrw	ebx,xmm6,0x00		; ebx=(C0 D0 -- --)
-	pextrw	ecx,xmm6,0x01		; ecx=(C1 D1 -- --)
-
-	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-	mov	rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-	mov	WORD [rdx+rax*SIZEOF_JSAMPLE], bx
-	mov	WORD [rsi+rax*SIZEOF_JSAMPLE], cx
-
-	pop	rbx
-	uncollect_args
-	pop	rbp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jiss2red.asm b/simd/jiss2red.asm
deleted file mode 100644
index 0e15ea8..0000000
--- a/simd/jiss2red.asm
+++ /dev/null
@@ -1,594 +0,0 @@
-;
-; jiss2red.asm - reduced-size IDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains inverse-DCT routines that produce reduced-size
-; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
-; The following code is based directly on the IJG's original jidctred.c;
-; see the jidctred.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	13
-%define PASS1_BITS	2
-
-%define DESCALE_P1_4	(CONST_BITS-PASS1_BITS+1)
-%define DESCALE_P2_4	(CONST_BITS+PASS1_BITS+3+1)
-%define DESCALE_P1_2	(CONST_BITS-PASS1_BITS+2)
-%define DESCALE_P2_2	(CONST_BITS+PASS1_BITS+3+2)
-
-%if CONST_BITS == 13
-F_0_211	equ	 1730		; FIX(0.211164243)
-F_0_509	equ	 4176		; FIX(0.509795579)
-F_0_601	equ	 4926		; FIX(0.601344887)
-F_0_720	equ	 5906		; FIX(0.720959822)
-F_0_765	equ	 6270		; FIX(0.765366865)
-F_0_850	equ	 6967		; FIX(0.850430095)
-F_0_899	equ	 7373		; FIX(0.899976223)
-F_1_061	equ	 8697		; FIX(1.061594337)
-F_1_272	equ	10426		; FIX(1.272758580)
-F_1_451	equ	11893		; FIX(1.451774981)
-F_1_847	equ	15137		; FIX(1.847759065)
-F_2_172	equ	17799		; FIX(2.172734803)
-F_2_562	equ	20995		; FIX(2.562915447)
-F_3_624	equ	29692		; FIX(3.624509785)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_211	equ	DESCALE( 226735879,30-CONST_BITS)	; FIX(0.211164243)
-F_0_509	equ	DESCALE( 547388834,30-CONST_BITS)	; FIX(0.509795579)
-F_0_601	equ	DESCALE( 645689155,30-CONST_BITS)	; FIX(0.601344887)
-F_0_720	equ	DESCALE( 774124714,30-CONST_BITS)	; FIX(0.720959822)
-F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
-F_0_850	equ	DESCALE( 913142361,30-CONST_BITS)	; FIX(0.850430095)
-F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
-F_1_061	equ	DESCALE(1139878239,30-CONST_BITS)	; FIX(1.061594337)
-F_1_272	equ	DESCALE(1366614119,30-CONST_BITS)	; FIX(1.272758580)
-F_1_451	equ	DESCALE(1558831516,30-CONST_BITS)	; FIX(1.451774981)
-F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
-F_2_172	equ	DESCALE(2332956230,30-CONST_BITS)	; FIX(2.172734803)
-F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
-F_3_624	equ	DESCALE(3891787747,30-CONST_BITS)	; FIX(3.624509785)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_idct_red_sse2) PRIVATE
-
-EXTN(jconst_idct_red_sse2):
-
-PW_F184_MF076	times 4 dw  F_1_847,-F_0_765
-PW_F256_F089	times 4 dw  F_2_562, F_0_899
-PW_F106_MF217	times 4 dw  F_1_061,-F_2_172
-PW_MF060_MF050	times 4 dw -F_0_601,-F_0_509
-PW_F145_MF021	times 4 dw  F_1_451,-F_0_211
-PW_F362_MF127	times 4 dw  F_3_624,-F_1_272
-PW_F085_MF072	times 4 dw  F_0_850,-F_0_720
-PD_DESCALE_P1_4	times 4 dd  1 << (DESCALE_P1_4-1)
-PD_DESCALE_P2_4	times 4 dd  1 << (DESCALE_P2_4-1)
-PD_DESCALE_P1_2	times 4 dd  1 << (DESCALE_P1_2-1)
-PD_DESCALE_P2_2	times 4 dd  1 << (DESCALE_P2_2-1)
-PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 4x4 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_4x4_sse2 (void * dct_table, JCOEFPTR coef_block,
-;                      JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)	(b)+8			; void * dct_table
-%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
-%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
-%define output_col(b)	(b)+20		; JDIMENSION output_col
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-
-	align	16
-	global	EXTN(jsimd_idct_4x4_sse2) PRIVATE
-
-EXTN(jsimd_idct_4x4_sse2):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	ebx
-;	push	ecx		; unused
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process columns from input.
-
-;	mov	eax, [original_ebp]
-	mov	edx, POINTER [dct_table(eax)]	; quantptr
-	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
-	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	jnz	short .columnDCT
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	por	xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	por	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	por	xmm0,xmm1
-	packsswb xmm0,xmm0
-	packsswb xmm0,xmm0
-	movd	eax,xmm0
-	test	eax,eax
-	jnz	short .columnDCT
-
-	; -- AC terms all zero
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	psllw	xmm0,PASS1_BITS
-
-	movdqa    xmm3,xmm0	; xmm0=in0=(00 01 02 03 04 05 06 07)
-	punpcklwd xmm0,xmm0	; xmm0=(00 00 01 01 02 02 03 03)
-	punpckhwd xmm3,xmm3	; xmm3=(04 04 05 05 06 06 07 07)
-
-	pshufd	xmm1,xmm0,0x50	; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
-	pshufd	xmm0,xmm0,0xFA	; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
-	pshufd	xmm6,xmm3,0x50	; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
-	pshufd	xmm3,xmm3,0xFA	; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
-
-	jmp	near .column_end
-	alignx	16,7
-%endif
-.columnDCT:
-
-	; -- Odd part
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	movdqa	xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	movdqa    xmm4,xmm0
-	movdqa    xmm5,xmm0
-	punpcklwd xmm4,xmm1
-	punpckhwd xmm5,xmm1
-	movdqa    xmm0,xmm4
-	movdqa    xmm1,xmm5
-	pmaddwd   xmm4,[GOTOFF(ebx,PW_F256_F089)]	; xmm4=(tmp2L)
-	pmaddwd   xmm5,[GOTOFF(ebx,PW_F256_F089)]	; xmm5=(tmp2H)
-	pmaddwd   xmm0,[GOTOFF(ebx,PW_F106_MF217)]	; xmm0=(tmp0L)
-	pmaddwd   xmm1,[GOTOFF(ebx,PW_F106_MF217)]	; xmm1=(tmp0H)
-
-	movdqa    xmm6,xmm2
-	movdqa    xmm7,xmm2
-	punpcklwd xmm6,xmm3
-	punpckhwd xmm7,xmm3
-	movdqa    xmm2,xmm6
-	movdqa    xmm3,xmm7
-	pmaddwd   xmm6,[GOTOFF(ebx,PW_MF060_MF050)]	; xmm6=(tmp2L)
-	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF050)]	; xmm7=(tmp2H)
-	pmaddwd   xmm2,[GOTOFF(ebx,PW_F145_MF021)]	; xmm2=(tmp0L)
-	pmaddwd   xmm3,[GOTOFF(ebx,PW_F145_MF021)]	; xmm3=(tmp0H)
-
-	paddd	xmm6,xmm4		; xmm6=tmp2L
-	paddd	xmm7,xmm5		; xmm7=tmp2H
-	paddd	xmm2,xmm0		; xmm2=tmp0L
-	paddd	xmm3,xmm1		; xmm3=tmp0H
-
-	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=tmp0L
-	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=tmp0H
-
-	; -- Even part
-
-	movdqa	xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	pxor      xmm1,xmm1
-	pxor      xmm2,xmm2
-	punpcklwd xmm1,xmm4		; xmm1=tmp0L
-	punpckhwd xmm2,xmm4		; xmm2=tmp0H
-	psrad     xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
-	psrad     xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
-
-	movdqa    xmm3,xmm5		; xmm5=in2=z2
-	punpcklwd xmm5,xmm0		; xmm0=in6=z3
-	punpckhwd xmm3,xmm0
-	pmaddwd   xmm5,[GOTOFF(ebx,PW_F184_MF076)]	; xmm5=tmp2L
-	pmaddwd   xmm3,[GOTOFF(ebx,PW_F184_MF076)]	; xmm3=tmp2H
-
-	movdqa	xmm4,xmm1
-	movdqa	xmm0,xmm2
-	paddd	xmm1,xmm5		; xmm1=tmp10L
-	paddd	xmm2,xmm3		; xmm2=tmp10H
-	psubd	xmm4,xmm5		; xmm4=tmp12L
-	psubd	xmm0,xmm3		; xmm0=tmp12H
-
-	; -- Final output stage
-
-	movdqa	xmm5,xmm1
-	movdqa	xmm3,xmm2
-	paddd	xmm1,xmm6		; xmm1=data0L
-	paddd	xmm2,xmm7		; xmm2=data0H
-	psubd	xmm5,xmm6		; xmm5=data3L
-	psubd	xmm3,xmm7		; xmm3=data3H
-
-	movdqa	xmm6,[GOTOFF(ebx,PD_DESCALE_P1_4)]	; xmm6=[PD_DESCALE_P1_4]
-
-	paddd	xmm1,xmm6
-	paddd	xmm2,xmm6
-	psrad	xmm1,DESCALE_P1_4
-	psrad	xmm2,DESCALE_P1_4
-	paddd	xmm5,xmm6
-	paddd	xmm3,xmm6
-	psrad	xmm5,DESCALE_P1_4
-	psrad	xmm3,DESCALE_P1_4
-
-	packssdw  xmm1,xmm2		; xmm1=data0=(00 01 02 03 04 05 06 07)
-	packssdw  xmm5,xmm3		; xmm5=data3=(30 31 32 33 34 35 36 37)
-
-	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp0L
-	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=tmp0H
-
-	movdqa	xmm2,xmm4
-	movdqa	xmm3,xmm0
-	paddd	xmm4,xmm7		; xmm4=data1L
-	paddd	xmm0,xmm6		; xmm0=data1H
-	psubd	xmm2,xmm7		; xmm2=data2L
-	psubd	xmm3,xmm6		; xmm3=data2H
-
-	movdqa	xmm7,[GOTOFF(ebx,PD_DESCALE_P1_4)]	; xmm7=[PD_DESCALE_P1_4]
-
-	paddd	xmm4,xmm7
-	paddd	xmm0,xmm7
-	psrad	xmm4,DESCALE_P1_4
-	psrad	xmm0,DESCALE_P1_4
-	paddd	xmm2,xmm7
-	paddd	xmm3,xmm7
-	psrad	xmm2,DESCALE_P1_4
-	psrad	xmm3,DESCALE_P1_4
-
-	packssdw  xmm4,xmm0		; xmm4=data1=(10 11 12 13 14 15 16 17)
-	packssdw  xmm2,xmm3		; xmm2=data2=(20 21 22 23 24 25 26 27)
-
-	movdqa    xmm6,xmm1	; transpose coefficients(phase 1)
-	punpcklwd xmm1,xmm4	; xmm1=(00 10 01 11 02 12 03 13)
-	punpckhwd xmm6,xmm4	; xmm6=(04 14 05 15 06 16 07 17)
-	movdqa    xmm7,xmm2	; transpose coefficients(phase 1)
-	punpcklwd xmm2,xmm5	; xmm2=(20 30 21 31 22 32 23 33)
-	punpckhwd xmm7,xmm5	; xmm7=(24 34 25 35 26 36 27 37)
-
-	movdqa    xmm0,xmm1	; transpose coefficients(phase 2)
-	punpckldq xmm1,xmm2	; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
-	punpckhdq xmm0,xmm2	; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
-	movdqa    xmm3,xmm6	; transpose coefficients(phase 2)
-	punpckldq xmm6,xmm7	; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
-	punpckhdq xmm3,xmm7	; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
-.column_end:
-
-	; -- Prefetch the next coefficient block
-
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-	; ---- Pass 2: process rows, store into output array.
-
-	mov	eax, [original_ebp]
-	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [output_col(eax)]
-
-	; -- Even part
-
-	pxor      xmm4,xmm4
-	punpcklwd xmm4,xmm1		; xmm4=tmp0
-	psrad     xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
-
-	; -- Odd part
-
-	punpckhwd xmm1,xmm0
-	punpckhwd xmm6,xmm3
-	movdqa    xmm5,xmm1
-	movdqa    xmm2,xmm6
-	pmaddwd   xmm1,[GOTOFF(ebx,PW_F256_F089)]	; xmm1=(tmp2)
-	pmaddwd   xmm6,[GOTOFF(ebx,PW_MF060_MF050)]	; xmm6=(tmp2)
-	pmaddwd   xmm5,[GOTOFF(ebx,PW_F106_MF217)]	; xmm5=(tmp0)
-	pmaddwd   xmm2,[GOTOFF(ebx,PW_F145_MF021)]	; xmm2=(tmp0)
-
-	paddd     xmm6,xmm1		; xmm6=tmp2
-	paddd     xmm2,xmm5		; xmm2=tmp0
-
-	; -- Even part
-
-	punpcklwd xmm0,xmm3
-	pmaddwd   xmm0,[GOTOFF(ebx,PW_F184_MF076)]	; xmm0=tmp2
-
-	movdqa    xmm7,xmm4
-	paddd     xmm4,xmm0		; xmm4=tmp10
-	psubd     xmm7,xmm0		; xmm7=tmp12
-
-	; -- Final output stage
-
-	movdqa	xmm1,[GOTOFF(ebx,PD_DESCALE_P2_4)]	; xmm1=[PD_DESCALE_P2_4]
-
-	movdqa	xmm5,xmm4
-	movdqa	xmm3,xmm7
-	paddd	xmm4,xmm6		; xmm4=data0=(00 10 20 30)
-	paddd	xmm7,xmm2		; xmm7=data1=(01 11 21 31)
-	psubd	xmm5,xmm6		; xmm5=data3=(03 13 23 33)
-	psubd	xmm3,xmm2		; xmm3=data2=(02 12 22 32)
-
-	paddd	xmm4,xmm1
-	paddd	xmm7,xmm1
-	psrad	xmm4,DESCALE_P2_4
-	psrad	xmm7,DESCALE_P2_4
-	paddd	xmm5,xmm1
-	paddd	xmm3,xmm1
-	psrad	xmm5,DESCALE_P2_4
-	psrad	xmm3,DESCALE_P2_4
-
-	packssdw  xmm4,xmm3		; xmm4=(00 10 20 30 02 12 22 32)
-	packssdw  xmm7,xmm5		; xmm7=(01 11 21 31 03 13 23 33)
-
-	movdqa    xmm0,xmm4		; transpose coefficients(phase 1)
-	punpcklwd xmm4,xmm7		; xmm4=(00 01 10 11 20 21 30 31)
-	punpckhwd xmm0,xmm7		; xmm0=(02 03 12 13 22 23 32 33)
-
-	movdqa    xmm6,xmm4		; transpose coefficients(phase 2)
-	punpckldq xmm4,xmm0		; xmm4=(00 01 02 03 10 11 12 13)
-	punpckhdq xmm6,xmm0		; xmm6=(20 21 22 23 30 31 32 33)
-
-	packsswb  xmm4,xmm6		; xmm4=(00 01 02 03 10 11 12 13 20 ..)
-	paddb     xmm4,[GOTOFF(ebx,PB_CENTERJSAMP)]
-
-	pshufd    xmm2,xmm4,0x39	; xmm2=(10 11 12 13 20 21 22 23 30 ..)
-	pshufd    xmm1,xmm4,0x4E	; xmm1=(20 21 22 23 30 31 32 33 00 ..)
-	pshufd    xmm3,xmm4,0x93	; xmm3=(30 31 32 33 00 01 02 03 10 ..)
-
-	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-	mov	esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-	movd	XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
-	movd	XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
-	mov	edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-	mov	esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-	movd	XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
-	movd	XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; unused
-	poppic	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-
-; --------------------------------------------------------------------------
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 2x2 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_2x2_sse2 (void * dct_table, JCOEFPTR coef_block,
-;                      JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)	(b)+8			; void * dct_table
-%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
-%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
-%define output_col(b)	(b)+20		; JDIMENSION output_col
-
-	align	16
-	global	EXTN(jsimd_idct_2x2_sse2) PRIVATE
-
-EXTN(jsimd_idct_2x2_sse2):
-	push	ebp
-	mov	ebp,esp
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process columns from input.
-
-	mov	edx, POINTER [dct_table(ebp)]	; quantptr
-	mov	esi, JCOEFPTR [coef_block(ebp)]		; inptr
-
-	; | input:                  | result:        |
-	; | 00 01 ** 03 ** 05 ** 07 |                |
-	; | 10 11 ** 13 ** 15 ** 17 |                |
-	; | ** ** ** ** ** ** ** ** |                |
-	; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
-	; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
-	; | 50 51 ** 53 ** 55 ** 57 |                |
-	; | ** ** ** ** ** ** ** ** |                |
-	; | 70 71 ** 73 ** 75 ** 77 |                |
-
-	; -- Odd part
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	movdqa	xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
-	; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
-
-	pcmpeqd   xmm7,xmm7
-	pslld     xmm7,WORD_BIT		; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
-
-	movdqa    xmm4,xmm0		; xmm4=(10 11 ** 13 ** 15 ** 17)
-	movdqa    xmm5,xmm2		; xmm5=(50 51 ** 53 ** 55 ** 57)
-	punpcklwd xmm4,xmm1		; xmm4=(10 30 11 31 ** ** 13 33)
-	punpcklwd xmm5,xmm3		; xmm5=(50 70 51 71 ** ** 53 73)
-	pmaddwd   xmm4,[GOTOFF(ebx,PW_F362_MF127)]
-	pmaddwd   xmm5,[GOTOFF(ebx,PW_F085_MF072)]
-
-	psrld	xmm0,WORD_BIT		; xmm0=(11 -- 13 -- 15 -- 17 --)
-	pand	xmm1,xmm7		; xmm1=(-- 31 -- 33 -- 35 -- 37)
-	psrld	xmm2,WORD_BIT		; xmm2=(51 -- 53 -- 55 -- 57 --)
-	pand	xmm3,xmm7		; xmm3=(-- 71 -- 73 -- 75 -- 77)
-	por	xmm0,xmm1		; xmm0=(11 31 13 33 15 35 17 37)
-	por	xmm2,xmm3		; xmm2=(51 71 53 73 55 75 57 77)
-	pmaddwd	xmm0,[GOTOFF(ebx,PW_F362_MF127)]
-	pmaddwd	xmm2,[GOTOFF(ebx,PW_F085_MF072)]
-
-	paddd	xmm4,xmm5		; xmm4=tmp0[col0 col1 **** col3]
-	paddd	xmm0,xmm2		; xmm0=tmp0[col1 col3 col5 col7]
-
-	; -- Even part
-
-	movdqa	xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	; xmm6=(00 01 ** 03 ** 05 ** 07)
-
-	movdqa	xmm1,xmm6		; xmm1=(00 01 ** 03 ** 05 ** 07)
-	pslld	xmm6,WORD_BIT		; xmm6=(-- 00 -- ** -- ** -- **)
-	pand	xmm1,xmm7		; xmm1=(-- 01 -- 03 -- 05 -- 07)
-	psrad	xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
-	psrad	xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
-
-	; -- Final output stage
-
-	movdqa	xmm3,xmm6
-	movdqa	xmm5,xmm1
-	paddd	xmm6,xmm4	; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
-	paddd	xmm1,xmm0	; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
-	psubd	xmm3,xmm4	; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
-	psubd	xmm5,xmm0	; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
-
-	movdqa	xmm2,[GOTOFF(ebx,PD_DESCALE_P1_2)]	; xmm2=[PD_DESCALE_P1_2]
-
-	punpckldq  xmm6,xmm3		; xmm6=(A0 B0 ** **)
-
-	movdqa     xmm7,xmm1
-	punpcklqdq xmm1,xmm5		; xmm1=(A1 A3 B1 B3)
-	punpckhqdq xmm7,xmm5		; xmm7=(A5 A7 B5 B7)
-
-	paddd	xmm6,xmm2
-	psrad	xmm6,DESCALE_P1_2
-
-	paddd	xmm1,xmm2
-	paddd	xmm7,xmm2
-	psrad	xmm1,DESCALE_P1_2
-	psrad	xmm7,DESCALE_P1_2
-
-	; -- Prefetch the next coefficient block
-
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-	; ---- Pass 2: process rows, store into output array.
-
-	mov	edi, JSAMPARRAY [output_buf(ebp)]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [output_col(ebp)]
-
-	; | input:| result:|
-	; | A0 B0 |        |
-	; | A1 B1 | C0 C1  |
-	; | A3 B3 | D0 D1  |
-	; | A5 B5 |        |
-	; | A7 B7 |        |
-
-	; -- Odd part
-
-	packssdw  xmm1,xmm1		; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
-	packssdw  xmm7,xmm7		; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
-	pmaddwd   xmm1,[GOTOFF(ebx,PW_F362_MF127)]
-	pmaddwd   xmm7,[GOTOFF(ebx,PW_F085_MF072)]
-
-	paddd     xmm1,xmm7		; xmm1=tmp0[row0 row1 row0 row1]
-
-	; -- Even part
-
-	pslld     xmm6,(CONST_BITS+2)	; xmm6=tmp10[row0 row1 **** ****]
-
-	; -- Final output stage
-
-	movdqa    xmm4,xmm6
-	paddd     xmm6,xmm1	; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
-	psubd     xmm4,xmm1	; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
-
-	punpckldq xmm6,xmm4	; xmm6=(C0 D0 C1 D1)
-
-	paddd     xmm6,[GOTOFF(ebx,PD_DESCALE_P2_2)]
-	psrad     xmm6,DESCALE_P2_2
-
-	packssdw  xmm6,xmm6		; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
-	packsswb  xmm6,xmm6		; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
-	paddb     xmm6,[GOTOFF(ebx,PB_CENTERJSAMP)]
-
-	pextrw	ebx,xmm6,0x00		; ebx=(C0 D0 -- --)
-	pextrw	ecx,xmm6,0x01		; ecx=(C1 D1 -- --)
-
-	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-	mov	esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-	mov	WORD [edx+eax*SIZEOF_JSAMPLE], bx
-	mov	WORD [esi+eax*SIZEOF_JSAMPLE], cx
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jisseflt.asm b/simd/jisseflt.asm
deleted file mode 100644
index 8faa749..0000000
--- a/simd/jisseflt.asm
+++ /dev/null
@@ -1,572 +0,0 @@
-;
-; jisseflt.asm - floating-point IDCT (SSE & MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the inverse DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
-	shufps	%1,%2,0x44
-%endmacro
-
-%macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
-	shufps	%1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_idct_float_sse) PRIVATE
-
-EXTN(jconst_idct_float_sse):
-
-PD_1_414	times 4 dd  1.414213562373095048801689
-PD_1_847	times 4 dd  1.847759065022573512256366
-PD_1_082	times 4 dd  1.082392200292393968799446
-PD_M2_613	times 4 dd -2.613125929752753055713286
-PD_0_125	times 4 dd  0.125	; 1/8
-PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_float_sse (void * dct_table, JCOEFPTR coef_block,
-;                       JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)	(b)+8			; void * dct_table
-%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
-%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
-%define output_col(b)	(b)+20		; JDIMENSION output_col
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-%define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
-					; FAST_FLOAT workspace[DCTSIZE2]
-
-	align	16
-	global	EXTN(jsimd_idct_float_sse) PRIVATE
-
-EXTN(jsimd_idct_float_sse):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [workspace]
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process columns from input, store into work array.
-
-;	mov	eax, [original_ebp]
-	mov	edx, POINTER [dct_table(eax)]	; quantptr
-	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
-	lea	edi, [workspace]			; FAST_FLOAT * wsptr
-	mov	ecx, DCTSIZE/4				; ctr
-	alignx	16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
-	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	jnz	near .columnDCT
-
-	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	por	mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	por	mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	por	mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	por	mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	por	mm1,mm0
-	packsswb mm1,mm1
-	movd	eax,mm1
-	test	eax,eax
-	jnz	short .columnDCT
-
-	; -- AC terms all zero
-
-	movq      mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-
-	punpckhwd mm1,mm0			; mm1=(** 02 ** 03)
-	punpcklwd mm0,mm0			; mm0=(00 00 01 01)
-	psrad     mm1,(DWORD_BIT-WORD_BIT)	; mm1=in0H=(02 03)
-	psrad     mm0,(DWORD_BIT-WORD_BIT)	; mm0=in0L=(00 01)
-	cvtpi2ps  xmm3,mm1			; xmm3=(02 03 ** **)
-	cvtpi2ps  xmm0,mm0			; xmm0=(00 01 ** **)
-	movlhps   xmm0,xmm3			; xmm0=in0=(00 01 02 03)
-
-	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	movaps	xmm1,xmm0
-	movaps	xmm2,xmm0
-	movaps	xmm3,xmm0
-
-	shufps	xmm0,xmm0,0x00			; xmm0=(00 00 00 00)
-	shufps	xmm1,xmm1,0x55			; xmm1=(01 01 01 01)
-	shufps	xmm2,xmm2,0xAA			; xmm2=(02 02 02 02)
-	shufps	xmm3,xmm3,0xFF			; xmm3=(03 03 03 03)
-
-	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
-	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
-	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
-	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
-	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
-	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
-	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
-	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
-	jmp	near .nextcolumn
-	alignx	16,7
-%endif
-.columnDCT:
-
-	; -- Even part
-
-	movq      mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	movq      mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	movq      mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	movq      mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-	punpckhwd mm4,mm0			; mm4=(** 02 ** 03)
-	punpcklwd mm0,mm0			; mm0=(00 00 01 01)
-	punpckhwd mm5,mm1			; mm5=(** 22 ** 23)
-	punpcklwd mm1,mm1			; mm1=(20 20 21 21)
-
-	psrad     mm4,(DWORD_BIT-WORD_BIT)	; mm4=in0H=(02 03)
-	psrad     mm0,(DWORD_BIT-WORD_BIT)	; mm0=in0L=(00 01)
-	cvtpi2ps  xmm4,mm4			; xmm4=(02 03 ** **)
-	cvtpi2ps  xmm0,mm0			; xmm0=(00 01 ** **)
-	psrad     mm5,(DWORD_BIT-WORD_BIT)	; mm5=in2H=(22 23)
-	psrad     mm1,(DWORD_BIT-WORD_BIT)	; mm1=in2L=(20 21)
-	cvtpi2ps  xmm5,mm5			; xmm5=(22 23 ** **)
-	cvtpi2ps  xmm1,mm1			; xmm1=(20 21 ** **)
-
-	punpckhwd mm6,mm2			; mm6=(** 42 ** 43)
-	punpcklwd mm2,mm2			; mm2=(40 40 41 41)
-	punpckhwd mm7,mm3			; mm7=(** 62 ** 63)
-	punpcklwd mm3,mm3			; mm3=(60 60 61 61)
-
-	psrad     mm6,(DWORD_BIT-WORD_BIT)	; mm6=in4H=(42 43)
-	psrad     mm2,(DWORD_BIT-WORD_BIT)	; mm2=in4L=(40 41)
-	cvtpi2ps  xmm6,mm6			; xmm6=(42 43 ** **)
-	cvtpi2ps  xmm2,mm2			; xmm2=(40 41 ** **)
-	psrad     mm7,(DWORD_BIT-WORD_BIT)	; mm7=in6H=(62 63)
-	psrad     mm3,(DWORD_BIT-WORD_BIT)	; mm3=in6L=(60 61)
-	cvtpi2ps  xmm7,mm7			; xmm7=(62 63 ** **)
-	cvtpi2ps  xmm3,mm3			; xmm3=(60 61 ** **)
-
-	movlhps   xmm0,xmm4			; xmm0=in0=(00 01 02 03)
-	movlhps   xmm1,xmm5			; xmm1=in2=(20 21 22 23)
-	mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	movlhps   xmm2,xmm6			; xmm2=in4=(40 41 42 43)
-	movlhps   xmm3,xmm7			; xmm3=in6=(60 61 62 63)
-	mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	movaps	xmm4,xmm0
-	movaps	xmm5,xmm1
-	subps	xmm0,xmm2		; xmm0=tmp11
-	subps	xmm1,xmm3
-	addps	xmm4,xmm2		; xmm4=tmp10
-	addps	xmm5,xmm3		; xmm5=tmp13
-
-	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
-	subps	xmm1,xmm5		; xmm1=tmp12
-
-	movaps	xmm6,xmm4
-	movaps	xmm7,xmm0
-	subps	xmm4,xmm5		; xmm4=tmp3
-	subps	xmm0,xmm1		; xmm0=tmp2
-	addps	xmm6,xmm5		; xmm6=tmp0
-	addps	xmm7,xmm1		; xmm7=tmp1
-
-	movaps	XMMWORD [wk(1)], xmm4	; tmp3
-	movaps	XMMWORD [wk(0)], xmm0	; tmp2
-
-	; -- Odd part
-
-	movq      mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movq      mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	movq      mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movq      mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-	punpckhwd mm6,mm4			; mm6=(** 12 ** 13)
-	punpcklwd mm4,mm4			; mm4=(10 10 11 11)
-	punpckhwd mm2,mm0			; mm2=(** 32 ** 33)
-	punpcklwd mm0,mm0			; mm0=(30 30 31 31)
-
-	psrad     mm6,(DWORD_BIT-WORD_BIT)	; mm6=in1H=(12 13)
-	psrad     mm4,(DWORD_BIT-WORD_BIT)	; mm4=in1L=(10 11)
-	cvtpi2ps  xmm4,mm6			; xmm4=(12 13 ** **)
-	cvtpi2ps  xmm2,mm4			; xmm2=(10 11 ** **)
-	psrad     mm2,(DWORD_BIT-WORD_BIT)	; mm2=in3H=(32 33)
-	psrad     mm0,(DWORD_BIT-WORD_BIT)	; mm0=in3L=(30 31)
-	cvtpi2ps  xmm0,mm2			; xmm0=(32 33 ** **)
-	cvtpi2ps  xmm3,mm0			; xmm3=(30 31 ** **)
-
-	punpckhwd mm7,mm5			; mm7=(** 52 ** 53)
-	punpcklwd mm5,mm5			; mm5=(50 50 51 51)
-	punpckhwd mm3,mm1			; mm3=(** 72 ** 73)
-	punpcklwd mm1,mm1			; mm1=(70 70 71 71)
-
-	movlhps   xmm2,xmm4			; xmm2=in1=(10 11 12 13)
-	movlhps   xmm3,xmm0			; xmm3=in3=(30 31 32 33)
-
-	psrad     mm7,(DWORD_BIT-WORD_BIT)	; mm7=in5H=(52 53)
-	psrad     mm5,(DWORD_BIT-WORD_BIT)	; mm5=in5L=(50 51)
-	cvtpi2ps  xmm4,mm7			; xmm4=(52 53 ** **)
-	cvtpi2ps  xmm5,mm5			; xmm5=(50 51 ** **)
-	psrad     mm3,(DWORD_BIT-WORD_BIT)	; mm3=in7H=(72 73)
-	psrad     mm1,(DWORD_BIT-WORD_BIT)	; mm1=in7L=(70 71)
-	cvtpi2ps  xmm0,mm3			; xmm0=(72 73 ** **)
-	cvtpi2ps  xmm1,mm1			; xmm1=(70 71 ** **)
-
-	mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	movlhps   xmm5,xmm4			; xmm5=in5=(50 51 52 53)
-	movlhps   xmm1,xmm0			; xmm1=in7=(70 71 72 73)
-	mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	movaps	xmm4,xmm2
-	movaps	xmm0,xmm5
-	addps	xmm2,xmm1		; xmm2=z11
-	addps	xmm5,xmm3		; xmm5=z13
-	subps	xmm4,xmm1		; xmm4=z12
-	subps	xmm0,xmm3		; xmm0=z10
-
-	movaps	xmm1,xmm2
-	subps	xmm2,xmm5
-	addps	xmm1,xmm5		; xmm1=tmp7
-
-	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
-
-	movaps	xmm3,xmm0
-	addps	xmm0,xmm4
-	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
-	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
-	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
-	addps	xmm3,xmm0		; xmm3=tmp12
-	subps	xmm4,xmm0		; xmm4=tmp10
-
-	; -- Final output stage
-
-	subps	xmm3,xmm1		; xmm3=tmp6
-	movaps	xmm5,xmm6
-	movaps	xmm0,xmm7
-	addps	xmm6,xmm1		; xmm6=data0=(00 01 02 03)
-	addps	xmm7,xmm3		; xmm7=data1=(10 11 12 13)
-	subps	xmm5,xmm1		; xmm5=data7=(70 71 72 73)
-	subps	xmm0,xmm3		; xmm0=data6=(60 61 62 63)
-	subps	xmm2,xmm3		; xmm2=tmp5
-
-	movaps    xmm1,xmm6		; transpose coefficients(phase 1)
-	unpcklps  xmm6,xmm7		; xmm6=(00 10 01 11)
-	unpckhps  xmm1,xmm7		; xmm1=(02 12 03 13)
-	movaps    xmm3,xmm0		; transpose coefficients(phase 1)
-	unpcklps  xmm0,xmm5		; xmm0=(60 70 61 71)
-	unpckhps  xmm3,xmm5		; xmm3=(62 72 63 73)
-
-	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
-	movaps	xmm5, XMMWORD [wk(1)]	; xmm5=tmp3
-
-	movaps	XMMWORD [wk(0)], xmm0	; wk(0)=(60 70 61 71)
-	movaps	XMMWORD [wk(1)], xmm3	; wk(1)=(62 72 63 73)
-
-	addps	xmm4,xmm2		; xmm4=tmp4
-	movaps	xmm0,xmm7
-	movaps	xmm3,xmm5
-	addps	xmm7,xmm2		; xmm7=data2=(20 21 22 23)
-	addps	xmm5,xmm4		; xmm5=data4=(40 41 42 43)
-	subps	xmm0,xmm2		; xmm0=data5=(50 51 52 53)
-	subps	xmm3,xmm4		; xmm3=data3=(30 31 32 33)
-
-	movaps    xmm2,xmm7		; transpose coefficients(phase 1)
-	unpcklps  xmm7,xmm3		; xmm7=(20 30 21 31)
-	unpckhps  xmm2,xmm3		; xmm2=(22 32 23 33)
-	movaps    xmm4,xmm5		; transpose coefficients(phase 1)
-	unpcklps  xmm5,xmm0		; xmm5=(40 50 41 51)
-	unpckhps  xmm4,xmm0		; xmm4=(42 52 43 53)
-
-	movaps    xmm3,xmm6		; transpose coefficients(phase 2)
-	unpcklps2 xmm6,xmm7		; xmm6=(00 10 20 30)
-	unpckhps2 xmm3,xmm7		; xmm3=(01 11 21 31)
-	movaps    xmm0,xmm1		; transpose coefficients(phase 2)
-	unpcklps2 xmm1,xmm2		; xmm1=(02 12 22 32)
-	unpckhps2 xmm0,xmm2		; xmm0=(03 13 23 33)
-
-	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=(60 70 61 71)
-	movaps	xmm2, XMMWORD [wk(1)]	; xmm2=(62 72 63 73)
-
-	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
-	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
-	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
-	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
-
-	movaps    xmm6,xmm5		; transpose coefficients(phase 2)
-	unpcklps2 xmm5,xmm7		; xmm5=(40 50 60 70)
-	unpckhps2 xmm6,xmm7		; xmm6=(41 51 61 71)
-	movaps    xmm3,xmm4		; transpose coefficients(phase 2)
-	unpcklps2 xmm4,xmm2		; xmm4=(42 52 62 72)
-	unpckhps2 xmm3,xmm2		; xmm3=(43 53 63 73)
-
-	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
-	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
-	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
-	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
-
-.nextcolumn:
-	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
-	add	edx, byte 4*SIZEOF_FLOAT_MULT_TYPE	; quantptr
-	add	edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr
-	dec	ecx					; ctr
-	jnz	near .columnloop
-
-	; -- Prefetch the next coefficient block
-
-	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
-	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
-	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
-	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
-
-	; ---- Pass 2: process rows from work array, store into output array.
-
-	mov	eax, [original_ebp]
-	lea	esi, [workspace]			; FAST_FLOAT * wsptr
-	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [output_col(eax)]
-	mov	ecx, DCTSIZE/4				; ctr
-	alignx	16,7
-.rowloop:
-
-	; -- Even part
-
-	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
-
-	movaps	xmm4,xmm0
-	movaps	xmm5,xmm1
-	subps	xmm0,xmm2		; xmm0=tmp11
-	subps	xmm1,xmm3
-	addps	xmm4,xmm2		; xmm4=tmp10
-	addps	xmm5,xmm3		; xmm5=tmp13
-
-	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
-	subps	xmm1,xmm5		; xmm1=tmp12
-
-	movaps	xmm6,xmm4
-	movaps	xmm7,xmm0
-	subps	xmm4,xmm5		; xmm4=tmp3
-	subps	xmm0,xmm1		; xmm0=tmp2
-	addps	xmm6,xmm5		; xmm6=tmp0
-	addps	xmm7,xmm1		; xmm7=tmp1
-
-	movaps	XMMWORD [wk(1)], xmm4	; tmp3
-	movaps	XMMWORD [wk(0)], xmm0	; tmp2
-
-	; -- Odd part
-
-	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
-
-	movaps	xmm4,xmm2
-	movaps	xmm0,xmm5
-	addps	xmm2,xmm1		; xmm2=z11
-	addps	xmm5,xmm3		; xmm5=z13
-	subps	xmm4,xmm1		; xmm4=z12
-	subps	xmm0,xmm3		; xmm0=z10
-
-	movaps	xmm1,xmm2
-	subps	xmm2,xmm5
-	addps	xmm1,xmm5		; xmm1=tmp7
-
-	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
-
-	movaps	xmm3,xmm0
-	addps	xmm0,xmm4
-	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
-	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
-	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
-	addps	xmm3,xmm0		; xmm3=tmp12
-	subps	xmm4,xmm0		; xmm4=tmp10
-
-	; -- Final output stage
-
-	subps	xmm3,xmm1		; xmm3=tmp6
-	movaps	xmm5,xmm6
-	movaps	xmm0,xmm7
-	addps	xmm6,xmm1		; xmm6=data0=(00 10 20 30)
-	addps	xmm7,xmm3		; xmm7=data1=(01 11 21 31)
-	subps	xmm5,xmm1		; xmm5=data7=(07 17 27 37)
-	subps	xmm0,xmm3		; xmm0=data6=(06 16 26 36)
-	subps	xmm2,xmm3		; xmm2=tmp5
-
-	movaps	xmm1,[GOTOFF(ebx,PD_0_125)]	; xmm1=[PD_0_125]
-
-	mulps	xmm6,xmm1		; descale(1/8)
-	mulps	xmm7,xmm1		; descale(1/8)
-	mulps	xmm5,xmm1		; descale(1/8)
-	mulps	xmm0,xmm1		; descale(1/8)
-
-	movhlps   xmm3,xmm6
-	movhlps   xmm1,xmm7
-	cvtps2pi  mm0,xmm6		; round to int32, mm0=data0L=(00 10)
-	cvtps2pi  mm1,xmm7		; round to int32, mm1=data1L=(01 11)
-	cvtps2pi  mm2,xmm3		; round to int32, mm2=data0H=(20 30)
-	cvtps2pi  mm3,xmm1		; round to int32, mm3=data1H=(21 31)
-	packssdw  mm0,mm2		; mm0=data0=(00 10 20 30)
-	packssdw  mm1,mm3		; mm1=data1=(01 11 21 31)
-
-	movhlps   xmm6,xmm5
-	movhlps   xmm7,xmm0
-	cvtps2pi  mm4,xmm5		; round to int32, mm4=data7L=(07 17)
-	cvtps2pi  mm5,xmm0		; round to int32, mm5=data6L=(06 16)
-	cvtps2pi  mm6,xmm6		; round to int32, mm6=data7H=(27 37)
-	cvtps2pi  mm7,xmm7		; round to int32, mm7=data6H=(26 36)
-	packssdw  mm4,mm6		; mm4=data7=(07 17 27 37)
-	packssdw  mm5,mm7		; mm5=data6=(06 16 26 36)
-
-	packsswb  mm0,mm5		; mm0=(00 10 20 30 06 16 26 36)
-	packsswb  mm1,mm4		; mm1=(01 11 21 31 07 17 27 37)
-
-	movaps	xmm3, XMMWORD [wk(0)]	; xmm3=tmp2
-	movaps	xmm1, XMMWORD [wk(1)]	; xmm1=tmp3
-
-	movaps	xmm6,[GOTOFF(ebx,PD_0_125)]	; xmm6=[PD_0_125]
-
-	addps	xmm4,xmm2		; xmm4=tmp4
-	movaps	xmm5,xmm3
-	movaps	xmm0,xmm1
-	addps	xmm3,xmm2		; xmm3=data2=(02 12 22 32)
-	addps	xmm1,xmm4		; xmm1=data4=(04 14 24 34)
-	subps	xmm5,xmm2		; xmm5=data5=(05 15 25 35)
-	subps	xmm0,xmm4		; xmm0=data3=(03 13 23 33)
-
-	mulps	xmm3,xmm6		; descale(1/8)
-	mulps	xmm1,xmm6		; descale(1/8)
-	mulps	xmm5,xmm6		; descale(1/8)
-	mulps	xmm0,xmm6		; descale(1/8)
-
-	movhlps   xmm7,xmm3
-	movhlps   xmm2,xmm1
-	cvtps2pi  mm2,xmm3		; round to int32, mm2=data2L=(02 12)
-	cvtps2pi  mm3,xmm1		; round to int32, mm3=data4L=(04 14)
-	cvtps2pi  mm6,xmm7		; round to int32, mm6=data2H=(22 32)
-	cvtps2pi  mm7,xmm2		; round to int32, mm7=data4H=(24 34)
-	packssdw  mm2,mm6		; mm2=data2=(02 12 22 32)
-	packssdw  mm3,mm7		; mm3=data4=(04 14 24 34)
-
-	movhlps   xmm4,xmm5
-	movhlps   xmm6,xmm0
-	cvtps2pi  mm5,xmm5		; round to int32, mm5=data5L=(05 15)
-	cvtps2pi  mm4,xmm0		; round to int32, mm4=data3L=(03 13)
-	cvtps2pi  mm6,xmm4		; round to int32, mm6=data5H=(25 35)
-	cvtps2pi  mm7,xmm6		; round to int32, mm7=data3H=(23 33)
-	packssdw  mm5,mm6		; mm5=data5=(05 15 25 35)
-	packssdw  mm4,mm7		; mm4=data3=(03 13 23 33)
-
-	movq      mm6,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm6=[PB_CENTERJSAMP]
-
-	packsswb  mm2,mm3		; mm2=(02 12 22 32 04 14 24 34)
-	packsswb  mm4,mm5		; mm4=(03 13 23 33 05 15 25 35)
-
-	paddb     mm0,mm6
-	paddb     mm1,mm6
-	paddb     mm2,mm6
-	paddb     mm4,mm6
-
-	movq      mm7,mm0		; transpose coefficients(phase 1)
-	punpcklbw mm0,mm1		; mm0=(00 01 10 11 20 21 30 31)
-	punpckhbw mm7,mm1		; mm7=(06 07 16 17 26 27 36 37)
-	movq      mm3,mm2		; transpose coefficients(phase 1)
-	punpcklbw mm2,mm4		; mm2=(02 03 12 13 22 23 32 33)
-	punpckhbw mm3,mm4		; mm3=(04 05 14 15 24 25 34 35)
-
-	movq      mm5,mm0		; transpose coefficients(phase 2)
-	punpcklwd mm0,mm2		; mm0=(00 01 02 03 10 11 12 13)
-	punpckhwd mm5,mm2		; mm5=(20 21 22 23 30 31 32 33)
-	movq      mm6,mm3		; transpose coefficients(phase 2)
-	punpcklwd mm3,mm7		; mm3=(04 05 06 07 14 15 16 17)
-	punpckhwd mm6,mm7		; mm6=(24 25 26 27 34 35 36 37)
-
-	movq      mm1,mm0		; transpose coefficients(phase 3)
-	punpckldq mm0,mm3		; mm0=(00 01 02 03 04 05 06 07)
-	punpckhdq mm1,mm3		; mm1=(10 11 12 13 14 15 16 17)
-	movq      mm4,mm5		; transpose coefficients(phase 3)
-	punpckldq mm5,mm6		; mm5=(20 21 22 23 24 25 26 27)
-	punpckhdq mm4,mm6		; mm4=(30 31 32 33 34 35 36 37)
-
-	pushpic	ebx			; save GOT address
-
-	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-	mov	ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
-	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
-	mov	edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
-	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
-
-	poppic	ebx			; restore GOT address
-
-	add	esi, byte 4*SIZEOF_FAST_FLOAT	; wsptr
-	add	edi, byte 4*SIZEOF_JSAMPROW
-	dec	ecx				; ctr
-	jnz	near .rowloop
-
-	emms		; empty MMX state
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jsimdcfg.inc.h b/simd/jsimdcfg.inc.h
deleted file mode 100644
index 583b7e3..0000000
--- a/simd/jsimdcfg.inc.h
+++ /dev/null
@@ -1,196 +0,0 @@
-// This file generates the include file for the assembly
-// implementations by abusing the C preprocessor.
-//
-// Note: Some things are manually defined as they need to
-// be mapped to NASM types.
-
-;
-; Automatically generated include file from jsimdcfg.inc.h
-;
-
-#define JPEG_INTERNALS
-
-#include "../jpeglib.h"
-#include "../jconfig.h"
-#include "../jmorecfg.h"
-#include "jsimd.h"
-
-;
-; -- jpeglib.h
-;
-
-%define _cpp_protection_DCTSIZE DCTSIZE
-%define _cpp_protection_DCTSIZE2 DCTSIZE2
-
-;
-; -- jmorecfg.h
-;
-
-%define _cpp_protection_RGB_RED RGB_RED
-%define _cpp_protection_RGB_GREEN RGB_GREEN
-%define _cpp_protection_RGB_BLUE RGB_BLUE
-%define _cpp_protection_RGB_PIXELSIZE RGB_PIXELSIZE
-
-%define _cpp_protection_EXT_RGB_RED EXT_RGB_RED
-%define _cpp_protection_EXT_RGB_GREEN EXT_RGB_GREEN
-%define _cpp_protection_EXT_RGB_BLUE EXT_RGB_BLUE
-%define _cpp_protection_EXT_RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-
-%define _cpp_protection_EXT_RGBX_RED EXT_RGBX_RED
-%define _cpp_protection_EXT_RGBX_GREEN EXT_RGBX_GREEN
-%define _cpp_protection_EXT_RGBX_BLUE EXT_RGBX_BLUE
-%define _cpp_protection_EXT_RGBX_PIXELSIZE EXT_RGBX_PIXELSIZE
-
-%define _cpp_protection_EXT_BGR_RED EXT_BGR_RED
-%define _cpp_protection_EXT_BGR_GREEN EXT_BGR_GREEN
-%define _cpp_protection_EXT_BGR_BLUE EXT_BGR_BLUE
-%define _cpp_protection_EXT_BGR_PIXELSIZE EXT_BGR_PIXELSIZE
-
-%define _cpp_protection_EXT_BGRX_RED EXT_BGRX_RED
-%define _cpp_protection_EXT_BGRX_GREEN EXT_BGRX_GREEN
-%define _cpp_protection_EXT_BGRX_BLUE EXT_BGRX_BLUE
-%define _cpp_protection_EXT_BGRX_PIXELSIZE EXT_BGRX_PIXELSIZE
-
-%define _cpp_protection_EXT_XBGR_RED EXT_XBGR_RED
-%define _cpp_protection_EXT_XBGR_GREEN EXT_XBGR_GREEN
-%define _cpp_protection_EXT_XBGR_BLUE EXT_XBGR_BLUE
-%define _cpp_protection_EXT_XBGR_PIXELSIZE EXT_XBGR_PIXELSIZE
-
-%define _cpp_protection_EXT_XRGB_RED EXT_XRGB_RED
-%define _cpp_protection_EXT_XRGB_GREEN EXT_XRGB_GREEN
-%define _cpp_protection_EXT_XRGB_BLUE EXT_XRGB_BLUE
-%define _cpp_protection_EXT_XRGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-
-%define RGBX_FILLER_0XFF        1
-
-; Representation of a single sample (pixel element value).
-; On this SIMD implementation, this must be 'unsigned char'.
-;
-
-%define JSAMPLE                 byte          ; unsigned char
-%define SIZEOF_JSAMPLE          SIZEOF_BYTE   ; sizeof(JSAMPLE)
-
-%define _cpp_protection_CENTERJSAMPLE CENTERJSAMPLE
-
-; Representation of a DCT frequency coefficient.
-; On this SIMD implementation, this must be 'short'.
-;
-%define JCOEF                   word          ; short
-%define SIZEOF_JCOEF            SIZEOF_WORD   ; sizeof(JCOEF)
-
-; Datatype used for image dimensions.
-; On this SIMD implementation, this must be 'unsigned int'.
-;
-%define JDIMENSION              dword         ; unsigned int
-%define SIZEOF_JDIMENSION       SIZEOF_DWORD  ; sizeof(JDIMENSION)
-
-%define JSAMPROW                POINTER       ; JSAMPLE FAR * (jpeglib.h)
-%define JSAMPARRAY              POINTER       ; JSAMPROW *    (jpeglib.h)
-%define JSAMPIMAGE              POINTER       ; JSAMPARRAY *  (jpeglib.h)
-%define JCOEFPTR                POINTER       ; JCOEF FAR *   (jpeglib.h)
-%define SIZEOF_JSAMPROW         SIZEOF_POINTER  ; sizeof(JSAMPROW)
-%define SIZEOF_JSAMPARRAY       SIZEOF_POINTER  ; sizeof(JSAMPARRAY)
-%define SIZEOF_JSAMPIMAGE       SIZEOF_POINTER  ; sizeof(JSAMPIMAGE)
-%define SIZEOF_JCOEFPTR         SIZEOF_POINTER  ; sizeof(JCOEFPTR)
-
-;
-; -- jdct.h
-;
-
-; A forward DCT routine is given a pointer to a work area of type DCTELEM[];
-; the DCT is to be performed in-place in that buffer.
-; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
-;
-%define DCTELEM                 word          ; short
-%define SIZEOF_DCTELEM          SIZEOF_WORD   ; sizeof(DCTELEM)
-
-%define FAST_FLOAT              FP32            ; float
-%define SIZEOF_FAST_FLOAT       SIZEOF_FP32     ; sizeof(FAST_FLOAT)
-
-; To maximize parallelism, Type MULTIPLIER is changed to short.
-;
-%define ISLOW_MULT_TYPE         word          ; must be short
-%define SIZEOF_ISLOW_MULT_TYPE  SIZEOF_WORD   ; sizeof(ISLOW_MULT_TYPE)
-
-%define IFAST_MULT_TYPE         word          ; must be short
-%define SIZEOF_IFAST_MULT_TYPE  SIZEOF_WORD   ; sizeof(IFAST_MULT_TYPE)
-%define IFAST_SCALE_BITS        2             ; fractional bits in scale factors
-
-%define FLOAT_MULT_TYPE         FP32          ; must be float
-%define SIZEOF_FLOAT_MULT_TYPE  SIZEOF_FP32   ; sizeof(FLOAT_MULT_TYPE)
-
-;
-; -- jsimd.h
-;
-
-%define _cpp_protection_JSIMD_NONE JSIMD_NONE
-%define _cpp_protection_JSIMD_MMX JSIMD_MMX
-%define _cpp_protection_JSIMD_3DNOW JSIMD_3DNOW
-%define _cpp_protection_JSIMD_SSE JSIMD_SSE
-%define _cpp_protection_JSIMD_SSE2 JSIMD_SSE2
-
-; Short forms of external names for systems with brain-damaged linkers.
-;
-#ifdef NEED_SHORT_EXTERNAL_NAMES
-%define _cpp_protection_jpeg_simd_cpu_support jpeg_simd_cpu_support
-%define _cpp_protection_jsimd_rgb_ycc_convert_mmx jsimd_rgb_ycc_convert_mmx
-%define _cpp_protection_jsimd_ycc_rgb_convert_mmx jsimd_ycc_rgb_convert_mmx
-%define _cpp_protection_jconst_rgb_ycc_convert_sse2 jconst_rgb_ycc_convert_sse2
-%define _cpp_protection_jsimd_rgb_ycc_convert_sse2 jsimd_rgb_ycc_convert_sse2
-%define _cpp_protection_jconst_ycc_rgb_convert_sse2 jconst_ycc_rgb_convert_sse2
-%define _cpp_protection_jsimd_ycc_rgb_convert_sse2 jsimd_ycc_rgb_convert_sse2
-%define _cpp_protection_jsimd_h2v2_downsample_mmx jsimd_h2v2_downsample_mmx
-%define _cpp_protection_jsimd_h2v1_downsample_mmx jsimd_h2v1_downsample_mmx
-%define _cpp_protection_jsimd_h2v2_downsample_sse2 jsimd_h2v2_downsample_sse2
-%define _cpp_protection_jsimd_h2v1_downsample_sse2 jsimd_h2v1_downsample_sse2
-%define _cpp_protection_jsimd_h2v2_upsample_mmx jsimd_h2v2_upsample_mmx
-%define _cpp_protection_jsimd_h2v1_upsample_mmx jsimd_h2v1_upsample_mmx
-%define _cpp_protection_jsimd_h2v1_fancy_upsample_mmx jsimd_h2v1_fancy_upsample_mmx
-%define _cpp_protection_jsimd_h2v2_fancy_upsample_mmx jsimd_h2v2_fancy_upsample_mmx
-%define _cpp_protection_jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_merged_upsample_mmx
-%define _cpp_protection_jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_merged_upsample_mmx
-%define _cpp_protection_jsimd_h2v2_upsample_sse2 jsimd_h2v2_upsample_sse2
-%define _cpp_protection_jsimd_h2v1_upsample_sse2 jsimd_h2v1_upsample_sse2
-%define _cpp_protection_jconst_fancy_upsample_sse2 jconst_fancy_upsample_sse2
-%define _cpp_protection_jsimd_h2v1_fancy_upsample_sse2 jsimd_h2v1_fancy_upsample_sse2
-%define _cpp_protection_jsimd_h2v2_fancy_upsample_sse2 jsimd_h2v2_fancy_upsample_sse2
-%define _cpp_protection_jconst_merged_upsample_sse2 jconst_merged_upsample_sse2
-%define _cpp_protection_jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_merged_upsample_sse2
-%define _cpp_protection_jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_merged_upsample_sse2
-%define _cpp_protection_jsimd_convsamp_mmx jsimd_convsamp_mmx
-%define _cpp_protection_jsimd_convsamp_sse2 jsimd_convsamp_sse2
-%define _cpp_protection_jsimd_convsamp_float_3dnow jsimd_convsamp_float_3dnow
-%define _cpp_protection_jsimd_convsamp_float_sse jsimd_convsamp_float_sse
-%define _cpp_protection_jsimd_convsamp_float_sse2 jsimd_convsamp_float_sse2
-%define _cpp_protection_jsimd_fdct_islow_mmx jsimd_fdct_islow_mmx
-%define _cpp_protection_jsimd_fdct_ifast_mmx jsimd_fdct_ifast_mmx
-%define _cpp_protection_jconst_fdct_islow_sse2 jconst_fdct_islow_sse2
-%define _cpp_protection_jsimd_fdct_islow_sse2 jsimd_fdct_islow_sse2
-%define _cpp_protection_jconst_fdct_ifast_sse2 jconst_fdct_ifast_sse2
-%define _cpp_protection_jsimd_fdct_ifast_sse2 jsimd_fdct_ifast_sse2
-%define _cpp_protection_jsimd_fdct_float_3dnow jsimd_fdct_float_3dnow
-%define _cpp_protection_jconst_fdct_float_sse jconst_fdct_float_sse
-%define _cpp_protection_jsimd_fdct_float_sse jsimd_fdct_float_sse
-%define _cpp_protection_jsimd_quantize_mmx jsimd_quantize_mmx
-%define _cpp_protection_jsimd_quantize_sse2 jsimd_quantize_sse2
-%define _cpp_protection_jsimd_quantize_float_3dnow jsimd_quantize_float_3dnow
-%define _cpp_protection_jsimd_quantize_float_sse jsimd_quantize_float_sse
-%define _cpp_protection_jsimd_quantize_float_sse2 jsimd_quantize_float_sse2
-%define _cpp_protection_jsimd_idct_2x2_mmx jsimd_idct_2x2_mmx
-%define _cpp_protection_jsimd_idct_4x4_mmx jsimd_idct_4x4_mmx
-%define _cpp_protection_jconst_idct_red_sse2 jconst_idct_red_sse2
-%define _cpp_protection_jsimd_idct_2x2_sse2 jsimd_idct_2x2_sse2
-%define _cpp_protection_jsimd_idct_4x4_sse2 jsimd_idct_4x4_sse2
-%define _cpp_protection_jsimd_idct_islow_mmx jsimd_idct_islow_mmx
-%define _cpp_protection_jsimd_idct_ifast_mmx jsimd_idct_ifast_mmx
-%define _cpp_protection_jconst_idct_islow_sse2 jconst_idct_islow_sse2
-%define _cpp_protection_jsimd_idct_islow_sse2 jsimd_idct_islow_sse2
-%define _cpp_protection_jconst_idct_ifast_sse2 jconst_idct_ifast_sse2
-%define _cpp_protection_jsimd_idct_ifast_sse2 jsimd_idct_ifast_sse2
-%define _cpp_protection_jsimd_idct_float_3dnow jsimd_idct_float_3dnow
-%define _cpp_protection_jconst_idct_float_sse jconst_idct_float_sse
-%define _cpp_protection_jsimd_idct_float_sse jsimd_idct_float_sse
-%define _cpp_protection_jconst_idct_float_sse2 jconst_idct_float_sse2
-%define _cpp_protection_jsimd_idct_float_sse2 jsimd_idct_float_sse2
-#endif /* NEED_SHORT_EXTERNAL_NAMES */
-
diff --git a/turbojpegl.c b/turbojpegl.c
deleted file mode 100644
index 2150a2d..0000000
--- a/turbojpegl.c
+++ /dev/null
@@ -1,363 +0,0 @@
-/* Copyright (C)2004 Landmark Graphics Corporation
- * Copyright (C)2005 Sun Microsystems, Inc.
- * Copyright (C)2009 D. R. Commander
- *
- * This library is free software and may be redistributed and/or modified under
- * the terms of the wxWindows Library License, Version 3.1 or (at your option)
- * any later version.  The full license is in the LICENSE.txt file included
- * with this distribution.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * wxWindows Library License for more details.
- */
-
-// This implements a JPEG compressor/decompressor using the libjpeg API
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <jpeglib.h>
-#include <jerror.h>
-#include <setjmp.h>
-#include "./turbojpeg.h"
-
-
-// Error handling
-
-static char lasterror[JMSG_LENGTH_MAX]="No error";
-
-typedef struct _error_mgr
-{
-	struct jpeg_error_mgr pub;
-	jmp_buf jb;
-} error_mgr;
-
-static void my_error_exit(j_common_ptr cinfo)
-{
-	error_mgr *myerr = (error_mgr *)cinfo->err;
-	(*cinfo->err->output_message)(cinfo);
-	longjmp(myerr->jb, 1);
-}
-
-static void my_output_message(j_common_ptr cinfo)
-{
-	(*cinfo->err->format_message)(cinfo, lasterror);
-}
-
-
-// Global structures, macros, etc.
-
-typedef struct _jpgstruct
-{
-	struct jpeg_compress_struct cinfo;
-	struct jpeg_decompress_struct dinfo;
-	struct jpeg_destination_mgr jdms;
-	struct jpeg_source_mgr jsms;
-	error_mgr jerr;
-	int initc, initd;
-} jpgstruct;
-
-static const int hsampfactor[NUMSUBOPT]={1, 2, 2, 1};
-static const int vsampfactor[NUMSUBOPT]={1, 1, 2, 1};
-
-#define _throw(c) {sprintf(lasterror, "%s", c);  return -1;}
-#define _catch(f) {if((f)==-1) return -1;}
-#define checkhandle(h) jpgstruct *j=(jpgstruct *)h; \
-	if(!j) _throw("Invalid handle");
-
-
-// CO
-
-static boolean empty_output_buffer(struct jpeg_compress_struct *cinfo)
-{
-	ERREXIT(cinfo, JERR_BUFFER_SIZE);
-	return TRUE;
-}
-
-static void destination_noop(struct jpeg_compress_struct *cinfo)
-{
-}
-
-DLLEXPORT tjhandle DLLCALL tjInitCompress(void)
-{
-	jpgstruct *j=NULL;
-	if((j=(jpgstruct *)malloc(sizeof(jpgstruct)))==NULL)
-		{sprintf(lasterror, "Memory allocation failure");  return NULL;}
-	memset(j, 0, sizeof(jpgstruct));
-	j->cinfo.err=jpeg_std_error(&j->jerr.pub);
-	j->jerr.pub.error_exit=my_error_exit;
-	j->jerr.pub.output_message=my_output_message;
-
-	if(setjmp(j->jerr.jb))
-	{ // this will execute if LIBJPEG has an error
-		if(j) free(j);  return NULL;
-  }
-
-	jpeg_create_compress(&j->cinfo);
-	j->cinfo.dest=&j->jdms;
-	j->jdms.init_destination=destination_noop;
-	j->jdms.empty_output_buffer=empty_output_buffer;
-	j->jdms.term_destination=destination_noop;
-
-	j->initc=1;
-	return (tjhandle)j;
-}
-
-DLLEXPORT unsigned long DLLCALL TJBUFSIZE(int width, int height)
-{
-	// This allows enough room in case the image doesn't compress
-	return ((width+15)&(~15)) * ((height+15)&(~15)) * 6 + 2048;
-}
-
-DLLEXPORT int DLLCALL tjCompress(tjhandle h,
-	unsigned char *srcbuf, int width, int pitch, int height, int ps,
-	unsigned char *dstbuf, unsigned long *size,
-	int jpegsub, int qual, int flags)
-{
-	int i;  JSAMPROW *row_pointer=NULL;
-
-	checkhandle(h);
-
-	if(srcbuf==NULL || width<=0 || pitch<0 || height<=0
-		|| dstbuf==NULL || size==NULL
-		|| jpegsub<0 || jpegsub>=NUMSUBOPT || qual<0 || qual>100)
-		_throw("Invalid argument in tjCompress()");
-	if(ps!=3 && ps!=4) _throw("This compressor can only take 24-bit or 32-bit RGB input");
-	if(!j->initc) _throw("Instance has not been initialized for compression");
-
-	if(pitch==0) pitch=width*ps;
-
-	j->cinfo.image_width = width;
-	j->cinfo.image_height = height;
-	j->cinfo.input_components = ps;
-
-	#if JCS_EXTENSIONS==1
-	j->cinfo.in_color_space = JCS_EXT_RGB;
-	if(ps==3 && (flags&TJ_BGR))
-		j->cinfo.in_color_space = JCS_EXT_BGR;
-	else if(ps==4 && !(flags&TJ_BGR) && !(flags&TJ_ALPHAFIRST))
-		j->cinfo.in_color_space = JCS_EXT_RGBX;
-	else if(ps==4 && (flags&TJ_BGR) && !(flags&TJ_ALPHAFIRST))
-		j->cinfo.in_color_space = JCS_EXT_BGRX;
-	else if(ps==4 && (flags&TJ_BGR) && (flags&TJ_ALPHAFIRST))
-		j->cinfo.in_color_space = JCS_EXT_XBGR;
-	else if(ps==4 && !(flags&TJ_BGR) && (flags&TJ_ALPHAFIRST))
-		j->cinfo.in_color_space = JCS_EXT_XRGB;
-	#else
-	#error "TurboJPEG requires JPEG colorspace extensions"
-	#endif
-
-	if(flags&TJ_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
-	else if(flags&TJ_FORCESSE) putenv("JSIMD_FORCESSE=1");
-	else if(flags&TJ_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
-
-	if(setjmp(j->jerr.jb))
-	{  // this will execute if LIBJPEG has an error
-		if(row_pointer) free(row_pointer);
-		return -1;
-  }
-
-	jpeg_set_defaults(&j->cinfo);
-
-	jpeg_set_quality(&j->cinfo, qual, TRUE);
-	if(jpegsub==TJ_GRAYSCALE)
-		jpeg_set_colorspace(&j->cinfo, JCS_GRAYSCALE);
-	else
-		jpeg_set_colorspace(&j->cinfo, JCS_YCbCr);
-	j->cinfo.dct_method = JDCT_FASTEST;
-
-	j->cinfo.comp_info[0].h_samp_factor=hsampfactor[jpegsub];
-	j->cinfo.comp_info[1].h_samp_factor=1;
-	j->cinfo.comp_info[2].h_samp_factor=1;
-	j->cinfo.comp_info[0].v_samp_factor=vsampfactor[jpegsub];
-	j->cinfo.comp_info[1].v_samp_factor=1;
-	j->cinfo.comp_info[2].v_samp_factor=1;
-
-	j->jdms.next_output_byte = dstbuf;
-	j->jdms.free_in_buffer = TJBUFSIZE(j->cinfo.image_width, j->cinfo.image_height);
-
-	if((row_pointer=(JSAMPROW *)malloc(sizeof(JSAMPROW)*height))==NULL)
-		_throw("Memory allocation failed in tjInitCompress()");
-	for(i=0; i<height; i++)
-	{
-		if(flags&TJ_BOTTOMUP) row_pointer[i]= &srcbuf[(height-i-1)*pitch];
-		else row_pointer[i]= &srcbuf[i*pitch];
-	}
-	jpeg_start_compress(&j->cinfo, TRUE);
-	while(j->cinfo.next_scanline<j->cinfo.image_height)
-	{
-		jpeg_write_scanlines(&j->cinfo, &row_pointer[j->cinfo.next_scanline],
-			j->cinfo.image_height-j->cinfo.next_scanline);
-	}
-	jpeg_finish_compress(&j->cinfo);
-	*size=TJBUFSIZE(j->cinfo.image_width, j->cinfo.image_height)
-		-(unsigned long)(j->jdms.free_in_buffer);
-
-	if(row_pointer) free(row_pointer);
-	return 0;
-}
-
-
-// DEC
-
-static boolean fill_input_buffer (struct jpeg_decompress_struct *dinfo)
-{
-	ERREXIT(dinfo, JERR_BUFFER_SIZE);
-	return TRUE;
-}
-
-static void skip_input_data (struct jpeg_decompress_struct *dinfo, long num_bytes)
-{
-	dinfo->src->next_input_byte += (size_t) num_bytes;
-	dinfo->src->bytes_in_buffer -= (size_t) num_bytes;
-}
-
-static void source_noop (struct jpeg_decompress_struct *dinfo)
-{
-}
-
-DLLEXPORT tjhandle DLLCALL tjInitDecompress(void)
-{
-	jpgstruct *j;
-	if((j=(jpgstruct *)malloc(sizeof(jpgstruct)))==NULL)
-		{sprintf(lasterror, "Memory allocation failure");  return NULL;}
-	memset(j, 0, sizeof(jpgstruct));
-	j->dinfo.err=jpeg_std_error(&j->jerr.pub);
-	j->jerr.pub.error_exit=my_error_exit;
-	j->jerr.pub.output_message=my_output_message;
-
-	if(setjmp(j->jerr.jb))
-	{ // this will execute if LIBJPEG has an error
-		free(j);  return NULL;
-  }
-
-	jpeg_create_decompress(&j->dinfo);
-	j->dinfo.src=&j->jsms;
-	j->jsms.init_source=source_noop;
-	j->jsms.fill_input_buffer = fill_input_buffer;
-	j->jsms.skip_input_data = skip_input_data;
-	j->jsms.resync_to_restart = jpeg_resync_to_restart;
-	j->jsms.term_source = source_noop;
-
-	j->initd=1;
-	return (tjhandle)j;
-}
-
-
-DLLEXPORT int DLLCALL tjDecompressHeader(tjhandle h,
-	unsigned char *srcbuf, unsigned long size,
-	int *width, int *height)
-{
-	checkhandle(h);
-
-	if(srcbuf==NULL || size<=0 || width==NULL || height==NULL)
-		_throw("Invalid argument in tjDecompressHeader()");
-	if(!j->initd) _throw("Instance has not been initialized for decompression");
-
-	if(setjmp(j->jerr.jb))
-	{  // this will execute if LIBJPEG has an error
-		return -1;
-	}
-
-	j->jsms.bytes_in_buffer = size;
-	j->jsms.next_input_byte = srcbuf;
-
-	jpeg_read_header(&j->dinfo, TRUE);
-
-	*width=j->dinfo.image_width;  *height=j->dinfo.image_height;
-
-	jpeg_abort_decompress(&j->dinfo);
-
-	if(*width<1 || *height<1) _throw("Invalid data returned in header");
-	return 0;
-}
-
-
-DLLEXPORT int DLLCALL tjDecompress(tjhandle h,
-	unsigned char *srcbuf, unsigned long size,
-	unsigned char *dstbuf, int width, int pitch, int height, int ps,
-	int flags)
-{
-	int i;  JSAMPROW *row_pointer=NULL;
-
-	checkhandle(h);
-
-	if(srcbuf==NULL || size<=0
-		|| dstbuf==NULL || width<=0 || pitch<0 || height<=0)
-		_throw("Invalid argument in tjDecompress()");
-	if(ps!=3 && ps!=4) _throw("This compressor can only take 24-bit or 32-bit RGB input");
-	if(!j->initd) _throw("Instance has not been initialized for decompression");
-
-	if(pitch==0) pitch=width*ps;
-
-	if(flags&TJ_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
-	else if(flags&TJ_FORCESSE) putenv("JSIMD_FORCESSE=1");
-	else if(flags&TJ_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
-
-	if(setjmp(j->jerr.jb))
-	{  // this will execute if LIBJPEG has an error
-		if(row_pointer) free(row_pointer);
-		return -1;
-  }
-
-	j->jsms.bytes_in_buffer = size;
-	j->jsms.next_input_byte = srcbuf;
-
-	jpeg_read_header(&j->dinfo, TRUE);
-
-	if((row_pointer=(JSAMPROW *)malloc(sizeof(JSAMPROW)*height))==NULL)
-		_throw("Memory allocation failed in tjInitDecompress()");
-	for(i=0; i<height; i++)
-	{
-		if(flags&TJ_BOTTOMUP) row_pointer[i]= &dstbuf[(height-i-1)*pitch];
-		else row_pointer[i]= &dstbuf[i*pitch];
-	}
-
-	#if JCS_EXTENSIONS==1
-	j->dinfo.out_color_space = JCS_EXT_RGB;
-	if(ps==3 && (flags&TJ_BGR))
-		j->dinfo.out_color_space = JCS_EXT_BGR;
-	else if(ps==4 && !(flags&TJ_BGR) && !(flags&TJ_ALPHAFIRST))
-		j->dinfo.out_color_space = JCS_EXT_RGBX;
-	else if(ps==4 && (flags&TJ_BGR) && !(flags&TJ_ALPHAFIRST))
-		j->dinfo.out_color_space = JCS_EXT_BGRX;
-	else if(ps==4 && (flags&TJ_BGR) && (flags&TJ_ALPHAFIRST))
-		j->dinfo.out_color_space = JCS_EXT_XBGR;
-	else if(ps==4 && !(flags&TJ_BGR) && (flags&TJ_ALPHAFIRST))
-		j->dinfo.out_color_space = JCS_EXT_XRGB;
-	#else
-	#error "TurboJPEG requires JPEG colorspace extensions"
-	#endif
-	if(flags&TJ_FASTUPSAMPLE) j->dinfo.do_fancy_upsampling=FALSE;
-
-	jpeg_start_decompress(&j->dinfo);
-	while(j->dinfo.output_scanline<j->dinfo.output_height)
-	{
-		jpeg_read_scanlines(&j->dinfo, &row_pointer[j->dinfo.output_scanline],
-			j->dinfo.output_height-j->dinfo.output_scanline);
-	}
-	jpeg_finish_decompress(&j->dinfo);
-
-	if(row_pointer) free(row_pointer);
-	return 0;
-}
-
-
-// General
-
-DLLEXPORT char* DLLCALL tjGetErrorStr(void)
-{
-	return lasterror;
-}
-
-DLLEXPORT int DLLCALL tjDestroy(tjhandle h)
-{
-	checkhandle(h);
-	if(setjmp(j->jerr.jb)) return -1;
-	if(j->initc) jpeg_destroy_compress(&j->cinfo);
-	if(j->initd) jpeg_destroy_decompress(&j->dinfo);
-	free(j);
-	return 0;
-}
diff --git a/win/jsimdcfg.inc b/win/jsimdcfg.inc
old mode 100644
new mode 100755