Blame - jdk/src/share/native/sun/awt/libpng/pnggccrd.c - platform/libcore

blob: 7dbbf532a92a8dd0804c93e0dd4bceb8031ccb08 [file] [log] [blame]

J. Duke	319a3b9	2007-12-01 00:00:00 +0000	[diff] [blame^]	1	/*
				2	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
				3	*
				4	* This code is free software; you can redistribute it and/or modify it
				5	* under the terms of the GNU General Public License version 2 only, as
				6	* published by the Free Software Foundation. Sun designates this
				7	* particular file as subject to the "Classpath" exception as provided
				8	* by Sun in the LICENSE file that accompanied this code.
				9	*
				10	* This code is distributed in the hope that it will be useful, but WITHOUT
				11	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
				12	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
				13	* version 2 for more details (a copy is included in the LICENSE file that
				14	* accompanied this code).
				15	*
				16	* You should have received a copy of the GNU General Public License version
				17	* 2 along with this work; if not, write to the Free Software Foundation,
				18	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
				19	*
				20	* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
				21	* CA 95054 USA or visit www.sun.com if you need additional information or
				22	* have any questions.
				23	*/
				24
				25	/* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
				26	*
				27	* This file is available under and governed by the GNU General Public
				28	* License version 2 only, as published by the Free Software Foundation.
				29	* However, the following notice accompanied the original version of this
				30	* file and, per its terms, should not be removed:
				31	*
				32	* For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
				33	*
				34	* See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
				35	* and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
				36	* for Intel's performance analysis of the MMX vs. non-MMX code.
				37	*
				38	* Last changed in libpng 1.2.15 January 5, 2007
				39	* For conditions of distribution and use, see copyright notice in png.h
				40	* Copyright (c) 1998-2007 Glenn Randers-Pehrson
				41	* Copyright (c) 1998, Intel Corporation
				42	*
				43	* Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
				44	* Interface to libpng contributed by Gilles Vollant, 1999.
				45	* GNU C port by Greg Roelofs, 1999-2001.
				46	*
				47	* Lines 2350-4300 converted in place with intel2gas 1.3.1:
				48	*
				49	* intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
				50	*
				51	* and then cleaned up by hand. See http://hermes.terminal.at/intel2gas/ .
				52	*
				53	* NOTE: A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
				54	* is required to assemble the newer MMX instructions such as movq.
				55	* For djgpp, see
				56	*
				57	* ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
				58	*
				59	* (or a later version in the same directory). For Linux, check your
				60	* distribution's web site(s) or try these links:
				61	*
				62	* http://rufus.w3.org/linux/RPM/binutils.html
				63	* http://www.debian.org/Packages/stable/devel/binutils.html
				64	* ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
				65	* binutils.tgz
				66	*
				67	* For other platforms, see the main GNU site:
				68	*
				69	* ftp://ftp.gnu.org/pub/gnu/binutils/
				70	*
				71	* Version 2.5.2l.15 is definitely too old...
				72	*/
				73
				74	/*
				75	* TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
				76	* =====================================
				77	*
				78	* 19991006:
				79	* - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
				80	*
				81	* 19991007:
				82	* - additional optimizations (possible or definite):
				83	* x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
				84	* - write MMX code for 48-bit case (pixel_bytes == 6)
				85	* - figure out what's up with 24-bit case (pixel_bytes == 3):
				86	* why subtract 8 from width_mmx in the pass 4/5 case?
				87	* (only width_mmx case) (near line 1606)
				88	* x [DONE] replace pixel_bytes within each block with the true
				89	* constant value (or are compilers smart enough to do that?)
				90	* - rewrite all MMX interlacing code so it's aligned with
				91	* the beginning of the row buffer, not the end. This
				92	* would not only allow one to eliminate half of the memory
				93	* writes for odd passes (that is, pass == odd), it may also
				94	* eliminate some unaligned-data-access exceptions (assuming
				95	* there's a penalty for not aligning 64-bit accesses on
				96	* 64-bit boundaries). The only catch is that the "leftover"
				97	* pixel(s) at the end of the row would have to be saved,
				98	* but there are enough unused MMX registers in every case,
				99	* so this is not a problem. A further benefit is that the
				100	* post-MMX cleanup code (C code) in at least some of the
				101	* cases could be done within the assembler block.
				102	* x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
				103	* inconsistent, and don't match the MMX Programmer's Reference
				104	* Manual conventions anyway. They should be changed to
				105	* "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
				106	* was lowest in memory (e.g., corresponding to a left pixel)
				107	* and b7 is the byte that was highest (e.g., a right pixel).
				108	*
				109	* 19991016:
				110	* - Brennan's Guide notwithstanding, gcc under Linux does not
				111	* want globals prefixed by underscores when referencing them--
				112	* i.e., if the variable is const4, then refer to it as const4,
				113	* not _const4. This seems to be a djgpp-specific requirement.
				114	* Also, such variables apparently must be declared outside
				115	* of functions; neither static nor automatic variables work if
				116	* defined within the scope of a single function, but both
				117	* static and truly global (multi-module) variables work fine.
				118	*
				119	* 19991023:
				120	* - fixed png_combine_row() non-MMX replication bug (odd passes only?)
				121	* - switched from string-concatenation-with-macros to cleaner method of
				122	* renaming global variables for djgpp--i.e., always use prefixes in
				123	* inlined assembler code (== strings) and conditionally rename the
				124	* variables, not the other way around. Hence _const4, _mask8_0, etc.
				125	*
				126	* 19991024:
				127	* - fixed mmxsupport()/png_do_read_interlace() first-row bug
				128	* This one was severely weird: even though mmxsupport() doesn't touch
				129	* ebx (where "row" pointer was stored), it nevertheless managed to zero
				130	* the register (even in static/non-fPIC code--see below), which in turn
				131	* caused png_do_read_interlace() to return prematurely on the first row of
				132	* interlaced images (i.e., without expanding the interlaced pixels).
				133	* Inspection of the generated assembly code didn't turn up any clues,
				134	* although it did point at a minor optimization (i.e., get rid of
				135	* mmx_supported_local variable and just use eax). Possibly the CPUID
				136	* instruction is more destructive than it looks? (Not yet checked.)
				137	* - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
				138	* listings... Apparently register spillage has to do with ebx, since
				139	* it's used to index the global offset table. Commenting it out of the
				140	* input-reg lists in png_combine_row() eliminated compiler barfage, so
				141	* ifdef'd with __PIC__ macro: if defined, use a global for unmask
				142	*
				143	* 19991107:
				144	* - verified CPUID clobberage: 12-char string constant ("GenuineIntel",
				145	* "AuthenticAMD", etc.) placed in ebx:ecx:edx. Still need to polish.
				146	*
				147	* 19991120:
				148	* - made "diff" variable (now "_dif") global to simplify conversion of
				149	* filtering routines (running out of regs, sigh). "diff" is still used
				150	* in interlacing routines, however.
				151	* - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
				152	* macro determines which is used); original not yet tested.
				153	*
				154	* 20000213:
				155	* - when compiling with gcc, be sure to use -fomit-frame-pointer
				156	*
				157	* 20000319:
				158	* - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
				159	* pass == 4 or 5, that caused visible corruption of interlaced images
				160	*
				161	* 20000623:
				162	* - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
				163	* many of the form "forbidden register 0 (ax) was spilled for class AREG."
				164	* This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
				165	* Chuck Wilson supplied a patch involving dummy output registers. See
				166	* http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
				167	* for the original (anonymous) SourceForge bug report.
				168	*
				169	* 20000706:
				170	* - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
				171	* pnggccrd.c: In function `png_combine_row':
				172	* pnggccrd.c:525: more than 10 operands in `asm'
				173	* pnggccrd.c:669: more than 10 operands in `asm'
				174	* pnggccrd.c:828: more than 10 operands in `asm'
				175	* pnggccrd.c:994: more than 10 operands in `asm'
				176	* pnggccrd.c:1177: more than 10 operands in `asm'
				177	* They are all the same problem and can be worked around by using the
				178	* global _unmask variable unconditionally, not just in the -fPIC case.
				179	* Reportedly earlier versions of gcc also have the problem with more than
				180	* 10 operands; they just don't report it. Much strangeness ensues, etc.
				181	*
				182	* 20000729:
				183	* - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
				184	* MMX routine); began converting png_read_filter_row_mmx_sub()
				185	* - to finish remaining sections:
				186	* - clean up indentation and comments
				187	* - preload local variables
				188	* - add output and input regs (order of former determines numerical
				189	* mapping of latter)
				190	* - avoid all usage of ebx (including bx, bh, bl) register [20000823]
				191	* - remove "$" from addressing of Shift and Mask variables [20000823]
				192	*
				193	* 20000731:
				194	* - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
				195	*
				196	* 20000822:
				197	* - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
				198	* shared-library (-fPIC) version! Code works just fine as part of static
				199	* library. Damn damn damn damn damn, should have tested that sooner.
				200	* ebx is getting clobbered again (explicitly this time); need to save it
				201	* on stack or rewrite asm code to avoid using it altogether. Blargh!
				202	*
				203	* 20000823:
				204	* - first section was trickiest; all remaining sections have ebx -> edx now.
				205	* (-fPIC works again.) Also added missing underscores to various Shift*
				206	* and Mask globals and got rid of leading "$" signs.
				207	*
				208	* 20000826:
				209	* - added visual separators to help navigate microscopic printed copies
				210	* (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
				211	* on png_read_filter_row_mmx_avg()
				212	*
				213	* 20000828:
				214	* - finished png_read_filter_row_mmx_avg(): only Paeth left! (930 lines...)
				215	* What the hell, did png_read_filter_row_mmx_paeth(), too. Comments not
				216	* cleaned up/shortened in either routine, but functionality is complete
				217	* and seems to be working fine.
				218	*
				219	* 20000829:
				220	* - ahhh, figured out last(?) bit of gcc/gas asm-fu: if register is listed
				221	* as an input reg (with dummy output variables, etc.), then it cannot
				222	* also appear in the clobber list or gcc 2.95.2 will barf. The solution
				223	* is simple enough...
				224	*
				225	* 20000914:
				226	* - bug in png_read_filter_row_mmx_avg(): 16-bit grayscale not handled
				227	* correctly (but 48-bit RGB just fine)
				228	*
				229	* 20000916:
				230	* - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
				231	* - "_ShiftBpp.use = 24;" should have been "_ShiftBpp.use = 16;"
				232	* - "_ShiftRem.use = 40;" should have been "_ShiftRem.use = 48;"
				233	* - "psllq _ShiftRem, %%mm2" should have been "psrlq _ShiftRem, %%mm2"
				234	*
				235	* 20010101:
				236	* - added new png_init_mmx_flags() function (here only because it needs to
				237	* call mmxsupport(), which should probably become global png_mmxsupport());
				238	* modified other MMX routines to run conditionally (png_ptr->asm_flags)
				239	*
				240	* 20010103:
				241	* - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
				242	* and made it public; moved png_init_mmx_flags() to png.c as internal func
				243	*
				244	* 20010104:
				245	* - removed dependency on png_read_filter_row_c() (C code already duplicated
				246	* within MMX version of png_read_filter_row()) so no longer necessary to
				247	* compile it into pngrutil.o
				248	*
				249	* 20010310:
				250	* - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
				251	*
				252	* 20020304:
				253	* - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
				254	*
				255	* 20040724:
				256	* - more tinkering with clobber list at lines 4529 and 5033, to get
				257	* it to compile on gcc-3.4.
				258	*
				259	* STILL TO DO:
				260	* - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
				261	* - write MMX code for 48-bit case (pixel_bytes == 6)
				262	* - figure out what's up with 24-bit case (pixel_bytes == 3):
				263	* why subtract 8 from width_mmx in the pass 4/5 case?
				264	* (only width_mmx case) (near line 1606)
				265	* - rewrite all MMX interlacing code so it's aligned with beginning
				266	* of the row buffer, not the end (see 19991007 for details)
				267	* x pick one version of mmxsupport() and get rid of the other
				268	* - add error messages to any remaining bogus default cases
				269	* - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
				270	* x add support for runtime enable/disable/query of various MMX routines
				271	*/
				272
				273	#define PNG_INTERNAL
				274	#include "png.h"
				275
				276	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGGCCRD)
				277
				278	int PNGAPI png_mmx_support(void);
				279
				280	#ifdef PNG_USE_LOCAL_ARRAYS
				281	const static int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
				282	const static int FARDATA png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
				283	const static int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
				284	#endif
				285
				286	#if defined(PNG_MMX_CODE_SUPPORTED)
				287	/* djgpp, Win32, Cygwin, and OS2 add their own underscores to global variables,
				288	* so define them without: */
				289	#if defined(__DJGPP__) \|\| defined(WIN32) \|\| defined(__CYGWIN__) \|\| \
				290	defined(__OS2__)
				291	# define _mmx_supported mmx_supported
				292	# define _const4 const4
				293	# define _const6 const6
				294	# define _mask8_0 mask8_0
				295	# define _mask16_1 mask16_1
				296	# define _mask16_0 mask16_0
				297	# define _mask24_2 mask24_2
				298	# define _mask24_1 mask24_1
				299	# define _mask24_0 mask24_0
				300	# define _mask32_3 mask32_3
				301	# define _mask32_2 mask32_2
				302	# define _mask32_1 mask32_1
				303	# define _mask32_0 mask32_0
				304	# define _mask48_5 mask48_5
				305	# define _mask48_4 mask48_4
				306	# define _mask48_3 mask48_3
				307	# define _mask48_2 mask48_2
				308	# define _mask48_1 mask48_1
				309	# define _mask48_0 mask48_0
				310	# define _LBCarryMask LBCarryMask
				311	# define _HBClearMask HBClearMask
				312	# define _ActiveMask ActiveMask
				313	# define _ActiveMask2 ActiveMask2
				314	# define _ActiveMaskEnd ActiveMaskEnd
				315	# define _ShiftBpp ShiftBpp
				316	# define _ShiftRem ShiftRem
				317	#ifdef PNG_THREAD_UNSAFE_OK
				318	# define _unmask unmask
				319	# define _FullLength FullLength
				320	# define _MMXLength MMXLength
				321	# define _dif dif
				322	# define _patemp patemp
				323	# define _pbtemp pbtemp
				324	# define _pctemp pctemp
				325	#endif
				326	#endif
				327
				328
				329	/* These constants are used in the inlined MMX assembly code.
				330	Ignore gcc's "At top level: defined but not used" warnings. */
				331
				332	/* GRR 20000706: originally _unmask was needed only when compiling with -fPIC,
				333	* since that case uses the %ebx register for indexing the Global Offset Table
				334	* and there were no other registers available. But gcc 2.95 and later emit
				335	* "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
				336	* in the non-PIC case, so we'll just use the global unconditionally now.
				337	*/
				338	#ifdef PNG_THREAD_UNSAFE_OK
				339	static int _unmask;
				340	#endif
				341
				342	const static unsigned long long _mask8_0 = 0x0102040810204080LL;
				343
				344	const static unsigned long long _mask16_1 = 0x0101020204040808LL;
				345	const static unsigned long long _mask16_0 = 0x1010202040408080LL;
				346
				347	const static unsigned long long _mask24_2 = 0x0101010202020404LL;
				348	const static unsigned long long _mask24_1 = 0x0408080810101020LL;
				349	const static unsigned long long _mask24_0 = 0x2020404040808080LL;
				350
				351	const static unsigned long long _mask32_3 = 0x0101010102020202LL;
				352	const static unsigned long long _mask32_2 = 0x0404040408080808LL;
				353	const static unsigned long long _mask32_1 = 0x1010101020202020LL;
				354	const static unsigned long long _mask32_0 = 0x4040404080808080LL;
				355
				356	const static unsigned long long _mask48_5 = 0x0101010101010202LL;
				357	const static unsigned long long _mask48_4 = 0x0202020204040404LL;
				358	const static unsigned long long _mask48_3 = 0x0404080808080808LL;
				359	const static unsigned long long _mask48_2 = 0x1010101010102020LL;
				360	const static unsigned long long _mask48_1 = 0x2020202040404040LL;
				361	const static unsigned long long _mask48_0 = 0x4040808080808080LL;
				362
				363	const static unsigned long long _const4 = 0x0000000000FFFFFFLL;
				364	//const static unsigned long long _const5 = 0x000000FFFFFF0000LL; // NOT USED
				365	const static unsigned long long _const6 = 0x00000000000000FFLL;
				366
				367	// These are used in the row-filter routines and should/would be local
				368	// variables if not for gcc addressing limitations.
				369	// WARNING: Their presence probably defeats the thread safety of libpng.
				370
				371	#ifdef PNG_THREAD_UNSAFE_OK
				372	static png_uint_32 _FullLength;
				373	static png_uint_32 _MMXLength;
				374	static int _dif;
				375	static int _patemp; // temp variables for Paeth routine
				376	static int _pbtemp;
				377	static int _pctemp;
				378	#endif
				379
				380	void /* PRIVATE */
				381	png_squelch_warnings(void)
				382	{
				383	#ifdef PNG_THREAD_UNSAFE_OK
				384	_dif = _dif;
				385	_patemp = _patemp;
				386	_pbtemp = _pbtemp;
				387	_pctemp = _pctemp;
				388	_MMXLength = _MMXLength;
				389	#endif
				390	_const4 = _const4;
				391	_const6 = _const6;
				392	_mask8_0 = _mask8_0;
				393	_mask16_1 = _mask16_1;
				394	_mask16_0 = _mask16_0;
				395	_mask24_2 = _mask24_2;
				396	_mask24_1 = _mask24_1;
				397	_mask24_0 = _mask24_0;
				398	_mask32_3 = _mask32_3;
				399	_mask32_2 = _mask32_2;
				400	_mask32_1 = _mask32_1;
				401	_mask32_0 = _mask32_0;
				402	_mask48_5 = _mask48_5;
				403	_mask48_4 = _mask48_4;
				404	_mask48_3 = _mask48_3;
				405	_mask48_2 = _mask48_2;
				406	_mask48_1 = _mask48_1;
				407	_mask48_0 = _mask48_0;
				408	}
				409	#endif /* PNG_MMX_CODE_SUPPORTED */
				410
				411
				412	static int _mmx_supported = 2;
				413
				414	/===========================================================================/
				415	/* */
				416	/* P N G _ C O M B I N E _ R O W */
				417	/* */
				418	/===========================================================================/
				419
				420	#if defined(PNG_HAVE_MMX_COMBINE_ROW)
				421
				422	#define BPP2 2
				423	#define BPP3 3 /* bytes per pixel (a.k.a. pixel_bytes) */
				424	#define BPP4 4
				425	#define BPP6 6 /* (defined only to help avoid cut-and-paste errors) */
				426	#define BPP8 8
				427
				428	/* Combines the row recently read in with the previous row.
				429	This routine takes care of alpha and transparency if requested.
				430	This routine also handles the two methods of progressive display
				431	of interlaced images, depending on the mask value.
				432	The mask value describes which pixels are to be combined with
				433	the row. The pattern always repeats every 8 pixels, so just 8
				434	bits are needed. A one indicates the pixel is to be combined; a
				435	zero indicates the pixel is to be skipped. This is in addition
				436	to any alpha or transparency value associated with the pixel.
				437	If you want all pixels to be combined, pass 0xff (255) in mask. */
				438
				439	/* Use this routine for the x86 platform - it uses a faster MMX routine
				440	if the machine supports MMX. */
				441
				442	void /* PRIVATE */
				443	png_combine_row(png_structp png_ptr, png_bytep row, int mask)
				444	{
				445	png_debug(1, "in png_combine_row (pnggccrd.c)\n");
				446
				447	#if defined(PNG_MMX_CODE_SUPPORTED)
				448	if (_mmx_supported == 2) {
				449	#if !defined(PNG_1_0_X)
				450	/* this should have happened in png_init_mmx_flags() already */
				451	png_warning(png_ptr, "asm_flags may not have been initialized");
				452	#endif
				453	png_mmx_support();
				454	}
				455	#endif
				456
				457	if (mask == 0xff)
				458	{
				459	png_debug(2,"mask == 0xff: doing single png_memcpy()\n");
				460	png_memcpy(row, png_ptr->row_buf + 1,
				461	(png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,png_ptr->width));
				462	}
				463	else /* (png_combine_row() is never called with mask == 0) */
				464	{
				465	switch (png_ptr->row_info.pixel_depth)
				466	{
				467	case 1: /* png_ptr->row_info.pixel_depth */
				468	{
				469	png_bytep sp;
				470	png_bytep dp;
				471	int s_inc, s_start, s_end;
				472	int m;
				473	int shift;
				474	png_uint_32 i;
				475
				476	sp = png_ptr->row_buf + 1;
				477	dp = row;
				478	m = 0x80;
				479	#if defined(PNG_READ_PACKSWAP_SUPPORTED)
				480	if (png_ptr->transformations & PNG_PACKSWAP)
				481	{
				482	s_start = 0;
				483	s_end = 7;
				484	s_inc = 1;
				485	}
				486	else
				487	#endif
				488	{
				489	s_start = 7;
				490	s_end = 0;
				491	s_inc = -1;
				492	}
				493
				494	shift = s_start;
				495
				496	for (i = 0; i < png_ptr->width; i++)
				497	{
				498	if (m & mask)
				499	{
				500	int value;
				501
				502	value = (*sp >> shift) & 0x1;
				503	*dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
				504	*dp \|= (png_byte)(value << shift);
				505	}
				506
				507	if (shift == s_end)
				508	{
				509	shift = s_start;
				510	sp++;
				511	dp++;
				512	}
				513	else
				514	shift += s_inc;
				515
				516	if (m == 1)
				517	m = 0x80;
				518	else
				519	m >>= 1;
				520	}
				521	break;
				522	}
				523
				524	case 2: /* png_ptr->row_info.pixel_depth */
				525	{
				526	png_bytep sp;
				527	png_bytep dp;
				528	int s_start, s_end, s_inc;
				529	int m;
				530	int shift;
				531	png_uint_32 i;
				532	int value;
				533
				534	sp = png_ptr->row_buf + 1;
				535	dp = row;
				536	m = 0x80;
				537	#if defined(PNG_READ_PACKSWAP_SUPPORTED)
				538	if (png_ptr->transformations & PNG_PACKSWAP)
				539	{
				540	s_start = 0;
				541	s_end = 6;
				542	s_inc = 2;
				543	}
				544	else
				545	#endif
				546	{
				547	s_start = 6;
				548	s_end = 0;
				549	s_inc = -2;
				550	}
				551
				552	shift = s_start;
				553
				554	for (i = 0; i < png_ptr->width; i++)
				555	{
				556	if (m & mask)
				557	{
				558	value = (*sp >> shift) & 0x3;
				559	*dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
				560	*dp \|= (png_byte)(value << shift);
				561	}
				562
				563	if (shift == s_end)
				564	{
				565	shift = s_start;
				566	sp++;
				567	dp++;
				568	}
				569	else
				570	shift += s_inc;
				571	if (m == 1)
				572	m = 0x80;
				573	else
				574	m >>= 1;
				575	}
				576	break;
				577	}
				578
				579	case 4: /* png_ptr->row_info.pixel_depth */
				580	{
				581	png_bytep sp;
				582	png_bytep dp;
				583	int s_start, s_end, s_inc;
				584	int m;
				585	int shift;
				586	png_uint_32 i;
				587	int value;
				588
				589	sp = png_ptr->row_buf + 1;
				590	dp = row;
				591	m = 0x80;
				592	#if defined(PNG_READ_PACKSWAP_SUPPORTED)
				593	if (png_ptr->transformations & PNG_PACKSWAP)
				594	{
				595	s_start = 0;
				596	s_end = 4;
				597	s_inc = 4;
				598	}
				599	else
				600	#endif
				601	{
				602	s_start = 4;
				603	s_end = 0;
				604	s_inc = -4;
				605	}
				606	shift = s_start;
				607
				608	for (i = 0; i < png_ptr->width; i++)
				609	{
				610	if (m & mask)
				611	{
				612	value = (*sp >> shift) & 0xf;
				613	*dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
				614	*dp \|= (png_byte)(value << shift);
				615	}
				616
				617	if (shift == s_end)
				618	{
				619	shift = s_start;
				620	sp++;
				621	dp++;
				622	}
				623	else
				624	shift += s_inc;
				625	if (m == 1)
				626	m = 0x80;
				627	else
				628	m >>= 1;
				629	}
				630	break;
				631	}
				632
				633	case 8: /* png_ptr->row_info.pixel_depth */
				634	{
				635	png_bytep srcptr;
				636	png_bytep dstptr;
				637
				638	#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
				639	#if !defined(PNG_1_0_X)
				640	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
				641	/* && _mmx_supported */ )
				642	#else
				643	if (_mmx_supported)
				644	#endif
				645	{
				646	png_uint_32 len;
				647	int diff;
				648	int dummy_value_a; // fix 'forbidden register spilled' error
				649	int dummy_value_d;
				650	int dummy_value_c;
				651	int dummy_value_S;
				652	int dummy_value_D;
				653	_unmask = ~mask; // global variable for -fPIC version
				654	srcptr = png_ptr->row_buf + 1;
				655	dstptr = row;
				656	len = png_ptr->width &~7; // reduce to multiple of 8
				657	diff = (int) (png_ptr->width & 7); // amount lost
				658
				659	__asm__ __volatile__ (
				660	"movd _unmask, %%mm7 \n\t" // load bit pattern
				661	"psubb %%mm6, %%mm6 \n\t" // zero mm6
				662	"punpcklbw %%mm7, %%mm7 \n\t"
				663	"punpcklwd %%mm7, %%mm7 \n\t"
				664	"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
				665
				666	"movq _mask8_0, %%mm0 \n\t"
				667	"pand %%mm7, %%mm0 \n\t" // nonzero if keep byte
				668	"pcmpeqb %%mm6, %%mm0 \n\t" // zeros->1s, v versa
				669
				670	// preload "movl len, %%ecx \n\t" // load length of line
				671	// preload "movl srcptr, %%esi \n\t" // load source
				672	// preload "movl dstptr, %%edi \n\t" // load dest
				673
				674	"cmpl $0, %%ecx \n\t" // len == 0 ?
				675	"je mainloop8end \n\t"
				676
				677	"mainloop8: \n\t"
				678	"movq (%%esi), %%mm4 \n\t" // *srcptr
				679	"pand %%mm0, %%mm4 \n\t"
				680	"movq %%mm0, %%mm6 \n\t"
				681	"pandn (%%edi), %%mm6 \n\t" // *dstptr
				682	"por %%mm6, %%mm4 \n\t"
				683	"movq %%mm4, (%%edi) \n\t"
				684	"addl $8, %%esi \n\t" // inc by 8 bytes processed
				685	"addl $8, %%edi \n\t"
				686	"subl $8, %%ecx \n\t" // dec by 8 pixels processed
				687	"ja mainloop8 \n\t"
				688
				689	"mainloop8end: \n\t"
				690	// preload "movl diff, %%ecx \n\t" // (diff is in eax)
				691	"movl %%eax, %%ecx \n\t"
				692	"cmpl $0, %%ecx \n\t"
				693	"jz end8 \n\t"
				694	// preload "movl mask, %%edx \n\t"
				695	"sall $24, %%edx \n\t" // make low byte, high byte
				696
				697	"secondloop8: \n\t"
				698	"sall %%edx \n\t" // move high bit to CF
				699	"jnc skip8 \n\t" // if CF = 0
				700	"movb (%%esi), %%al \n\t"
				701	"movb %%al, (%%edi) \n\t"
				702
				703	"skip8: \n\t"
				704	"incl %%esi \n\t"
				705	"incl %%edi \n\t"
				706	"decl %%ecx \n\t"
				707	"jnz secondloop8 \n\t"
				708
				709	"end8: \n\t"
				710	"EMMS \n\t" // DONE
				711
				712	: "=a" (dummy_value_a), // output regs (dummy)
				713	"=d" (dummy_value_d),
				714	"=c" (dummy_value_c),
				715	"=S" (dummy_value_S),
				716	"=D" (dummy_value_D)
				717
				718	: "3" (srcptr), // esi // input regs
				719	"4" (dstptr), // edi
				720	"0" (diff), // eax
				721	// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
				722	"2" (len), // ecx
				723	"1" (mask) // edx
				724
				725	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
				726	: "%mm0", "%mm4", "%mm6", "%mm7" // clobber list
				727	#endif
				728	);
				729	}
				730	else /* mmx _not supported - Use modified C routine */
				731	#endif /* PNG_MMX_CODE_SUPPORTED */
				732	{
				733	register png_uint_32 i;
				734	png_uint_32 initial_val = png_pass_start[png_ptr->pass];
				735	/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
				736	register int stride = png_pass_inc[png_ptr->pass];
				737	/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
				738	register int rep_bytes = png_pass_width[png_ptr->pass];
				739	/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
				740	png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
				741	int diff = (int) (png_ptr->width & 7); /* amount lost */
				742	register png_uint_32 final_val = len; /* GRR bugfix */
				743
				744	srcptr = png_ptr->row_buf + 1 + initial_val;
				745	dstptr = row + initial_val;
				746
				747	for (i = initial_val; i < final_val; i += stride)
				748	{
				749	png_memcpy(dstptr, srcptr, rep_bytes);
				750	srcptr += stride;
				751	dstptr += stride;
				752	}
				753	if (diff) /* number of leftover pixels: 3 for pngtest */
				754	{
				755	final_val+=diff /* BPP1 / ;
				756	for (; i < final_val; i += stride)
				757	{
				758	if (rep_bytes > (int)(final_val-i))
				759	rep_bytes = (int)(final_val-i);
				760	png_memcpy(dstptr, srcptr, rep_bytes);
				761	srcptr += stride;
				762	dstptr += stride;
				763	}
				764	}
				765
				766	} /* end of else (_mmx_supported) */
				767
				768	break;
				769	} /* end 8 bpp */
				770
				771	case 16: /* png_ptr->row_info.pixel_depth */
				772	{
				773	png_bytep srcptr;
				774	png_bytep dstptr;
				775
				776	#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
				777	#if !defined(PNG_1_0_X)
				778	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
				779	/* && _mmx_supported */ )
				780	#else
				781	if (_mmx_supported)
				782	#endif
				783	{
				784	png_uint_32 len;
				785	int diff;
				786	int dummy_value_a; // fix 'forbidden register spilled' error
				787	int dummy_value_d;
				788	int dummy_value_c;
				789	int dummy_value_S;
				790	int dummy_value_D;
				791	_unmask = ~mask; // global variable for -fPIC version
				792	srcptr = png_ptr->row_buf + 1;
				793	dstptr = row;
				794	len = png_ptr->width &~7; // reduce to multiple of 8
				795	diff = (int) (png_ptr->width & 7); // amount lost //
				796
				797	__asm__ __volatile__ (
				798	"movd _unmask, %%mm7 \n\t" // load bit pattern
				799	"psubb %%mm6, %%mm6 \n\t" // zero mm6
				800	"punpcklbw %%mm7, %%mm7 \n\t"
				801	"punpcklwd %%mm7, %%mm7 \n\t"
				802	"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
				803
				804	"movq _mask16_0, %%mm0 \n\t"
				805	"movq _mask16_1, %%mm1 \n\t"
				806
				807	"pand %%mm7, %%mm0 \n\t"
				808	"pand %%mm7, %%mm1 \n\t"
				809
				810	"pcmpeqb %%mm6, %%mm0 \n\t"
				811	"pcmpeqb %%mm6, %%mm1 \n\t"
				812
				813	// preload "movl len, %%ecx \n\t" // load length of line
				814	// preload "movl srcptr, %%esi \n\t" // load source
				815	// preload "movl dstptr, %%edi \n\t" // load dest
				816
				817	"cmpl $0, %%ecx \n\t"
				818	"jz mainloop16end \n\t"
				819
				820	"mainloop16: \n\t"
				821	"movq (%%esi), %%mm4 \n\t"
				822	"pand %%mm0, %%mm4 \n\t"
				823	"movq %%mm0, %%mm6 \n\t"
				824	"movq (%%edi), %%mm7 \n\t"
				825	"pandn %%mm7, %%mm6 \n\t"
				826	"por %%mm6, %%mm4 \n\t"
				827	"movq %%mm4, (%%edi) \n\t"
				828
				829	"movq 8(%%esi), %%mm5 \n\t"
				830	"pand %%mm1, %%mm5 \n\t"
				831	"movq %%mm1, %%mm7 \n\t"
				832	"movq 8(%%edi), %%mm6 \n\t"
				833	"pandn %%mm6, %%mm7 \n\t"
				834	"por %%mm7, %%mm5 \n\t"
				835	"movq %%mm5, 8(%%edi) \n\t"
				836
				837	"addl $16, %%esi \n\t" // inc by 16 bytes processed
				838	"addl $16, %%edi \n\t"
				839	"subl $8, %%ecx \n\t" // dec by 8 pixels processed
				840	"ja mainloop16 \n\t"
				841
				842	"mainloop16end: \n\t"
				843	// preload "movl diff, %%ecx \n\t" // (diff is in eax)
				844	"movl %%eax, %%ecx \n\t"
				845	"cmpl $0, %%ecx \n\t"
				846	"jz end16 \n\t"
				847	// preload "movl mask, %%edx \n\t"
				848	"sall $24, %%edx \n\t" // make low byte, high byte
				849
				850	"secondloop16: \n\t"
				851	"sall %%edx \n\t" // move high bit to CF
				852	"jnc skip16 \n\t" // if CF = 0
				853	"movw (%%esi), %%ax \n\t"
				854	"movw %%ax, (%%edi) \n\t"
				855
				856	"skip16: \n\t"
				857	"addl $2, %%esi \n\t"
				858	"addl $2, %%edi \n\t"
				859	"decl %%ecx \n\t"
				860	"jnz secondloop16 \n\t"
				861
				862	"end16: \n\t"
				863	"EMMS \n\t" // DONE
				864
				865	: "=a" (dummy_value_a), // output regs (dummy)
				866	"=c" (dummy_value_c),
				867	"=d" (dummy_value_d),
				868	"=S" (dummy_value_S),
				869	"=D" (dummy_value_D)
				870
				871	: "0" (diff), // eax // input regs
				872	// was (unmask) " " RESERVED // ebx // Global Offset Table idx
				873	"1" (len), // ecx
				874	"2" (mask), // edx
				875	"3" (srcptr), // esi
				876	"4" (dstptr) // edi
				877
				878	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
				879	: "%mm0", "%mm1", "%mm4" // clobber list
				880	, "%mm5", "%mm6", "%mm7"
				881	#endif
				882	);
				883	}
				884	else /* mmx _not supported - Use modified C routine */
				885	#endif /* PNG_MMX_CODE_SUPPORTED */
				886	{
				887	register png_uint_32 i;
				888	png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
				889	/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
				890	register int stride = BPP2 * png_pass_inc[png_ptr->pass];
				891	/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
				892	register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
				893	/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
				894	png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
				895	int diff = (int) (png_ptr->width & 7); /* amount lost */
				896	register png_uint_32 final_val = BPP2 * len; /* GRR bugfix */
				897
				898	srcptr = png_ptr->row_buf + 1 + initial_val;
				899	dstptr = row + initial_val;
				900
				901	for (i = initial_val; i < final_val; i += stride)
				902	{
				903	png_memcpy(dstptr, srcptr, rep_bytes);
				904	srcptr += stride;
				905	dstptr += stride;
				906	}
				907	if (diff) /* number of leftover pixels: 3 for pngtest */
				908	{
				909	final_val+=diff*BPP2;
				910	for (; i < final_val; i += stride)
				911	{
				912	if (rep_bytes > (int)(final_val-i))
				913	rep_bytes = (int)(final_val-i);
				914	png_memcpy(dstptr, srcptr, rep_bytes);
				915	srcptr += stride;
				916	dstptr += stride;
				917	}
				918	}
				919	} /* end of else (_mmx_supported) */
				920
				921	break;
				922	} /* end 16 bpp */
				923
				924	case 24: /* png_ptr->row_info.pixel_depth */
				925	{
				926	png_bytep srcptr;
				927	png_bytep dstptr;
				928
				929	#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
				930	#if !defined(PNG_1_0_X)
				931	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
				932	/* && _mmx_supported */ )
				933	#else
				934	if (_mmx_supported)
				935	#endif
				936	{
				937	png_uint_32 len;
				938	int diff;
				939	int dummy_value_a; // fix 'forbidden register spilled' error
				940	int dummy_value_d;
				941	int dummy_value_c;
				942	int dummy_value_S;
				943	int dummy_value_D;
				944	_unmask = ~mask; // global variable for -fPIC version
				945	srcptr = png_ptr->row_buf + 1;
				946	dstptr = row;
				947	len = png_ptr->width &~7; // reduce to multiple of 8
				948	diff = (int) (png_ptr->width & 7); // amount lost //
				949
				950	__asm__ __volatile__ (
				951	"movd _unmask, %%mm7 \n\t" // load bit pattern
				952	"psubb %%mm6, %%mm6 \n\t" // zero mm6
				953	"punpcklbw %%mm7, %%mm7 \n\t"
				954	"punpcklwd %%mm7, %%mm7 \n\t"
				955	"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
				956
				957	"movq _mask24_0, %%mm0 \n\t"
				958	"movq _mask24_1, %%mm1 \n\t"
				959	"movq _mask24_2, %%mm2 \n\t"
				960
				961	"pand %%mm7, %%mm0 \n\t"
				962	"pand %%mm7, %%mm1 \n\t"
				963	"pand %%mm7, %%mm2 \n\t"
				964
				965	"pcmpeqb %%mm6, %%mm0 \n\t"
				966	"pcmpeqb %%mm6, %%mm1 \n\t"
				967	"pcmpeqb %%mm6, %%mm2 \n\t"
				968
				969	// preload "movl len, %%ecx \n\t" // load length of line
				970	// preload "movl srcptr, %%esi \n\t" // load source
				971	// preload "movl dstptr, %%edi \n\t" // load dest
				972
				973	"cmpl $0, %%ecx \n\t"
				974	"jz mainloop24end \n\t"
				975
				976	"mainloop24: \n\t"
				977	"movq (%%esi), %%mm4 \n\t"
				978	"pand %%mm0, %%mm4 \n\t"
				979	"movq %%mm0, %%mm6 \n\t"
				980	"movq (%%edi), %%mm7 \n\t"
				981	"pandn %%mm7, %%mm6 \n\t"
				982	"por %%mm6, %%mm4 \n\t"
				983	"movq %%mm4, (%%edi) \n\t"
				984
				985	"movq 8(%%esi), %%mm5 \n\t"
				986	"pand %%mm1, %%mm5 \n\t"
				987	"movq %%mm1, %%mm7 \n\t"
				988	"movq 8(%%edi), %%mm6 \n\t"
				989	"pandn %%mm6, %%mm7 \n\t"
				990	"por %%mm7, %%mm5 \n\t"
				991	"movq %%mm5, 8(%%edi) \n\t"
				992
				993	"movq 16(%%esi), %%mm6 \n\t"
				994	"pand %%mm2, %%mm6 \n\t"
				995	"movq %%mm2, %%mm4 \n\t"
				996	"movq 16(%%edi), %%mm7 \n\t"
				997	"pandn %%mm7, %%mm4 \n\t"
				998	"por %%mm4, %%mm6 \n\t"
				999	"movq %%mm6, 16(%%edi) \n\t"
				1000
				1001	"addl $24, %%esi \n\t" // inc by 24 bytes processed
				1002	"addl $24, %%edi \n\t"
				1003	"subl $8, %%ecx \n\t" // dec by 8 pixels processed
				1004
				1005	"ja mainloop24 \n\t"
				1006
				1007	"mainloop24end: \n\t"
				1008	// preload "movl diff, %%ecx \n\t" // (diff is in eax)
				1009	"movl %%eax, %%ecx \n\t"
				1010	"cmpl $0, %%ecx \n\t"
				1011	"jz end24 \n\t"
				1012	// preload "movl mask, %%edx \n\t"
				1013	"sall $24, %%edx \n\t" // make low byte, high byte
				1014
				1015	"secondloop24: \n\t"
				1016	"sall %%edx \n\t" // move high bit to CF
				1017	"jnc skip24 \n\t" // if CF = 0
				1018	"movw (%%esi), %%ax \n\t"
				1019	"movw %%ax, (%%edi) \n\t"
				1020	"xorl %%eax, %%eax \n\t"
				1021	"movb 2(%%esi), %%al \n\t"
				1022	"movb %%al, 2(%%edi) \n\t"
				1023
				1024	"skip24: \n\t"
				1025	"addl $3, %%esi \n\t"
				1026	"addl $3, %%edi \n\t"
				1027	"decl %%ecx \n\t"
				1028	"jnz secondloop24 \n\t"
				1029
				1030	"end24: \n\t"
				1031	"EMMS \n\t" // DONE
				1032
				1033	: "=a" (dummy_value_a), // output regs (dummy)
				1034	"=d" (dummy_value_d),
				1035	"=c" (dummy_value_c),
				1036	"=S" (dummy_value_S),
				1037	"=D" (dummy_value_D)
				1038
				1039	: "3" (srcptr), // esi // input regs
				1040	"4" (dstptr), // edi
				1041	"0" (diff), // eax
				1042	// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
				1043	"2" (len), // ecx
				1044	"1" (mask) // edx
				1045
				1046	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
				1047	: "%mm0", "%mm1", "%mm2" // clobber list
				1048	, "%mm4", "%mm5", "%mm6", "%mm7"
				1049	#endif
				1050	);
				1051	}
				1052	else /* mmx _not supported - Use modified C routine */
				1053	#endif /* PNG_MMX_CODE_SUPPORTED */
				1054	{
				1055	register png_uint_32 i;
				1056	png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
				1057	/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
				1058	register int stride = BPP3 * png_pass_inc[png_ptr->pass];
				1059	/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
				1060	register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
				1061	/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
				1062	png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
				1063	int diff = (int) (png_ptr->width & 7); /* amount lost */
				1064	register png_uint_32 final_val = BPP3 * len; /* GRR bugfix */
				1065
				1066	srcptr = png_ptr->row_buf + 1 + initial_val;
				1067	dstptr = row + initial_val;
				1068
				1069	for (i = initial_val; i < final_val; i += stride)
				1070	{
				1071	png_memcpy(dstptr, srcptr, rep_bytes);
				1072	srcptr += stride;
				1073	dstptr += stride;
				1074	}
				1075	if (diff) /* number of leftover pixels: 3 for pngtest */
				1076	{
				1077	final_val+=diff*BPP3;
				1078	for (; i < final_val; i += stride)
				1079	{
				1080	if (rep_bytes > (int)(final_val-i))
				1081	rep_bytes = (int)(final_val-i);
				1082	png_memcpy(dstptr, srcptr, rep_bytes);
				1083	srcptr += stride;
				1084	dstptr += stride;
				1085	}
				1086	}
				1087	} /* end of else (_mmx_supported) */
				1088
				1089	break;
				1090	} /* end 24 bpp */
				1091
				1092	case 32: /* png_ptr->row_info.pixel_depth */
				1093	{
				1094	png_bytep srcptr;
				1095	png_bytep dstptr;
				1096
				1097	#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
				1098	#if !defined(PNG_1_0_X)
				1099	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
				1100	/* && _mmx_supported */ )
				1101	#else
				1102	if (_mmx_supported)
				1103	#endif
				1104	{
				1105	png_uint_32 len;
				1106	int diff;
				1107	int dummy_value_a; // fix 'forbidden register spilled' error
				1108	int dummy_value_d;
				1109	int dummy_value_c;
				1110	int dummy_value_S;
				1111	int dummy_value_D;
				1112	_unmask = ~mask; // global variable for -fPIC version
				1113	srcptr = png_ptr->row_buf + 1;
				1114	dstptr = row;
				1115	len = png_ptr->width &~7; // reduce to multiple of 8
				1116	diff = (int) (png_ptr->width & 7); // amount lost //
				1117
				1118	__asm__ __volatile__ (
				1119	"movd _unmask, %%mm7 \n\t" // load bit pattern
				1120	"psubb %%mm6, %%mm6 \n\t" // zero mm6
				1121	"punpcklbw %%mm7, %%mm7 \n\t"
				1122	"punpcklwd %%mm7, %%mm7 \n\t"
				1123	"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
				1124
				1125	"movq _mask32_0, %%mm0 \n\t"
				1126	"movq _mask32_1, %%mm1 \n\t"
				1127	"movq _mask32_2, %%mm2 \n\t"
				1128	"movq _mask32_3, %%mm3 \n\t"
				1129
				1130	"pand %%mm7, %%mm0 \n\t"
				1131	"pand %%mm7, %%mm1 \n\t"
				1132	"pand %%mm7, %%mm2 \n\t"
				1133	"pand %%mm7, %%mm3 \n\t"
				1134
				1135	"pcmpeqb %%mm6, %%mm0 \n\t"
				1136	"pcmpeqb %%mm6, %%mm1 \n\t"
				1137	"pcmpeqb %%mm6, %%mm2 \n\t"
				1138	"pcmpeqb %%mm6, %%mm3 \n\t"
				1139
				1140	// preload "movl len, %%ecx \n\t" // load length of line
				1141	// preload "movl srcptr, %%esi \n\t" // load source
				1142	// preload "movl dstptr, %%edi \n\t" // load dest
				1143
				1144	"cmpl $0, %%ecx \n\t" // lcr
				1145	"jz mainloop32end \n\t"
				1146
				1147	"mainloop32: \n\t"
				1148	"movq (%%esi), %%mm4 \n\t"
				1149	"pand %%mm0, %%mm4 \n\t"
				1150	"movq %%mm0, %%mm6 \n\t"
				1151	"movq (%%edi), %%mm7 \n\t"
				1152	"pandn %%mm7, %%mm6 \n\t"
				1153	"por %%mm6, %%mm4 \n\t"
				1154	"movq %%mm4, (%%edi) \n\t"
				1155
				1156	"movq 8(%%esi), %%mm5 \n\t"
				1157	"pand %%mm1, %%mm5 \n\t"
				1158	"movq %%mm1, %%mm7 \n\t"
				1159	"movq 8(%%edi), %%mm6 \n\t"
				1160	"pandn %%mm6, %%mm7 \n\t"
				1161	"por %%mm7, %%mm5 \n\t"
				1162	"movq %%mm5, 8(%%edi) \n\t"
				1163
				1164	"movq 16(%%esi), %%mm6 \n\t"
				1165	"pand %%mm2, %%mm6 \n\t"
				1166	"movq %%mm2, %%mm4 \n\t"
				1167	"movq 16(%%edi), %%mm7 \n\t"
				1168	"pandn %%mm7, %%mm4 \n\t"
				1169	"por %%mm4, %%mm6 \n\t"
				1170	"movq %%mm6, 16(%%edi) \n\t"
				1171
				1172	"movq 24(%%esi), %%mm7 \n\t"
				1173	"pand %%mm3, %%mm7 \n\t"
				1174	"movq %%mm3, %%mm5 \n\t"
				1175	"movq 24(%%edi), %%mm4 \n\t"
				1176	"pandn %%mm4, %%mm5 \n\t"
				1177	"por %%mm5, %%mm7 \n\t"
				1178	"movq %%mm7, 24(%%edi) \n\t"
				1179
				1180	"addl $32, %%esi \n\t" // inc by 32 bytes processed
				1181	"addl $32, %%edi \n\t"
				1182	"subl $8, %%ecx \n\t" // dec by 8 pixels processed
				1183	"ja mainloop32 \n\t"
				1184
				1185	"mainloop32end: \n\t"
				1186	// preload "movl diff, %%ecx \n\t" // (diff is in eax)
				1187	"movl %%eax, %%ecx \n\t"
				1188	"cmpl $0, %%ecx \n\t"
				1189	"jz end32 \n\t"
				1190	// preload "movl mask, %%edx \n\t"
				1191	"sall $24, %%edx \n\t" // low byte => high byte
				1192
				1193	"secondloop32: \n\t"
				1194	"sall %%edx \n\t" // move high bit to CF
				1195	"jnc skip32 \n\t" // if CF = 0
				1196	"movl (%%esi), %%eax \n\t"
				1197	"movl %%eax, (%%edi) \n\t"
				1198
				1199	"skip32: \n\t"
				1200	"addl $4, %%esi \n\t"
				1201	"addl $4, %%edi \n\t"
				1202	"decl %%ecx \n\t"
				1203	"jnz secondloop32 \n\t"
				1204
				1205	"end32: \n\t"
				1206	"EMMS \n\t" // DONE
				1207
				1208	: "=a" (dummy_value_a), // output regs (dummy)
				1209	"=d" (dummy_value_d),
				1210	"=c" (dummy_value_c),
				1211	"=S" (dummy_value_S),
				1212	"=D" (dummy_value_D)
				1213
				1214	: "3" (srcptr), // esi // input regs
				1215	"4" (dstptr), // edi
				1216	"0" (diff), // eax
				1217	// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
				1218	"2" (len), // ecx
				1219	"1" (mask) // edx
				1220
				1221	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
				1222	: "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
				1223	, "%mm4", "%mm5", "%mm6", "%mm7"
				1224	#endif
				1225	);
				1226	}
				1227	else /* mmx _not supported - Use modified C routine */
				1228	#endif /* PNG_MMX_CODE_SUPPORTED */
				1229	{
				1230	register png_uint_32 i;
				1231	png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
				1232	/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
				1233	register int stride = BPP4 * png_pass_inc[png_ptr->pass];
				1234	/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
				1235	register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
				1236	/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
				1237	png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
				1238	int diff = (int) (png_ptr->width & 7); /* amount lost */
				1239	register png_uint_32 final_val = BPP4 * len; /* GRR bugfix */
				1240
				1241	srcptr = png_ptr->row_buf + 1 + initial_val;
				1242	dstptr = row + initial_val;
				1243
				1244	for (i = initial_val; i < final_val; i += stride)
				1245	{
				1246	png_memcpy(dstptr, srcptr, rep_bytes);
				1247	srcptr += stride;
				1248	dstptr += stride;
				1249	}
				1250	if (diff) /* number of leftover pixels: 3 for pngtest */
				1251	{
				1252	final_val+=diff*BPP4;
				1253	for (; i < final_val; i += stride)
				1254	{
				1255	if (rep_bytes > (int)(final_val-i))
				1256	rep_bytes = (int)(final_val-i);
				1257	png_memcpy(dstptr, srcptr, rep_bytes);
				1258	srcptr += stride;
				1259	dstptr += stride;
				1260	}
				1261	}
				1262	} /* end of else (_mmx_supported) */
				1263
				1264	break;
				1265	} /* end 32 bpp */
				1266
				1267	case 48: /* png_ptr->row_info.pixel_depth */
				1268	{
				1269	png_bytep srcptr;
				1270	png_bytep dstptr;
				1271
				1272	#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
				1273	#if !defined(PNG_1_0_X)
				1274	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
				1275	/* && _mmx_supported */ )
				1276	#else
				1277	if (_mmx_supported)
				1278	#endif
				1279	{
				1280	png_uint_32 len;
				1281	int diff;
				1282	int dummy_value_a; // fix 'forbidden register spilled' error
				1283	int dummy_value_d;
				1284	int dummy_value_c;
				1285	int dummy_value_S;
				1286	int dummy_value_D;
				1287	_unmask = ~mask; // global variable for -fPIC version
				1288	srcptr = png_ptr->row_buf + 1;
				1289	dstptr = row;
				1290	len = png_ptr->width &~7; // reduce to multiple of 8
				1291	diff = (int) (png_ptr->width & 7); // amount lost //
				1292
				1293	__asm__ __volatile__ (
				1294	"movd _unmask, %%mm7 \n\t" // load bit pattern
				1295	"psubb %%mm6, %%mm6 \n\t" // zero mm6
				1296	"punpcklbw %%mm7, %%mm7 \n\t"
				1297	"punpcklwd %%mm7, %%mm7 \n\t"
				1298	"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
				1299
				1300	"movq _mask48_0, %%mm0 \n\t"
				1301	"movq _mask48_1, %%mm1 \n\t"
				1302	"movq _mask48_2, %%mm2 \n\t"
				1303	"movq _mask48_3, %%mm3 \n\t"
				1304	"movq _mask48_4, %%mm4 \n\t"
				1305	"movq _mask48_5, %%mm5 \n\t"
				1306
				1307	"pand %%mm7, %%mm0 \n\t"
				1308	"pand %%mm7, %%mm1 \n\t"
				1309	"pand %%mm7, %%mm2 \n\t"
				1310	"pand %%mm7, %%mm3 \n\t"
				1311	"pand %%mm7, %%mm4 \n\t"
				1312	"pand %%mm7, %%mm5 \n\t"
				1313
				1314	"pcmpeqb %%mm6, %%mm0 \n\t"
				1315	"pcmpeqb %%mm6, %%mm1 \n\t"
				1316	"pcmpeqb %%mm6, %%mm2 \n\t"
				1317	"pcmpeqb %%mm6, %%mm3 \n\t"
				1318	"pcmpeqb %%mm6, %%mm4 \n\t"
				1319	"pcmpeqb %%mm6, %%mm5 \n\t"
				1320
				1321	// preload "movl len, %%ecx \n\t" // load length of line
				1322	// preload "movl srcptr, %%esi \n\t" // load source
				1323	// preload "movl dstptr, %%edi \n\t" // load dest
				1324
				1325	"cmpl $0, %%ecx \n\t"
				1326	"jz mainloop48end \n\t"
				1327
				1328	"mainloop48: \n\t"
				1329	"movq (%%esi), %%mm7 \n\t"
				1330	"pand %%mm0, %%mm7 \n\t"
				1331	"movq %%mm0, %%mm6 \n\t"
				1332	"pandn (%%edi), %%mm6 \n\t"
				1333	"por %%mm6, %%mm7 \n\t"
				1334	"movq %%mm7, (%%edi) \n\t"
				1335
				1336	"movq 8(%%esi), %%mm6 \n\t"
				1337	"pand %%mm1, %%mm6 \n\t"
				1338	"movq %%mm1, %%mm7 \n\t"
				1339	"pandn 8(%%edi), %%mm7 \n\t"
				1340	"por %%mm7, %%mm6 \n\t"
				1341	"movq %%mm6, 8(%%edi) \n\t"
				1342
				1343	"movq 16(%%esi), %%mm6 \n\t"
				1344	"pand %%mm2, %%mm6 \n\t"
				1345	"movq %%mm2, %%mm7 \n\t"
				1346	"pandn 16(%%edi), %%mm7 \n\t"
				1347	"por %%mm7, %%mm6 \n\t"
				1348	"movq %%mm6, 16(%%edi) \n\t"
				1349
				1350	"movq 24(%%esi), %%mm7 \n\t"
				1351	"pand %%mm3, %%mm7 \n\t"
				1352	"movq %%mm3, %%mm6 \n\t"
				1353	"pandn 24(%%edi), %%mm6 \n\t"
				1354	"por %%mm6, %%mm7 \n\t"
				1355	"movq %%mm7, 24(%%edi) \n\t"
				1356
				1357	"movq 32(%%esi), %%mm6 \n\t"
				1358	"pand %%mm4, %%mm6 \n\t"
				1359	"movq %%mm4, %%mm7 \n\t"
				1360	"pandn 32(%%edi), %%mm7 \n\t"
				1361	"por %%mm7, %%mm6 \n\t"
				1362	"movq %%mm6, 32(%%edi) \n\t"
				1363
				1364	"movq 40(%%esi), %%mm7 \n\t"
				1365	"pand %%mm5, %%mm7 \n\t"
				1366	"movq %%mm5, %%mm6 \n\t"
				1367	"pandn 40(%%edi), %%mm6 \n\t"
				1368	"por %%mm6, %%mm7 \n\t"
				1369	"movq %%mm7, 40(%%edi) \n\t"
				1370
				1371	"addl $48, %%esi \n\t" // inc by 48 bytes processed
				1372	"addl $48, %%edi \n\t"
				1373	"subl $8, %%ecx \n\t" // dec by 8 pixels processed
				1374
				1375	"ja mainloop48 \n\t"
				1376
				1377	"mainloop48end: \n\t"
				1378	// preload "movl diff, %%ecx \n\t" // (diff is in eax)
				1379	"movl %%eax, %%ecx \n\t"
				1380	"cmpl $0, %%ecx \n\t"
				1381	"jz end48 \n\t"
				1382	// preload "movl mask, %%edx \n\t"
				1383	"sall $24, %%edx \n\t" // make low byte, high byte
				1384
				1385	"secondloop48: \n\t"
				1386	"sall %%edx \n\t" // move high bit to CF
				1387	"jnc skip48 \n\t" // if CF = 0
				1388	"movl (%%esi), %%eax \n\t"
				1389	"movl %%eax, (%%edi) \n\t"
				1390
				1391	"skip48: \n\t"
				1392	"addl $4, %%esi \n\t"
				1393	"addl $4, %%edi \n\t"
				1394	"decl %%ecx \n\t"
				1395	"jnz secondloop48 \n\t"
				1396
				1397	"end48: \n\t"
				1398	"EMMS \n\t" // DONE
				1399
				1400	: "=a" (dummy_value_a), // output regs (dummy)
				1401	"=d" (dummy_value_d),
				1402	"=c" (dummy_value_c),
				1403	"=S" (dummy_value_S),
				1404	"=D" (dummy_value_D)
				1405
				1406	: "3" (srcptr), // esi // input regs
				1407	"4" (dstptr), // edi
				1408	"0" (diff), // eax
				1409	// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
				1410	"2" (len), // ecx
				1411	"1" (mask) // edx
				1412
				1413	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
				1414	: "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
				1415	, "%mm4", "%mm5", "%mm6", "%mm7"
				1416	#endif
				1417	);
				1418	}
				1419	else /* mmx _not supported - Use modified C routine */
				1420	#endif /* PNG_MMX_CODE_SUPPORTED */
				1421	{
				1422	register png_uint_32 i;
				1423	png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
				1424	/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
				1425	register int stride = BPP6 * png_pass_inc[png_ptr->pass];
				1426	/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
				1427	register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
				1428	/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
				1429	png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
				1430	int diff = (int) (png_ptr->width & 7); /* amount lost */
				1431	register png_uint_32 final_val = BPP6 * len; /* GRR bugfix */
				1432
				1433	srcptr = png_ptr->row_buf + 1 + initial_val;
				1434	dstptr = row + initial_val;
				1435
				1436	for (i = initial_val; i < final_val; i += stride)
				1437	{
				1438	png_memcpy(dstptr, srcptr, rep_bytes);
				1439	srcptr += stride;
				1440	dstptr += stride;
				1441	}
				1442	if (diff) /* number of leftover pixels: 3 for pngtest */
				1443	{
				1444	final_val+=diff*BPP6;
				1445	for (; i < final_val; i += stride)
				1446	{
				1447	if (rep_bytes > (int)(final_val-i))
				1448	rep_bytes = (int)(final_val-i);
				1449	png_memcpy(dstptr, srcptr, rep_bytes);
				1450	srcptr += stride;
				1451	dstptr += stride;
				1452	}
				1453	}
				1454	} /* end of else (_mmx_supported) */
				1455
				1456	break;
				1457	} /* end 48 bpp */
				1458
				1459	case 64: /* png_ptr->row_info.pixel_depth */
				1460	{
				1461	png_bytep srcptr;
				1462	png_bytep dstptr;
				1463	register png_uint_32 i;
				1464	png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
				1465	/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
				1466	register int stride = BPP8 * png_pass_inc[png_ptr->pass];
				1467	/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
				1468	register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
				1469	/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
				1470	png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
				1471	int diff = (int) (png_ptr->width & 7); /* amount lost */
				1472	register png_uint_32 final_val = BPP8 * len; /* GRR bugfix */
				1473
				1474	srcptr = png_ptr->row_buf + 1 + initial_val;
				1475	dstptr = row + initial_val;
				1476
				1477	for (i = initial_val; i < final_val; i += stride)
				1478	{
				1479	png_memcpy(dstptr, srcptr, rep_bytes);
				1480	srcptr += stride;
				1481	dstptr += stride;
				1482	}
				1483	if (diff) /* number of leftover pixels: 3 for pngtest */
				1484	{
				1485	final_val+=diff*BPP8;
				1486	for (; i < final_val; i += stride)
				1487	{
				1488	if (rep_bytes > (int)(final_val-i))
				1489	rep_bytes = (int)(final_val-i);
				1490	png_memcpy(dstptr, srcptr, rep_bytes);
				1491	srcptr += stride;
				1492	dstptr += stride;
				1493	}
				1494	}
				1495
				1496	break;
				1497	} /* end 64 bpp */
				1498
				1499	default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
				1500	{
				1501	/* this should never happen */
				1502	png_warning(png_ptr, "Invalid row_info.pixel_depth in pnggccrd");
				1503	break;
				1504	}
				1505	} /* end switch (png_ptr->row_info.pixel_depth) */
				1506
				1507	} /* end if (non-trivial mask) */
				1508
				1509	} /* end png_combine_row() */
				1510
				1511	#endif /* PNG_HAVE_MMX_COMBINE_ROW */
				1512
				1513
				1514
				1515
				1516	/===========================================================================/
				1517	/* */
				1518	/* P N G _ D O _ R E A D _ I N T E R L A C E */
				1519	/* */
				1520	/===========================================================================/
				1521
				1522	#if defined(PNG_READ_INTERLACING_SUPPORTED)
				1523	#if defined(PNG_HAVE_MMX_READ_INTERLACE)
				1524
				1525	/* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
				1526	* has taken place. [GRR: what other steps come before and/or after?]
				1527	*/
				1528
				1529	void /* PRIVATE */
				1530	png_do_read_interlace(png_structp png_ptr)
				1531	{
				1532	png_row_infop row_info = &(png_ptr->row_info);
				1533	png_bytep row = png_ptr->row_buf + 1;
				1534	int pass = png_ptr->pass;
				1535	#if defined(PNG_READ_PACKSWAP_SUPPORTED)
				1536	png_uint_32 transformations = png_ptr->transformations;
				1537	#endif
				1538
				1539	png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
				1540
				1541	#if defined(PNG_MMX_CODE_SUPPORTED)
				1542	if (_mmx_supported == 2) {
				1543	#if !defined(PNG_1_0_X)
				1544	/* this should have happened in png_init_mmx_flags() already */
				1545	png_warning(png_ptr, "asm_flags may not have been initialized");
				1546	#endif
				1547	png_mmx_support();
				1548	}
				1549	#endif
				1550
				1551	if (row != NULL && row_info != NULL)
				1552	{
				1553	png_uint_32 final_width;
				1554
				1555	final_width = row_info->width * png_pass_inc[pass];
				1556
				1557	switch (row_info->pixel_depth)
				1558	{
				1559	case 1:
				1560	{
				1561	png_bytep sp, dp;
				1562	int sshift, dshift;
				1563	int s_start, s_end, s_inc;
				1564	png_byte v;
				1565	png_uint_32 i;
				1566	int j;
				1567
				1568	sp = row + (png_size_t)((row_info->width - 1) >> 3);
				1569	dp = row + (png_size_t)((final_width - 1) >> 3);
				1570	#if defined(PNG_READ_PACKSWAP_SUPPORTED)
				1571	if (transformations & PNG_PACKSWAP)
				1572	{
				1573	sshift = (int)((row_info->width + 7) & 7);
				1574	dshift = (int)((final_width + 7) & 7);
				1575	s_start = 7;
				1576	s_end = 0;
				1577	s_inc = -1;
				1578	}
				1579	else
				1580	#endif
				1581	{
				1582	sshift = 7 - (int)((row_info->width + 7) & 7);
				1583	dshift = 7 - (int)((final_width + 7) & 7);
				1584	s_start = 0;
				1585	s_end = 7;
				1586	s_inc = 1;
				1587	}
				1588
				1589	for (i = row_info->width; i; i--)
				1590	{
				1591	v = (png_byte)((*sp >> sshift) & 0x1);
				1592	for (j = 0; j < png_pass_inc[pass]; j++)
				1593	{
				1594	*dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
				1595	*dp \|= (png_byte)(v << dshift);
				1596	if (dshift == s_end)
				1597	{
				1598	dshift = s_start;
				1599	dp--;
				1600	}
				1601	else
				1602	dshift += s_inc;
				1603	}
				1604	if (sshift == s_end)
				1605	{
				1606	sshift = s_start;
				1607	sp--;
				1608	}
				1609	else
				1610	sshift += s_inc;
				1611	}
				1612	break;
				1613	}
				1614
				1615	case 2:
				1616	{
				1617	png_bytep sp, dp;
				1618	int sshift, dshift;
				1619	int s_start, s_end, s_inc;
				1620	png_uint_32 i;
				1621
				1622	sp = row + (png_size_t)((row_info->width - 1) >> 2);
				1623	dp = row + (png_size_t)((final_width - 1) >> 2);
				1624	#if defined(PNG_READ_PACKSWAP_SUPPORTED)
				1625	if (transformations & PNG_PACKSWAP)
				1626	{
				1627	sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
				1628	dshift = (png_size_t)(((final_width + 3) & 3) << 1);
				1629	s_start = 6;
				1630	s_end = 0;
				1631	s_inc = -2;
				1632	}
				1633	else
				1634	#endif
				1635	{
				1636	sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
				1637	dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
				1638	s_start = 0;
				1639	s_end = 6;
				1640	s_inc = 2;
				1641	}
				1642
				1643	for (i = row_info->width; i; i--)
				1644	{
				1645	png_byte v;
				1646	int j;
				1647
				1648	v = (png_byte)((*sp >> sshift) & 0x3);
				1649	for (j = 0; j < png_pass_inc[pass]; j++)
				1650	{
				1651	*dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
				1652	*dp \|= (png_byte)(v << dshift);
				1653	if (dshift == s_end)
				1654	{
				1655	dshift = s_start;
				1656	dp--;
				1657	}
				1658	else
				1659	dshift += s_inc;
				1660	}
				1661	if (sshift == s_end)
				1662	{
				1663	sshift = s_start;
				1664	sp--;
				1665	}
				1666	else
				1667	sshift += s_inc;
				1668	}
				1669	break;
				1670	}
				1671
				1672	case 4:
				1673	{
				1674	png_bytep sp, dp;
				1675	int sshift, dshift;
				1676	int s_start, s_end, s_inc;
				1677	png_uint_32 i;
				1678
				1679	sp = row + (png_size_t)((row_info->width - 1) >> 1);
				1680	dp = row + (png_size_t)((final_width - 1) >> 1);
				1681	#if defined(PNG_READ_PACKSWAP_SUPPORTED)
				1682	if (transformations & PNG_PACKSWAP)
				1683	{
				1684	sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
				1685	dshift = (png_size_t)(((final_width + 1) & 1) << 2);
				1686	s_start = 4;
				1687	s_end = 0;
				1688	s_inc = -4;
				1689	}
				1690	else
				1691	#endif
				1692	{
				1693	sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
				1694	dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
				1695	s_start = 0;
				1696	s_end = 4;
				1697	s_inc = 4;
				1698	}
				1699
				1700	for (i = row_info->width; i; i--)
				1701	{
				1702	png_byte v;
				1703	int j;
				1704
				1705	v = (png_byte)((*sp >> sshift) & 0xf);
				1706	for (j = 0; j < png_pass_inc[pass]; j++)
				1707	{
				1708	*dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
				1709	*dp \|= (png_byte)(v << dshift);
				1710	if (dshift == s_end)
				1711	{
				1712	dshift = s_start;
				1713	dp--;
				1714	}
				1715	else
				1716	dshift += s_inc;
				1717	}
				1718	if (sshift == s_end)
				1719	{
				1720	sshift = s_start;
				1721	sp--;
				1722	}
				1723	else
				1724	sshift += s_inc;
				1725	}
				1726	break;
				1727	}
				1728
				1729	/====================================================================/
				1730
				1731	default: /* 8-bit or larger (this is where the routine is modified) */
				1732	{
				1733	#if 0
				1734	// static unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
				1735	// static unsigned long long const4 = 0x0000000000FFFFFFLL; no good
				1736	// unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
				1737	// unsigned long long const4 = 0x0000000000FFFFFFLL; no good
				1738	#endif
				1739	png_bytep sptr, dp;
				1740	png_uint_32 i;
				1741	png_size_t pixel_bytes;
				1742	int width = (int)row_info->width;
				1743
				1744	pixel_bytes = (row_info->pixel_depth >> 3);
				1745
				1746	/* point sptr at the last pixel in the pre-expanded row: */
				1747	sptr = row + (width - 1) * pixel_bytes;
				1748
				1749	/* point dp at the last pixel position in the expanded row: */
				1750	dp = row + (final_width - 1) * pixel_bytes;
				1751
				1752	/* New code by Nirav Chhatrapati - Intel Corporation */
				1753
				1754	#if defined(PNG_MMX_CODE_SUPPORTED)
				1755	#if !defined(PNG_1_0_X)
				1756	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
				1757	/* && _mmx_supported */ )
				1758	#else
				1759	if (_mmx_supported)
				1760	#endif
				1761	{
				1762	//--------------------------------------------------------------
				1763	if (pixel_bytes == 3)
				1764	{
				1765	if (((pass == 0) \|\| (pass == 1)) && width)
				1766	{
				1767	int dummy_value_c; // fix 'forbidden register spilled'
				1768	int dummy_value_S;
				1769	int dummy_value_D;
				1770	int dummy_value_a;
				1771
				1772	__asm__ __volatile__ (
				1773	"subl $21, %%edi \n\t"
				1774	// (png_pass_inc[pass] - 1)*pixel_bytes
				1775
				1776	".loop3_pass0: \n\t"
				1777	"movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
				1778	"pand (%3), %%mm0 \n\t" // z z z z z 2 1 0
				1779	"movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
				1780	"psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
				1781	"movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
				1782	"psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
				1783	"psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
				1784	"por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
				1785	"por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
				1786	"movq %%mm0, %%mm3 \n\t" // 2 1 0 2 1 0 2 1
				1787	"psllq $16, %%mm0 \n\t" // 0 2 1 0 2 1 z z
				1788	"movq %%mm3, %%mm4 \n\t" // 2 1 0 2 1 0 2 1
				1789	"punpckhdq %%mm0, %%mm3 \n\t" // 0 2 1 0 2 1 0 2
				1790	"movq %%mm4, 16(%%edi) \n\t"
				1791	"psrlq $32, %%mm0 \n\t" // z z z z 0 2 1 0
				1792	"movq %%mm3, 8(%%edi) \n\t"
				1793	"punpckldq %%mm4, %%mm0 \n\t" // 1 0 2 1 0 2 1 0
				1794	"subl $3, %%esi \n\t"
				1795	"movq %%mm0, (%%edi) \n\t"
				1796	"subl $24, %%edi \n\t"
				1797	"decl %%ecx \n\t"
				1798	"jnz .loop3_pass0 \n\t"
				1799	"EMMS \n\t" // DONE
				1800
				1801	: "=c" (dummy_value_c), // output regs (dummy)
				1802	"=S" (dummy_value_S),
				1803	"=D" (dummy_value_D),
				1804	"=a" (dummy_value_a)
				1805
				1806
				1807	: "1" (sptr), // esi // input regs
				1808	"2" (dp), // edi
				1809	"0" (width), // ecx
				1810	"3" (&_const4) // %1(?) (0x0000000000FFFFFFLL)
				1811
				1812	#if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
				1813	: "%mm0", "%mm1", "%mm2" // clobber list
				1814	, "%mm3", "%mm4"
				1815	#endif
				1816	);
				1817	}
				1818	else if (((pass == 2) \|\| (pass == 3)) && width)
				1819	{
				1820	int dummy_value_c; // fix 'forbidden register spilled'
				1821	int dummy_value_S;
				1822	int dummy_value_D;
				1823	int dummy_value_a;
				1824
				1825	__asm__ __volatile__ (
				1826	"subl $9, %%edi \n\t"
				1827	// (png_pass_inc[pass] - 1)*pixel_bytes
				1828
				1829	".loop3_pass2: \n\t"
				1830	"movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
				1831	"pand (%3), %%mm0 \n\t" // z z z z z 2 1 0
				1832	"movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
				1833	"psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
				1834	"movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
				1835	"psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
				1836	"psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
				1837	"por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
				1838	"por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
				1839	"movq %%mm0, 4(%%edi) \n\t"
				1840	"psrlq $16, %%mm0 \n\t" // z z 2 1 0 2 1 0
				1841	"subl $3, %%esi \n\t"
				1842	"movd %%mm0, (%%edi) \n\t"
				1843	"subl $12, %%edi \n\t"
				1844	"decl %%ecx \n\t"
				1845	"jnz .loop3_pass2 \n\t"
				1846	"EMMS \n\t" // DONE
				1847
				1848	: "=c" (dummy_value_c), // output regs (dummy)
				1849	"=S" (dummy_value_S),
				1850	"=D" (dummy_value_D),
				1851	"=a" (dummy_value_a)
				1852
				1853	: "1" (sptr), // esi // input regs
				1854	"2" (dp), // edi
				1855	"0" (width), // ecx
				1856	"3" (&_const4) // (0x0000000000FFFFFFLL)
				1857
				1858	#if 0 /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
				1859	: "%mm0", "%mm1", "%mm2" // clobber list
				1860	#endif
				1861	);
				1862	}
				1863	else if (width) /* && ((pass == 4) \|\| (pass == 5)) */
				1864	{
				1865	int width_mmx = ((width >> 1) << 1) - 8; // GRR: huh?
				1866	if (width_mmx < 0)
				1867	width_mmx = 0;
				1868	width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
				1869	if (width_mmx)
				1870	{
				1871	// png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
				1872	// sptr points at last pixel in pre-expanded row
				1873	// dp points at last pixel position in expanded row
				1874	int dummy_value_c; // fix 'forbidden register spilled'
				1875	int dummy_value_S;
				1876	int dummy_value_D;
				1877	int dummy_value_a;
				1878	int dummy_value_d;
				1879
				1880	__asm__ __volatile__ (
				1881	"subl $3, %%esi \n\t"
				1882	"subl $9, %%edi \n\t"
				1883	// (png_pass_inc[pass] + 1)*pixel_bytes
				1884
				1885	".loop3_pass4: \n\t"
				1886	"movq (%%esi), %%mm0 \n\t" // x x 5 4 3 2 1 0
				1887	"movq %%mm0, %%mm1 \n\t" // x x 5 4 3 2 1 0
				1888	"movq %%mm0, %%mm2 \n\t" // x x 5 4 3 2 1 0
				1889	"psllq $24, %%mm0 \n\t" // 4 3 2 1 0 z z z
				1890	"pand (%3), %%mm1 \n\t" // z z z z z 2 1 0
				1891	"psrlq $24, %%mm2 \n\t" // z z z x x 5 4 3
				1892	"por %%mm1, %%mm0 \n\t" // 4 3 2 1 0 2 1 0
				1893	"movq %%mm2, %%mm3 \n\t" // z z z x x 5 4 3
				1894	"psllq $8, %%mm2 \n\t" // z z x x 5 4 3 z
				1895	"movq %%mm0, (%%edi) \n\t"
				1896	"psrlq $16, %%mm3 \n\t" // z z z z z x x 5
				1897	"pand (%4), %%mm3 \n\t" // z z z z z z z 5
				1898	"por %%mm3, %%mm2 \n\t" // z z x x 5 4 3 5
				1899	"subl $6, %%esi \n\t"
				1900	"movd %%mm2, 8(%%edi) \n\t"
				1901	"subl $12, %%edi \n\t"
				1902	"subl $2, %%ecx \n\t"
				1903	"jnz .loop3_pass4 \n\t"
				1904	"EMMS \n\t" // DONE
				1905
				1906	: "=c" (dummy_value_c), // output regs (dummy)
				1907	"=S" (dummy_value_S),
				1908	"=D" (dummy_value_D),
				1909	"=a" (dummy_value_a),
				1910	"=d" (dummy_value_d)
				1911
				1912	: "1" (sptr), // esi // input regs
				1913	"2" (dp), // edi
				1914	"0" (width_mmx), // ecx
				1915	"3" (&_const4), // 0x0000000000FFFFFFLL
				1916	"4" (&_const6) // 0x00000000000000FFLL
				1917
				1918	#if 0 /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
				1919	: "%mm0", "%mm1" // clobber list
				1920	, "%mm2", "%mm3"
				1921	#endif
				1922	);
				1923	}
				1924
				1925	sptr -= width_mmx*3;
				1926	dp -= width_mmx*6;
				1927	for (i = width; i; i--)
				1928	{
				1929	png_byte v[8];
				1930	int j;
				1931
				1932	png_memcpy(v, sptr, 3);
				1933	for (j = 0; j < png_pass_inc[pass]; j++)
				1934	{
				1935	png_memcpy(dp, v, 3);
				1936	dp -= 3;
				1937	}
				1938	sptr -= 3;
				1939	}
				1940	}
				1941	} /* end of pixel_bytes == 3 */
				1942
				1943	//--------------------------------------------------------------
				1944	else if (pixel_bytes == 1)
				1945	{
				1946	if (((pass == 0) \|\| (pass == 1)) && width)
				1947	{
				1948	int width_mmx = ((width >> 2) << 2);
				1949	width -= width_mmx; // 0-3 pixels => 0-3 bytes
				1950	if (width_mmx)
				1951	{
				1952	int dummy_value_c; // fix 'forbidden register spilled'
				1953	int dummy_value_S;
				1954	int dummy_value_D;
				1955
				1956	__asm__ __volatile__ (
				1957	"subl $3, %%esi \n\t"
				1958	"subl $31, %%edi \n\t"
				1959
				1960	".loop1_pass0: \n\t"
				1961	"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
				1962	"movq %%mm0, %%mm1 \n\t" // x x x x 3 2 1 0
				1963	"punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
				1964	"movq %%mm0, %%mm2 \n\t" // 3 3 2 2 1 1 0 0
				1965	"punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
				1966	"movq %%mm0, %%mm3 \n\t" // 1 1 1 1 0 0 0 0
				1967	"punpckldq %%mm0, %%mm0 \n\t" // 0 0 0 0 0 0 0 0
				1968	"punpckhdq %%mm3, %%mm3 \n\t" // 1 1 1 1 1 1 1 1
				1969	"movq %%mm0, (%%edi) \n\t"
				1970	"punpckhwd %%mm2, %%mm2 \n\t" // 3 3 3 3 2 2 2 2
				1971	"movq %%mm3, 8(%%edi) \n\t"
				1972	"movq %%mm2, %%mm4 \n\t" // 3 3 3 3 2 2 2 2
				1973	"punpckldq %%mm2, %%mm2 \n\t" // 2 2 2 2 2 2 2 2
				1974	"punpckhdq %%mm4, %%mm4 \n\t" // 3 3 3 3 3 3 3 3
				1975	"movq %%mm2, 16(%%edi) \n\t"
				1976	"subl $4, %%esi \n\t"
				1977	"movq %%mm4, 24(%%edi) \n\t"
				1978	"subl $32, %%edi \n\t"
				1979	"subl $4, %%ecx \n\t"
				1980	"jnz .loop1_pass0 \n\t"
				1981	"EMMS \n\t" // DONE
				1982
				1983	: "=c" (dummy_value_c), // output regs (dummy)
				1984	"=S" (dummy_value_S),
				1985	"=D" (dummy_value_D)
				1986
				1987	: "1" (sptr), // esi // input regs
				1988	"2" (dp), // edi
				1989	"0" (width_mmx) // ecx
				1990
				1991	#if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
				1992	: "%mm0", "%mm1", "%mm2" // clobber list
				1993	, "%mm3", "%mm4"
				1994	#endif
				1995	);
				1996	}
				1997
				1998	sptr -= width_mmx;
				1999	dp -= width_mmx*8;
				2000	for (i = width; i; i--)
				2001	{
				2002	int j;
				2003
				2004	/* I simplified this part in version 1.0.4e
				2005	* here and in several other instances where
				2006	* pixel_bytes == 1 -- GR-P
				2007	*
				2008	* Original code:
				2009	*
				2010	* png_byte v[8];
				2011	* png_memcpy(v, sptr, pixel_bytes);
				2012	* for (j = 0; j < png_pass_inc[pass]; j++)
				2013	* {
				2014	* png_memcpy(dp, v, pixel_bytes);
				2015	* dp -= pixel_bytes;
				2016	* }
				2017	* sptr -= pixel_bytes;
				2018	*
				2019	* Replacement code is in the next three lines:
				2020	*/
				2021
				2022	for (j = 0; j < png_pass_inc[pass]; j++)
				2023	{
				2024	dp-- = sptr;
				2025	}
				2026	--sptr;
				2027	}
				2028	}
				2029	else if (((pass == 2) \|\| (pass == 3)) && width)
				2030	{
				2031	int width_mmx = ((width >> 2) << 2);
				2032	width -= width_mmx; // 0-3 pixels => 0-3 bytes
				2033	if (width_mmx)
				2034	{
				2035	int dummy_value_c; // fix 'forbidden register spilled'
				2036	int dummy_value_S;
				2037	int dummy_value_D;
				2038
				2039	__asm__ __volatile__ (
				2040	"subl $3, %%esi \n\t"
				2041	"subl $15, %%edi \n\t"
				2042
				2043	".loop1_pass2: \n\t"
				2044	"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
				2045	"punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
				2046	"movq %%mm0, %%mm1 \n\t" // 3 3 2 2 1 1 0 0
				2047	"punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
				2048	"punpckhwd %%mm1, %%mm1 \n\t" // 3 3 3 3 2 2 2 2
				2049	"movq %%mm0, (%%edi) \n\t"
				2050	"subl $4, %%esi \n\t"
				2051	"movq %%mm1, 8(%%edi) \n\t"
				2052	"subl $16, %%edi \n\t"
				2053	"subl $4, %%ecx \n\t"
				2054	"jnz .loop1_pass2 \n\t"
				2055	"EMMS \n\t" // DONE
				2056
				2057	: "=c" (dummy_value_c), // output regs (dummy)
				2058	"=S" (dummy_value_S),
				2059	"=D" (dummy_value_D)
				2060
				2061	: "1" (sptr), // esi // input regs
				2062	"2" (dp), // edi
				2063	"0" (width_mmx) // ecx
				2064
				2065	#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
				2066	: "%mm0", "%mm1" // clobber list
				2067	#endif
				2068	);
				2069	}
				2070
				2071	sptr -= width_mmx;
				2072	dp -= width_mmx*4;
				2073	for (i = width; i; i--)
				2074	{
				2075	int j;
				2076
				2077	for (j = 0; j < png_pass_inc[pass]; j++)
				2078	{
				2079	dp-- = sptr;
				2080	}
				2081	--sptr;
				2082	}
				2083	}
				2084	else if (width) /* && ((pass == 4) \|\| (pass == 5)) */
				2085	{
				2086	int width_mmx = ((width >> 3) << 3);
				2087	width -= width_mmx; // 0-3 pixels => 0-3 bytes
				2088	if (width_mmx)
				2089	{
				2090	int dummy_value_c; // fix 'forbidden register spilled'
				2091	int dummy_value_S;
				2092	int dummy_value_D;
				2093
				2094	__asm__ __volatile__ (
				2095	"subl $7, %%esi \n\t"
				2096	"subl $15, %%edi \n\t"
				2097
				2098	".loop1_pass4: \n\t"
				2099	"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
				2100	"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
				2101	"punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
				2102	"punpckhbw %%mm1, %%mm1 \n\t" // 7 7 6 6 5 5 4 4
				2103	"movq %%mm1, 8(%%edi) \n\t"
				2104	"subl $8, %%esi \n\t"
				2105	"movq %%mm0, (%%edi) \n\t"
				2106	"subl $16, %%edi \n\t"
				2107	"subl $8, %%ecx \n\t"
				2108	"jnz .loop1_pass4 \n\t"
				2109	"EMMS \n\t" // DONE
				2110
				2111	: "=c" (dummy_value_c), // output regs (none)
				2112	"=S" (dummy_value_S),
				2113	"=D" (dummy_value_D)
				2114
				2115	: "1" (sptr), // esi // input regs
				2116	"2" (dp), // edi
				2117	"0" (width_mmx) // ecx
				2118
				2119	#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
				2120	: "%mm0", "%mm1" // clobber list
				2121	#endif
				2122	);
				2123	}
				2124
				2125	sptr -= width_mmx;
				2126	dp -= width_mmx*2;
				2127	for (i = width; i; i--)
				2128	{
				2129	int j;
				2130
				2131	for (j = 0; j < png_pass_inc[pass]; j++)
				2132	{
				2133	dp-- = sptr;
				2134	}
				2135	--sptr;
				2136	}
				2137	}
				2138	} /* end of pixel_bytes == 1 */
				2139
				2140	//--------------------------------------------------------------
				2141	else if (pixel_bytes == 2)
				2142	{
				2143	if (((pass == 0) \|\| (pass == 1)) && width)
				2144	{
				2145	int width_mmx = ((width >> 1) << 1);
				2146	width -= width_mmx; // 0,1 pixels => 0,2 bytes
				2147	if (width_mmx)
				2148	{
				2149	int dummy_value_c; // fix 'forbidden register spilled'
				2150	int dummy_value_S;
				2151	int dummy_value_D;
				2152
				2153	__asm__ __volatile__ (
				2154	"subl $2, %%esi \n\t"
				2155	"subl $30, %%edi \n\t"
				2156
				2157	".loop2_pass0: \n\t"
				2158	"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
				2159	"punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
				2160	"movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
				2161	"punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
				2162	"punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
				2163	"movq %%mm0, (%%edi) \n\t"
				2164	"movq %%mm0, 8(%%edi) \n\t"
				2165	"movq %%mm1, 16(%%edi) \n\t"
				2166	"subl $4, %%esi \n\t"
				2167	"movq %%mm1, 24(%%edi) \n\t"
				2168	"subl $32, %%edi \n\t"
				2169	"subl $2, %%ecx \n\t"
				2170	"jnz .loop2_pass0 \n\t"
				2171	"EMMS \n\t" // DONE
				2172
				2173	: "=c" (dummy_value_c), // output regs (dummy)
				2174	"=S" (dummy_value_S),
				2175	"=D" (dummy_value_D)
				2176
				2177	: "1" (sptr), // esi // input regs
				2178	"2" (dp), // edi
				2179	"0" (width_mmx) // ecx
				2180
				2181	#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
				2182	: "%mm0", "%mm1" // clobber list
				2183	#endif
				2184	);
				2185	}
				2186
				2187	sptr -= (width_mmx*2 - 2); // sign fixed
				2188	dp -= (width_mmx*16 - 2); // sign fixed
				2189	for (i = width; i; i--)
				2190	{
				2191	png_byte v[8];
				2192	int j;
				2193	sptr -= 2;
				2194	png_memcpy(v, sptr, 2);
				2195	for (j = 0; j < png_pass_inc[pass]; j++)
				2196	{
				2197	dp -= 2;
				2198	png_memcpy(dp, v, 2);
				2199	}
				2200	}
				2201	}
				2202	else if (((pass == 2) \|\| (pass == 3)) && width)
				2203	{
				2204	int width_mmx = ((width >> 1) << 1) ;
				2205	width -= width_mmx; // 0,1 pixels => 0,2 bytes
				2206	if (width_mmx)
				2207	{
				2208	int dummy_value_c; // fix 'forbidden register spilled'
				2209	int dummy_value_S;
				2210	int dummy_value_D;
				2211
				2212	__asm__ __volatile__ (
				2213	"subl $2, %%esi \n\t"
				2214	"subl $14, %%edi \n\t"
				2215
				2216	".loop2_pass2: \n\t"
				2217	"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
				2218	"punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
				2219	"movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
				2220	"punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
				2221	"punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
				2222	"movq %%mm0, (%%edi) \n\t"
				2223	"subl $4, %%esi \n\t"
				2224	"movq %%mm1, 8(%%edi) \n\t"
				2225	"subl $16, %%edi \n\t"
				2226	"subl $2, %%ecx \n\t"
				2227	"jnz .loop2_pass2 \n\t"
				2228	"EMMS \n\t" // DONE
				2229
				2230	: "=c" (dummy_value_c), // output regs (dummy)
				2231	"=S" (dummy_value_S),
				2232	"=D" (dummy_value_D)
				2233
				2234	: "1" (sptr), // esi // input regs
				2235	"2" (dp), // edi
				2236	"0" (width_mmx) // ecx
				2237
				2238	#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
				2239	: "%mm0", "%mm1" // clobber list
				2240	#endif
				2241	);
				2242	}
				2243
				2244	sptr -= (width_mmx*2 - 2); // sign fixed
				2245	dp -= (width_mmx*8 - 2); // sign fixed
				2246	for (i = width; i; i--)
				2247	{
				2248	png_byte v[8];
				2249	int j;
				2250	sptr -= 2;
				2251	png_memcpy(v, sptr, 2);
				2252	for (j = 0; j < png_pass_inc[pass]; j++)
				2253	{
				2254	dp -= 2;
				2255	png_memcpy(dp, v, 2);
				2256	}
				2257	}
				2258	}
				2259	else if (width) // pass == 4 or 5
				2260	{
				2261	int width_mmx = ((width >> 1) << 1) ;
				2262	width -= width_mmx; // 0,1 pixels => 0,2 bytes
				2263	if (width_mmx)
				2264	{
				2265	int dummy_value_c; // fix 'forbidden register spilled'
				2266	int dummy_value_S;
				2267	int dummy_value_D;
				2268
				2269	__asm__ __volatile__ (
				2270	"subl $2, %%esi \n\t"
				2271	"subl $6, %%edi \n\t"
				2272
				2273	".loop2_pass4: \n\t"
				2274	"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
				2275	"punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
				2276	"subl $4, %%esi \n\t"
				2277	"movq %%mm0, (%%edi) \n\t"
				2278	"subl $8, %%edi \n\t"
				2279	"subl $2, %%ecx \n\t"
				2280	"jnz .loop2_pass4 \n\t"
				2281	"EMMS \n\t" // DONE
				2282
				2283	: "=c" (dummy_value_c), // output regs (dummy)
				2284	"=S" (dummy_value_S),
				2285	"=D" (dummy_value_D)
				2286
				2287	: "1" (sptr), // esi // input regs
				2288	"2" (dp), // edi
				2289	"0" (width_mmx) // ecx
				2290
				2291	#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
				2292	: "%mm0" // clobber list
				2293	#endif
				2294	);
				2295	}
				2296
				2297	sptr -= (width_mmx*2 - 2); // sign fixed
				2298	dp -= (width_mmx*4 - 2); // sign fixed
				2299	for (i = width; i; i--)
				2300	{
				2301	png_byte v[8];
				2302	int j;
				2303	sptr -= 2;
				2304	png_memcpy(v, sptr, 2);
				2305	for (j = 0; j < png_pass_inc[pass]; j++)
				2306	{
				2307	dp -= 2;
				2308	png_memcpy(dp, v, 2);
				2309	}
				2310	}
				2311	}
				2312	} /* end of pixel_bytes == 2 */
				2313
				2314	//--------------------------------------------------------------
				2315	else if (pixel_bytes == 4)
				2316	{
				2317	if (((pass == 0) \|\| (pass == 1)) && width)
				2318	{
				2319	int width_mmx = ((width >> 1) << 1);
				2320	width -= width_mmx; // 0,1 pixels => 0,4 bytes
				2321	if (width_mmx)
				2322	{
				2323	int dummy_value_c; // fix 'forbidden register spilled'
				2324	int dummy_value_S;
				2325	int dummy_value_D;
				2326
				2327	__asm__ __volatile__ (
				2328	"subl $4, %%esi \n\t"
				2329	"subl $60, %%edi \n\t"
				2330
				2331	".loop4_pass0: \n\t"
				2332	"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
				2333	"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
				2334	"punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
				2335	"punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
				2336	"movq %%mm0, (%%edi) \n\t"
				2337	"movq %%mm0, 8(%%edi) \n\t"
				2338	"movq %%mm0, 16(%%edi) \n\t"
				2339	"movq %%mm0, 24(%%edi) \n\t"
				2340	"movq %%mm1, 32(%%edi) \n\t"
				2341	"movq %%mm1, 40(%%edi) \n\t"
				2342	"movq %%mm1, 48(%%edi) \n\t"
				2343	"subl $8, %%esi \n\t"
				2344	"movq %%mm1, 56(%%edi) \n\t"
				2345	"subl $64, %%edi \n\t"
				2346	"subl $2, %%ecx \n\t"
				2347	"jnz .loop4_pass0 \n\t"
				2348	"EMMS \n\t" // DONE
				2349
				2350	: "=c" (dummy_value_c), // output regs (dummy)
				2351	"=S" (dummy_value_S),
				2352	"=D" (dummy_value_D)
				2353
				2354	: "1" (sptr), // esi // input regs
				2355	"2" (dp), // edi
				2356	"0" (width_mmx) // ecx
				2357
				2358	#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
				2359	: "%mm0", "%mm1" // clobber list
				2360	#endif
				2361	);
				2362	}
				2363
				2364	sptr -= (width_mmx*4 - 4); // sign fixed
				2365	dp -= (width_mmx*32 - 4); // sign fixed
				2366	for (i = width; i; i--)
				2367	{
				2368	png_byte v[8];
				2369	int j;
				2370	sptr -= 4;
				2371	png_memcpy(v, sptr, 4);
				2372	for (j = 0; j < png_pass_inc[pass]; j++)
				2373	{
				2374	dp -= 4;
				2375	png_memcpy(dp, v, 4);
				2376	}
				2377	}
				2378	}
				2379	else if (((pass == 2) \|\| (pass == 3)) && width)
				2380	{
				2381	int width_mmx = ((width >> 1) << 1);
				2382	width -= width_mmx; // 0,1 pixels => 0,4 bytes
				2383	if (width_mmx)
				2384	{
				2385	int dummy_value_c; // fix 'forbidden register spilled'
				2386	int dummy_value_S;
				2387	int dummy_value_D;
				2388
				2389	__asm__ __volatile__ (
				2390	"subl $4, %%esi \n\t"
				2391	"subl $28, %%edi \n\t"
				2392
				2393	".loop4_pass2: \n\t"
				2394	"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
				2395	"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
				2396	"punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
				2397	"punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
				2398	"movq %%mm0, (%%edi) \n\t"
				2399	"movq %%mm0, 8(%%edi) \n\t"
				2400	"movq %%mm1, 16(%%edi) \n\t"
				2401	"movq %%mm1, 24(%%edi) \n\t"
				2402	"subl $8, %%esi \n\t"
				2403	"subl $32, %%edi \n\t"
				2404	"subl $2, %%ecx \n\t"
				2405	"jnz .loop4_pass2 \n\t"
				2406	"EMMS \n\t" // DONE
				2407
				2408	: "=c" (dummy_value_c), // output regs (dummy)
				2409	"=S" (dummy_value_S),
				2410	"=D" (dummy_value_D)
				2411
				2412	: "1" (sptr), // esi // input regs
				2413	"2" (dp), // edi
				2414	"0" (width_mmx) // ecx
				2415
				2416	#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
				2417	: "%mm0", "%mm1" // clobber list
				2418	#endif
				2419	);
				2420	}
				2421
				2422	sptr -= (width_mmx*4 - 4); // sign fixed
				2423	dp -= (width_mmx*16 - 4); // sign fixed
				2424	for (i = width; i; i--)
				2425	{
				2426	png_byte v[8];
				2427	int j;
				2428	sptr -= 4;
				2429	png_memcpy(v, sptr, 4);
				2430	for (j = 0; j < png_pass_inc[pass]; j++)
				2431	{
				2432	dp -= 4;
				2433	png_memcpy(dp, v, 4);
				2434	}
				2435	}
				2436	}
				2437	else if (width) // pass == 4 or 5
				2438	{
				2439	int width_mmx = ((width >> 1) << 1) ;
				2440	width -= width_mmx; // 0,1 pixels => 0,4 bytes
				2441	if (width_mmx)
				2442	{
				2443	int dummy_value_c; // fix 'forbidden register spilled'
				2444	int dummy_value_S;
				2445	int dummy_value_D;
				2446
				2447	__asm__ __volatile__ (
				2448	"subl $4, %%esi \n\t"
				2449	"subl $12, %%edi \n\t"
				2450
				2451	".loop4_pass4: \n\t"
				2452	"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
				2453	"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
				2454	"punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
				2455	"punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
				2456	"movq %%mm0, (%%edi) \n\t"
				2457	"subl $8, %%esi \n\t"
				2458	"movq %%mm1, 8(%%edi) \n\t"
				2459	"subl $16, %%edi \n\t"
				2460	"subl $2, %%ecx \n\t"
				2461	"jnz .loop4_pass4 \n\t"
				2462	"EMMS \n\t" // DONE
				2463
				2464	: "=c" (dummy_value_c), // output regs (dummy)
				2465	"=S" (dummy_value_S),
				2466	"=D" (dummy_value_D)
				2467
				2468	: "1" (sptr), // esi // input regs
				2469	"2" (dp), // edi
				2470	"0" (width_mmx) // ecx
				2471
				2472	#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
				2473	: "%mm0", "%mm1" // clobber list
				2474	#endif
				2475	);
				2476	}
				2477
				2478	sptr -= (width_mmx*4 - 4); // sign fixed
				2479	dp -= (width_mmx*8 - 4); // sign fixed
				2480	for (i = width; i; i--)
				2481	{
				2482	png_byte v[8];
				2483	int j;
				2484	sptr -= 4;
				2485	png_memcpy(v, sptr, 4);
				2486	for (j = 0; j < png_pass_inc[pass]; j++)
				2487	{
				2488	dp -= 4;
				2489	png_memcpy(dp, v, 4);
				2490	}
				2491	}
				2492	}
				2493	} /* end of pixel_bytes == 4 */
				2494
				2495	//--------------------------------------------------------------
				2496	else if (pixel_bytes == 8)
				2497	{
				2498	// GRR TEST: should work, but needs testing (special 64-bit version of rpng2?)
				2499	// GRR NOTE: no need to combine passes here!
				2500	if (((pass == 0) \|\| (pass == 1)) && width)
				2501	{
				2502	int dummy_value_c; // fix 'forbidden register spilled'
				2503	int dummy_value_S;
				2504	int dummy_value_D;
				2505
				2506	// source is 8-byte RRGGBBAA
				2507	// dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
				2508	__asm__ __volatile__ (
				2509	"subl $56, %%edi \n\t" // start of last block
				2510
				2511	".loop8_pass0: \n\t"
				2512	"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
				2513	"movq %%mm0, (%%edi) \n\t"
				2514	"movq %%mm0, 8(%%edi) \n\t"
				2515	"movq %%mm0, 16(%%edi) \n\t"
				2516	"movq %%mm0, 24(%%edi) \n\t"
				2517	"movq %%mm0, 32(%%edi) \n\t"
				2518	"movq %%mm0, 40(%%edi) \n\t"
				2519	"movq %%mm0, 48(%%edi) \n\t"
				2520	"subl $8, %%esi \n\t"
				2521	"movq %%mm0, 56(%%edi) \n\t"
				2522	"subl $64, %%edi \n\t"
				2523	"decl %%ecx \n\t"
				2524	"jnz .loop8_pass0 \n\t"
				2525	"EMMS \n\t" // DONE
				2526
				2527	: "=c" (dummy_value_c), // output regs (dummy)
				2528	"=S" (dummy_value_S),
				2529	"=D" (dummy_value_D)
				2530
				2531	: "1" (sptr), // esi // input regs
				2532	"2" (dp), // edi
				2533	"0" (width) // ecx
				2534
				2535	#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
				2536	: "%mm0" // clobber list
				2537	#endif
				2538	);
				2539	}
				2540	else if (((pass == 2) \|\| (pass == 3)) && width)
				2541	{
				2542	// source is 8-byte RRGGBBAA
				2543	// dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
				2544	// (recall that expansion is _in place_: sptr and dp
				2545	// both point at locations within same row buffer)
				2546	{
				2547	int dummy_value_c; // fix 'forbidden register spilled'
				2548	int dummy_value_S;
				2549	int dummy_value_D;
				2550
				2551	__asm__ __volatile__ (
				2552	"subl $24, %%edi \n\t" // start of last block
				2553
				2554	".loop8_pass2: \n\t"
				2555	"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
				2556	"movq %%mm0, (%%edi) \n\t"
				2557	"movq %%mm0, 8(%%edi) \n\t"
				2558	"movq %%mm0, 16(%%edi) \n\t"
				2559	"subl $8, %%esi \n\t"
				2560	"movq %%mm0, 24(%%edi) \n\t"
				2561	"subl $32, %%edi \n\t"
				2562	"decl %%ecx \n\t"
				2563	"jnz .loop8_pass2 \n\t"
				2564	"EMMS \n\t" // DONE
				2565
				2566	: "=c" (dummy_value_c), // output regs (dummy)
				2567	"=S" (dummy_value_S),
				2568	"=D" (dummy_value_D)
				2569
				2570	: "1" (sptr), // esi // input regs
				2571	"2" (dp), // edi
				2572	"0" (width) // ecx
				2573
				2574	#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
				2575	: "%mm0" // clobber list
				2576	#endif
				2577	);
				2578	}
				2579	}
				2580	else if (width) // pass == 4 or 5
				2581	{
				2582	// source is 8-byte RRGGBBAA
				2583	// dest is 16-byte RRGGBBAA RRGGBBAA
				2584	{
				2585	int dummy_value_c; // fix 'forbidden register spilled'
				2586	int dummy_value_S;
				2587	int dummy_value_D;
				2588
				2589	__asm__ __volatile__ (
				2590	"subl $8, %%edi \n\t" // start of last block
				2591
				2592	".loop8_pass4: \n\t"
				2593	"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
				2594	"movq %%mm0, (%%edi) \n\t"
				2595	"subl $8, %%esi \n\t"
				2596	"movq %%mm0, 8(%%edi) \n\t"
				2597	"subl $16, %%edi \n\t"
				2598	"decl %%ecx \n\t"
				2599	"jnz .loop8_pass4 \n\t"
				2600	"EMMS \n\t" // DONE
				2601
				2602	: "=c" (dummy_value_c), // output regs (dummy)
				2603	"=S" (dummy_value_S),
				2604	"=D" (dummy_value_D)
				2605
				2606	: "1" (sptr), // esi // input regs
				2607	"2" (dp), // edi
				2608	"0" (width) // ecx
				2609
				2610	#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
				2611	: "%mm0" // clobber list
				2612	#endif
				2613	);
				2614	}
				2615	}
				2616
				2617	} /* end of pixel_bytes == 8 */
				2618
				2619	//--------------------------------------------------------------
				2620	else if (pixel_bytes == 6)
				2621	{
				2622	for (i = width; i; i--)
				2623	{
				2624	png_byte v[8];
				2625	int j;
				2626	png_memcpy(v, sptr, 6);
				2627	for (j = 0; j < png_pass_inc[pass]; j++)
				2628	{
				2629	png_memcpy(dp, v, 6);
				2630	dp -= 6;
				2631	}
				2632	sptr -= 6;
				2633	}
				2634	} /* end of pixel_bytes == 6 */
				2635
				2636	//--------------------------------------------------------------
				2637	else
				2638	{
				2639	for (i = width; i; i--)
				2640	{
				2641	png_byte v[8];
				2642	int j;
				2643	png_memcpy(v, sptr, pixel_bytes);
				2644	for (j = 0; j < png_pass_inc[pass]; j++)
				2645	{
				2646	png_memcpy(dp, v, pixel_bytes);
				2647	dp -= pixel_bytes;
				2648	}
				2649	sptr-= pixel_bytes;
				2650	}
				2651	}
				2652	} // end of _mmx_supported ========================================
				2653
				2654	else /* MMX not supported: use modified C code - takes advantage
				2655	* of inlining of png_memcpy for a constant */
				2656	/* GRR 19991007: does it? or should pixel_bytes in each
				2657	* block be replaced with immediate value (e.g., 1)? */
				2658	/* GRR 19991017: replaced with constants in each case */
				2659	#endif /* PNG_MMX_CODE_SUPPORTED */
				2660	{
				2661	if (pixel_bytes == 1)
				2662	{
				2663	for (i = width; i; i--)
				2664	{
				2665	int j;
				2666	for (j = 0; j < png_pass_inc[pass]; j++)
				2667	{
				2668	dp-- = sptr;
				2669	}
				2670	--sptr;
				2671	}
				2672	}
				2673	else if (pixel_bytes == 3)
				2674	{
				2675	for (i = width; i; i--)
				2676	{
				2677	png_byte v[8];
				2678	int j;
				2679	png_memcpy(v, sptr, 3);
				2680	for (j = 0; j < png_pass_inc[pass]; j++)
				2681	{
				2682	png_memcpy(dp, v, 3);
				2683	dp -= 3;
				2684	}
				2685	sptr -= 3;
				2686	}
				2687	}
				2688	else if (pixel_bytes == 2)
				2689	{
				2690	for (i = width; i; i--)
				2691	{
				2692	png_byte v[8];
				2693	int j;
				2694	png_memcpy(v, sptr, 2);
				2695	for (j = 0; j < png_pass_inc[pass]; j++)
				2696	{
				2697	png_memcpy(dp, v, 2);
				2698	dp -= 2;
				2699	}
				2700	sptr -= 2;
				2701	}
				2702	}
				2703	else if (pixel_bytes == 4)
				2704	{
				2705	for (i = width; i; i--)
				2706	{
				2707	png_byte v[8];
				2708	int j;
				2709	png_memcpy(v, sptr, 4);
				2710	for (j = 0; j < png_pass_inc[pass]; j++)
				2711	{
				2712	#ifdef PNG_DEBUG
				2713	if (dp < row \|\| dp+3 > row+png_ptr->row_buf_size)
				2714	{
				2715	printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
				2716	row, dp, row+png_ptr->row_buf_size);
				2717	printf("row_buf=%d\n",png_ptr->row_buf_size);
				2718	}
				2719	#endif
				2720	png_memcpy(dp, v, 4);
				2721	dp -= 4;
				2722	}
				2723	sptr -= 4;
				2724	}
				2725	}
				2726	else if (pixel_bytes == 6)
				2727	{
				2728	for (i = width; i; i--)
				2729	{
				2730	png_byte v[8];
				2731	int j;
				2732	png_memcpy(v, sptr, 6);
				2733	for (j = 0; j < png_pass_inc[pass]; j++)
				2734	{
				2735	png_memcpy(dp, v, 6);
				2736	dp -= 6;
				2737	}
				2738	sptr -= 6;
				2739	}
				2740	}
				2741	else if (pixel_bytes == 8)
				2742	{
				2743	for (i = width; i; i--)
				2744	{
				2745	png_byte v[8];
				2746	int j;
				2747	png_memcpy(v, sptr, 8);
				2748	for (j = 0; j < png_pass_inc[pass]; j++)
				2749	{
				2750	png_memcpy(dp, v, 8);
				2751	dp -= 8;
				2752	}
				2753	sptr -= 8;
				2754	}
				2755	}
				2756	else /* GRR: should never be reached */
				2757	{
				2758	for (i = width; i; i--)
				2759	{
				2760	png_byte v[8];
				2761	int j;
				2762	png_memcpy(v, sptr, pixel_bytes);
				2763	for (j = 0; j < png_pass_inc[pass]; j++)
				2764	{
				2765	png_memcpy(dp, v, pixel_bytes);
				2766	dp -= pixel_bytes;
				2767	}
				2768	sptr -= pixel_bytes;
				2769	}
				2770	}
				2771
				2772	} /* end if (MMX not supported) */
				2773	break;
				2774	}
				2775	} /* end switch (row_info->pixel_depth) */
				2776
				2777	row_info->width = final_width;
				2778
				2779	row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
				2780	}
				2781
				2782	} /* end png_do_read_interlace() */
				2783
				2784	#endif /* PNG_HAVE_MMX_READ_INTERLACE */
				2785	#endif /* PNG_READ_INTERLACING_SUPPORTED */
				2786
				2787
				2788
				2789	#if defined(PNG_HAVE_MMX_READ_FILTER_ROW)
				2790	#if defined(PNG_MMX_CODE_SUPPORTED)
				2791
				2792	// These variables are utilized in the functions below. They are declared
				2793	// globally here to ensure alignment on 8-byte boundaries.
				2794
				2795	union uAll {
				2796	long long use;
				2797	double align;
				2798	} _LBCarryMask = {0x0101010101010101LL},
				2799	_HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
				2800	_ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
				2801
				2802	#ifdef PNG_THREAD_UNSAFE_OK
				2803	//===========================================================================//
				2804	// //
				2805	// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G //
				2806	// //
				2807	//===========================================================================//
				2808
				2809	// Optimized code for PNG Average filter decoder
				2810
				2811	static void /* PRIVATE */
				2812	png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
				2813	png_bytep prev_row)
				2814	{
				2815	int bpp;
				2816	int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
				2817	int dummy_value_S;
				2818	int dummy_value_D;
				2819
				2820	bpp = (row_info->pixel_depth + 7) >> 3; // get # bytes per pixel
				2821	_FullLength = row_info->rowbytes; // # of bytes to filter
				2822
				2823	__asm__ __volatile__ (
				2824	// initialize address pointers and offset
				2825	#ifdef __PIC__
				2826	"pushl %%ebx \n\t" // save index to Global Offset Table
				2827	#endif
				2828	//pre "movl row, %%edi \n\t" // edi: Avg(x)
				2829	"xorl %%ebx, %%ebx \n\t" // ebx: x
				2830	"movl %%edi, %%edx \n\t"
				2831	//pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
				2832	//pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
				2833	"subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
				2834
				2835	"xorl %%eax,%%eax \n\t"
				2836
				2837	// Compute the Raw value for the first bpp bytes
				2838	// Raw(x) = Avg(x) + (Prior(x)/2)
				2839	"avg_rlp: \n\t"
				2840	"movb (%%esi,%%ebx,),%%al \n\t" // load al with Prior(x)
				2841	"incl %%ebx \n\t"
				2842	"shrb %%al \n\t" // divide by 2
				2843	"addb -1(%%edi,%%ebx,),%%al \n\t" // add Avg(x); -1 to offset inc ebx
				2844	//pre "cmpl bpp, %%ebx \n\t" // (bpp is preloaded into ecx)
				2845	"cmpl %%ecx, %%ebx \n\t"
				2846	"movb %%al,-1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
				2847	"jb avg_rlp \n\t" // mov does not affect flags
				2848
				2849	// get # of bytes to alignment
				2850	"movl %%edi, _dif \n\t" // take start of row
				2851	"addl %%ebx, _dif \n\t" // add bpp
				2852	"addl $0xf, _dif \n\t" // add 7+8 to incr past alignment bdry
				2853	"andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
				2854	"subl %%edi, _dif \n\t" // subtract from start => value ebx at
				2855	"jz avg_go \n\t" // alignment
				2856
				2857	// fix alignment
				2858	// Compute the Raw value for the bytes up to the alignment boundary
				2859	// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
				2860	"xorl %%ecx, %%ecx \n\t"
				2861
				2862	"avg_lp1: \n\t"
				2863	"xorl %%eax, %%eax \n\t"
				2864	"movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
				2865	"movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
				2866	"addw %%cx, %%ax \n\t"
				2867	"incl %%ebx \n\t"
				2868	"shrw %%ax \n\t" // divide by 2
				2869	"addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
				2870	"cmpl _dif, %%ebx \n\t" // check if at alignment boundary
				2871	"movb %%al, -1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
				2872	"jb avg_lp1 \n\t" // repeat until at alignment boundary
				2873
				2874	"avg_go: \n\t"
				2875	"movl _FullLength, %%eax \n\t"
				2876	"movl %%eax, %%ecx \n\t"
				2877	"subl %%ebx, %%eax \n\t" // subtract alignment fix
				2878	"andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
				2879	"subl %%eax, %%ecx \n\t" // drop over bytes from original length
				2880	"movl %%ecx, _MMXLength \n\t"
				2881	#ifdef __PIC__
				2882	"popl %%ebx \n\t" // restore index to Global Offset Table
				2883	#endif
				2884
				2885	: "=c" (dummy_value_c), // output regs (dummy)
				2886	"=S" (dummy_value_S),
				2887	"=D" (dummy_value_D)
				2888
				2889	: "0" (bpp), // ecx // input regs
				2890	"1" (prev_row), // esi
				2891	"2" (row) // edi
				2892
				2893	: "%eax", "%edx" // clobber list
				2894	#ifndef __PIC__
				2895	, "%ebx"
				2896	#endif
				2897	// GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)
				2898	// (seems to work fine without...)
				2899	);
				2900
				2901	// now do the math for the rest of the row
				2902	switch (bpp)
				2903	{
				2904	case 3:
				2905	{
				2906	_ActiveMask.use = 0x0000000000ffffffLL;
				2907	_ShiftBpp.use = 24; // == 3 * 8
				2908	_ShiftRem.use = 40; // == 64 - 24
				2909
				2910	__asm__ __volatile__ (
				2911	// re-init address pointers and offset
				2912	"movq _ActiveMask, %%mm7 \n\t"
				2913	"movl _dif, %%ecx \n\t" // ecx: x = offset to
				2914	"movq _LBCarryMask, %%mm5 \n\t" // alignment boundary
				2915	// preload "movl row, %%edi \n\t" // edi: Avg(x)
				2916	"movq _HBClearMask, %%mm4 \n\t"
				2917	// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
				2918
				2919	// prime the pump: load the first Raw(x-bpp) data set
				2920	"movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
				2921	// (correct pos. in loop below)
				2922	"avg_3lp: \n\t"
				2923	"movq (%%edi,%%ecx,), %%mm0 \n\t" // load mm0 with Avg(x)
				2924	"movq %%mm5, %%mm3 \n\t"
				2925	"psrlq _ShiftRem, %%mm2 \n\t" // correct position Raw(x-bpp)
				2926	// data
				2927	"movq (%%esi,%%ecx,), %%mm1 \n\t" // load mm1 with Prior(x)
				2928	"movq %%mm7, %%mm6 \n\t"
				2929	"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
				2930	"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
				2931	"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
				2932	// byte
				2933	"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
				2934	// each byte
				2935	// add 1st active group (Raw(x-bpp)/2) to average with LBCarry
				2936	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
				2937	// LBCarrys
				2938	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
				2939	// where both
				2940	// lsb's were == 1 (only valid for active group)
				2941	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
				2942	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
				2943	// byte
				2944	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
				2945	// for each byte
				2946	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 1
				2947	// bytes to add to Avg
				2948	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
				2949	// Avg for each Active
				2950	// byte
				2951	// add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
				2952	"psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
				2953	// bytes 3-5
				2954	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
				2955	"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
				2956	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
				2957	// LBCarrys
				2958	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
				2959	// where both
				2960	// lsb's were == 1 (only valid for active group)
				2961	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
				2962	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
				2963	// byte
				2964	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
				2965	// for each byte
				2966	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
				2967	// bytes to add to Avg
				2968	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
				2969	// Avg for each Active
				2970	// byte
				2971
				2972	// add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
				2973	"psllq _ShiftBpp, %%mm6 \n\t" // shift mm6 mask to cover last
				2974	// two
				2975	// bytes
				2976	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
				2977	"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
				2978	// Data only needs to be shifted once here to
				2979	// get the correct x-bpp offset.
				2980	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
				2981	// LBCarrys
				2982	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
				2983	// where both
				2984	// lsb's were == 1 (only valid for active group)
				2985	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
				2986	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
				2987	// byte
				2988	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
				2989	// for each byte
				2990	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
				2991	// bytes to add to Avg
				2992	"addl $8, %%ecx \n\t"
				2993	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
				2994	// Avg for each Active
				2995	// byte
				2996	// now ready to write back to memory
				2997	"movq %%mm0, -8(%%edi,%%ecx,) \n\t"
				2998	// move updated Raw(x) to use as Raw(x-bpp) for next loop
				2999	"cmpl _MMXLength, %%ecx \n\t"
				3000	"movq %%mm0, %%mm2 \n\t" // mov updated Raw(x) to mm2
				3001	"jb avg_3lp \n\t"
				3002
				3003	: "=S" (dummy_value_S), // output regs (dummy)
				3004	"=D" (dummy_value_D)
				3005
				3006	: "0" (prev_row), // esi // input regs
				3007	"1" (row) // edi
				3008
				3009	: "%ecx" // clobber list
				3010	#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
				3011	, "%mm0", "%mm1", "%mm2", "%mm3"
				3012	, "%mm4", "%mm5", "%mm6", "%mm7"
				3013	#endif
				3014	);
				3015	}
				3016	break; // end 3 bpp
				3017
				3018	case 6:
				3019	case 4:
				3020	//case 7: // who wrote this? PNG doesn't support 5 or 7 bytes/pixel
				3021	//case 5: // GRR BOGUS
				3022	{
				3023	_ActiveMask.use = 0xffffffffffffffffLL; // use shift below to clear
				3024	// appropriate inactive bytes
				3025	_ShiftBpp.use = bpp << 3;
				3026	_ShiftRem.use = 64 - _ShiftBpp.use;
				3027
				3028	__asm__ __volatile__ (
				3029	"movq _HBClearMask, %%mm4 \n\t"
				3030
				3031	// re-init address pointers and offset
				3032	"movl _dif, %%ecx \n\t" // ecx: x = offset to
				3033	// alignment boundary
				3034
				3035	// load _ActiveMask and clear all bytes except for 1st active group
				3036	"movq _ActiveMask, %%mm7 \n\t"
				3037	// preload "movl row, %%edi \n\t" // edi: Avg(x)
				3038	"psrlq _ShiftRem, %%mm7 \n\t"
				3039	// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
				3040	"movq %%mm7, %%mm6 \n\t"
				3041	"movq _LBCarryMask, %%mm5 \n\t"
				3042	"psllq _ShiftBpp, %%mm6 \n\t" // create mask for 2nd active
				3043	// group
				3044
				3045	// prime the pump: load the first Raw(x-bpp) data set
				3046	"movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
				3047	// (we correct pos. in loop below)
				3048	"avg_4lp: \n\t"
				3049	"movq (%%edi,%%ecx,), %%mm0 \n\t"
				3050	"psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
				3051	"movq (%%esi,%%ecx,), %%mm1 \n\t"
				3052	// add (Prev_row/2) to average
				3053	"movq %%mm5, %%mm3 \n\t"
				3054	"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
				3055	"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
				3056	"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
				3057	// byte
				3058	"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
				3059	// each byte
				3060	// add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
				3061	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
				3062	// LBCarrys
				3063	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
				3064	// where both
				3065	// lsb's were == 1 (only valid for active group)
				3066	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
				3067	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
				3068	// byte
				3069	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
				3070	// for each byte
				3071	"pand %%mm7, %%mm2 \n\t" // leave only Active Group 1
				3072	// bytes to add to Avg
				3073	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
				3074	// for each Active
				3075	// byte
				3076	// add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
				3077	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
				3078	"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
				3079	"addl $8, %%ecx \n\t"
				3080	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
				3081	// LBCarrys
				3082	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
				3083	// where both
				3084	// lsb's were == 1 (only valid for active group)
				3085	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
				3086	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
				3087	// byte
				3088	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
				3089	// for each byte
				3090	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
				3091	// bytes to add to Avg
				3092	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
				3093	// Avg for each Active
				3094	// byte
				3095	"cmpl _MMXLength, %%ecx \n\t"
				3096	// now ready to write back to memory
				3097	"movq %%mm0, -8(%%edi,%%ecx,) \n\t"
				3098	// prep Raw(x-bpp) for next loop
				3099	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
				3100	"jb avg_4lp \n\t"
				3101
				3102	: "=S" (dummy_value_S), // output regs (dummy)
				3103	"=D" (dummy_value_D)
				3104
				3105	: "0" (prev_row), // esi // input regs
				3106	"1" (row) // edi
				3107
				3108	: "%ecx" // clobber list
				3109	#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
				3110	, "%mm0", "%mm1", "%mm2", "%mm3"
				3111	, "%mm4", "%mm5", "%mm6", "%mm7"
				3112	#endif
				3113	);
				3114	}
				3115	break; // end 4,6 bpp
				3116
				3117	case 2:
				3118	{
				3119	_ActiveMask.use = 0x000000000000ffffLL;
				3120	_ShiftBpp.use = 16; // == 2 * 8
				3121	_ShiftRem.use = 48; // == 64 - 16
				3122
				3123	__asm__ __volatile__ (
				3124	// load _ActiveMask
				3125	"movq _ActiveMask, %%mm7 \n\t"
				3126	// re-init address pointers and offset
				3127	"movl _dif, %%ecx \n\t" // ecx: x = offset to alignment
				3128	// boundary
				3129	"movq _LBCarryMask, %%mm5 \n\t"
				3130	// preload "movl row, %%edi \n\t" // edi: Avg(x)
				3131	"movq _HBClearMask, %%mm4 \n\t"
				3132	// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
				3133
				3134	// prime the pump: load the first Raw(x-bpp) data set
				3135	"movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
				3136	// (we correct pos. in loop below)
				3137	"avg_2lp: \n\t"
				3138	"movq (%%edi,%%ecx,), %%mm0 \n\t"
				3139	"psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
				3140	"movq (%%esi,%%ecx,), %%mm1 \n\t" // (GRR BUGFIX: was psllq)
				3141	// add (Prev_row/2) to average
				3142	"movq %%mm5, %%mm3 \n\t"
				3143	"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
				3144	"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
				3145	"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
				3146	// byte
				3147	"movq %%mm7, %%mm6 \n\t"
				3148	"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
				3149	// each byte
				3150
				3151	// add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
				3152	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
				3153	// LBCarrys
				3154	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
				3155	// where both
				3156	// lsb's were == 1 (only valid
				3157	// for active group)
				3158	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
				3159	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
				3160	// byte
				3161	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
				3162	// for each byte
				3163	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 1
				3164	// bytes to add to Avg
				3165	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
				3166	// for each Active byte
				3167
				3168	// add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
				3169	"psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
				3170	// bytes 2 & 3
				3171	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
				3172	"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
				3173	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
				3174	// LBCarrys
				3175	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
				3176	// where both
				3177	// lsb's were == 1 (only valid
				3178	// for active group)
				3179	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
				3180	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
				3181	// byte
				3182	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
				3183	// for each byte
				3184	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
				3185	// bytes to add to Avg
				3186	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
				3187	// Avg for each Active byte
				3188
				3189	// add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
				3190	"psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
				3191	// bytes 4 & 5
				3192	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
				3193	"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
				3194	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
				3195	// LBCarrys
				3196	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
				3197	// where both lsb's were == 1
				3198	// (only valid for active group)
				3199	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
				3200	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
				3201	// byte
				3202	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
				3203	// for each byte
				3204	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
				3205	// bytes to add to Avg
				3206	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
				3207	// Avg for each Active byte
				3208
				3209	// add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
				3210	"psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
				3211	// bytes 6 & 7
				3212	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
				3213	"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
				3214	"addl $8, %%ecx \n\t"
				3215	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
				3216	// LBCarrys
				3217	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
				3218	// where both
				3219	// lsb's were == 1 (only valid
				3220	// for active group)
				3221	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
				3222	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
				3223	// byte
				3224	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
				3225	// for each byte
				3226	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
				3227	// bytes to add to Avg
				3228	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
				3229	// Avg for each Active byte
				3230
				3231	"cmpl _MMXLength, %%ecx \n\t"
				3232	// now ready to write back to memory
				3233	"movq %%mm0, -8(%%edi,%%ecx,) \n\t"
				3234	// prep Raw(x-bpp) for next loop
				3235	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
				3236	"jb avg_2lp \n\t"
				3237
				3238	: "=S" (dummy_value_S), // output regs (dummy)
				3239	"=D" (dummy_value_D)
				3240
				3241	: "0" (prev_row), // esi // input regs
				3242	"1" (row) // edi
				3243
				3244	: "%ecx" // clobber list
				3245	#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
				3246	, "%mm0", "%mm1", "%mm2", "%mm3"
				3247	, "%mm4", "%mm5", "%mm6", "%mm7"
				3248	#endif
				3249	);
				3250	}
				3251	break; // end 2 bpp
				3252
				3253	case 1:
				3254	{
				3255	__asm__ __volatile__ (
				3256	// re-init address pointers and offset
				3257	#ifdef __PIC__
				3258	"pushl %%ebx \n\t" // save Global Offset Table index
				3259	#endif
				3260	"movl _dif, %%ebx \n\t" // ebx: x = offset to alignment
				3261	// boundary
				3262	// preload "movl row, %%edi \n\t" // edi: Avg(x)
				3263	"cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
				3264	"jnb avg_1end \n\t"
				3265	// do Paeth decode for remaining bytes
				3266	// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
				3267	"movl %%edi, %%edx \n\t"
				3268	// preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
				3269	"subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
				3270	"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
				3271	// in loop below
				3272	"avg_1lp: \n\t"
				3273	// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
				3274	"xorl %%eax, %%eax \n\t"
				3275	"movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
				3276	"movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
				3277	"addw %%cx, %%ax \n\t"
				3278	"incl %%ebx \n\t"
				3279	"shrw %%ax \n\t" // divide by 2
				3280	"addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset
				3281	// inc ebx
				3282	"cmpl _FullLength, %%ebx \n\t" // check if at end of array
				3283	"movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x);
				3284	// mov does not affect flags; -1 to offset inc ebx
				3285	"jb avg_1lp \n\t"
				3286
				3287	"avg_1end: \n\t"
				3288	#ifdef __PIC__
				3289	"popl %%ebx \n\t" // Global Offset Table index
				3290	#endif
				3291
				3292	: "=c" (dummy_value_c), // output regs (dummy)
				3293	"=S" (dummy_value_S),
				3294	"=D" (dummy_value_D)
				3295
				3296	: "0" (bpp), // ecx // input regs
				3297	"1" (prev_row), // esi
				3298	"2" (row) // edi
				3299
				3300	: "%eax", "%edx" // clobber list
				3301	#ifndef __PIC__
				3302	, "%ebx"
				3303	#endif
				3304	);
				3305	}
				3306	return; // end 1 bpp
				3307
				3308	case 8:
				3309	{
				3310	__asm__ __volatile__ (
				3311	// re-init address pointers and offset
				3312	"movl _dif, %%ecx \n\t" // ecx: x == offset to alignment
				3313	"movq _LBCarryMask, %%mm5 \n\t" // boundary
				3314	// preload "movl row, %%edi \n\t" // edi: Avg(x)
				3315	"movq _HBClearMask, %%mm4 \n\t"
				3316	// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
				3317
				3318	// prime the pump: load the first Raw(x-bpp) data set
				3319	"movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
				3320	// (NO NEED to correct pos. in loop below)
				3321
				3322	"avg_8lp: \n\t"
				3323	"movq (%%edi,%%ecx,), %%mm0 \n\t"
				3324	"movq %%mm5, %%mm3 \n\t"
				3325	"movq (%%esi,%%ecx,), %%mm1 \n\t"
				3326	"addl $8, %%ecx \n\t"
				3327	"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
				3328	"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
				3329	"pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
				3330	// where both lsb's were == 1
				3331	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
				3332	"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7, each byte
				3333	"paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg, each byte
				3334	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7, each byte
				3335	"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg, each
				3336	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
				3337	"cmpl _MMXLength, %%ecx \n\t"
				3338	"movq %%mm0, -8(%%edi,%%ecx,) \n\t"
				3339	"movq %%mm0, %%mm2 \n\t" // reuse as Raw(x-bpp)
				3340	"jb avg_8lp \n\t"
				3341
				3342	: "=S" (dummy_value_S), // output regs (dummy)
				3343	"=D" (dummy_value_D)
				3344
				3345	: "0" (prev_row), // esi // input regs
				3346	"1" (row) // edi
				3347
				3348	: "%ecx" // clobber list
				3349	#if 0 /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
				3350	, "%mm0", "%mm1", "%mm2"
				3351	, "%mm3", "%mm4", "%mm5"
				3352	#endif
				3353	);
				3354	}
				3355	break; // end 8 bpp
				3356
				3357	default: // bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8)
				3358	{
				3359
				3360	#ifdef PNG_DEBUG
				3361	// GRR: PRINT ERROR HERE: SHOULD NEVER BE REACHED
				3362	png_debug(1,
				3363	"Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())\n");
				3364	#endif
				3365
				3366	#if 0
				3367	__asm__ __volatile__ (
				3368	"movq _LBCarryMask, %%mm5 \n\t"
				3369	// re-init address pointers and offset
				3370	"movl _dif, %%ebx \n\t" // ebx: x = offset to
				3371	// alignment boundary
				3372	"movl row, %%edi \n\t" // edi: Avg(x)
				3373	"movq _HBClearMask, %%mm4 \n\t"
				3374	"movl %%edi, %%edx \n\t"
				3375	"movl prev_row, %%esi \n\t" // esi: Prior(x)
				3376	"subl bpp, %%edx \n\t" // edx: Raw(x-bpp)
				3377	"avg_Alp: \n\t"
				3378	"movq (%%edi,%%ebx,), %%mm0 \n\t"
				3379	"movq %%mm5, %%mm3 \n\t"
				3380	"movq (%%esi,%%ebx,), %%mm1 \n\t"
				3381	"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
				3382	"movq (%%edx,%%ebx,), %%mm2 \n\t"
				3383	"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
				3384	"pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
				3385	// where both lsb's were == 1
				3386	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
				3387	"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
				3388	// byte
				3389	"paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg for each
				3390	// byte
				3391	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
				3392	// byte
				3393	"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
				3394	// each byte
				3395	"addl $8, %%ebx \n\t"
				3396	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
				3397	// byte
				3398	"cmpl _MMXLength, %%ebx \n\t"
				3399	"movq %%mm0, -8(%%edi,%%ebx,) \n\t"
				3400	"jb avg_Alp \n\t"
				3401
				3402	: // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
				3403
				3404	: // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
				3405
				3406	: "%ebx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
				3407	);
				3408	#endif /* 0 - NEVER REACHED */
				3409	}
				3410	break;
				3411
				3412	} // end switch (bpp)
				3413
				3414	__asm__ __volatile__ (
				3415	// MMX acceleration complete; now do clean-up
				3416	// check if any remaining bytes left to decode
				3417	#ifdef __PIC__
				3418	"pushl %%ebx \n\t" // save index to Global Offset Table
				3419	#endif
				3420	"movl _MMXLength, %%ebx \n\t" // ebx: x == offset bytes after MMX
				3421	//pre "movl row, %%edi \n\t" // edi: Avg(x)
				3422	"cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
				3423	"jnb avg_end \n\t"
				3424
				3425	// do Avg decode for remaining bytes
				3426	//pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
				3427	"movl %%edi, %%edx \n\t"
				3428	//pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
				3429	"subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
				3430	"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
				3431
				3432	"avg_lp2: \n\t"
				3433	// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
				3434	"xorl %%eax, %%eax \n\t"
				3435	"movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
				3436	"movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
				3437	"addw %%cx, %%ax \n\t"
				3438	"incl %%ebx \n\t"
				3439	"shrw %%ax \n\t" // divide by 2
				3440	"addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
				3441	"cmpl _FullLength, %%ebx \n\t" // check if at end of array
				3442	"movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x) [mov does not
				3443	"jb avg_lp2 \n\t" // affect flags; -1 to offset inc ebx]
				3444
				3445	"avg_end: \n\t"
				3446	"EMMS \n\t" // end MMX; prep for poss. FP instrs.
				3447	#ifdef __PIC__
				3448	"popl %%ebx \n\t" // restore index to Global Offset Table
				3449	#endif
				3450
				3451	: "=c" (dummy_value_c), // output regs (dummy)
				3452	"=S" (dummy_value_S),
				3453	"=D" (dummy_value_D)
				3454
				3455	: "0" (bpp), // ecx // input regs
				3456	"1" (prev_row), // esi
				3457	"2" (row) // edi
				3458
				3459	: "%eax", "%edx" // clobber list
				3460	#ifndef __PIC__
				3461	, "%ebx"
				3462	#endif
				3463	);
				3464
				3465	} /* end png_read_filter_row_mmx_avg() */
				3466	#endif
				3467
				3468
				3469
				3470	#ifdef PNG_THREAD_UNSAFE_OK
				3471	//===========================================================================//
				3472	// //
				3473	// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H //
				3474	// //
				3475	//===========================================================================//
				3476
				3477	// Optimized code for PNG Paeth filter decoder
				3478
				3479	static void /* PRIVATE */
				3480	png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
				3481	png_bytep prev_row)
				3482	{
				3483	int bpp;
				3484	int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
				3485	int dummy_value_S;
				3486	int dummy_value_D;
				3487
				3488	bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
				3489	_FullLength = row_info->rowbytes; // # of bytes to filter
				3490
				3491	__asm__ __volatile__ (
				3492	#ifdef __PIC__
				3493	"pushl %%ebx \n\t" // save index to Global Offset Table
				3494	#endif
				3495	"xorl %%ebx, %%ebx \n\t" // ebx: x offset
				3496	//pre "movl row, %%edi \n\t"
				3497	"xorl %%edx, %%edx \n\t" // edx: x-bpp offset
				3498	//pre "movl prev_row, %%esi \n\t"
				3499	"xorl %%eax, %%eax \n\t"
				3500
				3501	// Compute the Raw value for the first bpp bytes
				3502	// Note: the formula works out to be always
				3503	// Paeth(x) = Raw(x) + Prior(x) where x < bpp
				3504	"paeth_rlp: \n\t"
				3505	"movb (%%edi,%%ebx,), %%al \n\t"
				3506	"addb (%%esi,%%ebx,), %%al \n\t"
				3507	"incl %%ebx \n\t"
				3508	//pre "cmpl bpp, %%ebx \n\t" (bpp is preloaded into ecx)
				3509	"cmpl %%ecx, %%ebx \n\t"
				3510	"movb %%al, -1(%%edi,%%ebx,) \n\t"
				3511	"jb paeth_rlp \n\t"
				3512	// get # of bytes to alignment
				3513	"movl %%edi, _dif \n\t" // take start of row
				3514	"addl %%ebx, _dif \n\t" // add bpp
				3515	"xorl %%ecx, %%ecx \n\t"
				3516	"addl $0xf, _dif \n\t" // add 7 + 8 to incr past alignment
				3517	// boundary
				3518	"andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
				3519	"subl %%edi, _dif \n\t" // subtract from start ==> value ebx
				3520	// at alignment
				3521	"jz paeth_go \n\t"
				3522	// fix alignment
				3523
				3524	"paeth_lp1: \n\t"
				3525	"xorl %%eax, %%eax \n\t"
				3526	// pav = p - a = (a + b - c) - a = b - c
				3527	"movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
				3528	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
				3529	"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
				3530	"movl %%eax, _patemp \n\t" // Save pav for later use
				3531	"xorl %%eax, %%eax \n\t"
				3532	// pbv = p - b = (a + b - c) - b = a - c
				3533	"movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
				3534	"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
				3535	"movl %%eax, %%ecx \n\t"
				3536	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
				3537	"addl _patemp, %%eax \n\t" // pcv = pav + pbv
				3538	// pc = abs(pcv)
				3539	"testl $0x80000000, %%eax \n\t"
				3540	"jz paeth_pca \n\t"
				3541	"negl %%eax \n\t" // reverse sign of neg values
				3542
				3543	"paeth_pca: \n\t"
				3544	"movl %%eax, _pctemp \n\t" // save pc for later use
				3545	// pb = abs(pbv)
				3546	"testl $0x80000000, %%ecx \n\t"
				3547	"jz paeth_pba \n\t"
				3548	"negl %%ecx \n\t" // reverse sign of neg values
				3549
				3550	"paeth_pba: \n\t"
				3551	"movl %%ecx, _pbtemp \n\t" // save pb for later use
				3552	// pa = abs(pav)
				3553	"movl _patemp, %%eax \n\t"
				3554	"testl $0x80000000, %%eax \n\t"
				3555	"jz paeth_paa \n\t"
				3556	"negl %%eax \n\t" // reverse sign of neg values
				3557
				3558	"paeth_paa: \n\t"
				3559	"movl %%eax, _patemp \n\t" // save pa for later use
				3560	// test if pa <= pb
				3561	"cmpl %%ecx, %%eax \n\t"
				3562	"jna paeth_abb \n\t"
				3563	// pa > pb; now test if pb <= pc
				3564	"cmpl _pctemp, %%ecx \n\t"
				3565	"jna paeth_bbc \n\t"
				3566	// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
				3567	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
				3568	"jmp paeth_paeth \n\t"
				3569
				3570	"paeth_bbc: \n\t"
				3571	// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
				3572	"movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
				3573	"jmp paeth_paeth \n\t"
				3574
				3575	"paeth_abb: \n\t"
				3576	// pa <= pb; now test if pa <= pc
				3577	"cmpl _pctemp, %%eax \n\t"
				3578	"jna paeth_abc \n\t"
				3579	// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
				3580	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
				3581	"jmp paeth_paeth \n\t"
				3582
				3583	"paeth_abc: \n\t"
				3584	// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
				3585	"movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
				3586
				3587	"paeth_paeth: \n\t"
				3588	"incl %%ebx \n\t"
				3589	"incl %%edx \n\t"
				3590	// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
				3591	"addb %%cl, -1(%%edi,%%ebx,) \n\t"
				3592	"cmpl _dif, %%ebx \n\t"
				3593	"jb paeth_lp1 \n\t"
				3594
				3595	"paeth_go: \n\t"
				3596	"movl _FullLength, %%ecx \n\t"
				3597	"movl %%ecx, %%eax \n\t"
				3598	"subl %%ebx, %%eax \n\t" // subtract alignment fix
				3599	"andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
				3600	"subl %%eax, %%ecx \n\t" // drop over bytes from original length
				3601	"movl %%ecx, _MMXLength \n\t"
				3602	#ifdef __PIC__
				3603	"popl %%ebx \n\t" // restore index to Global Offset Table
				3604	#endif
				3605
				3606	: "=c" (dummy_value_c), // output regs (dummy)
				3607	"=S" (dummy_value_S),
				3608	"=D" (dummy_value_D)
				3609
				3610	: "0" (bpp), // ecx // input regs
				3611	"1" (prev_row), // esi
				3612	"2" (row) // edi
				3613
				3614	: "%eax", "%edx" // clobber list
				3615	#ifndef __PIC__
				3616	, "%ebx"
				3617	#endif
				3618	);
				3619
				3620	// now do the math for the rest of the row
				3621	switch (bpp)
				3622	{
				3623	case 3:
				3624	{
				3625	_ActiveMask.use = 0x0000000000ffffffLL;
				3626	_ActiveMaskEnd.use = 0xffff000000000000LL;
				3627	_ShiftBpp.use = 24; // == bpp(3) * 8
				3628	_ShiftRem.use = 40; // == 64 - 24
				3629
				3630	__asm__ __volatile__ (
				3631	"movl _dif, %%ecx \n\t"
				3632	// preload "movl row, %%edi \n\t"
				3633	// preload "movl prev_row, %%esi \n\t"
				3634	"pxor %%mm0, %%mm0 \n\t"
				3635	// prime the pump: load the first Raw(x-bpp) data set
				3636	"movq -8(%%edi,%%ecx,), %%mm1 \n\t"
				3637	"paeth_3lp: \n\t"
				3638	"psrlq _ShiftRem, %%mm1 \n\t" // shift last 3 bytes to 1st
				3639	// 3 bytes
				3640	"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
				3641	"punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
				3642	"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
				3643	"punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
				3644	"psrlq _ShiftRem, %%mm3 \n\t" // shift last 3 bytes to 1st
				3645	// 3 bytes
				3646	// pav = p - a = (a + b - c) - a = b - c
				3647	"movq %%mm2, %%mm4 \n\t"
				3648	"punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
				3649	// pbv = p - b = (a + b - c) - b = a - c
				3650	"movq %%mm1, %%mm5 \n\t"
				3651	"psubw %%mm3, %%mm4 \n\t"
				3652	"pxor %%mm7, %%mm7 \n\t"
				3653	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
				3654	"movq %%mm4, %%mm6 \n\t"
				3655	"psubw %%mm3, %%mm5 \n\t"
				3656
				3657	// pa = abs(p-a) = abs(pav)
				3658	// pb = abs(p-b) = abs(pbv)
				3659	// pc = abs(p-c) = abs(pcv)
				3660	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
				3661	"paddw %%mm5, %%mm6 \n\t"
				3662	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
				3663	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
				3664	"psubw %%mm0, %%mm4 \n\t"
				3665	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
				3666	"psubw %%mm0, %%mm4 \n\t"
				3667	"psubw %%mm7, %%mm5 \n\t"
				3668	"pxor %%mm0, %%mm0 \n\t"
				3669	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
				3670	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
				3671	"psubw %%mm7, %%mm5 \n\t"
				3672	"psubw %%mm0, %%mm6 \n\t"
				3673	// test pa <= pb
				3674	"movq %%mm4, %%mm7 \n\t"
				3675	"psubw %%mm0, %%mm6 \n\t"
				3676	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
				3677	"movq %%mm7, %%mm0 \n\t"
				3678	// use mm7 mask to merge pa & pb
				3679	"pand %%mm7, %%mm5 \n\t"
				3680	// use mm0 mask copy to merge a & b
				3681	"pand %%mm0, %%mm2 \n\t"
				3682	"pandn %%mm4, %%mm7 \n\t"
				3683	"pandn %%mm1, %%mm0 \n\t"
				3684	"paddw %%mm5, %%mm7 \n\t"
				3685	"paddw %%mm2, %%mm0 \n\t"
				3686	// test ((pa <= pb)? pa:pb) <= pc
				3687	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
				3688	"pxor %%mm1, %%mm1 \n\t"
				3689	"pand %%mm7, %%mm3 \n\t"
				3690	"pandn %%mm0, %%mm7 \n\t"
				3691	"paddw %%mm3, %%mm7 \n\t"
				3692	"pxor %%mm0, %%mm0 \n\t"
				3693	"packuswb %%mm1, %%mm7 \n\t"
				3694	"movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
				3695	"pand _ActiveMask, %%mm7 \n\t"
				3696	"movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
				3697	"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
				3698	"punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
				3699	"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
				3700	"movq %%mm7, %%mm1 \n\t" // now mm1 will be used as
				3701	// Raw(x-bpp)
				3702	// now do Paeth for 2nd set of bytes (3-5)
				3703	"psrlq _ShiftBpp, %%mm2 \n\t" // load b=Prior(x) step 2
				3704	"punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
				3705	"pxor %%mm7, %%mm7 \n\t"
				3706	"punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
				3707	// pbv = p - b = (a + b - c) - b = a - c
				3708	"movq %%mm1, %%mm5 \n\t"
				3709	// pav = p - a = (a + b - c) - a = b - c
				3710	"movq %%mm2, %%mm4 \n\t"
				3711	"psubw %%mm3, %%mm5 \n\t"
				3712	"psubw %%mm3, %%mm4 \n\t"
				3713	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
				3714	// pav + pbv = pbv + pav
				3715	"movq %%mm5, %%mm6 \n\t"
				3716	"paddw %%mm4, %%mm6 \n\t"
				3717
				3718	// pa = abs(p-a) = abs(pav)
				3719	// pb = abs(p-b) = abs(pbv)
				3720	// pc = abs(p-c) = abs(pcv)
				3721	"pcmpgtw %%mm5, %%mm0 \n\t" // create mask pbv bytes < 0
				3722	"pcmpgtw %%mm4, %%mm7 \n\t" // create mask pav bytes < 0
				3723	"pand %%mm5, %%mm0 \n\t" // only pbv bytes < 0 in mm0
				3724	"pand %%mm4, %%mm7 \n\t" // only pav bytes < 0 in mm7
				3725	"psubw %%mm0, %%mm5 \n\t"
				3726	"psubw %%mm7, %%mm4 \n\t"
				3727	"psubw %%mm0, %%mm5 \n\t"
				3728	"psubw %%mm7, %%mm4 \n\t"
				3729	"pxor %%mm0, %%mm0 \n\t"
				3730	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
				3731	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
				3732	"psubw %%mm0, %%mm6 \n\t"
				3733	// test pa <= pb
				3734	"movq %%mm4, %%mm7 \n\t"
				3735	"psubw %%mm0, %%mm6 \n\t"
				3736	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
				3737	"movq %%mm7, %%mm0 \n\t"
				3738	// use mm7 mask to merge pa & pb
				3739	"pand %%mm7, %%mm5 \n\t"
				3740	// use mm0 mask copy to merge a & b
				3741	"pand %%mm0, %%mm2 \n\t"
				3742	"pandn %%mm4, %%mm7 \n\t"
				3743	"pandn %%mm1, %%mm0 \n\t"
				3744	"paddw %%mm5, %%mm7 \n\t"
				3745	"paddw %%mm2, %%mm0 \n\t"
				3746	// test ((pa <= pb)? pa:pb) <= pc
				3747	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
				3748	"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
				3749	"pand %%mm7, %%mm3 \n\t"
				3750	"pandn %%mm0, %%mm7 \n\t"
				3751	"pxor %%mm1, %%mm1 \n\t"
				3752	"paddw %%mm3, %%mm7 \n\t"
				3753	"pxor %%mm0, %%mm0 \n\t"
				3754	"packuswb %%mm1, %%mm7 \n\t"
				3755	"movq %%mm2, %%mm3 \n\t" // load c=Prior(x-bpp) step 1
				3756	"pand _ActiveMask, %%mm7 \n\t"
				3757	"punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
				3758	"psllq _ShiftBpp, %%mm7 \n\t" // shift bytes to 2nd group of
				3759	// 3 bytes
				3760	// pav = p - a = (a + b - c) - a = b - c
				3761	"movq %%mm2, %%mm4 \n\t"
				3762	"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
				3763	"psllq _ShiftBpp, %%mm3 \n\t" // load c=Prior(x-bpp) step 2
				3764	"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
				3765	"movq %%mm7, %%mm1 \n\t"
				3766	"punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
				3767	"psllq _ShiftBpp, %%mm1 \n\t" // shift bytes
				3768	// now mm1 will be used as Raw(x-bpp)
				3769	// now do Paeth for 3rd, and final, set of bytes (6-7)
				3770	"pxor %%mm7, %%mm7 \n\t"
				3771	"punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
				3772	"psubw %%mm3, %%mm4 \n\t"
				3773	// pbv = p - b = (a + b - c) - b = a - c
				3774	"movq %%mm1, %%mm5 \n\t"
				3775	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
				3776	"movq %%mm4, %%mm6 \n\t"
				3777	"psubw %%mm3, %%mm5 \n\t"
				3778	"pxor %%mm0, %%mm0 \n\t"
				3779	"paddw %%mm5, %%mm6 \n\t"
				3780
				3781	// pa = abs(p-a) = abs(pav)
				3782	// pb = abs(p-b) = abs(pbv)
				3783	// pc = abs(p-c) = abs(pcv)
				3784	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
				3785	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
				3786	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
				3787	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
				3788	"psubw %%mm0, %%mm4 \n\t"
				3789	"psubw %%mm7, %%mm5 \n\t"
				3790	"psubw %%mm0, %%mm4 \n\t"
				3791	"psubw %%mm7, %%mm5 \n\t"
				3792	"pxor %%mm0, %%mm0 \n\t"
				3793	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
				3794	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
				3795	"psubw %%mm0, %%mm6 \n\t"
				3796	// test pa <= pb
				3797	"movq %%mm4, %%mm7 \n\t"
				3798	"psubw %%mm0, %%mm6 \n\t"
				3799	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
				3800	"movq %%mm7, %%mm0 \n\t"
				3801	// use mm0 mask copy to merge a & b
				3802	"pand %%mm0, %%mm2 \n\t"
				3803	// use mm7 mask to merge pa & pb
				3804	"pand %%mm7, %%mm5 \n\t"
				3805	"pandn %%mm1, %%mm0 \n\t"
				3806	"pandn %%mm4, %%mm7 \n\t"
				3807	"paddw %%mm2, %%mm0 \n\t"
				3808	"paddw %%mm5, %%mm7 \n\t"
				3809	// test ((pa <= pb)? pa:pb) <= pc
				3810	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
				3811	"pand %%mm7, %%mm3 \n\t"
				3812	"pandn %%mm0, %%mm7 \n\t"
				3813	"paddw %%mm3, %%mm7 \n\t"
				3814	"pxor %%mm1, %%mm1 \n\t"
				3815	"packuswb %%mm7, %%mm1 \n\t"
				3816	// step ecx to next set of 8 bytes and repeat loop til done
				3817	"addl $8, %%ecx \n\t"
				3818	"pand _ActiveMaskEnd, %%mm1 \n\t"
				3819	"paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with
				3820	// Raw(x)
				3821
				3822	"cmpl _MMXLength, %%ecx \n\t"
				3823	"pxor %%mm0, %%mm0 \n\t" // pxor does not affect flags
				3824	"movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
				3825	// mm1 will be used as Raw(x-bpp) next loop
				3826	// mm3 ready to be used as Prior(x-bpp) next loop
				3827	"jb paeth_3lp \n\t"
				3828
				3829	: "=S" (dummy_value_S), // output regs (dummy)
				3830	"=D" (dummy_value_D)
				3831
				3832	: "0" (prev_row), // esi // input regs
				3833	"1" (row) // edi
				3834
				3835	: "%ecx" // clobber list
				3836	#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
				3837	, "%mm0", "%mm1", "%mm2", "%mm3"
				3838	, "%mm4", "%mm5", "%mm6", "%mm7"
				3839	#endif
				3840	);
				3841	}
				3842	break; // end 3 bpp
				3843
				3844	case 6:
				3845	//case 7: // GRR BOGUS
				3846	//case 5: // GRR BOGUS
				3847	{
				3848	_ActiveMask.use = 0x00000000ffffffffLL;
				3849	_ActiveMask2.use = 0xffffffff00000000LL;
				3850	_ShiftBpp.use = bpp << 3; // == bpp * 8
				3851	_ShiftRem.use = 64 - _ShiftBpp.use;
				3852
				3853	__asm__ __volatile__ (
				3854	"movl _dif, %%ecx \n\t"
				3855	// preload "movl row, %%edi \n\t"
				3856	// preload "movl prev_row, %%esi \n\t"
				3857	// prime the pump: load the first Raw(x-bpp) data set
				3858	"movq -8(%%edi,%%ecx,), %%mm1 \n\t"
				3859	"pxor %%mm0, %%mm0 \n\t"
				3860
				3861	"paeth_6lp: \n\t"
				3862	// must shift to position Raw(x-bpp) data
				3863	"psrlq _ShiftRem, %%mm1 \n\t"
				3864	// do first set of 4 bytes
				3865	"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
				3866	"punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
				3867	"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
				3868	"punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
				3869	// must shift to position Prior(x-bpp) data
				3870	"psrlq _ShiftRem, %%mm3 \n\t"
				3871	// pav = p - a = (a + b - c) - a = b - c
				3872	"movq %%mm2, %%mm4 \n\t"
				3873	"punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
				3874	// pbv = p - b = (a + b - c) - b = a - c
				3875	"movq %%mm1, %%mm5 \n\t"
				3876	"psubw %%mm3, %%mm4 \n\t"
				3877	"pxor %%mm7, %%mm7 \n\t"
				3878	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
				3879	"movq %%mm4, %%mm6 \n\t"
				3880	"psubw %%mm3, %%mm5 \n\t"
				3881	// pa = abs(p-a) = abs(pav)
				3882	// pb = abs(p-b) = abs(pbv)
				3883	// pc = abs(p-c) = abs(pcv)
				3884	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
				3885	"paddw %%mm5, %%mm6 \n\t"
				3886	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
				3887	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
				3888	"psubw %%mm0, %%mm4 \n\t"
				3889	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
				3890	"psubw %%mm0, %%mm4 \n\t"
				3891	"psubw %%mm7, %%mm5 \n\t"
				3892	"pxor %%mm0, %%mm0 \n\t"
				3893	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
				3894	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
				3895	"psubw %%mm7, %%mm5 \n\t"
				3896	"psubw %%mm0, %%mm6 \n\t"
				3897	// test pa <= pb
				3898	"movq %%mm4, %%mm7 \n\t"
				3899	"psubw %%mm0, %%mm6 \n\t"
				3900	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
				3901	"movq %%mm7, %%mm0 \n\t"
				3902	// use mm7 mask to merge pa & pb
				3903	"pand %%mm7, %%mm5 \n\t"
				3904	// use mm0 mask copy to merge a & b
				3905	"pand %%mm0, %%mm2 \n\t"
				3906	"pandn %%mm4, %%mm7 \n\t"
				3907	"pandn %%mm1, %%mm0 \n\t"
				3908	"paddw %%mm5, %%mm7 \n\t"
				3909	"paddw %%mm2, %%mm0 \n\t"
				3910	// test ((pa <= pb)? pa:pb) <= pc
				3911	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
				3912	"pxor %%mm1, %%mm1 \n\t"
				3913	"pand %%mm7, %%mm3 \n\t"
				3914	"pandn %%mm0, %%mm7 \n\t"
				3915	"paddw %%mm3, %%mm7 \n\t"
				3916	"pxor %%mm0, %%mm0 \n\t"
				3917	"packuswb %%mm1, %%mm7 \n\t"
				3918	"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
				3919	"pand _ActiveMask, %%mm7 \n\t"
				3920	"psrlq _ShiftRem, %%mm3 \n\t"
				3921	"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x) step 1
				3922	"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor and Raw(x)
				3923	"movq %%mm2, %%mm6 \n\t"
				3924	"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
				3925	"movq -8(%%edi,%%ecx,), %%mm1 \n\t"
				3926	"psllq _ShiftBpp, %%mm6 \n\t"
				3927	"movq %%mm7, %%mm5 \n\t"
				3928	"psrlq _ShiftRem, %%mm1 \n\t"
				3929	"por %%mm6, %%mm3 \n\t"
				3930	"psllq _ShiftBpp, %%mm5 \n\t"
				3931	"punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
				3932	"por %%mm5, %%mm1 \n\t"
				3933	// do second set of 4 bytes
				3934	"punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
				3935	"punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
				3936	// pav = p - a = (a + b - c) - a = b - c
				3937	"movq %%mm2, %%mm4 \n\t"
				3938	// pbv = p - b = (a + b - c) - b = a - c
				3939	"movq %%mm1, %%mm5 \n\t"
				3940	"psubw %%mm3, %%mm4 \n\t"
				3941	"pxor %%mm7, %%mm7 \n\t"
				3942	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
				3943	"movq %%mm4, %%mm6 \n\t"
				3944	"psubw %%mm3, %%mm5 \n\t"
				3945	// pa = abs(p-a) = abs(pav)
				3946	// pb = abs(p-b) = abs(pbv)
				3947	// pc = abs(p-c) = abs(pcv)
				3948	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
				3949	"paddw %%mm5, %%mm6 \n\t"
				3950	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
				3951	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
				3952	"psubw %%mm0, %%mm4 \n\t"
				3953	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
				3954	"psubw %%mm0, %%mm4 \n\t"
				3955	"psubw %%mm7, %%mm5 \n\t"
				3956	"pxor %%mm0, %%mm0 \n\t"
				3957	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
				3958	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
				3959	"psubw %%mm7, %%mm5 \n\t"
				3960	"psubw %%mm0, %%mm6 \n\t"
				3961	// test pa <= pb
				3962	"movq %%mm4, %%mm7 \n\t"
				3963	"psubw %%mm0, %%mm6 \n\t"
				3964	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
				3965	"movq %%mm7, %%mm0 \n\t"
				3966	// use mm7 mask to merge pa & pb
				3967	"pand %%mm7, %%mm5 \n\t"
				3968	// use mm0 mask copy to merge a & b
				3969	"pand %%mm0, %%mm2 \n\t"
				3970	"pandn %%mm4, %%mm7 \n\t"
				3971	"pandn %%mm1, %%mm0 \n\t"
				3972	"paddw %%mm5, %%mm7 \n\t"
				3973	"paddw %%mm2, %%mm0 \n\t"
				3974	// test ((pa <= pb)? pa:pb) <= pc
				3975	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
				3976	"pxor %%mm1, %%mm1 \n\t"
				3977	"pand %%mm7, %%mm3 \n\t"
				3978	"pandn %%mm0, %%mm7 \n\t"
				3979	"pxor %%mm1, %%mm1 \n\t"
				3980	"paddw %%mm3, %%mm7 \n\t"
				3981	"pxor %%mm0, %%mm0 \n\t"
				3982	// step ecx to next set of 8 bytes and repeat loop til done
				3983	"addl $8, %%ecx \n\t"
				3984	"packuswb %%mm7, %%mm1 \n\t"
				3985	"paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
				3986	"cmpl _MMXLength, %%ecx \n\t"
				3987	"movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
				3988	// mm1 will be used as Raw(x-bpp) next loop
				3989	"jb paeth_6lp \n\t"
				3990
				3991	: "=S" (dummy_value_S), // output regs (dummy)
				3992	"=D" (dummy_value_D)
				3993
				3994	: "0" (prev_row), // esi // input regs
				3995	"1" (row) // edi
				3996
				3997	: "%ecx" // clobber list
				3998	#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
				3999	, "%mm0", "%mm1", "%mm2", "%mm3"
				4000	, "%mm4", "%mm5", "%mm6", "%mm7"
				4001	#endif
				4002	);
				4003	}
				4004	break; // end 6 bpp
				4005
				4006	case 4:
				4007	{
				4008	_ActiveMask.use = 0x00000000ffffffffLL;
				4009
				4010	__asm__ __volatile__ (
				4011	"movl _dif, %%ecx \n\t"
				4012	// preload "movl row, %%edi \n\t"
				4013	// preload "movl prev_row, %%esi \n\t"
				4014	"pxor %%mm0, %%mm0 \n\t"
				4015	// prime the pump: load the first Raw(x-bpp) data set
				4016	"movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
				4017	// a=Raw(x-bpp) bytes
				4018	"paeth_4lp: \n\t"
				4019	// do first set of 4 bytes
				4020	"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
				4021	"punpckhbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
				4022	"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
				4023	"punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
				4024	// pav = p - a = (a + b - c) - a = b - c
				4025	"movq %%mm2, %%mm4 \n\t"
				4026	"punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
				4027	// pbv = p - b = (a + b - c) - b = a - c
				4028	"movq %%mm1, %%mm5 \n\t"
				4029	"psubw %%mm3, %%mm4 \n\t"
				4030	"pxor %%mm7, %%mm7 \n\t"
				4031	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
				4032	"movq %%mm4, %%mm6 \n\t"
				4033	"psubw %%mm3, %%mm5 \n\t"
				4034	// pa = abs(p-a) = abs(pav)
				4035	// pb = abs(p-b) = abs(pbv)
				4036	// pc = abs(p-c) = abs(pcv)
				4037	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
				4038	"paddw %%mm5, %%mm6 \n\t"
				4039	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
				4040	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
				4041	"psubw %%mm0, %%mm4 \n\t"
				4042	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
				4043	"psubw %%mm0, %%mm4 \n\t"
				4044	"psubw %%mm7, %%mm5 \n\t"
				4045	"pxor %%mm0, %%mm0 \n\t"
				4046	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
				4047	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
				4048	"psubw %%mm7, %%mm5 \n\t"
				4049	"psubw %%mm0, %%mm6 \n\t"
				4050	// test pa <= pb
				4051	"movq %%mm4, %%mm7 \n\t"
				4052	"psubw %%mm0, %%mm6 \n\t"
				4053	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
				4054	"movq %%mm7, %%mm0 \n\t"
				4055	// use mm7 mask to merge pa & pb
				4056	"pand %%mm7, %%mm5 \n\t"
				4057	// use mm0 mask copy to merge a & b
				4058	"pand %%mm0, %%mm2 \n\t"
				4059	"pandn %%mm4, %%mm7 \n\t"
				4060	"pandn %%mm1, %%mm0 \n\t"
				4061	"paddw %%mm5, %%mm7 \n\t"
				4062	"paddw %%mm2, %%mm0 \n\t"
				4063	// test ((pa <= pb)? pa:pb) <= pc
				4064	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
				4065	"pxor %%mm1, %%mm1 \n\t"
				4066	"pand %%mm7, %%mm3 \n\t"
				4067	"pandn %%mm0, %%mm7 \n\t"
				4068	"paddw %%mm3, %%mm7 \n\t"
				4069	"pxor %%mm0, %%mm0 \n\t"
				4070	"packuswb %%mm1, %%mm7 \n\t"
				4071	"movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
				4072	"pand _ActiveMask, %%mm7 \n\t"
				4073	"movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
				4074	"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
				4075	"punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
				4076	"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
				4077	"movq %%mm7, %%mm1 \n\t" // now mm1 will be used as Raw(x-bpp)
				4078	// do second set of 4 bytes
				4079	"punpckhbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
				4080	"punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
				4081	// pav = p - a = (a + b - c) - a = b - c
				4082	"movq %%mm2, %%mm4 \n\t"
				4083	// pbv = p - b = (a + b - c) - b = a - c
				4084	"movq %%mm1, %%mm5 \n\t"
				4085	"psubw %%mm3, %%mm4 \n\t"
				4086	"pxor %%mm7, %%mm7 \n\t"
				4087	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
				4088	"movq %%mm4, %%mm6 \n\t"
				4089	"psubw %%mm3, %%mm5 \n\t"
				4090	// pa = abs(p-a) = abs(pav)
				4091	// pb = abs(p-b) = abs(pbv)
				4092	// pc = abs(p-c) = abs(pcv)
				4093	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
				4094	"paddw %%mm5, %%mm6 \n\t"
				4095	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
				4096	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
				4097	"psubw %%mm0, %%mm4 \n\t"
				4098	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
				4099	"psubw %%mm0, %%mm4 \n\t"
				4100	"psubw %%mm7, %%mm5 \n\t"
				4101	"pxor %%mm0, %%mm0 \n\t"
				4102	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
				4103	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
				4104	"psubw %%mm7, %%mm5 \n\t"
				4105	"psubw %%mm0, %%mm6 \n\t"
				4106	// test pa <= pb
				4107	"movq %%mm4, %%mm7 \n\t"
				4108	"psubw %%mm0, %%mm6 \n\t"
				4109	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
				4110	"movq %%mm7, %%mm0 \n\t"
				4111	// use mm7 mask to merge pa & pb
				4112	"pand %%mm7, %%mm5 \n\t"
				4113	// use mm0 mask copy to merge a & b
				4114	"pand %%mm0, %%mm2 \n\t"
				4115	"pandn %%mm4, %%mm7 \n\t"
				4116	"pandn %%mm1, %%mm0 \n\t"
				4117	"paddw %%mm5, %%mm7 \n\t"
				4118	"paddw %%mm2, %%mm0 \n\t"
				4119	// test ((pa <= pb)? pa:pb) <= pc
				4120	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
				4121	"pxor %%mm1, %%mm1 \n\t"
				4122	"pand %%mm7, %%mm3 \n\t"
				4123	"pandn %%mm0, %%mm7 \n\t"
				4124	"pxor %%mm1, %%mm1 \n\t"
				4125	"paddw %%mm3, %%mm7 \n\t"
				4126	"pxor %%mm0, %%mm0 \n\t"
				4127	// step ecx to next set of 8 bytes and repeat loop til done
				4128	"addl $8, %%ecx \n\t"
				4129	"packuswb %%mm7, %%mm1 \n\t"
				4130	"paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add predictor with Raw(x)
				4131	"cmpl _MMXLength, %%ecx \n\t"
				4132	"movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
				4133	// mm1 will be used as Raw(x-bpp) next loop
				4134	"jb paeth_4lp \n\t"
				4135
				4136	: "=S" (dummy_value_S), // output regs (dummy)
				4137	"=D" (dummy_value_D)
				4138
				4139	: "0" (prev_row), // esi // input regs
				4140	"1" (row) // edi
				4141
				4142	: "%ecx" // clobber list
				4143	#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
				4144	, "%mm0", "%mm1", "%mm2", "%mm3"
				4145	, "%mm4", "%mm5", "%mm6", "%mm7"
				4146	#endif
				4147	);
				4148	}
				4149	break; // end 4 bpp
				4150
				4151	case 8: // bpp == 8
				4152	{
				4153	_ActiveMask.use = 0x00000000ffffffffLL;
				4154
				4155	__asm__ __volatile__ (
				4156	"movl _dif, %%ecx \n\t"
				4157	// preload "movl row, %%edi \n\t"
				4158	// preload "movl prev_row, %%esi \n\t"
				4159	"pxor %%mm0, %%mm0 \n\t"
				4160	// prime the pump: load the first Raw(x-bpp) data set
				4161	"movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
				4162	// a=Raw(x-bpp) bytes
				4163	"paeth_8lp: \n\t"
				4164	// do first set of 4 bytes
				4165	"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
				4166	"punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
				4167	"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
				4168	"punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
				4169	// pav = p - a = (a + b - c) - a = b - c
				4170	"movq %%mm2, %%mm4 \n\t"
				4171	"punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
				4172	// pbv = p - b = (a + b - c) - b = a - c
				4173	"movq %%mm1, %%mm5 \n\t"
				4174	"psubw %%mm3, %%mm4 \n\t"
				4175	"pxor %%mm7, %%mm7 \n\t"
				4176	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
				4177	"movq %%mm4, %%mm6 \n\t"
				4178	"psubw %%mm3, %%mm5 \n\t"
				4179	// pa = abs(p-a) = abs(pav)
				4180	// pb = abs(p-b) = abs(pbv)
				4181	// pc = abs(p-c) = abs(pcv)
				4182	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
				4183	"paddw %%mm5, %%mm6 \n\t"
				4184	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
				4185	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
				4186	"psubw %%mm0, %%mm4 \n\t"
				4187	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
				4188	"psubw %%mm0, %%mm4 \n\t"
				4189	"psubw %%mm7, %%mm5 \n\t"
				4190	"pxor %%mm0, %%mm0 \n\t"
				4191	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
				4192	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
				4193	"psubw %%mm7, %%mm5 \n\t"
				4194	"psubw %%mm0, %%mm6 \n\t"
				4195	// test pa <= pb
				4196	"movq %%mm4, %%mm7 \n\t"
				4197	"psubw %%mm0, %%mm6 \n\t"
				4198	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
				4199	"movq %%mm7, %%mm0 \n\t"
				4200	// use mm7 mask to merge pa & pb
				4201	"pand %%mm7, %%mm5 \n\t"
				4202	// use mm0 mask copy to merge a & b
				4203	"pand %%mm0, %%mm2 \n\t"
				4204	"pandn %%mm4, %%mm7 \n\t"
				4205	"pandn %%mm1, %%mm0 \n\t"
				4206	"paddw %%mm5, %%mm7 \n\t"
				4207	"paddw %%mm2, %%mm0 \n\t"
				4208	// test ((pa <= pb)? pa:pb) <= pc
				4209	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
				4210	"pxor %%mm1, %%mm1 \n\t"
				4211	"pand %%mm7, %%mm3 \n\t"
				4212	"pandn %%mm0, %%mm7 \n\t"
				4213	"paddw %%mm3, %%mm7 \n\t"
				4214	"pxor %%mm0, %%mm0 \n\t"
				4215	"packuswb %%mm1, %%mm7 \n\t"
				4216	"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
				4217	"pand _ActiveMask, %%mm7 \n\t"
				4218	"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
				4219	"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
				4220	"punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
				4221	"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
				4222	"movq -8(%%edi,%%ecx,), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
				4223
				4224	// do second set of 4 bytes
				4225	"punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
				4226	"punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
				4227	// pav = p - a = (a + b - c) - a = b - c
				4228	"movq %%mm2, %%mm4 \n\t"
				4229	// pbv = p - b = (a + b - c) - b = a - c
				4230	"movq %%mm1, %%mm5 \n\t"
				4231	"psubw %%mm3, %%mm4 \n\t"
				4232	"pxor %%mm7, %%mm7 \n\t"
				4233	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
				4234	"movq %%mm4, %%mm6 \n\t"
				4235	"psubw %%mm3, %%mm5 \n\t"
				4236	// pa = abs(p-a) = abs(pav)
				4237	// pb = abs(p-b) = abs(pbv)
				4238	// pc = abs(p-c) = abs(pcv)
				4239	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
				4240	"paddw %%mm5, %%mm6 \n\t"
				4241	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
				4242	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
				4243	"psubw %%mm0, %%mm4 \n\t"
				4244	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
				4245	"psubw %%mm0, %%mm4 \n\t"
				4246	"psubw %%mm7, %%mm5 \n\t"
				4247	"pxor %%mm0, %%mm0 \n\t"
				4248	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
				4249	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
				4250	"psubw %%mm7, %%mm5 \n\t"
				4251	"psubw %%mm0, %%mm6 \n\t"
				4252	// test pa <= pb
				4253	"movq %%mm4, %%mm7 \n\t"
				4254	"psubw %%mm0, %%mm6 \n\t"
				4255	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
				4256	"movq %%mm7, %%mm0 \n\t"
				4257	// use mm7 mask to merge pa & pb
				4258	"pand %%mm7, %%mm5 \n\t"
				4259	// use mm0 mask copy to merge a & b
				4260	"pand %%mm0, %%mm2 \n\t"
				4261	"pandn %%mm4, %%mm7 \n\t"
				4262	"pandn %%mm1, %%mm0 \n\t"
				4263	"paddw %%mm5, %%mm7 \n\t"
				4264	"paddw %%mm2, %%mm0 \n\t"
				4265	// test ((pa <= pb)? pa:pb) <= pc
				4266	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
				4267	"pxor %%mm1, %%mm1 \n\t"
				4268	"pand %%mm7, %%mm3 \n\t"
				4269	"pandn %%mm0, %%mm7 \n\t"
				4270	"pxor %%mm1, %%mm1 \n\t"
				4271	"paddw %%mm3, %%mm7 \n\t"
				4272	"pxor %%mm0, %%mm0 \n\t"
				4273	// step ecx to next set of 8 bytes and repeat loop til done
				4274	"addl $8, %%ecx \n\t"
				4275	"packuswb %%mm7, %%mm1 \n\t"
				4276	"paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
				4277	"cmpl _MMXLength, %%ecx \n\t"
				4278	"movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
				4279	// mm1 will be used as Raw(x-bpp) next loop
				4280	"jb paeth_8lp \n\t"
				4281
				4282	: "=S" (dummy_value_S), // output regs (dummy)
				4283	"=D" (dummy_value_D)
				4284
				4285	: "0" (prev_row), // esi // input regs
				4286	"1" (row) // edi
				4287
				4288	: "%ecx" // clobber list
				4289	#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
				4290	, "%mm0", "%mm1", "%mm2", "%mm3"
				4291	, "%mm4", "%mm5", "%mm6", "%mm7"
				4292	#endif
				4293	);
				4294	}
				4295	break; // end 8 bpp
				4296
				4297	case 1: // bpp = 1
				4298	case 2: // bpp = 2
				4299	default: // bpp > 8
				4300	{
				4301	__asm__ __volatile__ (
				4302	#ifdef __PIC__
				4303	"pushl %%ebx \n\t" // save Global Offset Table index
				4304	#endif
				4305	"movl _dif, %%ebx \n\t"
				4306	"cmpl _FullLength, %%ebx \n\t"
				4307	"jnb paeth_dend \n\t"
				4308
				4309	// preload "movl row, %%edi \n\t"
				4310	// preload "movl prev_row, %%esi \n\t"
				4311	// do Paeth decode for remaining bytes
				4312	"movl %%ebx, %%edx \n\t"
				4313	// preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
				4314	"subl %%ecx, %%edx \n\t" // edx = ebx - bpp
				4315	"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
				4316
				4317	"paeth_dlp: \n\t"
				4318	"xorl %%eax, %%eax \n\t"
				4319	// pav = p - a = (a + b - c) - a = b - c
				4320	"movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
				4321	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
				4322	"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
				4323	"movl %%eax, _patemp \n\t" // Save pav for later use
				4324	"xorl %%eax, %%eax \n\t"
				4325	// pbv = p - b = (a + b - c) - b = a - c
				4326	"movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
				4327	"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
				4328	"movl %%eax, %%ecx \n\t"
				4329	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
				4330	"addl _patemp, %%eax \n\t" // pcv = pav + pbv
				4331	// pc = abs(pcv)
				4332	"testl $0x80000000, %%eax \n\t"
				4333	"jz paeth_dpca \n\t"
				4334	"negl %%eax \n\t" // reverse sign of neg values
				4335
				4336	"paeth_dpca: \n\t"
				4337	"movl %%eax, _pctemp \n\t" // save pc for later use
				4338	// pb = abs(pbv)
				4339	"testl $0x80000000, %%ecx \n\t"
				4340	"jz paeth_dpba \n\t"
				4341	"negl %%ecx \n\t" // reverse sign of neg values
				4342
				4343	"paeth_dpba: \n\t"
				4344	"movl %%ecx, _pbtemp \n\t" // save pb for later use
				4345	// pa = abs(pav)
				4346	"movl _patemp, %%eax \n\t"
				4347	"testl $0x80000000, %%eax \n\t"
				4348	"jz paeth_dpaa \n\t"
				4349	"negl %%eax \n\t" // reverse sign of neg values
				4350
				4351	"paeth_dpaa: \n\t"
				4352	"movl %%eax, _patemp \n\t" // save pa for later use
				4353	// test if pa <= pb
				4354	"cmpl %%ecx, %%eax \n\t"
				4355	"jna paeth_dabb \n\t"
				4356	// pa > pb; now test if pb <= pc
				4357	"cmpl _pctemp, %%ecx \n\t"
				4358	"jna paeth_dbbc \n\t"
				4359	// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
				4360	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
				4361	"jmp paeth_dpaeth \n\t"
				4362
				4363	"paeth_dbbc: \n\t"
				4364	// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
				4365	"movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
				4366	"jmp paeth_dpaeth \n\t"
				4367
				4368	"paeth_dabb: \n\t"
				4369	// pa <= pb; now test if pa <= pc
				4370	"cmpl _pctemp, %%eax \n\t"
				4371	"jna paeth_dabc \n\t"
				4372	// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
				4373	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
				4374	"jmp paeth_dpaeth \n\t"
				4375
				4376	"paeth_dabc: \n\t"
				4377	// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
				4378	"movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
				4379
				4380	"paeth_dpaeth: \n\t"
				4381	"incl %%ebx \n\t"
				4382	"incl %%edx \n\t"
				4383	// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
				4384	"addb %%cl, -1(%%edi,%%ebx,) \n\t"
				4385	"cmpl _FullLength, %%ebx \n\t"
				4386	"jb paeth_dlp \n\t"
				4387
				4388	"paeth_dend: \n\t"
				4389	#ifdef __PIC__
				4390	"popl %%ebx \n\t" // index to Global Offset Table
				4391	#endif
				4392
				4393	: "=c" (dummy_value_c), // output regs (dummy)
				4394	"=S" (dummy_value_S),
				4395	"=D" (dummy_value_D)
				4396
				4397	: "0" (bpp), // ecx // input regs
				4398	"1" (prev_row), // esi
				4399	"2" (row) // edi
				4400
				4401	: "%eax", "%edx" // clobber list
				4402	#ifndef __PIC__
				4403	, "%ebx"
				4404	#endif
				4405	);
				4406	}
				4407	return; // No need to go further with this one
				4408
				4409	} // end switch (bpp)
				4410
				4411	__asm__ __volatile__ (
				4412	// MMX acceleration complete; now do clean-up
				4413	// check if any remaining bytes left to decode
				4414	#ifdef __PIC__
				4415	"pushl %%ebx \n\t" // save index to Global Offset Table
				4416	#endif
				4417	"movl _MMXLength, %%ebx \n\t"
				4418	"cmpl _FullLength, %%ebx \n\t"
				4419	"jnb paeth_end \n\t"
				4420	//pre "movl row, %%edi \n\t"
				4421	//pre "movl prev_row, %%esi \n\t"
				4422	// do Paeth decode for remaining bytes
				4423	"movl %%ebx, %%edx \n\t"
				4424	//pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
				4425	"subl %%ecx, %%edx \n\t" // edx = ebx - bpp
				4426	"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
				4427
				4428	"paeth_lp2: \n\t"
				4429	"xorl %%eax, %%eax \n\t"
				4430	// pav = p - a = (a + b - c) - a = b - c
				4431	"movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
				4432	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
				4433	"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
				4434	"movl %%eax, _patemp \n\t" // Save pav for later use
				4435	"xorl %%eax, %%eax \n\t"
				4436	// pbv = p - b = (a + b - c) - b = a - c
				4437	"movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
				4438	"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
				4439	"movl %%eax, %%ecx \n\t"
				4440	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
				4441	"addl _patemp, %%eax \n\t" // pcv = pav + pbv
				4442	// pc = abs(pcv)
				4443	"testl $0x80000000, %%eax \n\t"
				4444	"jz paeth_pca2 \n\t"
				4445	"negl %%eax \n\t" // reverse sign of neg values
				4446
				4447	"paeth_pca2: \n\t"
				4448	"movl %%eax, _pctemp \n\t" // save pc for later use
				4449	// pb = abs(pbv)
				4450	"testl $0x80000000, %%ecx \n\t"
				4451	"jz paeth_pba2 \n\t"
				4452	"negl %%ecx \n\t" // reverse sign of neg values
				4453
				4454	"paeth_pba2: \n\t"
				4455	"movl %%ecx, _pbtemp \n\t" // save pb for later use
				4456	// pa = abs(pav)
				4457	"movl _patemp, %%eax \n\t"
				4458	"testl $0x80000000, %%eax \n\t"
				4459	"jz paeth_paa2 \n\t"
				4460	"negl %%eax \n\t" // reverse sign of neg values
				4461
				4462	"paeth_paa2: \n\t"
				4463	"movl %%eax, _patemp \n\t" // save pa for later use
				4464	// test if pa <= pb
				4465	"cmpl %%ecx, %%eax \n\t"
				4466	"jna paeth_abb2 \n\t"
				4467	// pa > pb; now test if pb <= pc
				4468	"cmpl _pctemp, %%ecx \n\t"
				4469	"jna paeth_bbc2 \n\t"
				4470	// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
				4471	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
				4472	"jmp paeth_paeth2 \n\t"
				4473
				4474	"paeth_bbc2: \n\t"
				4475	// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
				4476	"movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
				4477	"jmp paeth_paeth2 \n\t"
				4478
				4479	"paeth_abb2: \n\t"
				4480	// pa <= pb; now test if pa <= pc
				4481	"cmpl _pctemp, %%eax \n\t"
				4482	"jna paeth_abc2 \n\t"
				4483	// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
				4484	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
				4485	"jmp paeth_paeth2 \n\t"
				4486
				4487	"paeth_abc2: \n\t"
				4488	// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
				4489	"movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
				4490
				4491	"paeth_paeth2: \n\t"
				4492	"incl %%ebx \n\t"
				4493	"incl %%edx \n\t"
				4494	// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
				4495	"addb %%cl, -1(%%edi,%%ebx,) \n\t"
				4496	"cmpl _FullLength, %%ebx \n\t"
				4497	"jb paeth_lp2 \n\t"
				4498
				4499	"paeth_end: \n\t"
				4500	"EMMS \n\t" // end MMX; prep for poss. FP instrs.
				4501	#ifdef __PIC__
				4502	"popl %%ebx \n\t" // restore index to Global Offset Table
				4503	#endif
				4504
				4505	: "=c" (dummy_value_c), // output regs (dummy)
				4506	"=S" (dummy_value_S),
				4507	"=D" (dummy_value_D)
				4508
				4509	: "0" (bpp), // ecx // input regs
				4510	"1" (prev_row), // esi
				4511	"2" (row) // edi
				4512
				4513	: "%eax", "%edx" // clobber list (no input regs!)
				4514	#ifndef __PIC__
				4515	, "%ebx"
				4516	#endif
				4517	);
				4518
				4519	} /* end png_read_filter_row_mmx_paeth() */
				4520	#endif
				4521
				4522
				4523
				4524
				4525	#ifdef PNG_THREAD_UNSAFE_OK
				4526	//===========================================================================//
				4527	// //
				4528	// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B //
				4529	// //
				4530	//===========================================================================//
				4531
				4532	// Optimized code for PNG Sub filter decoder
				4533
				4534	static void /* PRIVATE */
				4535	png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
				4536	{
				4537	int bpp;
				4538	int dummy_value_a;
				4539	int dummy_value_D;
				4540
				4541	bpp = (row_info->pixel_depth + 7) >> 3; // calc number of bytes per pixel
				4542	_FullLength = row_info->rowbytes - bpp; // number of bytes to filter
				4543
				4544	__asm__ __volatile__ (
				4545	//pre "movl row, %%edi \n\t"
				4546	"movl %%edi, %%esi \n\t" // lp = row
				4547	//pre "movl bpp, %%eax \n\t"
				4548	"addl %%eax, %%edi \n\t" // rp = row + bpp
				4549	//irr "xorl %%eax, %%eax \n\t"
				4550	// get # of bytes to alignment
				4551	"movl %%edi, _dif \n\t" // take start of row
				4552	"addl $0xf, _dif \n\t" // add 7 + 8 to incr past
				4553	// alignment boundary
				4554	"xorl %%ecx, %%ecx \n\t"
				4555	"andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
				4556	"subl %%edi, _dif \n\t" // subtract from start ==> value
				4557	"jz sub_go \n\t" // ecx at alignment
				4558
				4559	"sub_lp1: \n\t" // fix alignment
				4560	"movb (%%esi,%%ecx,), %%al \n\t"
				4561	"addb %%al, (%%edi,%%ecx,) \n\t"
				4562	"incl %%ecx \n\t"
				4563	"cmpl _dif, %%ecx \n\t"
				4564	"jb sub_lp1 \n\t"
				4565
				4566	"sub_go: \n\t"
				4567	"movl _FullLength, %%eax \n\t"
				4568	"movl %%eax, %%edx \n\t"
				4569	"subl %%ecx, %%edx \n\t" // subtract alignment fix
				4570	"andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
				4571	"subl %%edx, %%eax \n\t" // drop over bytes from length
				4572	"movl %%eax, _MMXLength \n\t"
				4573
				4574	: "=a" (dummy_value_a), // 0 // output regs (dummy)
				4575	"=D" (dummy_value_D) // 1
				4576
				4577	: "0" (bpp), // eax // input regs
				4578	"1" (row) // edi
				4579
				4580	: "%esi", "%ecx", "%edx" // clobber list
				4581
				4582	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
				4583	, "%mm0", "%mm1", "%mm2", "%mm3"
				4584	, "%mm4", "%mm5", "%mm6", "%mm7"
				4585	#endif
				4586	);
				4587
				4588	// now do the math for the rest of the row
				4589	switch (bpp)
				4590	{
				4591	case 3:
				4592	{
				4593	_ActiveMask.use = 0x0000ffffff000000LL;
				4594	_ShiftBpp.use = 24; // == 3 * 8
				4595	_ShiftRem.use = 40; // == 64 - 24
				4596
				4597	__asm__ __volatile__ (
				4598	// preload "movl row, %%edi \n\t"
				4599	"movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
				4600	// active byte group
				4601	"movl %%edi, %%esi \n\t" // lp = row
				4602	// preload "movl bpp, %%eax \n\t"
				4603	"addl %%eax, %%edi \n\t" // rp = row + bpp
				4604	"movq %%mm7, %%mm6 \n\t"
				4605	"movl _dif, %%edx \n\t"
				4606	"psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
				4607	// 3rd active byte group
				4608	// prime the pump: load the first Raw(x-bpp) data set
				4609	"movq -8(%%edi,%%edx,), %%mm1 \n\t"
				4610
				4611	"sub_3lp: \n\t" // shift data for adding first
				4612	"psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
				4613	// shift clears inactive bytes)
				4614	// add 1st active group
				4615	"movq (%%edi,%%edx,), %%mm0 \n\t"
				4616	"paddb %%mm1, %%mm0 \n\t"
				4617
				4618	// add 2nd active group
				4619	"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
				4620	"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
				4621	"pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
				4622	"paddb %%mm1, %%mm0 \n\t"
				4623
				4624	// add 3rd active group
				4625	"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
				4626	"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
				4627	"pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
				4628	"addl $8, %%edx \n\t"
				4629	"paddb %%mm1, %%mm0 \n\t"
				4630
				4631	"cmpl _MMXLength, %%edx \n\t"
				4632	"movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
				4633	"movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
				4634	"jb sub_3lp \n\t"
				4635
				4636	: "=a" (dummy_value_a), // 0 // output regs (dummy)
				4637	"=D" (dummy_value_D) // 1
				4638
				4639	: "0" (bpp), // eax // input regs
				4640	"1" (row) // edi
				4641
				4642	: "%edx", "%esi" // clobber list
				4643	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
				4644	, "%mm0", "%mm1", "%mm6", "%mm7"
				4645	#endif
				4646	);
				4647	}
				4648	break;
				4649
				4650	case 1:
				4651	{
				4652	__asm__ __volatile__ (
				4653	"movl _dif, %%edx \n\t"
				4654	// preload "movl row, %%edi \n\t"
				4655	"cmpl _FullLength, %%edx \n\t"
				4656	"jnb sub_1end \n\t"
				4657	"movl %%edi, %%esi \n\t" // lp = row
				4658	"xorl %%eax, %%eax \n\t"
				4659	// preload "movl bpp, %%eax \n\t"
				4660	"addl %%eax, %%edi \n\t" // rp = row + bpp
				4661
				4662	"sub_1lp: \n\t"
				4663	"movb (%%esi,%%edx,), %%al \n\t"
				4664	"addb %%al, (%%edi,%%edx,) \n\t"
				4665	"incl %%edx \n\t"
				4666	"cmpl _FullLength, %%edx \n\t"
				4667	"jb sub_1lp \n\t"
				4668
				4669	"sub_1end: \n\t"
				4670
				4671	: "=a" (dummy_value_a), // 0 // output regs (dummy)
				4672	"=D" (dummy_value_D) // 1
				4673
				4674	: "0" (bpp), // eax // input regs
				4675	"1" (row) // edi
				4676
				4677	: "%edx", "%esi" // clobber list
				4678	);
				4679	}
				4680	return;
				4681
				4682	case 6:
				4683	case 4:
				4684	//case 7: // GRR BOGUS
				4685	//case 5: // GRR BOGUS
				4686	{
				4687	_ShiftBpp.use = bpp << 3;
				4688	_ShiftRem.use = 64 - _ShiftBpp.use;
				4689
				4690	__asm__ __volatile__ (
				4691	// preload "movl row, %%edi \n\t"
				4692	"movl _dif, %%edx \n\t"
				4693	"movl %%edi, %%esi \n\t" // lp = row
				4694	// preload "movl bpp, %%eax \n\t"
				4695	"addl %%eax, %%edi \n\t" // rp = row + bpp
				4696
				4697	// prime the pump: load the first Raw(x-bpp) data set
				4698	"movq -8(%%edi,%%edx,), %%mm1 \n\t"
				4699
				4700	"sub_4lp: \n\t" // shift data for adding first
				4701	"psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
				4702	// shift clears inactive bytes)
				4703	"movq (%%edi,%%edx,), %%mm0 \n\t"
				4704	"paddb %%mm1, %%mm0 \n\t"
				4705
				4706	// add 2nd active group
				4707	"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
				4708	"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
				4709	"addl $8, %%edx \n\t"
				4710	"paddb %%mm1, %%mm0 \n\t"
				4711
				4712	"cmpl _MMXLength, %%edx \n\t"
				4713	"movq %%mm0, -8(%%edi,%%edx,) \n\t"
				4714	"movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
				4715	"jb sub_4lp \n\t"
				4716
				4717	: "=a" (dummy_value_a), // 0 // output regs (dummy)
				4718	"=D" (dummy_value_D) // 1
				4719
				4720	: "0" (bpp), // eax // input regs
				4721	"1" (row) // edi
				4722
				4723	: "%edx", "%esi" // clobber list
				4724	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
				4725	, "%mm0", "%mm1"
				4726	#endif
				4727	);
				4728	}
				4729	break;
				4730
				4731	case 2:
				4732	{
				4733	_ActiveMask.use = 0x00000000ffff0000LL;
				4734	_ShiftBpp.use = 16; // == 2 * 8
				4735	_ShiftRem.use = 48; // == 64 - 16
				4736
				4737	__asm__ __volatile__ (
				4738	"movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
				4739	// active byte group
				4740	"movl _dif, %%edx \n\t"
				4741	"movq %%mm7, %%mm6 \n\t"
				4742	// preload "movl row, %%edi \n\t"
				4743	"psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
				4744	// 3rd active byte group
				4745	"movl %%edi, %%esi \n\t" // lp = row
				4746	"movq %%mm6, %%mm5 \n\t"
				4747	// preload "movl bpp, %%eax \n\t"
				4748	"addl %%eax, %%edi \n\t" // rp = row + bpp
				4749	"psllq _ShiftBpp, %%mm5 \n\t" // move mask in mm5 to cover
				4750	// 4th active byte group
				4751	// prime the pump: load the first Raw(x-bpp) data set
				4752	"movq -8(%%edi,%%edx,), %%mm1 \n\t"
				4753
				4754	"sub_2lp: \n\t" // shift data for adding first
				4755	"psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
				4756	// shift clears inactive bytes)
				4757	// add 1st active group
				4758	"movq (%%edi,%%edx,), %%mm0 \n\t"
				4759	"paddb %%mm1, %%mm0 \n\t"
				4760
				4761	// add 2nd active group
				4762	"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
				4763	"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
				4764	"pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
				4765	"paddb %%mm1, %%mm0 \n\t"
				4766
				4767	// add 3rd active group
				4768	"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
				4769	"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
				4770	"pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
				4771	"paddb %%mm1, %%mm0 \n\t"
				4772
				4773	// add 4th active group
				4774	"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
				4775	"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
				4776	"pand %%mm5, %%mm1 \n\t" // mask to use 4th active group
				4777	"addl $8, %%edx \n\t"
				4778	"paddb %%mm1, %%mm0 \n\t"
				4779	"cmpl _MMXLength, %%edx \n\t"
				4780	"movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
				4781	"movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
				4782	"jb sub_2lp \n\t"
				4783
				4784	: "=a" (dummy_value_a), // 0 // output regs (dummy)
				4785	"=D" (dummy_value_D) // 1
				4786
				4787	: "0" (bpp), // eax // input regs
				4788	"1" (row) // edi
				4789
				4790	: "%edx", "%esi" // clobber list
				4791	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
				4792	, "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
				4793	#endif
				4794	);
				4795	}
				4796	break;
				4797
				4798	case 8:
				4799	{
				4800	__asm__ __volatile__ (
				4801	// preload "movl row, %%edi \n\t"
				4802	"movl _dif, %%edx \n\t"
				4803	"movl %%edi, %%esi \n\t" // lp = row
				4804	// preload "movl bpp, %%eax \n\t"
				4805	"addl %%eax, %%edi \n\t" // rp = row + bpp
				4806	"movl _MMXLength, %%ecx \n\t"
				4807
				4808	// prime the pump: load the first Raw(x-bpp) data set
				4809	"movq -8(%%edi,%%edx,), %%mm7 \n\t"
				4810	"andl $0x0000003f, %%ecx \n\t" // calc bytes over mult of 64
				4811
				4812	"sub_8lp: \n\t"
				4813	"movq (%%edi,%%edx,), %%mm0 \n\t" // load Sub(x) for 1st 8 bytes
				4814	"paddb %%mm7, %%mm0 \n\t"
				4815	"movq 8(%%edi,%%edx,), %%mm1 \n\t" // load Sub(x) for 2nd 8 bytes
				4816	"movq %%mm0, (%%edi,%%edx,) \n\t" // write Raw(x) for 1st 8 bytes
				4817
				4818	// Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
				4819	// This will be repeated for each group of 8 bytes with the 8th
				4820	// group being used as the Raw(x-bpp) for the 1st group of the
				4821	// next loop.
				4822
				4823	"paddb %%mm0, %%mm1 \n\t"
				4824	"movq 16(%%edi,%%edx,), %%mm2 \n\t" // load Sub(x) for 3rd 8 bytes
				4825	"movq %%mm1, 8(%%edi,%%edx,) \n\t" // write Raw(x) for 2nd 8 bytes
				4826	"paddb %%mm1, %%mm2 \n\t"
				4827	"movq 24(%%edi,%%edx,), %%mm3 \n\t" // load Sub(x) for 4th 8 bytes
				4828	"movq %%mm2, 16(%%edi,%%edx,) \n\t" // write Raw(x) for 3rd 8 bytes
				4829	"paddb %%mm2, %%mm3 \n\t"
				4830	"movq 32(%%edi,%%edx,), %%mm4 \n\t" // load Sub(x) for 5th 8 bytes
				4831	"movq %%mm3, 24(%%edi,%%edx,) \n\t" // write Raw(x) for 4th 8 bytes
				4832	"paddb %%mm3, %%mm4 \n\t"
				4833	"movq 40(%%edi,%%edx,), %%mm5 \n\t" // load Sub(x) for 6th 8 bytes
				4834	"movq %%mm4, 32(%%edi,%%edx,) \n\t" // write Raw(x) for 5th 8 bytes
				4835	"paddb %%mm4, %%mm5 \n\t"
				4836	"movq 48(%%edi,%%edx,), %%mm6 \n\t" // load Sub(x) for 7th 8 bytes
				4837	"movq %%mm5, 40(%%edi,%%edx,) \n\t" // write Raw(x) for 6th 8 bytes
				4838	"paddb %%mm5, %%mm6 \n\t"
				4839	"movq 56(%%edi,%%edx,), %%mm7 \n\t" // load Sub(x) for 8th 8 bytes
				4840	"movq %%mm6, 48(%%edi,%%edx,) \n\t" // write Raw(x) for 7th 8 bytes
				4841	"addl $64, %%edx \n\t"
				4842	"paddb %%mm6, %%mm7 \n\t"
				4843	"cmpl %%ecx, %%edx \n\t"
				4844	"movq %%mm7, -8(%%edi,%%edx,) \n\t" // write Raw(x) for 8th 8 bytes
				4845	"jb sub_8lp \n\t"
				4846
				4847	"cmpl _MMXLength, %%edx \n\t"
				4848	"jnb sub_8lt8 \n\t"
				4849
				4850	"sub_8lpA: \n\t"
				4851	"movq (%%edi,%%edx,), %%mm0 \n\t"
				4852	"addl $8, %%edx \n\t"
				4853	"paddb %%mm7, %%mm0 \n\t"
				4854	"cmpl _MMXLength, %%edx \n\t"
				4855	"movq %%mm0, -8(%%edi,%%edx,) \n\t" // -8 to offset early addl edx
				4856	"movq %%mm0, %%mm7 \n\t" // move calculated Raw(x) data
				4857	// to mm1 to be new Raw(x-bpp)
				4858	// for next loop
				4859	"jb sub_8lpA \n\t"
				4860
				4861	"sub_8lt8: \n\t"
				4862
				4863	: "=a" (dummy_value_a), // 0 // output regs (dummy)
				4864	"=D" (dummy_value_D) // 1
				4865
				4866	: "0" (bpp), // eax // input regs
				4867	"1" (row) // edi
				4868
				4869	: "%ecx", "%edx", "%esi" // clobber list
				4870	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
				4871	, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
				4872	#endif
				4873	);
				4874	}
				4875	break;
				4876
				4877	default: // bpp greater than 8 bytes GRR BOGUS
				4878	{
				4879	__asm__ __volatile__ (
				4880	"movl _dif, %%edx \n\t"
				4881	// preload "movl row, %%edi \n\t"
				4882	"movl %%edi, %%esi \n\t" // lp = row
				4883	// preload "movl bpp, %%eax \n\t"
				4884	"addl %%eax, %%edi \n\t" // rp = row + bpp
				4885
				4886	"sub_Alp: \n\t"
				4887	"movq (%%edi,%%edx,), %%mm0 \n\t"
				4888	"movq (%%esi,%%edx,), %%mm1 \n\t"
				4889	"addl $8, %%edx \n\t"
				4890	"paddb %%mm1, %%mm0 \n\t"
				4891	"cmpl _MMXLength, %%edx \n\t"
				4892	"movq %%mm0, -8(%%edi,%%edx,) \n\t" // mov does not affect flags;
				4893	// -8 to offset addl edx
				4894	"jb sub_Alp \n\t"
				4895
				4896	: "=a" (dummy_value_a), // 0 // output regs (dummy)
				4897	"=D" (dummy_value_D) // 1
				4898
				4899	: "0" (bpp), // eax // input regs
				4900	"1" (row) // edi
				4901
				4902	: "%edx", "%esi" // clobber list
				4903	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
				4904	, "%mm0", "%mm1"
				4905	#endif
				4906	);
				4907	}
				4908	break;
				4909
				4910	} // end switch (bpp)
				4911
				4912	__asm__ __volatile__ (
				4913	"movl _MMXLength, %%edx \n\t"
				4914	//pre "movl row, %%edi \n\t"
				4915	"cmpl _FullLength, %%edx \n\t"
				4916	"jnb sub_end \n\t"
				4917
				4918	"movl %%edi, %%esi \n\t" // lp = row
				4919	//pre "movl bpp, %%eax \n\t"
				4920	"addl %%eax, %%edi \n\t" // rp = row + bpp
				4921	"xorl %%eax, %%eax \n\t"
				4922
				4923	"sub_lp2: \n\t"
				4924	"movb (%%esi,%%edx,), %%al \n\t"
				4925	"addb %%al, (%%edi,%%edx,) \n\t"
				4926	"incl %%edx \n\t"
				4927	"cmpl _FullLength, %%edx \n\t"
				4928	"jb sub_lp2 \n\t"
				4929
				4930	"sub_end: \n\t"
				4931	"EMMS \n\t" // end MMX instructions
				4932
				4933	: "=a" (dummy_value_a), // 0 // output regs (dummy)
				4934	"=D" (dummy_value_D) // 1
				4935
				4936	: "0" (bpp), // eax // input regs
				4937	"1" (row) // edi
				4938
				4939	: "%edx", "%esi" // clobber list
				4940	);
				4941
				4942	} // end of png_read_filter_row_mmx_sub()
				4943	#endif
				4944
				4945
				4946
				4947
				4948	//===========================================================================//
				4949	// //
				4950	// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P //
				4951	// //
				4952	//===========================================================================//
				4953
				4954	// Optimized code for PNG Up filter decoder
				4955
				4956	static void /* PRIVATE */
				4957	png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
				4958	png_bytep prev_row)
				4959	{
				4960	png_uint_32 len;
				4961	int dummy_value_d; // fix 'forbidden register 3 (dx) was spilled' error
				4962	int dummy_value_S;
				4963	int dummy_value_D;
				4964
				4965	len = row_info->rowbytes; // number of bytes to filter
				4966
				4967	__asm__ __volatile__ (
				4968	//pre "movl row, %%edi \n\t"
				4969	// get # of bytes to alignment
				4970	#ifdef __PIC__
				4971	"pushl %%ebx \n\t"
				4972	#endif
				4973	"movl %%edi, %%ecx \n\t"
				4974	"xorl %%ebx, %%ebx \n\t"
				4975	"addl $0x7, %%ecx \n\t"
				4976	"xorl %%eax, %%eax \n\t"
				4977	"andl $0xfffffff8, %%ecx \n\t"
				4978	//pre "movl prev_row, %%esi \n\t"
				4979	"subl %%edi, %%ecx \n\t"
				4980	"jz up_go \n\t"
				4981
				4982	"up_lp1: \n\t" // fix alignment
				4983	"movb (%%edi,%%ebx,), %%al \n\t"
				4984	"addb (%%esi,%%ebx,), %%al \n\t"
				4985	"incl %%ebx \n\t"
				4986	"cmpl %%ecx, %%ebx \n\t"
				4987	"movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
				4988	"jb up_lp1 \n\t" // offset incl ebx
				4989
				4990	"up_go: \n\t"
				4991	//pre "movl len, %%edx \n\t"
				4992	"movl %%edx, %%ecx \n\t"
				4993	"subl %%ebx, %%edx \n\t" // subtract alignment fix
				4994	"andl $0x0000003f, %%edx \n\t" // calc bytes over mult of 64
				4995	"subl %%edx, %%ecx \n\t" // drop over bytes from length
				4996
				4997	// unrolled loop - use all MMX registers and interleave to reduce
				4998	// number of branch instructions (loops) and reduce partial stalls
				4999	"up_loop: \n\t"
				5000	"movq (%%esi,%%ebx,), %%mm1 \n\t"
				5001	"movq (%%edi,%%ebx,), %%mm0 \n\t"
				5002	"movq 8(%%esi,%%ebx,), %%mm3 \n\t"
				5003	"paddb %%mm1, %%mm0 \n\t"
				5004	"movq 8(%%edi,%%ebx,), %%mm2 \n\t"
				5005	"movq %%mm0, (%%edi,%%ebx,) \n\t"
				5006	"paddb %%mm3, %%mm2 \n\t"
				5007	"movq 16(%%esi,%%ebx,), %%mm5 \n\t"
				5008	"movq %%mm2, 8(%%edi,%%ebx,) \n\t"
				5009	"movq 16(%%edi,%%ebx,), %%mm4 \n\t"
				5010	"movq 24(%%esi,%%ebx,), %%mm7 \n\t"
				5011	"paddb %%mm5, %%mm4 \n\t"
				5012	"movq 24(%%edi,%%ebx,), %%mm6 \n\t"
				5013	"movq %%mm4, 16(%%edi,%%ebx,) \n\t"
				5014	"paddb %%mm7, %%mm6 \n\t"
				5015	"movq 32(%%esi,%%ebx,), %%mm1 \n\t"
				5016	"movq %%mm6, 24(%%edi,%%ebx,) \n\t"
				5017	"movq 32(%%edi,%%ebx,), %%mm0 \n\t"
				5018	"movq 40(%%esi,%%ebx,), %%mm3 \n\t"
				5019	"paddb %%mm1, %%mm0 \n\t"
				5020	"movq 40(%%edi,%%ebx,), %%mm2 \n\t"
				5021	"movq %%mm0, 32(%%edi,%%ebx,) \n\t"
				5022	"paddb %%mm3, %%mm2 \n\t"
				5023	"movq 48(%%esi,%%ebx,), %%mm5 \n\t"
				5024	"movq %%mm2, 40(%%edi,%%ebx,) \n\t"
				5025	"movq 48(%%edi,%%ebx,), %%mm4 \n\t"
				5026	"movq 56(%%esi,%%ebx,), %%mm7 \n\t"
				5027	"paddb %%mm5, %%mm4 \n\t"
				5028	"movq 56(%%edi,%%ebx,), %%mm6 \n\t"
				5029	"movq %%mm4, 48(%%edi,%%ebx,) \n\t"
				5030	"addl $64, %%ebx \n\t"
				5031	"paddb %%mm7, %%mm6 \n\t"
				5032	"cmpl %%ecx, %%ebx \n\t"
				5033	"movq %%mm6, -8(%%edi,%%ebx,) \n\t" // (+56)movq does not affect flags;
				5034	"jb up_loop \n\t" // -8 to offset addl ebx
				5035
				5036	"cmpl $0, %%edx \n\t" // test for bytes over mult of 64
				5037	"jz up_end \n\t"
				5038
				5039	"cmpl $8, %%edx \n\t" // test for less than 8 bytes
				5040	"jb up_lt8 \n\t" // [added by lcreeve at netins.net]
				5041
				5042	"addl %%edx, %%ecx \n\t"
				5043	"andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
				5044	"subl %%edx, %%ecx \n\t" // drop over bytes from length
				5045	"jz up_lt8 \n\t"
				5046
				5047	"up_lpA: \n\t" // use MMX regs to update 8 bytes sim.
				5048	"movq (%%esi,%%ebx,), %%mm1 \n\t"
				5049	"movq (%%edi,%%ebx,), %%mm0 \n\t"
				5050	"addl $8, %%ebx \n\t"
				5051	"paddb %%mm1, %%mm0 \n\t"
				5052	"cmpl %%ecx, %%ebx \n\t"
				5053	"movq %%mm0, -8(%%edi,%%ebx,) \n\t" // movq does not affect flags; -8 to
				5054	"jb up_lpA \n\t" // offset add ebx
				5055	"cmpl $0, %%edx \n\t" // test for bytes over mult of 8
				5056	"jz up_end \n\t"
				5057
				5058	"up_lt8: \n\t"
				5059	"xorl %%eax, %%eax \n\t"
				5060	"addl %%edx, %%ecx \n\t" // move over byte count into counter
				5061
				5062	"up_lp2: \n\t" // use x86 regs for remaining bytes
				5063	"movb (%%edi,%%ebx,), %%al \n\t"
				5064	"addb (%%esi,%%ebx,), %%al \n\t"
				5065	"incl %%ebx \n\t"
				5066	"cmpl %%ecx, %%ebx \n\t"
				5067	"movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
				5068	"jb up_lp2 \n\t" // offset inc ebx
				5069
				5070	"up_end: \n\t"
				5071	"EMMS \n\t" // conversion of filtered row complete
				5072	#ifdef __PIC__
				5073	"popl %%ebx \n\t"
				5074	#endif
				5075
				5076	: "=d" (dummy_value_d), // 0 // output regs (dummy)
				5077	"=S" (dummy_value_S), // 1
				5078	"=D" (dummy_value_D) // 2
				5079
				5080	: "0" (len), // edx // input regs
				5081	"1" (prev_row), // esi
				5082	"2" (row) // edi
				5083
				5084	: "%eax", "%ecx" // clobber list (no input regs!)
				5085	#ifndef __PIC__
				5086	, "%ebx"
				5087	#endif
				5088
				5089	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
				5090	, "%mm0", "%mm1", "%mm2", "%mm3"
				5091	, "%mm4", "%mm5", "%mm6", "%mm7"
				5092	#endif
				5093	);
				5094
				5095	} // end of png_read_filter_row_mmx_up()
				5096
				5097	#endif /* PNG_MMX_CODE_SUPPORTED */
				5098
				5099
				5100
				5101
				5102	/===========================================================================/
				5103	/* */
				5104	/* P N G _ R E A D _ F I L T E R _ R O W */
				5105	/* */
				5106	/===========================================================================/
				5107
				5108
				5109	/* Optimized png_read_filter_row routines */
				5110
				5111	void /* PRIVATE */
				5112	png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
				5113	row, png_bytep prev_row, int filter)
				5114	{
				5115	#ifdef PNG_DEBUG
				5116	char filnm[10];
				5117	#endif
				5118
				5119	#if defined(PNG_MMX_CODE_SUPPORTED)
				5120	/* GRR: these are superseded by png_ptr->asm_flags: */
				5121	#define UseMMX_sub 1 // GRR: converted 20000730
				5122	#define UseMMX_up 1 // GRR: converted 20000729
				5123	#define UseMMX_avg 1 // GRR: converted 20000828 (+ 16-bit bugfix 20000916)
				5124	#define UseMMX_paeth 1 // GRR: converted 20000828
				5125
				5126	if (_mmx_supported == 2) {
				5127	/* this should have happened in png_init_mmx_flags() already */
				5128	#if !defined(PNG_1_0_X)
				5129	png_warning(png_ptr, "asm_flags may not have been initialized");
				5130	#endif
				5131	png_mmx_support();
				5132	}
				5133	#endif /* PNG_MMX_CODE_SUPPORTED */
				5134
				5135	#ifdef PNG_DEBUG
				5136	png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
				5137	switch (filter)
				5138	{
				5139	case 0: sprintf(filnm, "none");
				5140	break;
				5141	case 1: sprintf(filnm, "sub-%s",
				5142	#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
				5143	#if !defined(PNG_1_0_X)
				5144	(png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" :
				5145	#endif
				5146	#endif
				5147	"x86");
				5148	break;
				5149	case 2: sprintf(filnm, "up-%s",
				5150	#ifdef PNG_MMX_CODE_SUPPORTED
				5151	#if !defined(PNG_1_0_X)
				5152	(png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" :
				5153	#endif
				5154	#endif
				5155	"x86");
				5156	break;
				5157	case 3: sprintf(filnm, "avg-%s",
				5158	#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
				5159	#if !defined(PNG_1_0_X)
				5160	(png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" :
				5161	#endif
				5162	#endif
				5163	"x86");
				5164	break;
				5165	case 4: sprintf(filnm, "Paeth-%s",
				5166	#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
				5167	#if !defined(PNG_1_0_X)
				5168	(png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":
				5169	#endif
				5170	#endif
				5171	"x86");
				5172	break;
				5173	default: sprintf(filnm, "unknw");
				5174	break;
				5175	}
				5176	png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
				5177	png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
				5178	png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
				5179	(int)((row_info->pixel_depth + 7) >> 3));
				5180	png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
				5181	#endif /* PNG_DEBUG */
				5182
				5183	switch (filter)
				5184	{
				5185	case PNG_FILTER_VALUE_NONE:
				5186	break;
				5187
				5188	case PNG_FILTER_VALUE_SUB:
				5189	#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
				5190	#if !defined(PNG_1_0_X)
				5191	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
				5192	(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
				5193	(row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
				5194	#else
				5195	if (_mmx_supported)
				5196	#endif
				5197	{
				5198	png_read_filter_row_mmx_sub(row_info, row);
				5199	}
				5200	else
				5201	#endif /* PNG_MMX_CODE_SUPPORTED */
				5202	{
				5203	png_uint_32 i;
				5204	png_uint_32 istop = row_info->rowbytes;
				5205	png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
				5206	png_bytep rp = row + bpp;
				5207	png_bytep lp = row;
				5208
				5209	for (i = bpp; i < istop; i++)
				5210	{
				5211	rp = (png_byte)(((int)(rp) + (int)(*lp++)) & 0xff);
				5212	rp++;
				5213	}
				5214	} /* end !UseMMX_sub */
				5215	break;
				5216
				5217	case PNG_FILTER_VALUE_UP:
				5218	#if defined(PNG_MMX_CODE_SUPPORTED)
				5219	#if !defined(PNG_1_0_X)
				5220	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
				5221	(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
				5222	(row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
				5223	#else
				5224	if (_mmx_supported)
				5225	#endif
				5226	{
				5227	png_read_filter_row_mmx_up(row_info, row, prev_row);
				5228	}
				5229	else
				5230	#endif /* PNG_MMX_CODE_SUPPORTED */
				5231	{
				5232	png_uint_32 i;
				5233	png_uint_32 istop = row_info->rowbytes;
				5234	png_bytep rp = row;
				5235	png_bytep pp = prev_row;
				5236
				5237	for (i = 0; i < istop; ++i)
				5238	{
				5239	rp = (png_byte)(((int)(rp) + (int)(*pp++)) & 0xff);
				5240	rp++;
				5241	}
				5242	} /* end !UseMMX_up */
				5243	break;
				5244
				5245	case PNG_FILTER_VALUE_AVG:
				5246	#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
				5247	#if !defined(PNG_1_0_X)
				5248	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
				5249	(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
				5250	(row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
				5251	#else
				5252	if (_mmx_supported)
				5253	#endif
				5254	{
				5255	png_read_filter_row_mmx_avg(row_info, row, prev_row);
				5256	}
				5257	else
				5258	#endif /* PNG_MMX_CODE_SUPPORTED */
				5259	{
				5260	png_uint_32 i;
				5261	png_bytep rp = row;
				5262	png_bytep pp = prev_row;
				5263	png_bytep lp = row;
				5264	png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
				5265	png_uint_32 istop = row_info->rowbytes - bpp;
				5266
				5267	for (i = 0; i < bpp; i++)
				5268	{
				5269	rp = (png_byte)(((int)(rp) +
				5270	((int)(*pp++) >> 1)) & 0xff);
				5271	rp++;
				5272	}
				5273
				5274	for (i = 0; i < istop; i++)
				5275	{
				5276	rp = (png_byte)(((int)(rp) +
				5277	((int)(pp++ + lp++) >> 1)) & 0xff);
				5278	rp++;
				5279	}
				5280	} /* end !UseMMX_avg */
				5281	break;
				5282
				5283	case PNG_FILTER_VALUE_PAETH:
				5284	#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
				5285	#if !defined(PNG_1_0_X)
				5286	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
				5287	(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
				5288	(row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
				5289	#else
				5290	if (_mmx_supported)
				5291	#endif
				5292	{
				5293	png_read_filter_row_mmx_paeth(row_info, row, prev_row);
				5294	}
				5295	else
				5296	#endif /* PNG_MMX_CODE_SUPPORTED */
				5297	{
				5298	png_uint_32 i;
				5299	png_bytep rp = row;
				5300	png_bytep pp = prev_row;
				5301	png_bytep lp = row;
				5302	png_bytep cp = prev_row;
				5303	png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
				5304	png_uint_32 istop = row_info->rowbytes - bpp;
				5305
				5306	for (i = 0; i < bpp; i++)
				5307	{
				5308	rp = (png_byte)(((int)(rp) + (int)(*pp++)) & 0xff);
				5309	rp++;
				5310	}
				5311
				5312	for (i = 0; i < istop; i++) /* use leftover rp,pp */
				5313	{
				5314	int a, b, c, pa, pb, pc, p;
				5315
				5316	a = *lp++;
				5317	b = *pp++;
				5318	c = *cp++;
				5319
				5320	p = b - c;
				5321	pc = a - c;
				5322
				5323	#ifdef PNG_USE_ABS
				5324	pa = abs(p);
				5325	pb = abs(pc);
				5326	pc = abs(p + pc);
				5327	#else
				5328	pa = p < 0 ? -p : p;
				5329	pb = pc < 0 ? -pc : pc;
				5330	pc = (p + pc) < 0 ? -(p + pc) : p + pc;
				5331	#endif
				5332
				5333	/*
				5334	if (pa <= pb && pa <= pc)
				5335	p = a;
				5336	else if (pb <= pc)
				5337	p = b;
				5338	else
				5339	p = c;
				5340	*/
				5341
				5342	p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
				5343
				5344	rp = (png_byte)(((int)(rp) + p) & 0xff);
				5345	rp++;
				5346	}
				5347	} /* end !UseMMX_paeth */
				5348	break;
				5349
				5350	default:
				5351	png_warning(png_ptr, "Ignoring bad row-filter type");
				5352	*row=0;
				5353	break;
				5354	}
				5355	}
				5356
				5357	#endif /* PNG_HAVE_MMX_READ_FILTER_ROW */
				5358
				5359
				5360	/===========================================================================/
				5361	/* */
				5362	/* P N G _ M M X _ S U P P O R T */
				5363	/* */
				5364	/===========================================================================/
				5365
				5366	/* GRR NOTES: (1) the following code assumes 386 or better (pushfl/popfl)
				5367	* (2) all instructions compile with gcc 2.7.2.3 and later
				5368	* (3) the function is moved down here to prevent gcc from
				5369	* inlining it in multiple places and then barfing be-
				5370	* cause the ".NOT_SUPPORTED" label is multiply defined
				5371	* [is there a way to signal that a single function should
				5372	* not be inlined? is there a way to modify the label for
				5373	* each inlined instance, e.g., by appending _1, _2, etc.?
				5374	* maybe if don't use leading "." in label name? (nope...sigh)]
				5375	*/
				5376
				5377	int PNGAPI
				5378	png_mmx_support(void)
				5379	{
				5380	#if defined(PNG_MMX_CODE_SUPPORTED)
				5381	int result;
				5382	__asm__ __volatile__ (
				5383	"pushl %%ebx \n\t" // ebx gets clobbered by CPUID instruction
				5384	"pushl %%ecx \n\t" // so does ecx...
				5385	"pushl %%edx \n\t" // ...and edx (but ecx & edx safe on Linux)
				5386	// ".byte 0x66 \n\t" // convert 16-bit pushf to 32-bit pushfd
				5387	// "pushf \n\t" // 16-bit pushf
				5388	"pushfl \n\t" // save Eflag to stack
				5389	"popl %%eax \n\t" // get Eflag from stack into eax
				5390	"movl %%eax, %%ecx \n\t" // make another copy of Eflag in ecx
				5391	"xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
				5392	"pushl %%eax \n\t" // save modified Eflag back to stack
				5393	// ".byte 0x66 \n\t" // convert 16-bit popf to 32-bit popfd
				5394	// "popf \n\t" // 16-bit popf
				5395	"popfl \n\t" // restore modified value to Eflag reg
				5396	"pushfl \n\t" // save Eflag to stack
				5397	"popl %%eax \n\t" // get Eflag from stack
				5398	"pushl %%ecx \n\t" // save original Eflag to stack
				5399	"popfl \n\t" // restore original Eflag
				5400	"xorl %%ecx, %%eax \n\t" // compare new Eflag with original Eflag
				5401	"jz 0f \n\t" // if same, CPUID instr. is not supported
				5402
				5403	"xorl %%eax, %%eax \n\t" // set eax to zero
				5404	// ".byte 0x0f, 0xa2 \n\t" // CPUID instruction (two-byte opcode)
				5405	"cpuid \n\t" // get the CPU identification info
				5406	"cmpl $1, %%eax \n\t" // make sure eax return non-zero value
				5407	"jl 0f \n\t" // if eax is zero, MMX is not supported
				5408
				5409	"xorl %%eax, %%eax \n\t" // set eax to zero and...
				5410	"incl %%eax \n\t" // ...increment eax to 1. This pair is
				5411	// faster than the instruction "mov eax, 1"
				5412	"cpuid \n\t" // get the CPU identification info again
				5413	"andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
				5414	"cmpl $0, %%edx \n\t" // 0 = MMX not supported
				5415	"jz 0f \n\t" // non-zero = yes, MMX IS supported
				5416
				5417	"movl $1, %%eax \n\t" // set return value to 1
				5418	"jmp 1f \n\t" // DONE: have MMX support
				5419
				5420	"0: \n\t" // .NOT_SUPPORTED: target label for jump instructions
				5421	"movl $0, %%eax \n\t" // set return value to 0
				5422	"1: \n\t" // .RETURN: target label for jump instructions
				5423	"popl %%edx \n\t" // restore edx
				5424	"popl %%ecx \n\t" // restore ecx
				5425	"popl %%ebx \n\t" // restore ebx
				5426
				5427	// "ret \n\t" // DONE: no MMX support
				5428	// (fall through to standard C "ret")
				5429
				5430	: "=a" (result) // output list
				5431
				5432	: // any variables used on input (none)
				5433
				5434	// no clobber list
				5435	// , "%ebx", "%ecx", "%edx" // GRR: we handle these manually
				5436	// , "memory" // if write to a variable gcc thought was in a reg
				5437	// , "cc" // "condition codes" (flag bits)
				5438	);
				5439	_mmx_supported = result;
				5440	#else
				5441	_mmx_supported = 0;
				5442	#endif /* PNG_MMX_CODE_SUPPORTED */
				5443
				5444	return _mmx_supported;
				5445	}
				5446
				5447
				5448	#endif /* PNG_USE_PNGGCCRD */