blob: 7dbbf532a92a8dd0804c93e0dd4bceb8031ccb08 [file] [log] [blame]
J. Duke319a3b92007-12-01 00:00:00 +00001/*
2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
3 *
4 * This code is free software; you can redistribute it and/or modify it
5 * under the terms of the GNU General Public License version 2 only, as
6 * published by the Free Software Foundation. Sun designates this
7 * particular file as subject to the "Classpath" exception as provided
8 * by Sun in the LICENSE file that accompanied this code.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
22 * have any questions.
23 */
24
25/* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
26 *
27 * This file is available under and governed by the GNU General Public
28 * License version 2 only, as published by the Free Software Foundation.
29 * However, the following notice accompanied the original version of this
30 * file and, per its terms, should not be removed:
31 *
32 * For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
33 *
34 * See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
35 * and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
36 * for Intel's performance analysis of the MMX vs. non-MMX code.
37 *
38 * Last changed in libpng 1.2.15 January 5, 2007
39 * For conditions of distribution and use, see copyright notice in png.h
40 * Copyright (c) 1998-2007 Glenn Randers-Pehrson
41 * Copyright (c) 1998, Intel Corporation
42 *
43 * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
44 * Interface to libpng contributed by Gilles Vollant, 1999.
45 * GNU C port by Greg Roelofs, 1999-2001.
46 *
47 * Lines 2350-4300 converted in place with intel2gas 1.3.1:
48 *
49 * intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
50 *
51 * and then cleaned up by hand. See http://hermes.terminal.at/intel2gas/ .
52 *
53 * NOTE: A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
54 * is required to assemble the newer MMX instructions such as movq.
55 * For djgpp, see
56 *
57 * ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
58 *
59 * (or a later version in the same directory). For Linux, check your
60 * distribution's web site(s) or try these links:
61 *
62 * http://rufus.w3.org/linux/RPM/binutils.html
63 * http://www.debian.org/Packages/stable/devel/binutils.html
64 * ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
65 * binutils.tgz
66 *
67 * For other platforms, see the main GNU site:
68 *
69 * ftp://ftp.gnu.org/pub/gnu/binutils/
70 *
71 * Version 2.5.2l.15 is definitely too old...
72 */
73
74/*
75 * TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
76 * =====================================
77 *
78 * 19991006:
79 * - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
80 *
81 * 19991007:
82 * - additional optimizations (possible or definite):
83 * x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
84 * - write MMX code for 48-bit case (pixel_bytes == 6)
85 * - figure out what's up with 24-bit case (pixel_bytes == 3):
86 * why subtract 8 from width_mmx in the pass 4/5 case?
87 * (only width_mmx case) (near line 1606)
88 * x [DONE] replace pixel_bytes within each block with the true
89 * constant value (or are compilers smart enough to do that?)
90 * - rewrite all MMX interlacing code so it's aligned with
91 * the *beginning* of the row buffer, not the end. This
92 * would not only allow one to eliminate half of the memory
93 * writes for odd passes (that is, pass == odd), it may also
94 * eliminate some unaligned-data-access exceptions (assuming
95 * there's a penalty for not aligning 64-bit accesses on
96 * 64-bit boundaries). The only catch is that the "leftover"
97 * pixel(s) at the end of the row would have to be saved,
98 * but there are enough unused MMX registers in every case,
99 * so this is not a problem. A further benefit is that the
100 * post-MMX cleanup code (C code) in at least some of the
101 * cases could be done within the assembler block.
102 * x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
103 * inconsistent, and don't match the MMX Programmer's Reference
104 * Manual conventions anyway. They should be changed to
105 * "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
106 * was lowest in memory (e.g., corresponding to a left pixel)
107 * and b7 is the byte that was highest (e.g., a right pixel).
108 *
109 * 19991016:
110 * - Brennan's Guide notwithstanding, gcc under Linux does *not*
111 * want globals prefixed by underscores when referencing them--
112 * i.e., if the variable is const4, then refer to it as const4,
113 * not _const4. This seems to be a djgpp-specific requirement.
114 * Also, such variables apparently *must* be declared outside
115 * of functions; neither static nor automatic variables work if
116 * defined within the scope of a single function, but both
117 * static and truly global (multi-module) variables work fine.
118 *
119 * 19991023:
120 * - fixed png_combine_row() non-MMX replication bug (odd passes only?)
121 * - switched from string-concatenation-with-macros to cleaner method of
122 * renaming global variables for djgpp--i.e., always use prefixes in
123 * inlined assembler code (== strings) and conditionally rename the
124 * variables, not the other way around. Hence _const4, _mask8_0, etc.
125 *
126 * 19991024:
127 * - fixed mmxsupport()/png_do_read_interlace() first-row bug
128 * This one was severely weird: even though mmxsupport() doesn't touch
129 * ebx (where "row" pointer was stored), it nevertheless managed to zero
130 * the register (even in static/non-fPIC code--see below), which in turn
131 * caused png_do_read_interlace() to return prematurely on the first row of
132 * interlaced images (i.e., without expanding the interlaced pixels).
133 * Inspection of the generated assembly code didn't turn up any clues,
134 * although it did point at a minor optimization (i.e., get rid of
135 * mmx_supported_local variable and just use eax). Possibly the CPUID
136 * instruction is more destructive than it looks? (Not yet checked.)
137 * - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
138 * listings... Apparently register spillage has to do with ebx, since
139 * it's used to index the global offset table. Commenting it out of the
140 * input-reg lists in png_combine_row() eliminated compiler barfage, so
141 * ifdef'd with __PIC__ macro: if defined, use a global for unmask
142 *
143 * 19991107:
144 * - verified CPUID clobberage: 12-char string constant ("GenuineIntel",
145 * "AuthenticAMD", etc.) placed in ebx:ecx:edx. Still need to polish.
146 *
147 * 19991120:
148 * - made "diff" variable (now "_dif") global to simplify conversion of
149 * filtering routines (running out of regs, sigh). "diff" is still used
150 * in interlacing routines, however.
151 * - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
152 * macro determines which is used); original not yet tested.
153 *
154 * 20000213:
155 * - when compiling with gcc, be sure to use -fomit-frame-pointer
156 *
157 * 20000319:
158 * - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
159 * pass == 4 or 5, that caused visible corruption of interlaced images
160 *
161 * 20000623:
162 * - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
163 * many of the form "forbidden register 0 (ax) was spilled for class AREG."
164 * This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
165 * Chuck Wilson supplied a patch involving dummy output registers. See
166 * http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
167 * for the original (anonymous) SourceForge bug report.
168 *
169 * 20000706:
170 * - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
171 * pnggccrd.c: In function `png_combine_row':
172 * pnggccrd.c:525: more than 10 operands in `asm'
173 * pnggccrd.c:669: more than 10 operands in `asm'
174 * pnggccrd.c:828: more than 10 operands in `asm'
175 * pnggccrd.c:994: more than 10 operands in `asm'
176 * pnggccrd.c:1177: more than 10 operands in `asm'
177 * They are all the same problem and can be worked around by using the
178 * global _unmask variable unconditionally, not just in the -fPIC case.
179 * Reportedly earlier versions of gcc also have the problem with more than
180 * 10 operands; they just don't report it. Much strangeness ensues, etc.
181 *
182 * 20000729:
183 * - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
184 * MMX routine); began converting png_read_filter_row_mmx_sub()
185 * - to finish remaining sections:
186 * - clean up indentation and comments
187 * - preload local variables
188 * - add output and input regs (order of former determines numerical
189 * mapping of latter)
190 * - avoid all usage of ebx (including bx, bh, bl) register [20000823]
191 * - remove "$" from addressing of Shift and Mask variables [20000823]
192 *
193 * 20000731:
194 * - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
195 *
196 * 20000822:
197 * - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
198 * shared-library (-fPIC) version! Code works just fine as part of static
199 * library. Damn damn damn damn damn, should have tested that sooner.
200 * ebx is getting clobbered again (explicitly this time); need to save it
201 * on stack or rewrite asm code to avoid using it altogether. Blargh!
202 *
203 * 20000823:
204 * - first section was trickiest; all remaining sections have ebx -> edx now.
205 * (-fPIC works again.) Also added missing underscores to various Shift*
206 * and *Mask* globals and got rid of leading "$" signs.
207 *
208 * 20000826:
209 * - added visual separators to help navigate microscopic printed copies
210 * (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
211 * on png_read_filter_row_mmx_avg()
212 *
213 * 20000828:
214 * - finished png_read_filter_row_mmx_avg(): only Paeth left! (930 lines...)
215 * What the hell, did png_read_filter_row_mmx_paeth(), too. Comments not
216 * cleaned up/shortened in either routine, but functionality is complete
217 * and seems to be working fine.
218 *
219 * 20000829:
220 * - ahhh, figured out last(?) bit of gcc/gas asm-fu: if register is listed
221 * as an input reg (with dummy output variables, etc.), then it *cannot*
222 * also appear in the clobber list or gcc 2.95.2 will barf. The solution
223 * is simple enough...
224 *
225 * 20000914:
226 * - bug in png_read_filter_row_mmx_avg(): 16-bit grayscale not handled
227 * correctly (but 48-bit RGB just fine)
228 *
229 * 20000916:
230 * - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
231 * - "_ShiftBpp.use = 24;" should have been "_ShiftBpp.use = 16;"
232 * - "_ShiftRem.use = 40;" should have been "_ShiftRem.use = 48;"
233 * - "psllq _ShiftRem, %%mm2" should have been "psrlq _ShiftRem, %%mm2"
234 *
235 * 20010101:
236 * - added new png_init_mmx_flags() function (here only because it needs to
237 * call mmxsupport(), which should probably become global png_mmxsupport());
238 * modified other MMX routines to run conditionally (png_ptr->asm_flags)
239 *
240 * 20010103:
241 * - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
242 * and made it public; moved png_init_mmx_flags() to png.c as internal func
243 *
244 * 20010104:
245 * - removed dependency on png_read_filter_row_c() (C code already duplicated
246 * within MMX version of png_read_filter_row()) so no longer necessary to
247 * compile it into pngrutil.o
248 *
249 * 20010310:
250 * - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
251 *
252 * 20020304:
253 * - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
254 *
255 * 20040724:
256 * - more tinkering with clobber list at lines 4529 and 5033, to get
257 * it to compile on gcc-3.4.
258 *
259 * STILL TO DO:
260 * - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
261 * - write MMX code for 48-bit case (pixel_bytes == 6)
262 * - figure out what's up with 24-bit case (pixel_bytes == 3):
263 * why subtract 8 from width_mmx in the pass 4/5 case?
264 * (only width_mmx case) (near line 1606)
265 * - rewrite all MMX interlacing code so it's aligned with beginning
266 * of the row buffer, not the end (see 19991007 for details)
267 * x pick one version of mmxsupport() and get rid of the other
268 * - add error messages to any remaining bogus default cases
269 * - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
270 * x add support for runtime enable/disable/query of various MMX routines
271 */
272
273#define PNG_INTERNAL
274#include "png.h"
275
276#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGGCCRD)
277
278int PNGAPI png_mmx_support(void);
279
280#ifdef PNG_USE_LOCAL_ARRAYS
281const static int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
282const static int FARDATA png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
283const static int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
284#endif
285
286#if defined(PNG_MMX_CODE_SUPPORTED)
287/* djgpp, Win32, Cygwin, and OS2 add their own underscores to global variables,
288 * so define them without: */
289#if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__) || \
290 defined(__OS2__)
291# define _mmx_supported mmx_supported
292# define _const4 const4
293# define _const6 const6
294# define _mask8_0 mask8_0
295# define _mask16_1 mask16_1
296# define _mask16_0 mask16_0
297# define _mask24_2 mask24_2
298# define _mask24_1 mask24_1
299# define _mask24_0 mask24_0
300# define _mask32_3 mask32_3
301# define _mask32_2 mask32_2
302# define _mask32_1 mask32_1
303# define _mask32_0 mask32_0
304# define _mask48_5 mask48_5
305# define _mask48_4 mask48_4
306# define _mask48_3 mask48_3
307# define _mask48_2 mask48_2
308# define _mask48_1 mask48_1
309# define _mask48_0 mask48_0
310# define _LBCarryMask LBCarryMask
311# define _HBClearMask HBClearMask
312# define _ActiveMask ActiveMask
313# define _ActiveMask2 ActiveMask2
314# define _ActiveMaskEnd ActiveMaskEnd
315# define _ShiftBpp ShiftBpp
316# define _ShiftRem ShiftRem
317#ifdef PNG_THREAD_UNSAFE_OK
318# define _unmask unmask
319# define _FullLength FullLength
320# define _MMXLength MMXLength
321# define _dif dif
322# define _patemp patemp
323# define _pbtemp pbtemp
324# define _pctemp pctemp
325#endif
326#endif
327
328
329/* These constants are used in the inlined MMX assembly code.
330 Ignore gcc's "At top level: defined but not used" warnings. */
331
332/* GRR 20000706: originally _unmask was needed only when compiling with -fPIC,
333 * since that case uses the %ebx register for indexing the Global Offset Table
334 * and there were no other registers available. But gcc 2.95 and later emit
335 * "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
336 * in the non-PIC case, so we'll just use the global unconditionally now.
337 */
338#ifdef PNG_THREAD_UNSAFE_OK
339static int _unmask;
340#endif
341
342const static unsigned long long _mask8_0 = 0x0102040810204080LL;
343
344const static unsigned long long _mask16_1 = 0x0101020204040808LL;
345const static unsigned long long _mask16_0 = 0x1010202040408080LL;
346
347const static unsigned long long _mask24_2 = 0x0101010202020404LL;
348const static unsigned long long _mask24_1 = 0x0408080810101020LL;
349const static unsigned long long _mask24_0 = 0x2020404040808080LL;
350
351const static unsigned long long _mask32_3 = 0x0101010102020202LL;
352const static unsigned long long _mask32_2 = 0x0404040408080808LL;
353const static unsigned long long _mask32_1 = 0x1010101020202020LL;
354const static unsigned long long _mask32_0 = 0x4040404080808080LL;
355
356const static unsigned long long _mask48_5 = 0x0101010101010202LL;
357const static unsigned long long _mask48_4 = 0x0202020204040404LL;
358const static unsigned long long _mask48_3 = 0x0404080808080808LL;
359const static unsigned long long _mask48_2 = 0x1010101010102020LL;
360const static unsigned long long _mask48_1 = 0x2020202040404040LL;
361const static unsigned long long _mask48_0 = 0x4040808080808080LL;
362
363const static unsigned long long _const4 = 0x0000000000FFFFFFLL;
364//const static unsigned long long _const5 = 0x000000FFFFFF0000LL; // NOT USED
365const static unsigned long long _const6 = 0x00000000000000FFLL;
366
367// These are used in the row-filter routines and should/would be local
368// variables if not for gcc addressing limitations.
369// WARNING: Their presence probably defeats the thread safety of libpng.
370
371#ifdef PNG_THREAD_UNSAFE_OK
372static png_uint_32 _FullLength;
373static png_uint_32 _MMXLength;
374static int _dif;
375static int _patemp; // temp variables for Paeth routine
376static int _pbtemp;
377static int _pctemp;
378#endif
379
380void /* PRIVATE */
381png_squelch_warnings(void)
382{
383#ifdef PNG_THREAD_UNSAFE_OK
384 _dif = _dif;
385 _patemp = _patemp;
386 _pbtemp = _pbtemp;
387 _pctemp = _pctemp;
388 _MMXLength = _MMXLength;
389#endif
390 _const4 = _const4;
391 _const6 = _const6;
392 _mask8_0 = _mask8_0;
393 _mask16_1 = _mask16_1;
394 _mask16_0 = _mask16_0;
395 _mask24_2 = _mask24_2;
396 _mask24_1 = _mask24_1;
397 _mask24_0 = _mask24_0;
398 _mask32_3 = _mask32_3;
399 _mask32_2 = _mask32_2;
400 _mask32_1 = _mask32_1;
401 _mask32_0 = _mask32_0;
402 _mask48_5 = _mask48_5;
403 _mask48_4 = _mask48_4;
404 _mask48_3 = _mask48_3;
405 _mask48_2 = _mask48_2;
406 _mask48_1 = _mask48_1;
407 _mask48_0 = _mask48_0;
408}
409#endif /* PNG_MMX_CODE_SUPPORTED */
410
411
412static int _mmx_supported = 2;
413
414/*===========================================================================*/
415/* */
416/* P N G _ C O M B I N E _ R O W */
417/* */
418/*===========================================================================*/
419
420#if defined(PNG_HAVE_MMX_COMBINE_ROW)
421
422#define BPP2 2
423#define BPP3 3 /* bytes per pixel (a.k.a. pixel_bytes) */
424#define BPP4 4
425#define BPP6 6 /* (defined only to help avoid cut-and-paste errors) */
426#define BPP8 8
427
428/* Combines the row recently read in with the previous row.
429 This routine takes care of alpha and transparency if requested.
430 This routine also handles the two methods of progressive display
431 of interlaced images, depending on the mask value.
432 The mask value describes which pixels are to be combined with
433 the row. The pattern always repeats every 8 pixels, so just 8
434 bits are needed. A one indicates the pixel is to be combined; a
435 zero indicates the pixel is to be skipped. This is in addition
436 to any alpha or transparency value associated with the pixel.
437 If you want all pixels to be combined, pass 0xff (255) in mask. */
438
439/* Use this routine for the x86 platform - it uses a faster MMX routine
440 if the machine supports MMX. */
441
442void /* PRIVATE */
443png_combine_row(png_structp png_ptr, png_bytep row, int mask)
444{
445 png_debug(1, "in png_combine_row (pnggccrd.c)\n");
446
447#if defined(PNG_MMX_CODE_SUPPORTED)
448 if (_mmx_supported == 2) {
449#if !defined(PNG_1_0_X)
450 /* this should have happened in png_init_mmx_flags() already */
451 png_warning(png_ptr, "asm_flags may not have been initialized");
452#endif
453 png_mmx_support();
454 }
455#endif
456
457 if (mask == 0xff)
458 {
459 png_debug(2,"mask == 0xff: doing single png_memcpy()\n");
460 png_memcpy(row, png_ptr->row_buf + 1,
461 (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,png_ptr->width));
462 }
463 else /* (png_combine_row() is never called with mask == 0) */
464 {
465 switch (png_ptr->row_info.pixel_depth)
466 {
467 case 1: /* png_ptr->row_info.pixel_depth */
468 {
469 png_bytep sp;
470 png_bytep dp;
471 int s_inc, s_start, s_end;
472 int m;
473 int shift;
474 png_uint_32 i;
475
476 sp = png_ptr->row_buf + 1;
477 dp = row;
478 m = 0x80;
479#if defined(PNG_READ_PACKSWAP_SUPPORTED)
480 if (png_ptr->transformations & PNG_PACKSWAP)
481 {
482 s_start = 0;
483 s_end = 7;
484 s_inc = 1;
485 }
486 else
487#endif
488 {
489 s_start = 7;
490 s_end = 0;
491 s_inc = -1;
492 }
493
494 shift = s_start;
495
496 for (i = 0; i < png_ptr->width; i++)
497 {
498 if (m & mask)
499 {
500 int value;
501
502 value = (*sp >> shift) & 0x1;
503 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
504 *dp |= (png_byte)(value << shift);
505 }
506
507 if (shift == s_end)
508 {
509 shift = s_start;
510 sp++;
511 dp++;
512 }
513 else
514 shift += s_inc;
515
516 if (m == 1)
517 m = 0x80;
518 else
519 m >>= 1;
520 }
521 break;
522 }
523
524 case 2: /* png_ptr->row_info.pixel_depth */
525 {
526 png_bytep sp;
527 png_bytep dp;
528 int s_start, s_end, s_inc;
529 int m;
530 int shift;
531 png_uint_32 i;
532 int value;
533
534 sp = png_ptr->row_buf + 1;
535 dp = row;
536 m = 0x80;
537#if defined(PNG_READ_PACKSWAP_SUPPORTED)
538 if (png_ptr->transformations & PNG_PACKSWAP)
539 {
540 s_start = 0;
541 s_end = 6;
542 s_inc = 2;
543 }
544 else
545#endif
546 {
547 s_start = 6;
548 s_end = 0;
549 s_inc = -2;
550 }
551
552 shift = s_start;
553
554 for (i = 0; i < png_ptr->width; i++)
555 {
556 if (m & mask)
557 {
558 value = (*sp >> shift) & 0x3;
559 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
560 *dp |= (png_byte)(value << shift);
561 }
562
563 if (shift == s_end)
564 {
565 shift = s_start;
566 sp++;
567 dp++;
568 }
569 else
570 shift += s_inc;
571 if (m == 1)
572 m = 0x80;
573 else
574 m >>= 1;
575 }
576 break;
577 }
578
579 case 4: /* png_ptr->row_info.pixel_depth */
580 {
581 png_bytep sp;
582 png_bytep dp;
583 int s_start, s_end, s_inc;
584 int m;
585 int shift;
586 png_uint_32 i;
587 int value;
588
589 sp = png_ptr->row_buf + 1;
590 dp = row;
591 m = 0x80;
592#if defined(PNG_READ_PACKSWAP_SUPPORTED)
593 if (png_ptr->transformations & PNG_PACKSWAP)
594 {
595 s_start = 0;
596 s_end = 4;
597 s_inc = 4;
598 }
599 else
600#endif
601 {
602 s_start = 4;
603 s_end = 0;
604 s_inc = -4;
605 }
606 shift = s_start;
607
608 for (i = 0; i < png_ptr->width; i++)
609 {
610 if (m & mask)
611 {
612 value = (*sp >> shift) & 0xf;
613 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
614 *dp |= (png_byte)(value << shift);
615 }
616
617 if (shift == s_end)
618 {
619 shift = s_start;
620 sp++;
621 dp++;
622 }
623 else
624 shift += s_inc;
625 if (m == 1)
626 m = 0x80;
627 else
628 m >>= 1;
629 }
630 break;
631 }
632
633 case 8: /* png_ptr->row_info.pixel_depth */
634 {
635 png_bytep srcptr;
636 png_bytep dstptr;
637
638#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
639#if !defined(PNG_1_0_X)
640 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
641 /* && _mmx_supported */ )
642#else
643 if (_mmx_supported)
644#endif
645 {
646 png_uint_32 len;
647 int diff;
648 int dummy_value_a; // fix 'forbidden register spilled' error
649 int dummy_value_d;
650 int dummy_value_c;
651 int dummy_value_S;
652 int dummy_value_D;
653 _unmask = ~mask; // global variable for -fPIC version
654 srcptr = png_ptr->row_buf + 1;
655 dstptr = row;
656 len = png_ptr->width &~7; // reduce to multiple of 8
657 diff = (int) (png_ptr->width & 7); // amount lost
658
659 __asm__ __volatile__ (
660 "movd _unmask, %%mm7 \n\t" // load bit pattern
661 "psubb %%mm6, %%mm6 \n\t" // zero mm6
662 "punpcklbw %%mm7, %%mm7 \n\t"
663 "punpcklwd %%mm7, %%mm7 \n\t"
664 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
665
666 "movq _mask8_0, %%mm0 \n\t"
667 "pand %%mm7, %%mm0 \n\t" // nonzero if keep byte
668 "pcmpeqb %%mm6, %%mm0 \n\t" // zeros->1s, v versa
669
670// preload "movl len, %%ecx \n\t" // load length of line
671// preload "movl srcptr, %%esi \n\t" // load source
672// preload "movl dstptr, %%edi \n\t" // load dest
673
674 "cmpl $0, %%ecx \n\t" // len == 0 ?
675 "je mainloop8end \n\t"
676
677 "mainloop8: \n\t"
678 "movq (%%esi), %%mm4 \n\t" // *srcptr
679 "pand %%mm0, %%mm4 \n\t"
680 "movq %%mm0, %%mm6 \n\t"
681 "pandn (%%edi), %%mm6 \n\t" // *dstptr
682 "por %%mm6, %%mm4 \n\t"
683 "movq %%mm4, (%%edi) \n\t"
684 "addl $8, %%esi \n\t" // inc by 8 bytes processed
685 "addl $8, %%edi \n\t"
686 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
687 "ja mainloop8 \n\t"
688
689 "mainloop8end: \n\t"
690// preload "movl diff, %%ecx \n\t" // (diff is in eax)
691 "movl %%eax, %%ecx \n\t"
692 "cmpl $0, %%ecx \n\t"
693 "jz end8 \n\t"
694// preload "movl mask, %%edx \n\t"
695 "sall $24, %%edx \n\t" // make low byte, high byte
696
697 "secondloop8: \n\t"
698 "sall %%edx \n\t" // move high bit to CF
699 "jnc skip8 \n\t" // if CF = 0
700 "movb (%%esi), %%al \n\t"
701 "movb %%al, (%%edi) \n\t"
702
703 "skip8: \n\t"
704 "incl %%esi \n\t"
705 "incl %%edi \n\t"
706 "decl %%ecx \n\t"
707 "jnz secondloop8 \n\t"
708
709 "end8: \n\t"
710 "EMMS \n\t" // DONE
711
712 : "=a" (dummy_value_a), // output regs (dummy)
713 "=d" (dummy_value_d),
714 "=c" (dummy_value_c),
715 "=S" (dummy_value_S),
716 "=D" (dummy_value_D)
717
718 : "3" (srcptr), // esi // input regs
719 "4" (dstptr), // edi
720 "0" (diff), // eax
721// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
722 "2" (len), // ecx
723 "1" (mask) // edx
724
725#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
726 : "%mm0", "%mm4", "%mm6", "%mm7" // clobber list
727#endif
728 );
729 }
730 else /* mmx _not supported - Use modified C routine */
731#endif /* PNG_MMX_CODE_SUPPORTED */
732 {
733 register png_uint_32 i;
734 png_uint_32 initial_val = png_pass_start[png_ptr->pass];
735 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
736 register int stride = png_pass_inc[png_ptr->pass];
737 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
738 register int rep_bytes = png_pass_width[png_ptr->pass];
739 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
740 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
741 int diff = (int) (png_ptr->width & 7); /* amount lost */
742 register png_uint_32 final_val = len; /* GRR bugfix */
743
744 srcptr = png_ptr->row_buf + 1 + initial_val;
745 dstptr = row + initial_val;
746
747 for (i = initial_val; i < final_val; i += stride)
748 {
749 png_memcpy(dstptr, srcptr, rep_bytes);
750 srcptr += stride;
751 dstptr += stride;
752 }
753 if (diff) /* number of leftover pixels: 3 for pngtest */
754 {
755 final_val+=diff /* *BPP1 */ ;
756 for (; i < final_val; i += stride)
757 {
758 if (rep_bytes > (int)(final_val-i))
759 rep_bytes = (int)(final_val-i);
760 png_memcpy(dstptr, srcptr, rep_bytes);
761 srcptr += stride;
762 dstptr += stride;
763 }
764 }
765
766 } /* end of else (_mmx_supported) */
767
768 break;
769 } /* end 8 bpp */
770
771 case 16: /* png_ptr->row_info.pixel_depth */
772 {
773 png_bytep srcptr;
774 png_bytep dstptr;
775
776#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
777#if !defined(PNG_1_0_X)
778 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
779 /* && _mmx_supported */ )
780#else
781 if (_mmx_supported)
782#endif
783 {
784 png_uint_32 len;
785 int diff;
786 int dummy_value_a; // fix 'forbidden register spilled' error
787 int dummy_value_d;
788 int dummy_value_c;
789 int dummy_value_S;
790 int dummy_value_D;
791 _unmask = ~mask; // global variable for -fPIC version
792 srcptr = png_ptr->row_buf + 1;
793 dstptr = row;
794 len = png_ptr->width &~7; // reduce to multiple of 8
795 diff = (int) (png_ptr->width & 7); // amount lost //
796
797 __asm__ __volatile__ (
798 "movd _unmask, %%mm7 \n\t" // load bit pattern
799 "psubb %%mm6, %%mm6 \n\t" // zero mm6
800 "punpcklbw %%mm7, %%mm7 \n\t"
801 "punpcklwd %%mm7, %%mm7 \n\t"
802 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
803
804 "movq _mask16_0, %%mm0 \n\t"
805 "movq _mask16_1, %%mm1 \n\t"
806
807 "pand %%mm7, %%mm0 \n\t"
808 "pand %%mm7, %%mm1 \n\t"
809
810 "pcmpeqb %%mm6, %%mm0 \n\t"
811 "pcmpeqb %%mm6, %%mm1 \n\t"
812
813// preload "movl len, %%ecx \n\t" // load length of line
814// preload "movl srcptr, %%esi \n\t" // load source
815// preload "movl dstptr, %%edi \n\t" // load dest
816
817 "cmpl $0, %%ecx \n\t"
818 "jz mainloop16end \n\t"
819
820 "mainloop16: \n\t"
821 "movq (%%esi), %%mm4 \n\t"
822 "pand %%mm0, %%mm4 \n\t"
823 "movq %%mm0, %%mm6 \n\t"
824 "movq (%%edi), %%mm7 \n\t"
825 "pandn %%mm7, %%mm6 \n\t"
826 "por %%mm6, %%mm4 \n\t"
827 "movq %%mm4, (%%edi) \n\t"
828
829 "movq 8(%%esi), %%mm5 \n\t"
830 "pand %%mm1, %%mm5 \n\t"
831 "movq %%mm1, %%mm7 \n\t"
832 "movq 8(%%edi), %%mm6 \n\t"
833 "pandn %%mm6, %%mm7 \n\t"
834 "por %%mm7, %%mm5 \n\t"
835 "movq %%mm5, 8(%%edi) \n\t"
836
837 "addl $16, %%esi \n\t" // inc by 16 bytes processed
838 "addl $16, %%edi \n\t"
839 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
840 "ja mainloop16 \n\t"
841
842 "mainloop16end: \n\t"
843// preload "movl diff, %%ecx \n\t" // (diff is in eax)
844 "movl %%eax, %%ecx \n\t"
845 "cmpl $0, %%ecx \n\t"
846 "jz end16 \n\t"
847// preload "movl mask, %%edx \n\t"
848 "sall $24, %%edx \n\t" // make low byte, high byte
849
850 "secondloop16: \n\t"
851 "sall %%edx \n\t" // move high bit to CF
852 "jnc skip16 \n\t" // if CF = 0
853 "movw (%%esi), %%ax \n\t"
854 "movw %%ax, (%%edi) \n\t"
855
856 "skip16: \n\t"
857 "addl $2, %%esi \n\t"
858 "addl $2, %%edi \n\t"
859 "decl %%ecx \n\t"
860 "jnz secondloop16 \n\t"
861
862 "end16: \n\t"
863 "EMMS \n\t" // DONE
864
865 : "=a" (dummy_value_a), // output regs (dummy)
866 "=c" (dummy_value_c),
867 "=d" (dummy_value_d),
868 "=S" (dummy_value_S),
869 "=D" (dummy_value_D)
870
871 : "0" (diff), // eax // input regs
872// was (unmask) " " RESERVED // ebx // Global Offset Table idx
873 "1" (len), // ecx
874 "2" (mask), // edx
875 "3" (srcptr), // esi
876 "4" (dstptr) // edi
877
878#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
879 : "%mm0", "%mm1", "%mm4" // clobber list
880 , "%mm5", "%mm6", "%mm7"
881#endif
882 );
883 }
884 else /* mmx _not supported - Use modified C routine */
885#endif /* PNG_MMX_CODE_SUPPORTED */
886 {
887 register png_uint_32 i;
888 png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
889 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
890 register int stride = BPP2 * png_pass_inc[png_ptr->pass];
891 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
892 register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
893 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
894 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
895 int diff = (int) (png_ptr->width & 7); /* amount lost */
896 register png_uint_32 final_val = BPP2 * len; /* GRR bugfix */
897
898 srcptr = png_ptr->row_buf + 1 + initial_val;
899 dstptr = row + initial_val;
900
901 for (i = initial_val; i < final_val; i += stride)
902 {
903 png_memcpy(dstptr, srcptr, rep_bytes);
904 srcptr += stride;
905 dstptr += stride;
906 }
907 if (diff) /* number of leftover pixels: 3 for pngtest */
908 {
909 final_val+=diff*BPP2;
910 for (; i < final_val; i += stride)
911 {
912 if (rep_bytes > (int)(final_val-i))
913 rep_bytes = (int)(final_val-i);
914 png_memcpy(dstptr, srcptr, rep_bytes);
915 srcptr += stride;
916 dstptr += stride;
917 }
918 }
919 } /* end of else (_mmx_supported) */
920
921 break;
922 } /* end 16 bpp */
923
924 case 24: /* png_ptr->row_info.pixel_depth */
925 {
926 png_bytep srcptr;
927 png_bytep dstptr;
928
929#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
930#if !defined(PNG_1_0_X)
931 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
932 /* && _mmx_supported */ )
933#else
934 if (_mmx_supported)
935#endif
936 {
937 png_uint_32 len;
938 int diff;
939 int dummy_value_a; // fix 'forbidden register spilled' error
940 int dummy_value_d;
941 int dummy_value_c;
942 int dummy_value_S;
943 int dummy_value_D;
944 _unmask = ~mask; // global variable for -fPIC version
945 srcptr = png_ptr->row_buf + 1;
946 dstptr = row;
947 len = png_ptr->width &~7; // reduce to multiple of 8
948 diff = (int) (png_ptr->width & 7); // amount lost //
949
950 __asm__ __volatile__ (
951 "movd _unmask, %%mm7 \n\t" // load bit pattern
952 "psubb %%mm6, %%mm6 \n\t" // zero mm6
953 "punpcklbw %%mm7, %%mm7 \n\t"
954 "punpcklwd %%mm7, %%mm7 \n\t"
955 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
956
957 "movq _mask24_0, %%mm0 \n\t"
958 "movq _mask24_1, %%mm1 \n\t"
959 "movq _mask24_2, %%mm2 \n\t"
960
961 "pand %%mm7, %%mm0 \n\t"
962 "pand %%mm7, %%mm1 \n\t"
963 "pand %%mm7, %%mm2 \n\t"
964
965 "pcmpeqb %%mm6, %%mm0 \n\t"
966 "pcmpeqb %%mm6, %%mm1 \n\t"
967 "pcmpeqb %%mm6, %%mm2 \n\t"
968
969// preload "movl len, %%ecx \n\t" // load length of line
970// preload "movl srcptr, %%esi \n\t" // load source
971// preload "movl dstptr, %%edi \n\t" // load dest
972
973 "cmpl $0, %%ecx \n\t"
974 "jz mainloop24end \n\t"
975
976 "mainloop24: \n\t"
977 "movq (%%esi), %%mm4 \n\t"
978 "pand %%mm0, %%mm4 \n\t"
979 "movq %%mm0, %%mm6 \n\t"
980 "movq (%%edi), %%mm7 \n\t"
981 "pandn %%mm7, %%mm6 \n\t"
982 "por %%mm6, %%mm4 \n\t"
983 "movq %%mm4, (%%edi) \n\t"
984
985 "movq 8(%%esi), %%mm5 \n\t"
986 "pand %%mm1, %%mm5 \n\t"
987 "movq %%mm1, %%mm7 \n\t"
988 "movq 8(%%edi), %%mm6 \n\t"
989 "pandn %%mm6, %%mm7 \n\t"
990 "por %%mm7, %%mm5 \n\t"
991 "movq %%mm5, 8(%%edi) \n\t"
992
993 "movq 16(%%esi), %%mm6 \n\t"
994 "pand %%mm2, %%mm6 \n\t"
995 "movq %%mm2, %%mm4 \n\t"
996 "movq 16(%%edi), %%mm7 \n\t"
997 "pandn %%mm7, %%mm4 \n\t"
998 "por %%mm4, %%mm6 \n\t"
999 "movq %%mm6, 16(%%edi) \n\t"
1000
1001 "addl $24, %%esi \n\t" // inc by 24 bytes processed
1002 "addl $24, %%edi \n\t"
1003 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
1004
1005 "ja mainloop24 \n\t"
1006
1007 "mainloop24end: \n\t"
1008// preload "movl diff, %%ecx \n\t" // (diff is in eax)
1009 "movl %%eax, %%ecx \n\t"
1010 "cmpl $0, %%ecx \n\t"
1011 "jz end24 \n\t"
1012// preload "movl mask, %%edx \n\t"
1013 "sall $24, %%edx \n\t" // make low byte, high byte
1014
1015 "secondloop24: \n\t"
1016 "sall %%edx \n\t" // move high bit to CF
1017 "jnc skip24 \n\t" // if CF = 0
1018 "movw (%%esi), %%ax \n\t"
1019 "movw %%ax, (%%edi) \n\t"
1020 "xorl %%eax, %%eax \n\t"
1021 "movb 2(%%esi), %%al \n\t"
1022 "movb %%al, 2(%%edi) \n\t"
1023
1024 "skip24: \n\t"
1025 "addl $3, %%esi \n\t"
1026 "addl $3, %%edi \n\t"
1027 "decl %%ecx \n\t"
1028 "jnz secondloop24 \n\t"
1029
1030 "end24: \n\t"
1031 "EMMS \n\t" // DONE
1032
1033 : "=a" (dummy_value_a), // output regs (dummy)
1034 "=d" (dummy_value_d),
1035 "=c" (dummy_value_c),
1036 "=S" (dummy_value_S),
1037 "=D" (dummy_value_D)
1038
1039 : "3" (srcptr), // esi // input regs
1040 "4" (dstptr), // edi
1041 "0" (diff), // eax
1042// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1043 "2" (len), // ecx
1044 "1" (mask) // edx
1045
1046#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1047 : "%mm0", "%mm1", "%mm2" // clobber list
1048 , "%mm4", "%mm5", "%mm6", "%mm7"
1049#endif
1050 );
1051 }
1052 else /* mmx _not supported - Use modified C routine */
1053#endif /* PNG_MMX_CODE_SUPPORTED */
1054 {
1055 register png_uint_32 i;
1056 png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
1057 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1058 register int stride = BPP3 * png_pass_inc[png_ptr->pass];
1059 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1060 register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
1061 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1062 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1063 int diff = (int) (png_ptr->width & 7); /* amount lost */
1064 register png_uint_32 final_val = BPP3 * len; /* GRR bugfix */
1065
1066 srcptr = png_ptr->row_buf + 1 + initial_val;
1067 dstptr = row + initial_val;
1068
1069 for (i = initial_val; i < final_val; i += stride)
1070 {
1071 png_memcpy(dstptr, srcptr, rep_bytes);
1072 srcptr += stride;
1073 dstptr += stride;
1074 }
1075 if (diff) /* number of leftover pixels: 3 for pngtest */
1076 {
1077 final_val+=diff*BPP3;
1078 for (; i < final_val; i += stride)
1079 {
1080 if (rep_bytes > (int)(final_val-i))
1081 rep_bytes = (int)(final_val-i);
1082 png_memcpy(dstptr, srcptr, rep_bytes);
1083 srcptr += stride;
1084 dstptr += stride;
1085 }
1086 }
1087 } /* end of else (_mmx_supported) */
1088
1089 break;
1090 } /* end 24 bpp */
1091
1092 case 32: /* png_ptr->row_info.pixel_depth */
1093 {
1094 png_bytep srcptr;
1095 png_bytep dstptr;
1096
1097#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1098#if !defined(PNG_1_0_X)
1099 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1100 /* && _mmx_supported */ )
1101#else
1102 if (_mmx_supported)
1103#endif
1104 {
1105 png_uint_32 len;
1106 int diff;
1107 int dummy_value_a; // fix 'forbidden register spilled' error
1108 int dummy_value_d;
1109 int dummy_value_c;
1110 int dummy_value_S;
1111 int dummy_value_D;
1112 _unmask = ~mask; // global variable for -fPIC version
1113 srcptr = png_ptr->row_buf + 1;
1114 dstptr = row;
1115 len = png_ptr->width &~7; // reduce to multiple of 8
1116 diff = (int) (png_ptr->width & 7); // amount lost //
1117
1118 __asm__ __volatile__ (
1119 "movd _unmask, %%mm7 \n\t" // load bit pattern
1120 "psubb %%mm6, %%mm6 \n\t" // zero mm6
1121 "punpcklbw %%mm7, %%mm7 \n\t"
1122 "punpcklwd %%mm7, %%mm7 \n\t"
1123 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
1124
1125 "movq _mask32_0, %%mm0 \n\t"
1126 "movq _mask32_1, %%mm1 \n\t"
1127 "movq _mask32_2, %%mm2 \n\t"
1128 "movq _mask32_3, %%mm3 \n\t"
1129
1130 "pand %%mm7, %%mm0 \n\t"
1131 "pand %%mm7, %%mm1 \n\t"
1132 "pand %%mm7, %%mm2 \n\t"
1133 "pand %%mm7, %%mm3 \n\t"
1134
1135 "pcmpeqb %%mm6, %%mm0 \n\t"
1136 "pcmpeqb %%mm6, %%mm1 \n\t"
1137 "pcmpeqb %%mm6, %%mm2 \n\t"
1138 "pcmpeqb %%mm6, %%mm3 \n\t"
1139
1140// preload "movl len, %%ecx \n\t" // load length of line
1141// preload "movl srcptr, %%esi \n\t" // load source
1142// preload "movl dstptr, %%edi \n\t" // load dest
1143
1144 "cmpl $0, %%ecx \n\t" // lcr
1145 "jz mainloop32end \n\t"
1146
1147 "mainloop32: \n\t"
1148 "movq (%%esi), %%mm4 \n\t"
1149 "pand %%mm0, %%mm4 \n\t"
1150 "movq %%mm0, %%mm6 \n\t"
1151 "movq (%%edi), %%mm7 \n\t"
1152 "pandn %%mm7, %%mm6 \n\t"
1153 "por %%mm6, %%mm4 \n\t"
1154 "movq %%mm4, (%%edi) \n\t"
1155
1156 "movq 8(%%esi), %%mm5 \n\t"
1157 "pand %%mm1, %%mm5 \n\t"
1158 "movq %%mm1, %%mm7 \n\t"
1159 "movq 8(%%edi), %%mm6 \n\t"
1160 "pandn %%mm6, %%mm7 \n\t"
1161 "por %%mm7, %%mm5 \n\t"
1162 "movq %%mm5, 8(%%edi) \n\t"
1163
1164 "movq 16(%%esi), %%mm6 \n\t"
1165 "pand %%mm2, %%mm6 \n\t"
1166 "movq %%mm2, %%mm4 \n\t"
1167 "movq 16(%%edi), %%mm7 \n\t"
1168 "pandn %%mm7, %%mm4 \n\t"
1169 "por %%mm4, %%mm6 \n\t"
1170 "movq %%mm6, 16(%%edi) \n\t"
1171
1172 "movq 24(%%esi), %%mm7 \n\t"
1173 "pand %%mm3, %%mm7 \n\t"
1174 "movq %%mm3, %%mm5 \n\t"
1175 "movq 24(%%edi), %%mm4 \n\t"
1176 "pandn %%mm4, %%mm5 \n\t"
1177 "por %%mm5, %%mm7 \n\t"
1178 "movq %%mm7, 24(%%edi) \n\t"
1179
1180 "addl $32, %%esi \n\t" // inc by 32 bytes processed
1181 "addl $32, %%edi \n\t"
1182 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
1183 "ja mainloop32 \n\t"
1184
1185 "mainloop32end: \n\t"
1186// preload "movl diff, %%ecx \n\t" // (diff is in eax)
1187 "movl %%eax, %%ecx \n\t"
1188 "cmpl $0, %%ecx \n\t"
1189 "jz end32 \n\t"
1190// preload "movl mask, %%edx \n\t"
1191 "sall $24, %%edx \n\t" // low byte => high byte
1192
1193 "secondloop32: \n\t"
1194 "sall %%edx \n\t" // move high bit to CF
1195 "jnc skip32 \n\t" // if CF = 0
1196 "movl (%%esi), %%eax \n\t"
1197 "movl %%eax, (%%edi) \n\t"
1198
1199 "skip32: \n\t"
1200 "addl $4, %%esi \n\t"
1201 "addl $4, %%edi \n\t"
1202 "decl %%ecx \n\t"
1203 "jnz secondloop32 \n\t"
1204
1205 "end32: \n\t"
1206 "EMMS \n\t" // DONE
1207
1208 : "=a" (dummy_value_a), // output regs (dummy)
1209 "=d" (dummy_value_d),
1210 "=c" (dummy_value_c),
1211 "=S" (dummy_value_S),
1212 "=D" (dummy_value_D)
1213
1214 : "3" (srcptr), // esi // input regs
1215 "4" (dstptr), // edi
1216 "0" (diff), // eax
1217// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1218 "2" (len), // ecx
1219 "1" (mask) // edx
1220
1221#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1222 : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
1223 , "%mm4", "%mm5", "%mm6", "%mm7"
1224#endif
1225 );
1226 }
1227 else /* mmx _not supported - Use modified C routine */
1228#endif /* PNG_MMX_CODE_SUPPORTED */
1229 {
1230 register png_uint_32 i;
1231 png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
1232 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1233 register int stride = BPP4 * png_pass_inc[png_ptr->pass];
1234 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1235 register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
1236 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1237 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1238 int diff = (int) (png_ptr->width & 7); /* amount lost */
1239 register png_uint_32 final_val = BPP4 * len; /* GRR bugfix */
1240
1241 srcptr = png_ptr->row_buf + 1 + initial_val;
1242 dstptr = row + initial_val;
1243
1244 for (i = initial_val; i < final_val; i += stride)
1245 {
1246 png_memcpy(dstptr, srcptr, rep_bytes);
1247 srcptr += stride;
1248 dstptr += stride;
1249 }
1250 if (diff) /* number of leftover pixels: 3 for pngtest */
1251 {
1252 final_val+=diff*BPP4;
1253 for (; i < final_val; i += stride)
1254 {
1255 if (rep_bytes > (int)(final_val-i))
1256 rep_bytes = (int)(final_val-i);
1257 png_memcpy(dstptr, srcptr, rep_bytes);
1258 srcptr += stride;
1259 dstptr += stride;
1260 }
1261 }
1262 } /* end of else (_mmx_supported) */
1263
1264 break;
1265 } /* end 32 bpp */
1266
1267 case 48: /* png_ptr->row_info.pixel_depth */
1268 {
1269 png_bytep srcptr;
1270 png_bytep dstptr;
1271
1272#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1273#if !defined(PNG_1_0_X)
1274 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1275 /* && _mmx_supported */ )
1276#else
1277 if (_mmx_supported)
1278#endif
1279 {
1280 png_uint_32 len;
1281 int diff;
1282 int dummy_value_a; // fix 'forbidden register spilled' error
1283 int dummy_value_d;
1284 int dummy_value_c;
1285 int dummy_value_S;
1286 int dummy_value_D;
1287 _unmask = ~mask; // global variable for -fPIC version
1288 srcptr = png_ptr->row_buf + 1;
1289 dstptr = row;
1290 len = png_ptr->width &~7; // reduce to multiple of 8
1291 diff = (int) (png_ptr->width & 7); // amount lost //
1292
1293 __asm__ __volatile__ (
1294 "movd _unmask, %%mm7 \n\t" // load bit pattern
1295 "psubb %%mm6, %%mm6 \n\t" // zero mm6
1296 "punpcklbw %%mm7, %%mm7 \n\t"
1297 "punpcklwd %%mm7, %%mm7 \n\t"
1298 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
1299
1300 "movq _mask48_0, %%mm0 \n\t"
1301 "movq _mask48_1, %%mm1 \n\t"
1302 "movq _mask48_2, %%mm2 \n\t"
1303 "movq _mask48_3, %%mm3 \n\t"
1304 "movq _mask48_4, %%mm4 \n\t"
1305 "movq _mask48_5, %%mm5 \n\t"
1306
1307 "pand %%mm7, %%mm0 \n\t"
1308 "pand %%mm7, %%mm1 \n\t"
1309 "pand %%mm7, %%mm2 \n\t"
1310 "pand %%mm7, %%mm3 \n\t"
1311 "pand %%mm7, %%mm4 \n\t"
1312 "pand %%mm7, %%mm5 \n\t"
1313
1314 "pcmpeqb %%mm6, %%mm0 \n\t"
1315 "pcmpeqb %%mm6, %%mm1 \n\t"
1316 "pcmpeqb %%mm6, %%mm2 \n\t"
1317 "pcmpeqb %%mm6, %%mm3 \n\t"
1318 "pcmpeqb %%mm6, %%mm4 \n\t"
1319 "pcmpeqb %%mm6, %%mm5 \n\t"
1320
1321// preload "movl len, %%ecx \n\t" // load length of line
1322// preload "movl srcptr, %%esi \n\t" // load source
1323// preload "movl dstptr, %%edi \n\t" // load dest
1324
1325 "cmpl $0, %%ecx \n\t"
1326 "jz mainloop48end \n\t"
1327
1328 "mainloop48: \n\t"
1329 "movq (%%esi), %%mm7 \n\t"
1330 "pand %%mm0, %%mm7 \n\t"
1331 "movq %%mm0, %%mm6 \n\t"
1332 "pandn (%%edi), %%mm6 \n\t"
1333 "por %%mm6, %%mm7 \n\t"
1334 "movq %%mm7, (%%edi) \n\t"
1335
1336 "movq 8(%%esi), %%mm6 \n\t"
1337 "pand %%mm1, %%mm6 \n\t"
1338 "movq %%mm1, %%mm7 \n\t"
1339 "pandn 8(%%edi), %%mm7 \n\t"
1340 "por %%mm7, %%mm6 \n\t"
1341 "movq %%mm6, 8(%%edi) \n\t"
1342
1343 "movq 16(%%esi), %%mm6 \n\t"
1344 "pand %%mm2, %%mm6 \n\t"
1345 "movq %%mm2, %%mm7 \n\t"
1346 "pandn 16(%%edi), %%mm7 \n\t"
1347 "por %%mm7, %%mm6 \n\t"
1348 "movq %%mm6, 16(%%edi) \n\t"
1349
1350 "movq 24(%%esi), %%mm7 \n\t"
1351 "pand %%mm3, %%mm7 \n\t"
1352 "movq %%mm3, %%mm6 \n\t"
1353 "pandn 24(%%edi), %%mm6 \n\t"
1354 "por %%mm6, %%mm7 \n\t"
1355 "movq %%mm7, 24(%%edi) \n\t"
1356
1357 "movq 32(%%esi), %%mm6 \n\t"
1358 "pand %%mm4, %%mm6 \n\t"
1359 "movq %%mm4, %%mm7 \n\t"
1360 "pandn 32(%%edi), %%mm7 \n\t"
1361 "por %%mm7, %%mm6 \n\t"
1362 "movq %%mm6, 32(%%edi) \n\t"
1363
1364 "movq 40(%%esi), %%mm7 \n\t"
1365 "pand %%mm5, %%mm7 \n\t"
1366 "movq %%mm5, %%mm6 \n\t"
1367 "pandn 40(%%edi), %%mm6 \n\t"
1368 "por %%mm6, %%mm7 \n\t"
1369 "movq %%mm7, 40(%%edi) \n\t"
1370
1371 "addl $48, %%esi \n\t" // inc by 48 bytes processed
1372 "addl $48, %%edi \n\t"
1373 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
1374
1375 "ja mainloop48 \n\t"
1376
1377 "mainloop48end: \n\t"
1378// preload "movl diff, %%ecx \n\t" // (diff is in eax)
1379 "movl %%eax, %%ecx \n\t"
1380 "cmpl $0, %%ecx \n\t"
1381 "jz end48 \n\t"
1382// preload "movl mask, %%edx \n\t"
1383 "sall $24, %%edx \n\t" // make low byte, high byte
1384
1385 "secondloop48: \n\t"
1386 "sall %%edx \n\t" // move high bit to CF
1387 "jnc skip48 \n\t" // if CF = 0
1388 "movl (%%esi), %%eax \n\t"
1389 "movl %%eax, (%%edi) \n\t"
1390
1391 "skip48: \n\t"
1392 "addl $4, %%esi \n\t"
1393 "addl $4, %%edi \n\t"
1394 "decl %%ecx \n\t"
1395 "jnz secondloop48 \n\t"
1396
1397 "end48: \n\t"
1398 "EMMS \n\t" // DONE
1399
1400 : "=a" (dummy_value_a), // output regs (dummy)
1401 "=d" (dummy_value_d),
1402 "=c" (dummy_value_c),
1403 "=S" (dummy_value_S),
1404 "=D" (dummy_value_D)
1405
1406 : "3" (srcptr), // esi // input regs
1407 "4" (dstptr), // edi
1408 "0" (diff), // eax
1409// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1410 "2" (len), // ecx
1411 "1" (mask) // edx
1412
1413#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1414 : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
1415 , "%mm4", "%mm5", "%mm6", "%mm7"
1416#endif
1417 );
1418 }
1419 else /* mmx _not supported - Use modified C routine */
1420#endif /* PNG_MMX_CODE_SUPPORTED */
1421 {
1422 register png_uint_32 i;
1423 png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
1424 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1425 register int stride = BPP6 * png_pass_inc[png_ptr->pass];
1426 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1427 register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
1428 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1429 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1430 int diff = (int) (png_ptr->width & 7); /* amount lost */
1431 register png_uint_32 final_val = BPP6 * len; /* GRR bugfix */
1432
1433 srcptr = png_ptr->row_buf + 1 + initial_val;
1434 dstptr = row + initial_val;
1435
1436 for (i = initial_val; i < final_val; i += stride)
1437 {
1438 png_memcpy(dstptr, srcptr, rep_bytes);
1439 srcptr += stride;
1440 dstptr += stride;
1441 }
1442 if (diff) /* number of leftover pixels: 3 for pngtest */
1443 {
1444 final_val+=diff*BPP6;
1445 for (; i < final_val; i += stride)
1446 {
1447 if (rep_bytes > (int)(final_val-i))
1448 rep_bytes = (int)(final_val-i);
1449 png_memcpy(dstptr, srcptr, rep_bytes);
1450 srcptr += stride;
1451 dstptr += stride;
1452 }
1453 }
1454 } /* end of else (_mmx_supported) */
1455
1456 break;
1457 } /* end 48 bpp */
1458
1459 case 64: /* png_ptr->row_info.pixel_depth */
1460 {
1461 png_bytep srcptr;
1462 png_bytep dstptr;
1463 register png_uint_32 i;
1464 png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
1465 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1466 register int stride = BPP8 * png_pass_inc[png_ptr->pass];
1467 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1468 register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
1469 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1470 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1471 int diff = (int) (png_ptr->width & 7); /* amount lost */
1472 register png_uint_32 final_val = BPP8 * len; /* GRR bugfix */
1473
1474 srcptr = png_ptr->row_buf + 1 + initial_val;
1475 dstptr = row + initial_val;
1476
1477 for (i = initial_val; i < final_val; i += stride)
1478 {
1479 png_memcpy(dstptr, srcptr, rep_bytes);
1480 srcptr += stride;
1481 dstptr += stride;
1482 }
1483 if (diff) /* number of leftover pixels: 3 for pngtest */
1484 {
1485 final_val+=diff*BPP8;
1486 for (; i < final_val; i += stride)
1487 {
1488 if (rep_bytes > (int)(final_val-i))
1489 rep_bytes = (int)(final_val-i);
1490 png_memcpy(dstptr, srcptr, rep_bytes);
1491 srcptr += stride;
1492 dstptr += stride;
1493 }
1494 }
1495
1496 break;
1497 } /* end 64 bpp */
1498
1499 default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
1500 {
1501 /* this should never happen */
1502 png_warning(png_ptr, "Invalid row_info.pixel_depth in pnggccrd");
1503 break;
1504 }
1505 } /* end switch (png_ptr->row_info.pixel_depth) */
1506
1507 } /* end if (non-trivial mask) */
1508
1509} /* end png_combine_row() */
1510
1511#endif /* PNG_HAVE_MMX_COMBINE_ROW */
1512
1513
1514
1515
1516/*===========================================================================*/
1517/* */
1518/* P N G _ D O _ R E A D _ I N T E R L A C E */
1519/* */
1520/*===========================================================================*/
1521
1522#if defined(PNG_READ_INTERLACING_SUPPORTED)
1523#if defined(PNG_HAVE_MMX_READ_INTERLACE)
1524
1525/* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
1526 * has taken place. [GRR: what other steps come before and/or after?]
1527 */
1528
1529void /* PRIVATE */
1530png_do_read_interlace(png_structp png_ptr)
1531{
1532 png_row_infop row_info = &(png_ptr->row_info);
1533 png_bytep row = png_ptr->row_buf + 1;
1534 int pass = png_ptr->pass;
1535#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1536 png_uint_32 transformations = png_ptr->transformations;
1537#endif
1538
1539 png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
1540
1541#if defined(PNG_MMX_CODE_SUPPORTED)
1542 if (_mmx_supported == 2) {
1543#if !defined(PNG_1_0_X)
1544 /* this should have happened in png_init_mmx_flags() already */
1545 png_warning(png_ptr, "asm_flags may not have been initialized");
1546#endif
1547 png_mmx_support();
1548 }
1549#endif
1550
1551 if (row != NULL && row_info != NULL)
1552 {
1553 png_uint_32 final_width;
1554
1555 final_width = row_info->width * png_pass_inc[pass];
1556
1557 switch (row_info->pixel_depth)
1558 {
1559 case 1:
1560 {
1561 png_bytep sp, dp;
1562 int sshift, dshift;
1563 int s_start, s_end, s_inc;
1564 png_byte v;
1565 png_uint_32 i;
1566 int j;
1567
1568 sp = row + (png_size_t)((row_info->width - 1) >> 3);
1569 dp = row + (png_size_t)((final_width - 1) >> 3);
1570#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1571 if (transformations & PNG_PACKSWAP)
1572 {
1573 sshift = (int)((row_info->width + 7) & 7);
1574 dshift = (int)((final_width + 7) & 7);
1575 s_start = 7;
1576 s_end = 0;
1577 s_inc = -1;
1578 }
1579 else
1580#endif
1581 {
1582 sshift = 7 - (int)((row_info->width + 7) & 7);
1583 dshift = 7 - (int)((final_width + 7) & 7);
1584 s_start = 0;
1585 s_end = 7;
1586 s_inc = 1;
1587 }
1588
1589 for (i = row_info->width; i; i--)
1590 {
1591 v = (png_byte)((*sp >> sshift) & 0x1);
1592 for (j = 0; j < png_pass_inc[pass]; j++)
1593 {
1594 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1595 *dp |= (png_byte)(v << dshift);
1596 if (dshift == s_end)
1597 {
1598 dshift = s_start;
1599 dp--;
1600 }
1601 else
1602 dshift += s_inc;
1603 }
1604 if (sshift == s_end)
1605 {
1606 sshift = s_start;
1607 sp--;
1608 }
1609 else
1610 sshift += s_inc;
1611 }
1612 break;
1613 }
1614
1615 case 2:
1616 {
1617 png_bytep sp, dp;
1618 int sshift, dshift;
1619 int s_start, s_end, s_inc;
1620 png_uint_32 i;
1621
1622 sp = row + (png_size_t)((row_info->width - 1) >> 2);
1623 dp = row + (png_size_t)((final_width - 1) >> 2);
1624#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1625 if (transformations & PNG_PACKSWAP)
1626 {
1627 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1628 dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1629 s_start = 6;
1630 s_end = 0;
1631 s_inc = -2;
1632 }
1633 else
1634#endif
1635 {
1636 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1637 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1638 s_start = 0;
1639 s_end = 6;
1640 s_inc = 2;
1641 }
1642
1643 for (i = row_info->width; i; i--)
1644 {
1645 png_byte v;
1646 int j;
1647
1648 v = (png_byte)((*sp >> sshift) & 0x3);
1649 for (j = 0; j < png_pass_inc[pass]; j++)
1650 {
1651 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1652 *dp |= (png_byte)(v << dshift);
1653 if (dshift == s_end)
1654 {
1655 dshift = s_start;
1656 dp--;
1657 }
1658 else
1659 dshift += s_inc;
1660 }
1661 if (sshift == s_end)
1662 {
1663 sshift = s_start;
1664 sp--;
1665 }
1666 else
1667 sshift += s_inc;
1668 }
1669 break;
1670 }
1671
1672 case 4:
1673 {
1674 png_bytep sp, dp;
1675 int sshift, dshift;
1676 int s_start, s_end, s_inc;
1677 png_uint_32 i;
1678
1679 sp = row + (png_size_t)((row_info->width - 1) >> 1);
1680 dp = row + (png_size_t)((final_width - 1) >> 1);
1681#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1682 if (transformations & PNG_PACKSWAP)
1683 {
1684 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1685 dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1686 s_start = 4;
1687 s_end = 0;
1688 s_inc = -4;
1689 }
1690 else
1691#endif
1692 {
1693 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1694 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1695 s_start = 0;
1696 s_end = 4;
1697 s_inc = 4;
1698 }
1699
1700 for (i = row_info->width; i; i--)
1701 {
1702 png_byte v;
1703 int j;
1704
1705 v = (png_byte)((*sp >> sshift) & 0xf);
1706 for (j = 0; j < png_pass_inc[pass]; j++)
1707 {
1708 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1709 *dp |= (png_byte)(v << dshift);
1710 if (dshift == s_end)
1711 {
1712 dshift = s_start;
1713 dp--;
1714 }
1715 else
1716 dshift += s_inc;
1717 }
1718 if (sshift == s_end)
1719 {
1720 sshift = s_start;
1721 sp--;
1722 }
1723 else
1724 sshift += s_inc;
1725 }
1726 break;
1727 }
1728
1729 /*====================================================================*/
1730
1731 default: /* 8-bit or larger (this is where the routine is modified) */
1732 {
1733#if 0
1734// static unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
1735// static unsigned long long const4 = 0x0000000000FFFFFFLL; no good
1736// unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
1737// unsigned long long const4 = 0x0000000000FFFFFFLL; no good
1738#endif
1739 png_bytep sptr, dp;
1740 png_uint_32 i;
1741 png_size_t pixel_bytes;
1742 int width = (int)row_info->width;
1743
1744 pixel_bytes = (row_info->pixel_depth >> 3);
1745
1746 /* point sptr at the last pixel in the pre-expanded row: */
1747 sptr = row + (width - 1) * pixel_bytes;
1748
1749 /* point dp at the last pixel position in the expanded row: */
1750 dp = row + (final_width - 1) * pixel_bytes;
1751
1752 /* New code by Nirav Chhatrapati - Intel Corporation */
1753
1754#if defined(PNG_MMX_CODE_SUPPORTED)
1755#if !defined(PNG_1_0_X)
1756 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1757 /* && _mmx_supported */ )
1758#else
1759 if (_mmx_supported)
1760#endif
1761 {
1762 //--------------------------------------------------------------
1763 if (pixel_bytes == 3)
1764 {
1765 if (((pass == 0) || (pass == 1)) && width)
1766 {
1767 int dummy_value_c; // fix 'forbidden register spilled'
1768 int dummy_value_S;
1769 int dummy_value_D;
1770 int dummy_value_a;
1771
1772 __asm__ __volatile__ (
1773 "subl $21, %%edi \n\t"
1774 // (png_pass_inc[pass] - 1)*pixel_bytes
1775
1776 ".loop3_pass0: \n\t"
1777 "movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
1778 "pand (%3), %%mm0 \n\t" // z z z z z 2 1 0
1779 "movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
1780 "psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
1781 "movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
1782 "psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
1783 "psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
1784 "por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
1785 "por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
1786 "movq %%mm0, %%mm3 \n\t" // 2 1 0 2 1 0 2 1
1787 "psllq $16, %%mm0 \n\t" // 0 2 1 0 2 1 z z
1788 "movq %%mm3, %%mm4 \n\t" // 2 1 0 2 1 0 2 1
1789 "punpckhdq %%mm0, %%mm3 \n\t" // 0 2 1 0 2 1 0 2
1790 "movq %%mm4, 16(%%edi) \n\t"
1791 "psrlq $32, %%mm0 \n\t" // z z z z 0 2 1 0
1792 "movq %%mm3, 8(%%edi) \n\t"
1793 "punpckldq %%mm4, %%mm0 \n\t" // 1 0 2 1 0 2 1 0
1794 "subl $3, %%esi \n\t"
1795 "movq %%mm0, (%%edi) \n\t"
1796 "subl $24, %%edi \n\t"
1797 "decl %%ecx \n\t"
1798 "jnz .loop3_pass0 \n\t"
1799 "EMMS \n\t" // DONE
1800
1801 : "=c" (dummy_value_c), // output regs (dummy)
1802 "=S" (dummy_value_S),
1803 "=D" (dummy_value_D),
1804 "=a" (dummy_value_a)
1805
1806
1807 : "1" (sptr), // esi // input regs
1808 "2" (dp), // edi
1809 "0" (width), // ecx
1810 "3" (&_const4) // %1(?) (0x0000000000FFFFFFLL)
1811
1812#if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1813 : "%mm0", "%mm1", "%mm2" // clobber list
1814 , "%mm3", "%mm4"
1815#endif
1816 );
1817 }
1818 else if (((pass == 2) || (pass == 3)) && width)
1819 {
1820 int dummy_value_c; // fix 'forbidden register spilled'
1821 int dummy_value_S;
1822 int dummy_value_D;
1823 int dummy_value_a;
1824
1825 __asm__ __volatile__ (
1826 "subl $9, %%edi \n\t"
1827 // (png_pass_inc[pass] - 1)*pixel_bytes
1828
1829 ".loop3_pass2: \n\t"
1830 "movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
1831 "pand (%3), %%mm0 \n\t" // z z z z z 2 1 0
1832 "movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
1833 "psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
1834 "movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
1835 "psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
1836 "psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
1837 "por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
1838 "por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
1839 "movq %%mm0, 4(%%edi) \n\t"
1840 "psrlq $16, %%mm0 \n\t" // z z 2 1 0 2 1 0
1841 "subl $3, %%esi \n\t"
1842 "movd %%mm0, (%%edi) \n\t"
1843 "subl $12, %%edi \n\t"
1844 "decl %%ecx \n\t"
1845 "jnz .loop3_pass2 \n\t"
1846 "EMMS \n\t" // DONE
1847
1848 : "=c" (dummy_value_c), // output regs (dummy)
1849 "=S" (dummy_value_S),
1850 "=D" (dummy_value_D),
1851 "=a" (dummy_value_a)
1852
1853 : "1" (sptr), // esi // input regs
1854 "2" (dp), // edi
1855 "0" (width), // ecx
1856 "3" (&_const4) // (0x0000000000FFFFFFLL)
1857
1858#if 0 /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
1859 : "%mm0", "%mm1", "%mm2" // clobber list
1860#endif
1861 );
1862 }
1863 else if (width) /* && ((pass == 4) || (pass == 5)) */
1864 {
1865 int width_mmx = ((width >> 1) << 1) - 8; // GRR: huh?
1866 if (width_mmx < 0)
1867 width_mmx = 0;
1868 width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
1869 if (width_mmx)
1870 {
1871 // png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1872 // sptr points at last pixel in pre-expanded row
1873 // dp points at last pixel position in expanded row
1874 int dummy_value_c; // fix 'forbidden register spilled'
1875 int dummy_value_S;
1876 int dummy_value_D;
1877 int dummy_value_a;
1878 int dummy_value_d;
1879
1880 __asm__ __volatile__ (
1881 "subl $3, %%esi \n\t"
1882 "subl $9, %%edi \n\t"
1883 // (png_pass_inc[pass] + 1)*pixel_bytes
1884
1885 ".loop3_pass4: \n\t"
1886 "movq (%%esi), %%mm0 \n\t" // x x 5 4 3 2 1 0
1887 "movq %%mm0, %%mm1 \n\t" // x x 5 4 3 2 1 0
1888 "movq %%mm0, %%mm2 \n\t" // x x 5 4 3 2 1 0
1889 "psllq $24, %%mm0 \n\t" // 4 3 2 1 0 z z z
1890 "pand (%3), %%mm1 \n\t" // z z z z z 2 1 0
1891 "psrlq $24, %%mm2 \n\t" // z z z x x 5 4 3
1892 "por %%mm1, %%mm0 \n\t" // 4 3 2 1 0 2 1 0
1893 "movq %%mm2, %%mm3 \n\t" // z z z x x 5 4 3
1894 "psllq $8, %%mm2 \n\t" // z z x x 5 4 3 z
1895 "movq %%mm0, (%%edi) \n\t"
1896 "psrlq $16, %%mm3 \n\t" // z z z z z x x 5
1897 "pand (%4), %%mm3 \n\t" // z z z z z z z 5
1898 "por %%mm3, %%mm2 \n\t" // z z x x 5 4 3 5
1899 "subl $6, %%esi \n\t"
1900 "movd %%mm2, 8(%%edi) \n\t"
1901 "subl $12, %%edi \n\t"
1902 "subl $2, %%ecx \n\t"
1903 "jnz .loop3_pass4 \n\t"
1904 "EMMS \n\t" // DONE
1905
1906 : "=c" (dummy_value_c), // output regs (dummy)
1907 "=S" (dummy_value_S),
1908 "=D" (dummy_value_D),
1909 "=a" (dummy_value_a),
1910 "=d" (dummy_value_d)
1911
1912 : "1" (sptr), // esi // input regs
1913 "2" (dp), // edi
1914 "0" (width_mmx), // ecx
1915 "3" (&_const4), // 0x0000000000FFFFFFLL
1916 "4" (&_const6) // 0x00000000000000FFLL
1917
1918#if 0 /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
1919 : "%mm0", "%mm1" // clobber list
1920 , "%mm2", "%mm3"
1921#endif
1922 );
1923 }
1924
1925 sptr -= width_mmx*3;
1926 dp -= width_mmx*6;
1927 for (i = width; i; i--)
1928 {
1929 png_byte v[8];
1930 int j;
1931
1932 png_memcpy(v, sptr, 3);
1933 for (j = 0; j < png_pass_inc[pass]; j++)
1934 {
1935 png_memcpy(dp, v, 3);
1936 dp -= 3;
1937 }
1938 sptr -= 3;
1939 }
1940 }
1941 } /* end of pixel_bytes == 3 */
1942
1943 //--------------------------------------------------------------
1944 else if (pixel_bytes == 1)
1945 {
1946 if (((pass == 0) || (pass == 1)) && width)
1947 {
1948 int width_mmx = ((width >> 2) << 2);
1949 width -= width_mmx; // 0-3 pixels => 0-3 bytes
1950 if (width_mmx)
1951 {
1952 int dummy_value_c; // fix 'forbidden register spilled'
1953 int dummy_value_S;
1954 int dummy_value_D;
1955
1956 __asm__ __volatile__ (
1957 "subl $3, %%esi \n\t"
1958 "subl $31, %%edi \n\t"
1959
1960 ".loop1_pass0: \n\t"
1961 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
1962 "movq %%mm0, %%mm1 \n\t" // x x x x 3 2 1 0
1963 "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
1964 "movq %%mm0, %%mm2 \n\t" // 3 3 2 2 1 1 0 0
1965 "punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
1966 "movq %%mm0, %%mm3 \n\t" // 1 1 1 1 0 0 0 0
1967 "punpckldq %%mm0, %%mm0 \n\t" // 0 0 0 0 0 0 0 0
1968 "punpckhdq %%mm3, %%mm3 \n\t" // 1 1 1 1 1 1 1 1
1969 "movq %%mm0, (%%edi) \n\t"
1970 "punpckhwd %%mm2, %%mm2 \n\t" // 3 3 3 3 2 2 2 2
1971 "movq %%mm3, 8(%%edi) \n\t"
1972 "movq %%mm2, %%mm4 \n\t" // 3 3 3 3 2 2 2 2
1973 "punpckldq %%mm2, %%mm2 \n\t" // 2 2 2 2 2 2 2 2
1974 "punpckhdq %%mm4, %%mm4 \n\t" // 3 3 3 3 3 3 3 3
1975 "movq %%mm2, 16(%%edi) \n\t"
1976 "subl $4, %%esi \n\t"
1977 "movq %%mm4, 24(%%edi) \n\t"
1978 "subl $32, %%edi \n\t"
1979 "subl $4, %%ecx \n\t"
1980 "jnz .loop1_pass0 \n\t"
1981 "EMMS \n\t" // DONE
1982
1983 : "=c" (dummy_value_c), // output regs (dummy)
1984 "=S" (dummy_value_S),
1985 "=D" (dummy_value_D)
1986
1987 : "1" (sptr), // esi // input regs
1988 "2" (dp), // edi
1989 "0" (width_mmx) // ecx
1990
1991#if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1992 : "%mm0", "%mm1", "%mm2" // clobber list
1993 , "%mm3", "%mm4"
1994#endif
1995 );
1996 }
1997
1998 sptr -= width_mmx;
1999 dp -= width_mmx*8;
2000 for (i = width; i; i--)
2001 {
2002 int j;
2003
2004 /* I simplified this part in version 1.0.4e
2005 * here and in several other instances where
2006 * pixel_bytes == 1 -- GR-P
2007 *
2008 * Original code:
2009 *
2010 * png_byte v[8];
2011 * png_memcpy(v, sptr, pixel_bytes);
2012 * for (j = 0; j < png_pass_inc[pass]; j++)
2013 * {
2014 * png_memcpy(dp, v, pixel_bytes);
2015 * dp -= pixel_bytes;
2016 * }
2017 * sptr -= pixel_bytes;
2018 *
2019 * Replacement code is in the next three lines:
2020 */
2021
2022 for (j = 0; j < png_pass_inc[pass]; j++)
2023 {
2024 *dp-- = *sptr;
2025 }
2026 --sptr;
2027 }
2028 }
2029 else if (((pass == 2) || (pass == 3)) && width)
2030 {
2031 int width_mmx = ((width >> 2) << 2);
2032 width -= width_mmx; // 0-3 pixels => 0-3 bytes
2033 if (width_mmx)
2034 {
2035 int dummy_value_c; // fix 'forbidden register spilled'
2036 int dummy_value_S;
2037 int dummy_value_D;
2038
2039 __asm__ __volatile__ (
2040 "subl $3, %%esi \n\t"
2041 "subl $15, %%edi \n\t"
2042
2043 ".loop1_pass2: \n\t"
2044 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2045 "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
2046 "movq %%mm0, %%mm1 \n\t" // 3 3 2 2 1 1 0 0
2047 "punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
2048 "punpckhwd %%mm1, %%mm1 \n\t" // 3 3 3 3 2 2 2 2
2049 "movq %%mm0, (%%edi) \n\t"
2050 "subl $4, %%esi \n\t"
2051 "movq %%mm1, 8(%%edi) \n\t"
2052 "subl $16, %%edi \n\t"
2053 "subl $4, %%ecx \n\t"
2054 "jnz .loop1_pass2 \n\t"
2055 "EMMS \n\t" // DONE
2056
2057 : "=c" (dummy_value_c), // output regs (dummy)
2058 "=S" (dummy_value_S),
2059 "=D" (dummy_value_D)
2060
2061 : "1" (sptr), // esi // input regs
2062 "2" (dp), // edi
2063 "0" (width_mmx) // ecx
2064
2065#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2066 : "%mm0", "%mm1" // clobber list
2067#endif
2068 );
2069 }
2070
2071 sptr -= width_mmx;
2072 dp -= width_mmx*4;
2073 for (i = width; i; i--)
2074 {
2075 int j;
2076
2077 for (j = 0; j < png_pass_inc[pass]; j++)
2078 {
2079 *dp-- = *sptr;
2080 }
2081 --sptr;
2082 }
2083 }
2084 else if (width) /* && ((pass == 4) || (pass == 5)) */
2085 {
2086 int width_mmx = ((width >> 3) << 3);
2087 width -= width_mmx; // 0-3 pixels => 0-3 bytes
2088 if (width_mmx)
2089 {
2090 int dummy_value_c; // fix 'forbidden register spilled'
2091 int dummy_value_S;
2092 int dummy_value_D;
2093
2094 __asm__ __volatile__ (
2095 "subl $7, %%esi \n\t"
2096 "subl $15, %%edi \n\t"
2097
2098 ".loop1_pass4: \n\t"
2099 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2100 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2101 "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
2102 "punpckhbw %%mm1, %%mm1 \n\t" // 7 7 6 6 5 5 4 4
2103 "movq %%mm1, 8(%%edi) \n\t"
2104 "subl $8, %%esi \n\t"
2105 "movq %%mm0, (%%edi) \n\t"
2106 "subl $16, %%edi \n\t"
2107 "subl $8, %%ecx \n\t"
2108 "jnz .loop1_pass4 \n\t"
2109 "EMMS \n\t" // DONE
2110
2111 : "=c" (dummy_value_c), // output regs (none)
2112 "=S" (dummy_value_S),
2113 "=D" (dummy_value_D)
2114
2115 : "1" (sptr), // esi // input regs
2116 "2" (dp), // edi
2117 "0" (width_mmx) // ecx
2118
2119#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2120 : "%mm0", "%mm1" // clobber list
2121#endif
2122 );
2123 }
2124
2125 sptr -= width_mmx;
2126 dp -= width_mmx*2;
2127 for (i = width; i; i--)
2128 {
2129 int j;
2130
2131 for (j = 0; j < png_pass_inc[pass]; j++)
2132 {
2133 *dp-- = *sptr;
2134 }
2135 --sptr;
2136 }
2137 }
2138 } /* end of pixel_bytes == 1 */
2139
2140 //--------------------------------------------------------------
2141 else if (pixel_bytes == 2)
2142 {
2143 if (((pass == 0) || (pass == 1)) && width)
2144 {
2145 int width_mmx = ((width >> 1) << 1);
2146 width -= width_mmx; // 0,1 pixels => 0,2 bytes
2147 if (width_mmx)
2148 {
2149 int dummy_value_c; // fix 'forbidden register spilled'
2150 int dummy_value_S;
2151 int dummy_value_D;
2152
2153 __asm__ __volatile__ (
2154 "subl $2, %%esi \n\t"
2155 "subl $30, %%edi \n\t"
2156
2157 ".loop2_pass0: \n\t"
2158 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2159 "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2160 "movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
2161 "punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
2162 "punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
2163 "movq %%mm0, (%%edi) \n\t"
2164 "movq %%mm0, 8(%%edi) \n\t"
2165 "movq %%mm1, 16(%%edi) \n\t"
2166 "subl $4, %%esi \n\t"
2167 "movq %%mm1, 24(%%edi) \n\t"
2168 "subl $32, %%edi \n\t"
2169 "subl $2, %%ecx \n\t"
2170 "jnz .loop2_pass0 \n\t"
2171 "EMMS \n\t" // DONE
2172
2173 : "=c" (dummy_value_c), // output regs (dummy)
2174 "=S" (dummy_value_S),
2175 "=D" (dummy_value_D)
2176
2177 : "1" (sptr), // esi // input regs
2178 "2" (dp), // edi
2179 "0" (width_mmx) // ecx
2180
2181#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2182 : "%mm0", "%mm1" // clobber list
2183#endif
2184 );
2185 }
2186
2187 sptr -= (width_mmx*2 - 2); // sign fixed
2188 dp -= (width_mmx*16 - 2); // sign fixed
2189 for (i = width; i; i--)
2190 {
2191 png_byte v[8];
2192 int j;
2193 sptr -= 2;
2194 png_memcpy(v, sptr, 2);
2195 for (j = 0; j < png_pass_inc[pass]; j++)
2196 {
2197 dp -= 2;
2198 png_memcpy(dp, v, 2);
2199 }
2200 }
2201 }
2202 else if (((pass == 2) || (pass == 3)) && width)
2203 {
2204 int width_mmx = ((width >> 1) << 1) ;
2205 width -= width_mmx; // 0,1 pixels => 0,2 bytes
2206 if (width_mmx)
2207 {
2208 int dummy_value_c; // fix 'forbidden register spilled'
2209 int dummy_value_S;
2210 int dummy_value_D;
2211
2212 __asm__ __volatile__ (
2213 "subl $2, %%esi \n\t"
2214 "subl $14, %%edi \n\t"
2215
2216 ".loop2_pass2: \n\t"
2217 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2218 "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2219 "movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
2220 "punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
2221 "punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
2222 "movq %%mm0, (%%edi) \n\t"
2223 "subl $4, %%esi \n\t"
2224 "movq %%mm1, 8(%%edi) \n\t"
2225 "subl $16, %%edi \n\t"
2226 "subl $2, %%ecx \n\t"
2227 "jnz .loop2_pass2 \n\t"
2228 "EMMS \n\t" // DONE
2229
2230 : "=c" (dummy_value_c), // output regs (dummy)
2231 "=S" (dummy_value_S),
2232 "=D" (dummy_value_D)
2233
2234 : "1" (sptr), // esi // input regs
2235 "2" (dp), // edi
2236 "0" (width_mmx) // ecx
2237
2238#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2239 : "%mm0", "%mm1" // clobber list
2240#endif
2241 );
2242 }
2243
2244 sptr -= (width_mmx*2 - 2); // sign fixed
2245 dp -= (width_mmx*8 - 2); // sign fixed
2246 for (i = width; i; i--)
2247 {
2248 png_byte v[8];
2249 int j;
2250 sptr -= 2;
2251 png_memcpy(v, sptr, 2);
2252 for (j = 0; j < png_pass_inc[pass]; j++)
2253 {
2254 dp -= 2;
2255 png_memcpy(dp, v, 2);
2256 }
2257 }
2258 }
2259 else if (width) // pass == 4 or 5
2260 {
2261 int width_mmx = ((width >> 1) << 1) ;
2262 width -= width_mmx; // 0,1 pixels => 0,2 bytes
2263 if (width_mmx)
2264 {
2265 int dummy_value_c; // fix 'forbidden register spilled'
2266 int dummy_value_S;
2267 int dummy_value_D;
2268
2269 __asm__ __volatile__ (
2270 "subl $2, %%esi \n\t"
2271 "subl $6, %%edi \n\t"
2272
2273 ".loop2_pass4: \n\t"
2274 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2275 "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2276 "subl $4, %%esi \n\t"
2277 "movq %%mm0, (%%edi) \n\t"
2278 "subl $8, %%edi \n\t"
2279 "subl $2, %%ecx \n\t"
2280 "jnz .loop2_pass4 \n\t"
2281 "EMMS \n\t" // DONE
2282
2283 : "=c" (dummy_value_c), // output regs (dummy)
2284 "=S" (dummy_value_S),
2285 "=D" (dummy_value_D)
2286
2287 : "1" (sptr), // esi // input regs
2288 "2" (dp), // edi
2289 "0" (width_mmx) // ecx
2290
2291#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2292 : "%mm0" // clobber list
2293#endif
2294 );
2295 }
2296
2297 sptr -= (width_mmx*2 - 2); // sign fixed
2298 dp -= (width_mmx*4 - 2); // sign fixed
2299 for (i = width; i; i--)
2300 {
2301 png_byte v[8];
2302 int j;
2303 sptr -= 2;
2304 png_memcpy(v, sptr, 2);
2305 for (j = 0; j < png_pass_inc[pass]; j++)
2306 {
2307 dp -= 2;
2308 png_memcpy(dp, v, 2);
2309 }
2310 }
2311 }
2312 } /* end of pixel_bytes == 2 */
2313
2314 //--------------------------------------------------------------
2315 else if (pixel_bytes == 4)
2316 {
2317 if (((pass == 0) || (pass == 1)) && width)
2318 {
2319 int width_mmx = ((width >> 1) << 1);
2320 width -= width_mmx; // 0,1 pixels => 0,4 bytes
2321 if (width_mmx)
2322 {
2323 int dummy_value_c; // fix 'forbidden register spilled'
2324 int dummy_value_S;
2325 int dummy_value_D;
2326
2327 __asm__ __volatile__ (
2328 "subl $4, %%esi \n\t"
2329 "subl $60, %%edi \n\t"
2330
2331 ".loop4_pass0: \n\t"
2332 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2333 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2334 "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2335 "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2336 "movq %%mm0, (%%edi) \n\t"
2337 "movq %%mm0, 8(%%edi) \n\t"
2338 "movq %%mm0, 16(%%edi) \n\t"
2339 "movq %%mm0, 24(%%edi) \n\t"
2340 "movq %%mm1, 32(%%edi) \n\t"
2341 "movq %%mm1, 40(%%edi) \n\t"
2342 "movq %%mm1, 48(%%edi) \n\t"
2343 "subl $8, %%esi \n\t"
2344 "movq %%mm1, 56(%%edi) \n\t"
2345 "subl $64, %%edi \n\t"
2346 "subl $2, %%ecx \n\t"
2347 "jnz .loop4_pass0 \n\t"
2348 "EMMS \n\t" // DONE
2349
2350 : "=c" (dummy_value_c), // output regs (dummy)
2351 "=S" (dummy_value_S),
2352 "=D" (dummy_value_D)
2353
2354 : "1" (sptr), // esi // input regs
2355 "2" (dp), // edi
2356 "0" (width_mmx) // ecx
2357
2358#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2359 : "%mm0", "%mm1" // clobber list
2360#endif
2361 );
2362 }
2363
2364 sptr -= (width_mmx*4 - 4); // sign fixed
2365 dp -= (width_mmx*32 - 4); // sign fixed
2366 for (i = width; i; i--)
2367 {
2368 png_byte v[8];
2369 int j;
2370 sptr -= 4;
2371 png_memcpy(v, sptr, 4);
2372 for (j = 0; j < png_pass_inc[pass]; j++)
2373 {
2374 dp -= 4;
2375 png_memcpy(dp, v, 4);
2376 }
2377 }
2378 }
2379 else if (((pass == 2) || (pass == 3)) && width)
2380 {
2381 int width_mmx = ((width >> 1) << 1);
2382 width -= width_mmx; // 0,1 pixels => 0,4 bytes
2383 if (width_mmx)
2384 {
2385 int dummy_value_c; // fix 'forbidden register spilled'
2386 int dummy_value_S;
2387 int dummy_value_D;
2388
2389 __asm__ __volatile__ (
2390 "subl $4, %%esi \n\t"
2391 "subl $28, %%edi \n\t"
2392
2393 ".loop4_pass2: \n\t"
2394 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2395 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2396 "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2397 "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2398 "movq %%mm0, (%%edi) \n\t"
2399 "movq %%mm0, 8(%%edi) \n\t"
2400 "movq %%mm1, 16(%%edi) \n\t"
2401 "movq %%mm1, 24(%%edi) \n\t"
2402 "subl $8, %%esi \n\t"
2403 "subl $32, %%edi \n\t"
2404 "subl $2, %%ecx \n\t"
2405 "jnz .loop4_pass2 \n\t"
2406 "EMMS \n\t" // DONE
2407
2408 : "=c" (dummy_value_c), // output regs (dummy)
2409 "=S" (dummy_value_S),
2410 "=D" (dummy_value_D)
2411
2412 : "1" (sptr), // esi // input regs
2413 "2" (dp), // edi
2414 "0" (width_mmx) // ecx
2415
2416#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2417 : "%mm0", "%mm1" // clobber list
2418#endif
2419 );
2420 }
2421
2422 sptr -= (width_mmx*4 - 4); // sign fixed
2423 dp -= (width_mmx*16 - 4); // sign fixed
2424 for (i = width; i; i--)
2425 {
2426 png_byte v[8];
2427 int j;
2428 sptr -= 4;
2429 png_memcpy(v, sptr, 4);
2430 for (j = 0; j < png_pass_inc[pass]; j++)
2431 {
2432 dp -= 4;
2433 png_memcpy(dp, v, 4);
2434 }
2435 }
2436 }
2437 else if (width) // pass == 4 or 5
2438 {
2439 int width_mmx = ((width >> 1) << 1) ;
2440 width -= width_mmx; // 0,1 pixels => 0,4 bytes
2441 if (width_mmx)
2442 {
2443 int dummy_value_c; // fix 'forbidden register spilled'
2444 int dummy_value_S;
2445 int dummy_value_D;
2446
2447 __asm__ __volatile__ (
2448 "subl $4, %%esi \n\t"
2449 "subl $12, %%edi \n\t"
2450
2451 ".loop4_pass4: \n\t"
2452 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2453 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2454 "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2455 "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2456 "movq %%mm0, (%%edi) \n\t"
2457 "subl $8, %%esi \n\t"
2458 "movq %%mm1, 8(%%edi) \n\t"
2459 "subl $16, %%edi \n\t"
2460 "subl $2, %%ecx \n\t"
2461 "jnz .loop4_pass4 \n\t"
2462 "EMMS \n\t" // DONE
2463
2464 : "=c" (dummy_value_c), // output regs (dummy)
2465 "=S" (dummy_value_S),
2466 "=D" (dummy_value_D)
2467
2468 : "1" (sptr), // esi // input regs
2469 "2" (dp), // edi
2470 "0" (width_mmx) // ecx
2471
2472#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2473 : "%mm0", "%mm1" // clobber list
2474#endif
2475 );
2476 }
2477
2478 sptr -= (width_mmx*4 - 4); // sign fixed
2479 dp -= (width_mmx*8 - 4); // sign fixed
2480 for (i = width; i; i--)
2481 {
2482 png_byte v[8];
2483 int j;
2484 sptr -= 4;
2485 png_memcpy(v, sptr, 4);
2486 for (j = 0; j < png_pass_inc[pass]; j++)
2487 {
2488 dp -= 4;
2489 png_memcpy(dp, v, 4);
2490 }
2491 }
2492 }
2493 } /* end of pixel_bytes == 4 */
2494
2495 //--------------------------------------------------------------
2496 else if (pixel_bytes == 8)
2497 {
2498// GRR TEST: should work, but needs testing (special 64-bit version of rpng2?)
2499 // GRR NOTE: no need to combine passes here!
2500 if (((pass == 0) || (pass == 1)) && width)
2501 {
2502 int dummy_value_c; // fix 'forbidden register spilled'
2503 int dummy_value_S;
2504 int dummy_value_D;
2505
2506 // source is 8-byte RRGGBBAA
2507 // dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
2508 __asm__ __volatile__ (
2509 "subl $56, %%edi \n\t" // start of last block
2510
2511 ".loop8_pass0: \n\t"
2512 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2513 "movq %%mm0, (%%edi) \n\t"
2514 "movq %%mm0, 8(%%edi) \n\t"
2515 "movq %%mm0, 16(%%edi) \n\t"
2516 "movq %%mm0, 24(%%edi) \n\t"
2517 "movq %%mm0, 32(%%edi) \n\t"
2518 "movq %%mm0, 40(%%edi) \n\t"
2519 "movq %%mm0, 48(%%edi) \n\t"
2520 "subl $8, %%esi \n\t"
2521 "movq %%mm0, 56(%%edi) \n\t"
2522 "subl $64, %%edi \n\t"
2523 "decl %%ecx \n\t"
2524 "jnz .loop8_pass0 \n\t"
2525 "EMMS \n\t" // DONE
2526
2527 : "=c" (dummy_value_c), // output regs (dummy)
2528 "=S" (dummy_value_S),
2529 "=D" (dummy_value_D)
2530
2531 : "1" (sptr), // esi // input regs
2532 "2" (dp), // edi
2533 "0" (width) // ecx
2534
2535#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2536 : "%mm0" // clobber list
2537#endif
2538 );
2539 }
2540 else if (((pass == 2) || (pass == 3)) && width)
2541 {
2542 // source is 8-byte RRGGBBAA
2543 // dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
2544 // (recall that expansion is _in place_: sptr and dp
2545 // both point at locations within same row buffer)
2546 {
2547 int dummy_value_c; // fix 'forbidden register spilled'
2548 int dummy_value_S;
2549 int dummy_value_D;
2550
2551 __asm__ __volatile__ (
2552 "subl $24, %%edi \n\t" // start of last block
2553
2554 ".loop8_pass2: \n\t"
2555 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2556 "movq %%mm0, (%%edi) \n\t"
2557 "movq %%mm0, 8(%%edi) \n\t"
2558 "movq %%mm0, 16(%%edi) \n\t"
2559 "subl $8, %%esi \n\t"
2560 "movq %%mm0, 24(%%edi) \n\t"
2561 "subl $32, %%edi \n\t"
2562 "decl %%ecx \n\t"
2563 "jnz .loop8_pass2 \n\t"
2564 "EMMS \n\t" // DONE
2565
2566 : "=c" (dummy_value_c), // output regs (dummy)
2567 "=S" (dummy_value_S),
2568 "=D" (dummy_value_D)
2569
2570 : "1" (sptr), // esi // input regs
2571 "2" (dp), // edi
2572 "0" (width) // ecx
2573
2574#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2575 : "%mm0" // clobber list
2576#endif
2577 );
2578 }
2579 }
2580 else if (width) // pass == 4 or 5
2581 {
2582 // source is 8-byte RRGGBBAA
2583 // dest is 16-byte RRGGBBAA RRGGBBAA
2584 {
2585 int dummy_value_c; // fix 'forbidden register spilled'
2586 int dummy_value_S;
2587 int dummy_value_D;
2588
2589 __asm__ __volatile__ (
2590 "subl $8, %%edi \n\t" // start of last block
2591
2592 ".loop8_pass4: \n\t"
2593 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2594 "movq %%mm0, (%%edi) \n\t"
2595 "subl $8, %%esi \n\t"
2596 "movq %%mm0, 8(%%edi) \n\t"
2597 "subl $16, %%edi \n\t"
2598 "decl %%ecx \n\t"
2599 "jnz .loop8_pass4 \n\t"
2600 "EMMS \n\t" // DONE
2601
2602 : "=c" (dummy_value_c), // output regs (dummy)
2603 "=S" (dummy_value_S),
2604 "=D" (dummy_value_D)
2605
2606 : "1" (sptr), // esi // input regs
2607 "2" (dp), // edi
2608 "0" (width) // ecx
2609
2610#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2611 : "%mm0" // clobber list
2612#endif
2613 );
2614 }
2615 }
2616
2617 } /* end of pixel_bytes == 8 */
2618
2619 //--------------------------------------------------------------
2620 else if (pixel_bytes == 6)
2621 {
2622 for (i = width; i; i--)
2623 {
2624 png_byte v[8];
2625 int j;
2626 png_memcpy(v, sptr, 6);
2627 for (j = 0; j < png_pass_inc[pass]; j++)
2628 {
2629 png_memcpy(dp, v, 6);
2630 dp -= 6;
2631 }
2632 sptr -= 6;
2633 }
2634 } /* end of pixel_bytes == 6 */
2635
2636 //--------------------------------------------------------------
2637 else
2638 {
2639 for (i = width; i; i--)
2640 {
2641 png_byte v[8];
2642 int j;
2643 png_memcpy(v, sptr, pixel_bytes);
2644 for (j = 0; j < png_pass_inc[pass]; j++)
2645 {
2646 png_memcpy(dp, v, pixel_bytes);
2647 dp -= pixel_bytes;
2648 }
2649 sptr-= pixel_bytes;
2650 }
2651 }
2652 } // end of _mmx_supported ========================================
2653
2654 else /* MMX not supported: use modified C code - takes advantage
2655 * of inlining of png_memcpy for a constant */
2656 /* GRR 19991007: does it? or should pixel_bytes in each
2657 * block be replaced with immediate value (e.g., 1)? */
2658 /* GRR 19991017: replaced with constants in each case */
2659#endif /* PNG_MMX_CODE_SUPPORTED */
2660 {
2661 if (pixel_bytes == 1)
2662 {
2663 for (i = width; i; i--)
2664 {
2665 int j;
2666 for (j = 0; j < png_pass_inc[pass]; j++)
2667 {
2668 *dp-- = *sptr;
2669 }
2670 --sptr;
2671 }
2672 }
2673 else if (pixel_bytes == 3)
2674 {
2675 for (i = width; i; i--)
2676 {
2677 png_byte v[8];
2678 int j;
2679 png_memcpy(v, sptr, 3);
2680 for (j = 0; j < png_pass_inc[pass]; j++)
2681 {
2682 png_memcpy(dp, v, 3);
2683 dp -= 3;
2684 }
2685 sptr -= 3;
2686 }
2687 }
2688 else if (pixel_bytes == 2)
2689 {
2690 for (i = width; i; i--)
2691 {
2692 png_byte v[8];
2693 int j;
2694 png_memcpy(v, sptr, 2);
2695 for (j = 0; j < png_pass_inc[pass]; j++)
2696 {
2697 png_memcpy(dp, v, 2);
2698 dp -= 2;
2699 }
2700 sptr -= 2;
2701 }
2702 }
2703 else if (pixel_bytes == 4)
2704 {
2705 for (i = width; i; i--)
2706 {
2707 png_byte v[8];
2708 int j;
2709 png_memcpy(v, sptr, 4);
2710 for (j = 0; j < png_pass_inc[pass]; j++)
2711 {
2712#ifdef PNG_DEBUG
2713 if (dp < row || dp+3 > row+png_ptr->row_buf_size)
2714 {
2715 printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
2716 row, dp, row+png_ptr->row_buf_size);
2717 printf("row_buf=%d\n",png_ptr->row_buf_size);
2718 }
2719#endif
2720 png_memcpy(dp, v, 4);
2721 dp -= 4;
2722 }
2723 sptr -= 4;
2724 }
2725 }
2726 else if (pixel_bytes == 6)
2727 {
2728 for (i = width; i; i--)
2729 {
2730 png_byte v[8];
2731 int j;
2732 png_memcpy(v, sptr, 6);
2733 for (j = 0; j < png_pass_inc[pass]; j++)
2734 {
2735 png_memcpy(dp, v, 6);
2736 dp -= 6;
2737 }
2738 sptr -= 6;
2739 }
2740 }
2741 else if (pixel_bytes == 8)
2742 {
2743 for (i = width; i; i--)
2744 {
2745 png_byte v[8];
2746 int j;
2747 png_memcpy(v, sptr, 8);
2748 for (j = 0; j < png_pass_inc[pass]; j++)
2749 {
2750 png_memcpy(dp, v, 8);
2751 dp -= 8;
2752 }
2753 sptr -= 8;
2754 }
2755 }
2756 else /* GRR: should never be reached */
2757 {
2758 for (i = width; i; i--)
2759 {
2760 png_byte v[8];
2761 int j;
2762 png_memcpy(v, sptr, pixel_bytes);
2763 for (j = 0; j < png_pass_inc[pass]; j++)
2764 {
2765 png_memcpy(dp, v, pixel_bytes);
2766 dp -= pixel_bytes;
2767 }
2768 sptr -= pixel_bytes;
2769 }
2770 }
2771
2772 } /* end if (MMX not supported) */
2773 break;
2774 }
2775 } /* end switch (row_info->pixel_depth) */
2776
2777 row_info->width = final_width;
2778
2779 row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
2780 }
2781
2782} /* end png_do_read_interlace() */
2783
2784#endif /* PNG_HAVE_MMX_READ_INTERLACE */
2785#endif /* PNG_READ_INTERLACING_SUPPORTED */
2786
2787
2788
2789#if defined(PNG_HAVE_MMX_READ_FILTER_ROW)
2790#if defined(PNG_MMX_CODE_SUPPORTED)
2791
2792// These variables are utilized in the functions below. They are declared
2793// globally here to ensure alignment on 8-byte boundaries.
2794
2795union uAll {
2796 long long use;
2797 double align;
2798} _LBCarryMask = {0x0101010101010101LL},
2799 _HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
2800 _ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
2801
2802#ifdef PNG_THREAD_UNSAFE_OK
2803//===========================================================================//
2804// //
2805// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G //
2806// //
2807//===========================================================================//
2808
2809// Optimized code for PNG Average filter decoder
2810
2811static void /* PRIVATE */
2812png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
2813 png_bytep prev_row)
2814{
2815 int bpp;
2816 int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
2817 int dummy_value_S;
2818 int dummy_value_D;
2819
2820 bpp = (row_info->pixel_depth + 7) >> 3; // get # bytes per pixel
2821 _FullLength = row_info->rowbytes; // # of bytes to filter
2822
2823 __asm__ __volatile__ (
2824 // initialize address pointers and offset
2825#ifdef __PIC__
2826 "pushl %%ebx \n\t" // save index to Global Offset Table
2827#endif
2828//pre "movl row, %%edi \n\t" // edi: Avg(x)
2829 "xorl %%ebx, %%ebx \n\t" // ebx: x
2830 "movl %%edi, %%edx \n\t"
2831//pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
2832//pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
2833 "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
2834
2835 "xorl %%eax,%%eax \n\t"
2836
2837 // Compute the Raw value for the first bpp bytes
2838 // Raw(x) = Avg(x) + (Prior(x)/2)
2839 "avg_rlp: \n\t"
2840 "movb (%%esi,%%ebx,),%%al \n\t" // load al with Prior(x)
2841 "incl %%ebx \n\t"
2842 "shrb %%al \n\t" // divide by 2
2843 "addb -1(%%edi,%%ebx,),%%al \n\t" // add Avg(x); -1 to offset inc ebx
2844//pre "cmpl bpp, %%ebx \n\t" // (bpp is preloaded into ecx)
2845 "cmpl %%ecx, %%ebx \n\t"
2846 "movb %%al,-1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2847 "jb avg_rlp \n\t" // mov does not affect flags
2848
2849 // get # of bytes to alignment
2850 "movl %%edi, _dif \n\t" // take start of row
2851 "addl %%ebx, _dif \n\t" // add bpp
2852 "addl $0xf, _dif \n\t" // add 7+8 to incr past alignment bdry
2853 "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
2854 "subl %%edi, _dif \n\t" // subtract from start => value ebx at
2855 "jz avg_go \n\t" // alignment
2856
2857 // fix alignment
2858 // Compute the Raw value for the bytes up to the alignment boundary
2859 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2860 "xorl %%ecx, %%ecx \n\t"
2861
2862 "avg_lp1: \n\t"
2863 "xorl %%eax, %%eax \n\t"
2864 "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
2865 "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
2866 "addw %%cx, %%ax \n\t"
2867 "incl %%ebx \n\t"
2868 "shrw %%ax \n\t" // divide by 2
2869 "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
2870 "cmpl _dif, %%ebx \n\t" // check if at alignment boundary
2871 "movb %%al, -1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2872 "jb avg_lp1 \n\t" // repeat until at alignment boundary
2873
2874 "avg_go: \n\t"
2875 "movl _FullLength, %%eax \n\t"
2876 "movl %%eax, %%ecx \n\t"
2877 "subl %%ebx, %%eax \n\t" // subtract alignment fix
2878 "andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
2879 "subl %%eax, %%ecx \n\t" // drop over bytes from original length
2880 "movl %%ecx, _MMXLength \n\t"
2881#ifdef __PIC__
2882 "popl %%ebx \n\t" // restore index to Global Offset Table
2883#endif
2884
2885 : "=c" (dummy_value_c), // output regs (dummy)
2886 "=S" (dummy_value_S),
2887 "=D" (dummy_value_D)
2888
2889 : "0" (bpp), // ecx // input regs
2890 "1" (prev_row), // esi
2891 "2" (row) // edi
2892
2893 : "%eax", "%edx" // clobber list
2894#ifndef __PIC__
2895 , "%ebx"
2896#endif
2897 // GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)
2898 // (seems to work fine without...)
2899 );
2900
2901 // now do the math for the rest of the row
2902 switch (bpp)
2903 {
2904 case 3:
2905 {
2906 _ActiveMask.use = 0x0000000000ffffffLL;
2907 _ShiftBpp.use = 24; // == 3 * 8
2908 _ShiftRem.use = 40; // == 64 - 24
2909
2910 __asm__ __volatile__ (
2911 // re-init address pointers and offset
2912 "movq _ActiveMask, %%mm7 \n\t"
2913 "movl _dif, %%ecx \n\t" // ecx: x = offset to
2914 "movq _LBCarryMask, %%mm5 \n\t" // alignment boundary
2915// preload "movl row, %%edi \n\t" // edi: Avg(x)
2916 "movq _HBClearMask, %%mm4 \n\t"
2917// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
2918
2919 // prime the pump: load the first Raw(x-bpp) data set
2920 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2921 // (correct pos. in loop below)
2922 "avg_3lp: \n\t"
2923 "movq (%%edi,%%ecx,), %%mm0 \n\t" // load mm0 with Avg(x)
2924 "movq %%mm5, %%mm3 \n\t"
2925 "psrlq _ShiftRem, %%mm2 \n\t" // correct position Raw(x-bpp)
2926 // data
2927 "movq (%%esi,%%ecx,), %%mm1 \n\t" // load mm1 with Prior(x)
2928 "movq %%mm7, %%mm6 \n\t"
2929 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
2930 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
2931 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
2932 // byte
2933 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
2934 // each byte
2935 // add 1st active group (Raw(x-bpp)/2) to average with LBCarry
2936 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2937 // LBCarrys
2938 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2939 // where both
2940 // lsb's were == 1 (only valid for active group)
2941 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2942 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2943 // byte
2944 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2945 // for each byte
2946 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 1
2947 // bytes to add to Avg
2948 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
2949 // Avg for each Active
2950 // byte
2951 // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
2952 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
2953 // bytes 3-5
2954 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2955 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2956 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2957 // LBCarrys
2958 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2959 // where both
2960 // lsb's were == 1 (only valid for active group)
2961 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2962 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2963 // byte
2964 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2965 // for each byte
2966 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
2967 // bytes to add to Avg
2968 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
2969 // Avg for each Active
2970 // byte
2971
2972 // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
2973 "psllq _ShiftBpp, %%mm6 \n\t" // shift mm6 mask to cover last
2974 // two
2975 // bytes
2976 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2977 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2978 // Data only needs to be shifted once here to
2979 // get the correct x-bpp offset.
2980 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2981 // LBCarrys
2982 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2983 // where both
2984 // lsb's were == 1 (only valid for active group)
2985 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2986 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2987 // byte
2988 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2989 // for each byte
2990 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
2991 // bytes to add to Avg
2992 "addl $8, %%ecx \n\t"
2993 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
2994 // Avg for each Active
2995 // byte
2996 // now ready to write back to memory
2997 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2998 // move updated Raw(x) to use as Raw(x-bpp) for next loop
2999 "cmpl _MMXLength, %%ecx \n\t"
3000 "movq %%mm0, %%mm2 \n\t" // mov updated Raw(x) to mm2
3001 "jb avg_3lp \n\t"
3002
3003 : "=S" (dummy_value_S), // output regs (dummy)
3004 "=D" (dummy_value_D)
3005
3006 : "0" (prev_row), // esi // input regs
3007 "1" (row) // edi
3008
3009 : "%ecx" // clobber list
3010#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3011 , "%mm0", "%mm1", "%mm2", "%mm3"
3012 , "%mm4", "%mm5", "%mm6", "%mm7"
3013#endif
3014 );
3015 }
3016 break; // end 3 bpp
3017
3018 case 6:
3019 case 4:
3020 //case 7: // who wrote this? PNG doesn't support 5 or 7 bytes/pixel
3021 //case 5: // GRR BOGUS
3022 {
3023 _ActiveMask.use = 0xffffffffffffffffLL; // use shift below to clear
3024 // appropriate inactive bytes
3025 _ShiftBpp.use = bpp << 3;
3026 _ShiftRem.use = 64 - _ShiftBpp.use;
3027
3028 __asm__ __volatile__ (
3029 "movq _HBClearMask, %%mm4 \n\t"
3030
3031 // re-init address pointers and offset
3032 "movl _dif, %%ecx \n\t" // ecx: x = offset to
3033 // alignment boundary
3034
3035 // load _ActiveMask and clear all bytes except for 1st active group
3036 "movq _ActiveMask, %%mm7 \n\t"
3037// preload "movl row, %%edi \n\t" // edi: Avg(x)
3038 "psrlq _ShiftRem, %%mm7 \n\t"
3039// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3040 "movq %%mm7, %%mm6 \n\t"
3041 "movq _LBCarryMask, %%mm5 \n\t"
3042 "psllq _ShiftBpp, %%mm6 \n\t" // create mask for 2nd active
3043 // group
3044
3045 // prime the pump: load the first Raw(x-bpp) data set
3046 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3047 // (we correct pos. in loop below)
3048 "avg_4lp: \n\t"
3049 "movq (%%edi,%%ecx,), %%mm0 \n\t"
3050 "psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
3051 "movq (%%esi,%%ecx,), %%mm1 \n\t"
3052 // add (Prev_row/2) to average
3053 "movq %%mm5, %%mm3 \n\t"
3054 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3055 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3056 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
3057 // byte
3058 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
3059 // each byte
3060 // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
3061 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3062 // LBCarrys
3063 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3064 // where both
3065 // lsb's were == 1 (only valid for active group)
3066 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3067 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3068 // byte
3069 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3070 // for each byte
3071 "pand %%mm7, %%mm2 \n\t" // leave only Active Group 1
3072 // bytes to add to Avg
3073 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
3074 // for each Active
3075 // byte
3076 // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3077 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3078 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3079 "addl $8, %%ecx \n\t"
3080 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3081 // LBCarrys
3082 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3083 // where both
3084 // lsb's were == 1 (only valid for active group)
3085 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3086 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3087 // byte
3088 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3089 // for each byte
3090 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3091 // bytes to add to Avg
3092 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3093 // Avg for each Active
3094 // byte
3095 "cmpl _MMXLength, %%ecx \n\t"
3096 // now ready to write back to memory
3097 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3098 // prep Raw(x-bpp) for next loop
3099 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3100 "jb avg_4lp \n\t"
3101
3102 : "=S" (dummy_value_S), // output regs (dummy)
3103 "=D" (dummy_value_D)
3104
3105 : "0" (prev_row), // esi // input regs
3106 "1" (row) // edi
3107
3108 : "%ecx" // clobber list
3109#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3110 , "%mm0", "%mm1", "%mm2", "%mm3"
3111 , "%mm4", "%mm5", "%mm6", "%mm7"
3112#endif
3113 );
3114 }
3115 break; // end 4,6 bpp
3116
3117 case 2:
3118 {
3119 _ActiveMask.use = 0x000000000000ffffLL;
3120 _ShiftBpp.use = 16; // == 2 * 8
3121 _ShiftRem.use = 48; // == 64 - 16
3122
3123 __asm__ __volatile__ (
3124 // load _ActiveMask
3125 "movq _ActiveMask, %%mm7 \n\t"
3126 // re-init address pointers and offset
3127 "movl _dif, %%ecx \n\t" // ecx: x = offset to alignment
3128 // boundary
3129 "movq _LBCarryMask, %%mm5 \n\t"
3130// preload "movl row, %%edi \n\t" // edi: Avg(x)
3131 "movq _HBClearMask, %%mm4 \n\t"
3132// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3133
3134 // prime the pump: load the first Raw(x-bpp) data set
3135 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3136 // (we correct pos. in loop below)
3137 "avg_2lp: \n\t"
3138 "movq (%%edi,%%ecx,), %%mm0 \n\t"
3139 "psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
3140 "movq (%%esi,%%ecx,), %%mm1 \n\t" // (GRR BUGFIX: was psllq)
3141 // add (Prev_row/2) to average
3142 "movq %%mm5, %%mm3 \n\t"
3143 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3144 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3145 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
3146 // byte
3147 "movq %%mm7, %%mm6 \n\t"
3148 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
3149 // each byte
3150
3151 // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
3152 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3153 // LBCarrys
3154 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3155 // where both
3156 // lsb's were == 1 (only valid
3157 // for active group)
3158 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3159 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3160 // byte
3161 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3162 // for each byte
3163 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 1
3164 // bytes to add to Avg
3165 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
3166 // for each Active byte
3167
3168 // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3169 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
3170 // bytes 2 & 3
3171 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3172 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3173 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3174 // LBCarrys
3175 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3176 // where both
3177 // lsb's were == 1 (only valid
3178 // for active group)
3179 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3180 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3181 // byte
3182 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3183 // for each byte
3184 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3185 // bytes to add to Avg
3186 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3187 // Avg for each Active byte
3188
3189 // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
3190 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
3191 // bytes 4 & 5
3192 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3193 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3194 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3195 // LBCarrys
3196 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3197 // where both lsb's were == 1
3198 // (only valid for active group)
3199 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3200 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3201 // byte
3202 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3203 // for each byte
3204 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3205 // bytes to add to Avg
3206 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3207 // Avg for each Active byte
3208
3209 // add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
3210 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
3211 // bytes 6 & 7
3212 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3213 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3214 "addl $8, %%ecx \n\t"
3215 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3216 // LBCarrys
3217 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3218 // where both
3219 // lsb's were == 1 (only valid
3220 // for active group)
3221 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3222 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3223 // byte
3224 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3225 // for each byte
3226 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3227 // bytes to add to Avg
3228 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3229 // Avg for each Active byte
3230
3231 "cmpl _MMXLength, %%ecx \n\t"
3232 // now ready to write back to memory
3233 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3234 // prep Raw(x-bpp) for next loop
3235 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3236 "jb avg_2lp \n\t"
3237
3238 : "=S" (dummy_value_S), // output regs (dummy)
3239 "=D" (dummy_value_D)
3240
3241 : "0" (prev_row), // esi // input regs
3242 "1" (row) // edi
3243
3244 : "%ecx" // clobber list
3245#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3246 , "%mm0", "%mm1", "%mm2", "%mm3"
3247 , "%mm4", "%mm5", "%mm6", "%mm7"
3248#endif
3249 );
3250 }
3251 break; // end 2 bpp
3252
3253 case 1:
3254 {
3255 __asm__ __volatile__ (
3256 // re-init address pointers and offset
3257#ifdef __PIC__
3258 "pushl %%ebx \n\t" // save Global Offset Table index
3259#endif
3260 "movl _dif, %%ebx \n\t" // ebx: x = offset to alignment
3261 // boundary
3262// preload "movl row, %%edi \n\t" // edi: Avg(x)
3263 "cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
3264 "jnb avg_1end \n\t"
3265 // do Paeth decode for remaining bytes
3266// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3267 "movl %%edi, %%edx \n\t"
3268// preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
3269 "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
3270 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
3271 // in loop below
3272 "avg_1lp: \n\t"
3273 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3274 "xorl %%eax, %%eax \n\t"
3275 "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
3276 "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
3277 "addw %%cx, %%ax \n\t"
3278 "incl %%ebx \n\t"
3279 "shrw %%ax \n\t" // divide by 2
3280 "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset
3281 // inc ebx
3282 "cmpl _FullLength, %%ebx \n\t" // check if at end of array
3283 "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x);
3284 // mov does not affect flags; -1 to offset inc ebx
3285 "jb avg_1lp \n\t"
3286
3287 "avg_1end: \n\t"
3288#ifdef __PIC__
3289 "popl %%ebx \n\t" // Global Offset Table index
3290#endif
3291
3292 : "=c" (dummy_value_c), // output regs (dummy)
3293 "=S" (dummy_value_S),
3294 "=D" (dummy_value_D)
3295
3296 : "0" (bpp), // ecx // input regs
3297 "1" (prev_row), // esi
3298 "2" (row) // edi
3299
3300 : "%eax", "%edx" // clobber list
3301#ifndef __PIC__
3302 , "%ebx"
3303#endif
3304 );
3305 }
3306 return; // end 1 bpp
3307
3308 case 8:
3309 {
3310 __asm__ __volatile__ (
3311 // re-init address pointers and offset
3312 "movl _dif, %%ecx \n\t" // ecx: x == offset to alignment
3313 "movq _LBCarryMask, %%mm5 \n\t" // boundary
3314// preload "movl row, %%edi \n\t" // edi: Avg(x)
3315 "movq _HBClearMask, %%mm4 \n\t"
3316// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3317
3318 // prime the pump: load the first Raw(x-bpp) data set
3319 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3320 // (NO NEED to correct pos. in loop below)
3321
3322 "avg_8lp: \n\t"
3323 "movq (%%edi,%%ecx,), %%mm0 \n\t"
3324 "movq %%mm5, %%mm3 \n\t"
3325 "movq (%%esi,%%ecx,), %%mm1 \n\t"
3326 "addl $8, %%ecx \n\t"
3327 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3328 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3329 "pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
3330 // where both lsb's were == 1
3331 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3332 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7, each byte
3333 "paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg, each byte
3334 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7, each byte
3335 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg, each
3336 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
3337 "cmpl _MMXLength, %%ecx \n\t"
3338 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3339 "movq %%mm0, %%mm2 \n\t" // reuse as Raw(x-bpp)
3340 "jb avg_8lp \n\t"
3341
3342 : "=S" (dummy_value_S), // output regs (dummy)
3343 "=D" (dummy_value_D)
3344
3345 : "0" (prev_row), // esi // input regs
3346 "1" (row) // edi
3347
3348 : "%ecx" // clobber list
3349#if 0 /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
3350 , "%mm0", "%mm1", "%mm2"
3351 , "%mm3", "%mm4", "%mm5"
3352#endif
3353 );
3354 }
3355 break; // end 8 bpp
3356
3357 default: // bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8)
3358 {
3359
3360#ifdef PNG_DEBUG
3361 // GRR: PRINT ERROR HERE: SHOULD NEVER BE REACHED
3362 png_debug(1,
3363 "Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())\n");
3364#endif
3365
3366#if 0
3367 __asm__ __volatile__ (
3368 "movq _LBCarryMask, %%mm5 \n\t"
3369 // re-init address pointers and offset
3370 "movl _dif, %%ebx \n\t" // ebx: x = offset to
3371 // alignment boundary
3372 "movl row, %%edi \n\t" // edi: Avg(x)
3373 "movq _HBClearMask, %%mm4 \n\t"
3374 "movl %%edi, %%edx \n\t"
3375 "movl prev_row, %%esi \n\t" // esi: Prior(x)
3376 "subl bpp, %%edx \n\t" // edx: Raw(x-bpp)
3377 "avg_Alp: \n\t"
3378 "movq (%%edi,%%ebx,), %%mm0 \n\t"
3379 "movq %%mm5, %%mm3 \n\t"
3380 "movq (%%esi,%%ebx,), %%mm1 \n\t"
3381 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3382 "movq (%%edx,%%ebx,), %%mm2 \n\t"
3383 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3384 "pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
3385 // where both lsb's were == 1
3386 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3387 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
3388 // byte
3389 "paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg for each
3390 // byte
3391 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3392 // byte
3393 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
3394 // each byte
3395 "addl $8, %%ebx \n\t"
3396 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
3397 // byte
3398 "cmpl _MMXLength, %%ebx \n\t"
3399 "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
3400 "jb avg_Alp \n\t"
3401
3402 : // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
3403
3404 : // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
3405
3406 : "%ebx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
3407 );
3408#endif /* 0 - NEVER REACHED */
3409 }
3410 break;
3411
3412 } // end switch (bpp)
3413
3414 __asm__ __volatile__ (
3415 // MMX acceleration complete; now do clean-up
3416 // check if any remaining bytes left to decode
3417#ifdef __PIC__
3418 "pushl %%ebx \n\t" // save index to Global Offset Table
3419#endif
3420 "movl _MMXLength, %%ebx \n\t" // ebx: x == offset bytes after MMX
3421//pre "movl row, %%edi \n\t" // edi: Avg(x)
3422 "cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
3423 "jnb avg_end \n\t"
3424
3425 // do Avg decode for remaining bytes
3426//pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
3427 "movl %%edi, %%edx \n\t"
3428//pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
3429 "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
3430 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
3431
3432 "avg_lp2: \n\t"
3433 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3434 "xorl %%eax, %%eax \n\t"
3435 "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
3436 "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
3437 "addw %%cx, %%ax \n\t"
3438 "incl %%ebx \n\t"
3439 "shrw %%ax \n\t" // divide by 2
3440 "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
3441 "cmpl _FullLength, %%ebx \n\t" // check if at end of array
3442 "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x) [mov does not
3443 "jb avg_lp2 \n\t" // affect flags; -1 to offset inc ebx]
3444
3445 "avg_end: \n\t"
3446 "EMMS \n\t" // end MMX; prep for poss. FP instrs.
3447#ifdef __PIC__
3448 "popl %%ebx \n\t" // restore index to Global Offset Table
3449#endif
3450
3451 : "=c" (dummy_value_c), // output regs (dummy)
3452 "=S" (dummy_value_S),
3453 "=D" (dummy_value_D)
3454
3455 : "0" (bpp), // ecx // input regs
3456 "1" (prev_row), // esi
3457 "2" (row) // edi
3458
3459 : "%eax", "%edx" // clobber list
3460#ifndef __PIC__
3461 , "%ebx"
3462#endif
3463 );
3464
3465} /* end png_read_filter_row_mmx_avg() */
3466#endif
3467
3468
3469
3470#ifdef PNG_THREAD_UNSAFE_OK
3471//===========================================================================//
3472// //
3473// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H //
3474// //
3475//===========================================================================//
3476
3477// Optimized code for PNG Paeth filter decoder
3478
3479static void /* PRIVATE */
3480png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
3481 png_bytep prev_row)
3482{
3483 int bpp;
3484 int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
3485 int dummy_value_S;
3486 int dummy_value_D;
3487
3488 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3489 _FullLength = row_info->rowbytes; // # of bytes to filter
3490
3491 __asm__ __volatile__ (
3492#ifdef __PIC__
3493 "pushl %%ebx \n\t" // save index to Global Offset Table
3494#endif
3495 "xorl %%ebx, %%ebx \n\t" // ebx: x offset
3496//pre "movl row, %%edi \n\t"
3497 "xorl %%edx, %%edx \n\t" // edx: x-bpp offset
3498//pre "movl prev_row, %%esi \n\t"
3499 "xorl %%eax, %%eax \n\t"
3500
3501 // Compute the Raw value for the first bpp bytes
3502 // Note: the formula works out to be always
3503 // Paeth(x) = Raw(x) + Prior(x) where x < bpp
3504 "paeth_rlp: \n\t"
3505 "movb (%%edi,%%ebx,), %%al \n\t"
3506 "addb (%%esi,%%ebx,), %%al \n\t"
3507 "incl %%ebx \n\t"
3508//pre "cmpl bpp, %%ebx \n\t" (bpp is preloaded into ecx)
3509 "cmpl %%ecx, %%ebx \n\t"
3510 "movb %%al, -1(%%edi,%%ebx,) \n\t"
3511 "jb paeth_rlp \n\t"
3512 // get # of bytes to alignment
3513 "movl %%edi, _dif \n\t" // take start of row
3514 "addl %%ebx, _dif \n\t" // add bpp
3515 "xorl %%ecx, %%ecx \n\t"
3516 "addl $0xf, _dif \n\t" // add 7 + 8 to incr past alignment
3517 // boundary
3518 "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
3519 "subl %%edi, _dif \n\t" // subtract from start ==> value ebx
3520 // at alignment
3521 "jz paeth_go \n\t"
3522 // fix alignment
3523
3524 "paeth_lp1: \n\t"
3525 "xorl %%eax, %%eax \n\t"
3526 // pav = p - a = (a + b - c) - a = b - c
3527 "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
3528 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3529 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3530 "movl %%eax, _patemp \n\t" // Save pav for later use
3531 "xorl %%eax, %%eax \n\t"
3532 // pbv = p - b = (a + b - c) - b = a - c
3533 "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
3534 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3535 "movl %%eax, %%ecx \n\t"
3536 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3537 "addl _patemp, %%eax \n\t" // pcv = pav + pbv
3538 // pc = abs(pcv)
3539 "testl $0x80000000, %%eax \n\t"
3540 "jz paeth_pca \n\t"
3541 "negl %%eax \n\t" // reverse sign of neg values
3542
3543 "paeth_pca: \n\t"
3544 "movl %%eax, _pctemp \n\t" // save pc for later use
3545 // pb = abs(pbv)
3546 "testl $0x80000000, %%ecx \n\t"
3547 "jz paeth_pba \n\t"
3548 "negl %%ecx \n\t" // reverse sign of neg values
3549
3550 "paeth_pba: \n\t"
3551 "movl %%ecx, _pbtemp \n\t" // save pb for later use
3552 // pa = abs(pav)
3553 "movl _patemp, %%eax \n\t"
3554 "testl $0x80000000, %%eax \n\t"
3555 "jz paeth_paa \n\t"
3556 "negl %%eax \n\t" // reverse sign of neg values
3557
3558 "paeth_paa: \n\t"
3559 "movl %%eax, _patemp \n\t" // save pa for later use
3560 // test if pa <= pb
3561 "cmpl %%ecx, %%eax \n\t"
3562 "jna paeth_abb \n\t"
3563 // pa > pb; now test if pb <= pc
3564 "cmpl _pctemp, %%ecx \n\t"
3565 "jna paeth_bbc \n\t"
3566 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3567 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3568 "jmp paeth_paeth \n\t"
3569
3570 "paeth_bbc: \n\t"
3571 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3572 "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
3573 "jmp paeth_paeth \n\t"
3574
3575 "paeth_abb: \n\t"
3576 // pa <= pb; now test if pa <= pc
3577 "cmpl _pctemp, %%eax \n\t"
3578 "jna paeth_abc \n\t"
3579 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3580 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3581 "jmp paeth_paeth \n\t"
3582
3583 "paeth_abc: \n\t"
3584 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3585 "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
3586
3587 "paeth_paeth: \n\t"
3588 "incl %%ebx \n\t"
3589 "incl %%edx \n\t"
3590 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3591 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
3592 "cmpl _dif, %%ebx \n\t"
3593 "jb paeth_lp1 \n\t"
3594
3595 "paeth_go: \n\t"
3596 "movl _FullLength, %%ecx \n\t"
3597 "movl %%ecx, %%eax \n\t"
3598 "subl %%ebx, %%eax \n\t" // subtract alignment fix
3599 "andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
3600 "subl %%eax, %%ecx \n\t" // drop over bytes from original length
3601 "movl %%ecx, _MMXLength \n\t"
3602#ifdef __PIC__
3603 "popl %%ebx \n\t" // restore index to Global Offset Table
3604#endif
3605
3606 : "=c" (dummy_value_c), // output regs (dummy)
3607 "=S" (dummy_value_S),
3608 "=D" (dummy_value_D)
3609
3610 : "0" (bpp), // ecx // input regs
3611 "1" (prev_row), // esi
3612 "2" (row) // edi
3613
3614 : "%eax", "%edx" // clobber list
3615#ifndef __PIC__
3616 , "%ebx"
3617#endif
3618 );
3619
3620 // now do the math for the rest of the row
3621 switch (bpp)
3622 {
3623 case 3:
3624 {
3625 _ActiveMask.use = 0x0000000000ffffffLL;
3626 _ActiveMaskEnd.use = 0xffff000000000000LL;
3627 _ShiftBpp.use = 24; // == bpp(3) * 8
3628 _ShiftRem.use = 40; // == 64 - 24
3629
3630 __asm__ __volatile__ (
3631 "movl _dif, %%ecx \n\t"
3632// preload "movl row, %%edi \n\t"
3633// preload "movl prev_row, %%esi \n\t"
3634 "pxor %%mm0, %%mm0 \n\t"
3635 // prime the pump: load the first Raw(x-bpp) data set
3636 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3637 "paeth_3lp: \n\t"
3638 "psrlq _ShiftRem, %%mm1 \n\t" // shift last 3 bytes to 1st
3639 // 3 bytes
3640 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3641 "punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3642 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
3643 "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3644 "psrlq _ShiftRem, %%mm3 \n\t" // shift last 3 bytes to 1st
3645 // 3 bytes
3646 // pav = p - a = (a + b - c) - a = b - c
3647 "movq %%mm2, %%mm4 \n\t"
3648 "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3649 // pbv = p - b = (a + b - c) - b = a - c
3650 "movq %%mm1, %%mm5 \n\t"
3651 "psubw %%mm3, %%mm4 \n\t"
3652 "pxor %%mm7, %%mm7 \n\t"
3653 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3654 "movq %%mm4, %%mm6 \n\t"
3655 "psubw %%mm3, %%mm5 \n\t"
3656
3657 // pa = abs(p-a) = abs(pav)
3658 // pb = abs(p-b) = abs(pbv)
3659 // pc = abs(p-c) = abs(pcv)
3660 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3661 "paddw %%mm5, %%mm6 \n\t"
3662 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3663 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3664 "psubw %%mm0, %%mm4 \n\t"
3665 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3666 "psubw %%mm0, %%mm4 \n\t"
3667 "psubw %%mm7, %%mm5 \n\t"
3668 "pxor %%mm0, %%mm0 \n\t"
3669 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3670 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3671 "psubw %%mm7, %%mm5 \n\t"
3672 "psubw %%mm0, %%mm6 \n\t"
3673 // test pa <= pb
3674 "movq %%mm4, %%mm7 \n\t"
3675 "psubw %%mm0, %%mm6 \n\t"
3676 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3677 "movq %%mm7, %%mm0 \n\t"
3678 // use mm7 mask to merge pa & pb
3679 "pand %%mm7, %%mm5 \n\t"
3680 // use mm0 mask copy to merge a & b
3681 "pand %%mm0, %%mm2 \n\t"
3682 "pandn %%mm4, %%mm7 \n\t"
3683 "pandn %%mm1, %%mm0 \n\t"
3684 "paddw %%mm5, %%mm7 \n\t"
3685 "paddw %%mm2, %%mm0 \n\t"
3686 // test ((pa <= pb)? pa:pb) <= pc
3687 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3688 "pxor %%mm1, %%mm1 \n\t"
3689 "pand %%mm7, %%mm3 \n\t"
3690 "pandn %%mm0, %%mm7 \n\t"
3691 "paddw %%mm3, %%mm7 \n\t"
3692 "pxor %%mm0, %%mm0 \n\t"
3693 "packuswb %%mm1, %%mm7 \n\t"
3694 "movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3695 "pand _ActiveMask, %%mm7 \n\t"
3696 "movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
3697 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3698 "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3699 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3700 "movq %%mm7, %%mm1 \n\t" // now mm1 will be used as
3701 // Raw(x-bpp)
3702 // now do Paeth for 2nd set of bytes (3-5)
3703 "psrlq _ShiftBpp, %%mm2 \n\t" // load b=Prior(x) step 2
3704 "punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3705 "pxor %%mm7, %%mm7 \n\t"
3706 "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3707 // pbv = p - b = (a + b - c) - b = a - c
3708 "movq %%mm1, %%mm5 \n\t"
3709 // pav = p - a = (a + b - c) - a = b - c
3710 "movq %%mm2, %%mm4 \n\t"
3711 "psubw %%mm3, %%mm5 \n\t"
3712 "psubw %%mm3, %%mm4 \n\t"
3713 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
3714 // pav + pbv = pbv + pav
3715 "movq %%mm5, %%mm6 \n\t"
3716 "paddw %%mm4, %%mm6 \n\t"
3717
3718 // pa = abs(p-a) = abs(pav)
3719 // pb = abs(p-b) = abs(pbv)
3720 // pc = abs(p-c) = abs(pcv)
3721 "pcmpgtw %%mm5, %%mm0 \n\t" // create mask pbv bytes < 0
3722 "pcmpgtw %%mm4, %%mm7 \n\t" // create mask pav bytes < 0
3723 "pand %%mm5, %%mm0 \n\t" // only pbv bytes < 0 in mm0
3724 "pand %%mm4, %%mm7 \n\t" // only pav bytes < 0 in mm7
3725 "psubw %%mm0, %%mm5 \n\t"
3726 "psubw %%mm7, %%mm4 \n\t"
3727 "psubw %%mm0, %%mm5 \n\t"
3728 "psubw %%mm7, %%mm4 \n\t"
3729 "pxor %%mm0, %%mm0 \n\t"
3730 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3731 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3732 "psubw %%mm0, %%mm6 \n\t"
3733 // test pa <= pb
3734 "movq %%mm4, %%mm7 \n\t"
3735 "psubw %%mm0, %%mm6 \n\t"
3736 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3737 "movq %%mm7, %%mm0 \n\t"
3738 // use mm7 mask to merge pa & pb
3739 "pand %%mm7, %%mm5 \n\t"
3740 // use mm0 mask copy to merge a & b
3741 "pand %%mm0, %%mm2 \n\t"
3742 "pandn %%mm4, %%mm7 \n\t"
3743 "pandn %%mm1, %%mm0 \n\t"
3744 "paddw %%mm5, %%mm7 \n\t"
3745 "paddw %%mm2, %%mm0 \n\t"
3746 // test ((pa <= pb)? pa:pb) <= pc
3747 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3748 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3749 "pand %%mm7, %%mm3 \n\t"
3750 "pandn %%mm0, %%mm7 \n\t"
3751 "pxor %%mm1, %%mm1 \n\t"
3752 "paddw %%mm3, %%mm7 \n\t"
3753 "pxor %%mm0, %%mm0 \n\t"
3754 "packuswb %%mm1, %%mm7 \n\t"
3755 "movq %%mm2, %%mm3 \n\t" // load c=Prior(x-bpp) step 1
3756 "pand _ActiveMask, %%mm7 \n\t"
3757 "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3758 "psllq _ShiftBpp, %%mm7 \n\t" // shift bytes to 2nd group of
3759 // 3 bytes
3760 // pav = p - a = (a + b - c) - a = b - c
3761 "movq %%mm2, %%mm4 \n\t"
3762 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3763 "psllq _ShiftBpp, %%mm3 \n\t" // load c=Prior(x-bpp) step 2
3764 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3765 "movq %%mm7, %%mm1 \n\t"
3766 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3767 "psllq _ShiftBpp, %%mm1 \n\t" // shift bytes
3768 // now mm1 will be used as Raw(x-bpp)
3769 // now do Paeth for 3rd, and final, set of bytes (6-7)
3770 "pxor %%mm7, %%mm7 \n\t"
3771 "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3772 "psubw %%mm3, %%mm4 \n\t"
3773 // pbv = p - b = (a + b - c) - b = a - c
3774 "movq %%mm1, %%mm5 \n\t"
3775 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3776 "movq %%mm4, %%mm6 \n\t"
3777 "psubw %%mm3, %%mm5 \n\t"
3778 "pxor %%mm0, %%mm0 \n\t"
3779 "paddw %%mm5, %%mm6 \n\t"
3780
3781 // pa = abs(p-a) = abs(pav)
3782 // pb = abs(p-b) = abs(pbv)
3783 // pc = abs(p-c) = abs(pcv)
3784 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3785 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3786 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3787 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3788 "psubw %%mm0, %%mm4 \n\t"
3789 "psubw %%mm7, %%mm5 \n\t"
3790 "psubw %%mm0, %%mm4 \n\t"
3791 "psubw %%mm7, %%mm5 \n\t"
3792 "pxor %%mm0, %%mm0 \n\t"
3793 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3794 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3795 "psubw %%mm0, %%mm6 \n\t"
3796 // test pa <= pb
3797 "movq %%mm4, %%mm7 \n\t"
3798 "psubw %%mm0, %%mm6 \n\t"
3799 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3800 "movq %%mm7, %%mm0 \n\t"
3801 // use mm0 mask copy to merge a & b
3802 "pand %%mm0, %%mm2 \n\t"
3803 // use mm7 mask to merge pa & pb
3804 "pand %%mm7, %%mm5 \n\t"
3805 "pandn %%mm1, %%mm0 \n\t"
3806 "pandn %%mm4, %%mm7 \n\t"
3807 "paddw %%mm2, %%mm0 \n\t"
3808 "paddw %%mm5, %%mm7 \n\t"
3809 // test ((pa <= pb)? pa:pb) <= pc
3810 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3811 "pand %%mm7, %%mm3 \n\t"
3812 "pandn %%mm0, %%mm7 \n\t"
3813 "paddw %%mm3, %%mm7 \n\t"
3814 "pxor %%mm1, %%mm1 \n\t"
3815 "packuswb %%mm7, %%mm1 \n\t"
3816 // step ecx to next set of 8 bytes and repeat loop til done
3817 "addl $8, %%ecx \n\t"
3818 "pand _ActiveMaskEnd, %%mm1 \n\t"
3819 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with
3820 // Raw(x)
3821
3822 "cmpl _MMXLength, %%ecx \n\t"
3823 "pxor %%mm0, %%mm0 \n\t" // pxor does not affect flags
3824 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3825 // mm1 will be used as Raw(x-bpp) next loop
3826 // mm3 ready to be used as Prior(x-bpp) next loop
3827 "jb paeth_3lp \n\t"
3828
3829 : "=S" (dummy_value_S), // output regs (dummy)
3830 "=D" (dummy_value_D)
3831
3832 : "0" (prev_row), // esi // input regs
3833 "1" (row) // edi
3834
3835 : "%ecx" // clobber list
3836#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3837 , "%mm0", "%mm1", "%mm2", "%mm3"
3838 , "%mm4", "%mm5", "%mm6", "%mm7"
3839#endif
3840 );
3841 }
3842 break; // end 3 bpp
3843
3844 case 6:
3845 //case 7: // GRR BOGUS
3846 //case 5: // GRR BOGUS
3847 {
3848 _ActiveMask.use = 0x00000000ffffffffLL;
3849 _ActiveMask2.use = 0xffffffff00000000LL;
3850 _ShiftBpp.use = bpp << 3; // == bpp * 8
3851 _ShiftRem.use = 64 - _ShiftBpp.use;
3852
3853 __asm__ __volatile__ (
3854 "movl _dif, %%ecx \n\t"
3855// preload "movl row, %%edi \n\t"
3856// preload "movl prev_row, %%esi \n\t"
3857 // prime the pump: load the first Raw(x-bpp) data set
3858 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3859 "pxor %%mm0, %%mm0 \n\t"
3860
3861 "paeth_6lp: \n\t"
3862 // must shift to position Raw(x-bpp) data
3863 "psrlq _ShiftRem, %%mm1 \n\t"
3864 // do first set of 4 bytes
3865 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3866 "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
3867 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3868 "punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
3869 // must shift to position Prior(x-bpp) data
3870 "psrlq _ShiftRem, %%mm3 \n\t"
3871 // pav = p - a = (a + b - c) - a = b - c
3872 "movq %%mm2, %%mm4 \n\t"
3873 "punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
3874 // pbv = p - b = (a + b - c) - b = a - c
3875 "movq %%mm1, %%mm5 \n\t"
3876 "psubw %%mm3, %%mm4 \n\t"
3877 "pxor %%mm7, %%mm7 \n\t"
3878 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3879 "movq %%mm4, %%mm6 \n\t"
3880 "psubw %%mm3, %%mm5 \n\t"
3881 // pa = abs(p-a) = abs(pav)
3882 // pb = abs(p-b) = abs(pbv)
3883 // pc = abs(p-c) = abs(pcv)
3884 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3885 "paddw %%mm5, %%mm6 \n\t"
3886 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3887 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3888 "psubw %%mm0, %%mm4 \n\t"
3889 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3890 "psubw %%mm0, %%mm4 \n\t"
3891 "psubw %%mm7, %%mm5 \n\t"
3892 "pxor %%mm0, %%mm0 \n\t"
3893 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3894 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3895 "psubw %%mm7, %%mm5 \n\t"
3896 "psubw %%mm0, %%mm6 \n\t"
3897 // test pa <= pb
3898 "movq %%mm4, %%mm7 \n\t"
3899 "psubw %%mm0, %%mm6 \n\t"
3900 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3901 "movq %%mm7, %%mm0 \n\t"
3902 // use mm7 mask to merge pa & pb
3903 "pand %%mm7, %%mm5 \n\t"
3904 // use mm0 mask copy to merge a & b
3905 "pand %%mm0, %%mm2 \n\t"
3906 "pandn %%mm4, %%mm7 \n\t"
3907 "pandn %%mm1, %%mm0 \n\t"
3908 "paddw %%mm5, %%mm7 \n\t"
3909 "paddw %%mm2, %%mm0 \n\t"
3910 // test ((pa <= pb)? pa:pb) <= pc
3911 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3912 "pxor %%mm1, %%mm1 \n\t"
3913 "pand %%mm7, %%mm3 \n\t"
3914 "pandn %%mm0, %%mm7 \n\t"
3915 "paddw %%mm3, %%mm7 \n\t"
3916 "pxor %%mm0, %%mm0 \n\t"
3917 "packuswb %%mm1, %%mm7 \n\t"
3918 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3919 "pand _ActiveMask, %%mm7 \n\t"
3920 "psrlq _ShiftRem, %%mm3 \n\t"
3921 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x) step 1
3922 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor and Raw(x)
3923 "movq %%mm2, %%mm6 \n\t"
3924 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3925 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3926 "psllq _ShiftBpp, %%mm6 \n\t"
3927 "movq %%mm7, %%mm5 \n\t"
3928 "psrlq _ShiftRem, %%mm1 \n\t"
3929 "por %%mm6, %%mm3 \n\t"
3930 "psllq _ShiftBpp, %%mm5 \n\t"
3931 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3932 "por %%mm5, %%mm1 \n\t"
3933 // do second set of 4 bytes
3934 "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3935 "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3936 // pav = p - a = (a + b - c) - a = b - c
3937 "movq %%mm2, %%mm4 \n\t"
3938 // pbv = p - b = (a + b - c) - b = a - c
3939 "movq %%mm1, %%mm5 \n\t"
3940 "psubw %%mm3, %%mm4 \n\t"
3941 "pxor %%mm7, %%mm7 \n\t"
3942 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3943 "movq %%mm4, %%mm6 \n\t"
3944 "psubw %%mm3, %%mm5 \n\t"
3945 // pa = abs(p-a) = abs(pav)
3946 // pb = abs(p-b) = abs(pbv)
3947 // pc = abs(p-c) = abs(pcv)
3948 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3949 "paddw %%mm5, %%mm6 \n\t"
3950 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3951 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3952 "psubw %%mm0, %%mm4 \n\t"
3953 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3954 "psubw %%mm0, %%mm4 \n\t"
3955 "psubw %%mm7, %%mm5 \n\t"
3956 "pxor %%mm0, %%mm0 \n\t"
3957 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3958 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3959 "psubw %%mm7, %%mm5 \n\t"
3960 "psubw %%mm0, %%mm6 \n\t"
3961 // test pa <= pb
3962 "movq %%mm4, %%mm7 \n\t"
3963 "psubw %%mm0, %%mm6 \n\t"
3964 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3965 "movq %%mm7, %%mm0 \n\t"
3966 // use mm7 mask to merge pa & pb
3967 "pand %%mm7, %%mm5 \n\t"
3968 // use mm0 mask copy to merge a & b
3969 "pand %%mm0, %%mm2 \n\t"
3970 "pandn %%mm4, %%mm7 \n\t"
3971 "pandn %%mm1, %%mm0 \n\t"
3972 "paddw %%mm5, %%mm7 \n\t"
3973 "paddw %%mm2, %%mm0 \n\t"
3974 // test ((pa <= pb)? pa:pb) <= pc
3975 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3976 "pxor %%mm1, %%mm1 \n\t"
3977 "pand %%mm7, %%mm3 \n\t"
3978 "pandn %%mm0, %%mm7 \n\t"
3979 "pxor %%mm1, %%mm1 \n\t"
3980 "paddw %%mm3, %%mm7 \n\t"
3981 "pxor %%mm0, %%mm0 \n\t"
3982 // step ecx to next set of 8 bytes and repeat loop til done
3983 "addl $8, %%ecx \n\t"
3984 "packuswb %%mm7, %%mm1 \n\t"
3985 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
3986 "cmpl _MMXLength, %%ecx \n\t"
3987 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3988 // mm1 will be used as Raw(x-bpp) next loop
3989 "jb paeth_6lp \n\t"
3990
3991 : "=S" (dummy_value_S), // output regs (dummy)
3992 "=D" (dummy_value_D)
3993
3994 : "0" (prev_row), // esi // input regs
3995 "1" (row) // edi
3996
3997 : "%ecx" // clobber list
3998#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3999 , "%mm0", "%mm1", "%mm2", "%mm3"
4000 , "%mm4", "%mm5", "%mm6", "%mm7"
4001#endif
4002 );
4003 }
4004 break; // end 6 bpp
4005
4006 case 4:
4007 {
4008 _ActiveMask.use = 0x00000000ffffffffLL;
4009
4010 __asm__ __volatile__ (
4011 "movl _dif, %%ecx \n\t"
4012// preload "movl row, %%edi \n\t"
4013// preload "movl prev_row, %%esi \n\t"
4014 "pxor %%mm0, %%mm0 \n\t"
4015 // prime the pump: load the first Raw(x-bpp) data set
4016 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
4017 // a=Raw(x-bpp) bytes
4018 "paeth_4lp: \n\t"
4019 // do first set of 4 bytes
4020 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4021 "punpckhbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
4022 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
4023 "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
4024 // pav = p - a = (a + b - c) - a = b - c
4025 "movq %%mm2, %%mm4 \n\t"
4026 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
4027 // pbv = p - b = (a + b - c) - b = a - c
4028 "movq %%mm1, %%mm5 \n\t"
4029 "psubw %%mm3, %%mm4 \n\t"
4030 "pxor %%mm7, %%mm7 \n\t"
4031 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4032 "movq %%mm4, %%mm6 \n\t"
4033 "psubw %%mm3, %%mm5 \n\t"
4034 // pa = abs(p-a) = abs(pav)
4035 // pb = abs(p-b) = abs(pbv)
4036 // pc = abs(p-c) = abs(pcv)
4037 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4038 "paddw %%mm5, %%mm6 \n\t"
4039 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4040 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4041 "psubw %%mm0, %%mm4 \n\t"
4042 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4043 "psubw %%mm0, %%mm4 \n\t"
4044 "psubw %%mm7, %%mm5 \n\t"
4045 "pxor %%mm0, %%mm0 \n\t"
4046 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4047 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4048 "psubw %%mm7, %%mm5 \n\t"
4049 "psubw %%mm0, %%mm6 \n\t"
4050 // test pa <= pb
4051 "movq %%mm4, %%mm7 \n\t"
4052 "psubw %%mm0, %%mm6 \n\t"
4053 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4054 "movq %%mm7, %%mm0 \n\t"
4055 // use mm7 mask to merge pa & pb
4056 "pand %%mm7, %%mm5 \n\t"
4057 // use mm0 mask copy to merge a & b
4058 "pand %%mm0, %%mm2 \n\t"
4059 "pandn %%mm4, %%mm7 \n\t"
4060 "pandn %%mm1, %%mm0 \n\t"
4061 "paddw %%mm5, %%mm7 \n\t"
4062 "paddw %%mm2, %%mm0 \n\t"
4063 // test ((pa <= pb)? pa:pb) <= pc
4064 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4065 "pxor %%mm1, %%mm1 \n\t"
4066 "pand %%mm7, %%mm3 \n\t"
4067 "pandn %%mm0, %%mm7 \n\t"
4068 "paddw %%mm3, %%mm7 \n\t"
4069 "pxor %%mm0, %%mm0 \n\t"
4070 "packuswb %%mm1, %%mm7 \n\t"
4071 "movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
4072 "pand _ActiveMask, %%mm7 \n\t"
4073 "movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
4074 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
4075 "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
4076 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
4077 "movq %%mm7, %%mm1 \n\t" // now mm1 will be used as Raw(x-bpp)
4078 // do second set of 4 bytes
4079 "punpckhbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
4080 "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
4081 // pav = p - a = (a + b - c) - a = b - c
4082 "movq %%mm2, %%mm4 \n\t"
4083 // pbv = p - b = (a + b - c) - b = a - c
4084 "movq %%mm1, %%mm5 \n\t"
4085 "psubw %%mm3, %%mm4 \n\t"
4086 "pxor %%mm7, %%mm7 \n\t"
4087 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4088 "movq %%mm4, %%mm6 \n\t"
4089 "psubw %%mm3, %%mm5 \n\t"
4090 // pa = abs(p-a) = abs(pav)
4091 // pb = abs(p-b) = abs(pbv)
4092 // pc = abs(p-c) = abs(pcv)
4093 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4094 "paddw %%mm5, %%mm6 \n\t"
4095 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4096 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4097 "psubw %%mm0, %%mm4 \n\t"
4098 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4099 "psubw %%mm0, %%mm4 \n\t"
4100 "psubw %%mm7, %%mm5 \n\t"
4101 "pxor %%mm0, %%mm0 \n\t"
4102 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4103 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4104 "psubw %%mm7, %%mm5 \n\t"
4105 "psubw %%mm0, %%mm6 \n\t"
4106 // test pa <= pb
4107 "movq %%mm4, %%mm7 \n\t"
4108 "psubw %%mm0, %%mm6 \n\t"
4109 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4110 "movq %%mm7, %%mm0 \n\t"
4111 // use mm7 mask to merge pa & pb
4112 "pand %%mm7, %%mm5 \n\t"
4113 // use mm0 mask copy to merge a & b
4114 "pand %%mm0, %%mm2 \n\t"
4115 "pandn %%mm4, %%mm7 \n\t"
4116 "pandn %%mm1, %%mm0 \n\t"
4117 "paddw %%mm5, %%mm7 \n\t"
4118 "paddw %%mm2, %%mm0 \n\t"
4119 // test ((pa <= pb)? pa:pb) <= pc
4120 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4121 "pxor %%mm1, %%mm1 \n\t"
4122 "pand %%mm7, %%mm3 \n\t"
4123 "pandn %%mm0, %%mm7 \n\t"
4124 "pxor %%mm1, %%mm1 \n\t"
4125 "paddw %%mm3, %%mm7 \n\t"
4126 "pxor %%mm0, %%mm0 \n\t"
4127 // step ecx to next set of 8 bytes and repeat loop til done
4128 "addl $8, %%ecx \n\t"
4129 "packuswb %%mm7, %%mm1 \n\t"
4130 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add predictor with Raw(x)
4131 "cmpl _MMXLength, %%ecx \n\t"
4132 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
4133 // mm1 will be used as Raw(x-bpp) next loop
4134 "jb paeth_4lp \n\t"
4135
4136 : "=S" (dummy_value_S), // output regs (dummy)
4137 "=D" (dummy_value_D)
4138
4139 : "0" (prev_row), // esi // input regs
4140 "1" (row) // edi
4141
4142 : "%ecx" // clobber list
4143#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4144 , "%mm0", "%mm1", "%mm2", "%mm3"
4145 , "%mm4", "%mm5", "%mm6", "%mm7"
4146#endif
4147 );
4148 }
4149 break; // end 4 bpp
4150
4151 case 8: // bpp == 8
4152 {
4153 _ActiveMask.use = 0x00000000ffffffffLL;
4154
4155 __asm__ __volatile__ (
4156 "movl _dif, %%ecx \n\t"
4157// preload "movl row, %%edi \n\t"
4158// preload "movl prev_row, %%esi \n\t"
4159 "pxor %%mm0, %%mm0 \n\t"
4160 // prime the pump: load the first Raw(x-bpp) data set
4161 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
4162 // a=Raw(x-bpp) bytes
4163 "paeth_8lp: \n\t"
4164 // do first set of 4 bytes
4165 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4166 "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
4167 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
4168 "punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
4169 // pav = p - a = (a + b - c) - a = b - c
4170 "movq %%mm2, %%mm4 \n\t"
4171 "punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
4172 // pbv = p - b = (a + b - c) - b = a - c
4173 "movq %%mm1, %%mm5 \n\t"
4174 "psubw %%mm3, %%mm4 \n\t"
4175 "pxor %%mm7, %%mm7 \n\t"
4176 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4177 "movq %%mm4, %%mm6 \n\t"
4178 "psubw %%mm3, %%mm5 \n\t"
4179 // pa = abs(p-a) = abs(pav)
4180 // pb = abs(p-b) = abs(pbv)
4181 // pc = abs(p-c) = abs(pcv)
4182 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4183 "paddw %%mm5, %%mm6 \n\t"
4184 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4185 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4186 "psubw %%mm0, %%mm4 \n\t"
4187 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4188 "psubw %%mm0, %%mm4 \n\t"
4189 "psubw %%mm7, %%mm5 \n\t"
4190 "pxor %%mm0, %%mm0 \n\t"
4191 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4192 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4193 "psubw %%mm7, %%mm5 \n\t"
4194 "psubw %%mm0, %%mm6 \n\t"
4195 // test pa <= pb
4196 "movq %%mm4, %%mm7 \n\t"
4197 "psubw %%mm0, %%mm6 \n\t"
4198 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4199 "movq %%mm7, %%mm0 \n\t"
4200 // use mm7 mask to merge pa & pb
4201 "pand %%mm7, %%mm5 \n\t"
4202 // use mm0 mask copy to merge a & b
4203 "pand %%mm0, %%mm2 \n\t"
4204 "pandn %%mm4, %%mm7 \n\t"
4205 "pandn %%mm1, %%mm0 \n\t"
4206 "paddw %%mm5, %%mm7 \n\t"
4207 "paddw %%mm2, %%mm0 \n\t"
4208 // test ((pa <= pb)? pa:pb) <= pc
4209 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4210 "pxor %%mm1, %%mm1 \n\t"
4211 "pand %%mm7, %%mm3 \n\t"
4212 "pandn %%mm0, %%mm7 \n\t"
4213 "paddw %%mm3, %%mm7 \n\t"
4214 "pxor %%mm0, %%mm0 \n\t"
4215 "packuswb %%mm1, %%mm7 \n\t"
4216 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4217 "pand _ActiveMask, %%mm7 \n\t"
4218 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
4219 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
4220 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
4221 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
4222 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
4223
4224 // do second set of 4 bytes
4225 "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
4226 "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
4227 // pav = p - a = (a + b - c) - a = b - c
4228 "movq %%mm2, %%mm4 \n\t"
4229 // pbv = p - b = (a + b - c) - b = a - c
4230 "movq %%mm1, %%mm5 \n\t"
4231 "psubw %%mm3, %%mm4 \n\t"
4232 "pxor %%mm7, %%mm7 \n\t"
4233 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4234 "movq %%mm4, %%mm6 \n\t"
4235 "psubw %%mm3, %%mm5 \n\t"
4236 // pa = abs(p-a) = abs(pav)
4237 // pb = abs(p-b) = abs(pbv)
4238 // pc = abs(p-c) = abs(pcv)
4239 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4240 "paddw %%mm5, %%mm6 \n\t"
4241 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4242 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4243 "psubw %%mm0, %%mm4 \n\t"
4244 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4245 "psubw %%mm0, %%mm4 \n\t"
4246 "psubw %%mm7, %%mm5 \n\t"
4247 "pxor %%mm0, %%mm0 \n\t"
4248 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4249 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4250 "psubw %%mm7, %%mm5 \n\t"
4251 "psubw %%mm0, %%mm6 \n\t"
4252 // test pa <= pb
4253 "movq %%mm4, %%mm7 \n\t"
4254 "psubw %%mm0, %%mm6 \n\t"
4255 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4256 "movq %%mm7, %%mm0 \n\t"
4257 // use mm7 mask to merge pa & pb
4258 "pand %%mm7, %%mm5 \n\t"
4259 // use mm0 mask copy to merge a & b
4260 "pand %%mm0, %%mm2 \n\t"
4261 "pandn %%mm4, %%mm7 \n\t"
4262 "pandn %%mm1, %%mm0 \n\t"
4263 "paddw %%mm5, %%mm7 \n\t"
4264 "paddw %%mm2, %%mm0 \n\t"
4265 // test ((pa <= pb)? pa:pb) <= pc
4266 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4267 "pxor %%mm1, %%mm1 \n\t"
4268 "pand %%mm7, %%mm3 \n\t"
4269 "pandn %%mm0, %%mm7 \n\t"
4270 "pxor %%mm1, %%mm1 \n\t"
4271 "paddw %%mm3, %%mm7 \n\t"
4272 "pxor %%mm0, %%mm0 \n\t"
4273 // step ecx to next set of 8 bytes and repeat loop til done
4274 "addl $8, %%ecx \n\t"
4275 "packuswb %%mm7, %%mm1 \n\t"
4276 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
4277 "cmpl _MMXLength, %%ecx \n\t"
4278 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
4279 // mm1 will be used as Raw(x-bpp) next loop
4280 "jb paeth_8lp \n\t"
4281
4282 : "=S" (dummy_value_S), // output regs (dummy)
4283 "=D" (dummy_value_D)
4284
4285 : "0" (prev_row), // esi // input regs
4286 "1" (row) // edi
4287
4288 : "%ecx" // clobber list
4289#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4290 , "%mm0", "%mm1", "%mm2", "%mm3"
4291 , "%mm4", "%mm5", "%mm6", "%mm7"
4292#endif
4293 );
4294 }
4295 break; // end 8 bpp
4296
4297 case 1: // bpp = 1
4298 case 2: // bpp = 2
4299 default: // bpp > 8
4300 {
4301 __asm__ __volatile__ (
4302#ifdef __PIC__
4303 "pushl %%ebx \n\t" // save Global Offset Table index
4304#endif
4305 "movl _dif, %%ebx \n\t"
4306 "cmpl _FullLength, %%ebx \n\t"
4307 "jnb paeth_dend \n\t"
4308
4309// preload "movl row, %%edi \n\t"
4310// preload "movl prev_row, %%esi \n\t"
4311 // do Paeth decode for remaining bytes
4312 "movl %%ebx, %%edx \n\t"
4313// preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
4314 "subl %%ecx, %%edx \n\t" // edx = ebx - bpp
4315 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
4316
4317 "paeth_dlp: \n\t"
4318 "xorl %%eax, %%eax \n\t"
4319 // pav = p - a = (a + b - c) - a = b - c
4320 "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
4321 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4322 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4323 "movl %%eax, _patemp \n\t" // Save pav for later use
4324 "xorl %%eax, %%eax \n\t"
4325 // pbv = p - b = (a + b - c) - b = a - c
4326 "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
4327 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4328 "movl %%eax, %%ecx \n\t"
4329 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4330 "addl _patemp, %%eax \n\t" // pcv = pav + pbv
4331 // pc = abs(pcv)
4332 "testl $0x80000000, %%eax \n\t"
4333 "jz paeth_dpca \n\t"
4334 "negl %%eax \n\t" // reverse sign of neg values
4335
4336 "paeth_dpca: \n\t"
4337 "movl %%eax, _pctemp \n\t" // save pc for later use
4338 // pb = abs(pbv)
4339 "testl $0x80000000, %%ecx \n\t"
4340 "jz paeth_dpba \n\t"
4341 "negl %%ecx \n\t" // reverse sign of neg values
4342
4343 "paeth_dpba: \n\t"
4344 "movl %%ecx, _pbtemp \n\t" // save pb for later use
4345 // pa = abs(pav)
4346 "movl _patemp, %%eax \n\t"
4347 "testl $0x80000000, %%eax \n\t"
4348 "jz paeth_dpaa \n\t"
4349 "negl %%eax \n\t" // reverse sign of neg values
4350
4351 "paeth_dpaa: \n\t"
4352 "movl %%eax, _patemp \n\t" // save pa for later use
4353 // test if pa <= pb
4354 "cmpl %%ecx, %%eax \n\t"
4355 "jna paeth_dabb \n\t"
4356 // pa > pb; now test if pb <= pc
4357 "cmpl _pctemp, %%ecx \n\t"
4358 "jna paeth_dbbc \n\t"
4359 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4360 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4361 "jmp paeth_dpaeth \n\t"
4362
4363 "paeth_dbbc: \n\t"
4364 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4365 "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
4366 "jmp paeth_dpaeth \n\t"
4367
4368 "paeth_dabb: \n\t"
4369 // pa <= pb; now test if pa <= pc
4370 "cmpl _pctemp, %%eax \n\t"
4371 "jna paeth_dabc \n\t"
4372 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4373 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4374 "jmp paeth_dpaeth \n\t"
4375
4376 "paeth_dabc: \n\t"
4377 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4378 "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
4379
4380 "paeth_dpaeth: \n\t"
4381 "incl %%ebx \n\t"
4382 "incl %%edx \n\t"
4383 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4384 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4385 "cmpl _FullLength, %%ebx \n\t"
4386 "jb paeth_dlp \n\t"
4387
4388 "paeth_dend: \n\t"
4389#ifdef __PIC__
4390 "popl %%ebx \n\t" // index to Global Offset Table
4391#endif
4392
4393 : "=c" (dummy_value_c), // output regs (dummy)
4394 "=S" (dummy_value_S),
4395 "=D" (dummy_value_D)
4396
4397 : "0" (bpp), // ecx // input regs
4398 "1" (prev_row), // esi
4399 "2" (row) // edi
4400
4401 : "%eax", "%edx" // clobber list
4402#ifndef __PIC__
4403 , "%ebx"
4404#endif
4405 );
4406 }
4407 return; // No need to go further with this one
4408
4409 } // end switch (bpp)
4410
4411 __asm__ __volatile__ (
4412 // MMX acceleration complete; now do clean-up
4413 // check if any remaining bytes left to decode
4414#ifdef __PIC__
4415 "pushl %%ebx \n\t" // save index to Global Offset Table
4416#endif
4417 "movl _MMXLength, %%ebx \n\t"
4418 "cmpl _FullLength, %%ebx \n\t"
4419 "jnb paeth_end \n\t"
4420//pre "movl row, %%edi \n\t"
4421//pre "movl prev_row, %%esi \n\t"
4422 // do Paeth decode for remaining bytes
4423 "movl %%ebx, %%edx \n\t"
4424//pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
4425 "subl %%ecx, %%edx \n\t" // edx = ebx - bpp
4426 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
4427
4428 "paeth_lp2: \n\t"
4429 "xorl %%eax, %%eax \n\t"
4430 // pav = p - a = (a + b - c) - a = b - c
4431 "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
4432 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4433 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4434 "movl %%eax, _patemp \n\t" // Save pav for later use
4435 "xorl %%eax, %%eax \n\t"
4436 // pbv = p - b = (a + b - c) - b = a - c
4437 "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
4438 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4439 "movl %%eax, %%ecx \n\t"
4440 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4441 "addl _patemp, %%eax \n\t" // pcv = pav + pbv
4442 // pc = abs(pcv)
4443 "testl $0x80000000, %%eax \n\t"
4444 "jz paeth_pca2 \n\t"
4445 "negl %%eax \n\t" // reverse sign of neg values
4446
4447 "paeth_pca2: \n\t"
4448 "movl %%eax, _pctemp \n\t" // save pc for later use
4449 // pb = abs(pbv)
4450 "testl $0x80000000, %%ecx \n\t"
4451 "jz paeth_pba2 \n\t"
4452 "negl %%ecx \n\t" // reverse sign of neg values
4453
4454 "paeth_pba2: \n\t"
4455 "movl %%ecx, _pbtemp \n\t" // save pb for later use
4456 // pa = abs(pav)
4457 "movl _patemp, %%eax \n\t"
4458 "testl $0x80000000, %%eax \n\t"
4459 "jz paeth_paa2 \n\t"
4460 "negl %%eax \n\t" // reverse sign of neg values
4461
4462 "paeth_paa2: \n\t"
4463 "movl %%eax, _patemp \n\t" // save pa for later use
4464 // test if pa <= pb
4465 "cmpl %%ecx, %%eax \n\t"
4466 "jna paeth_abb2 \n\t"
4467 // pa > pb; now test if pb <= pc
4468 "cmpl _pctemp, %%ecx \n\t"
4469 "jna paeth_bbc2 \n\t"
4470 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4471 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4472 "jmp paeth_paeth2 \n\t"
4473
4474 "paeth_bbc2: \n\t"
4475 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4476 "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
4477 "jmp paeth_paeth2 \n\t"
4478
4479 "paeth_abb2: \n\t"
4480 // pa <= pb; now test if pa <= pc
4481 "cmpl _pctemp, %%eax \n\t"
4482 "jna paeth_abc2 \n\t"
4483 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4484 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4485 "jmp paeth_paeth2 \n\t"
4486
4487 "paeth_abc2: \n\t"
4488 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4489 "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
4490
4491 "paeth_paeth2: \n\t"
4492 "incl %%ebx \n\t"
4493 "incl %%edx \n\t"
4494 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4495 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4496 "cmpl _FullLength, %%ebx \n\t"
4497 "jb paeth_lp2 \n\t"
4498
4499 "paeth_end: \n\t"
4500 "EMMS \n\t" // end MMX; prep for poss. FP instrs.
4501#ifdef __PIC__
4502 "popl %%ebx \n\t" // restore index to Global Offset Table
4503#endif
4504
4505 : "=c" (dummy_value_c), // output regs (dummy)
4506 "=S" (dummy_value_S),
4507 "=D" (dummy_value_D)
4508
4509 : "0" (bpp), // ecx // input regs
4510 "1" (prev_row), // esi
4511 "2" (row) // edi
4512
4513 : "%eax", "%edx" // clobber list (no input regs!)
4514#ifndef __PIC__
4515 , "%ebx"
4516#endif
4517 );
4518
4519} /* end png_read_filter_row_mmx_paeth() */
4520#endif
4521
4522
4523
4524
4525#ifdef PNG_THREAD_UNSAFE_OK
4526//===========================================================================//
4527// //
4528// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B //
4529// //
4530//===========================================================================//
4531
4532// Optimized code for PNG Sub filter decoder
4533
4534static void /* PRIVATE */
4535png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
4536{
4537 int bpp;
4538 int dummy_value_a;
4539 int dummy_value_D;
4540
4541 bpp = (row_info->pixel_depth + 7) >> 3; // calc number of bytes per pixel
4542 _FullLength = row_info->rowbytes - bpp; // number of bytes to filter
4543
4544 __asm__ __volatile__ (
4545//pre "movl row, %%edi \n\t"
4546 "movl %%edi, %%esi \n\t" // lp = row
4547//pre "movl bpp, %%eax \n\t"
4548 "addl %%eax, %%edi \n\t" // rp = row + bpp
4549//irr "xorl %%eax, %%eax \n\t"
4550 // get # of bytes to alignment
4551 "movl %%edi, _dif \n\t" // take start of row
4552 "addl $0xf, _dif \n\t" // add 7 + 8 to incr past
4553 // alignment boundary
4554 "xorl %%ecx, %%ecx \n\t"
4555 "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
4556 "subl %%edi, _dif \n\t" // subtract from start ==> value
4557 "jz sub_go \n\t" // ecx at alignment
4558
4559 "sub_lp1: \n\t" // fix alignment
4560 "movb (%%esi,%%ecx,), %%al \n\t"
4561 "addb %%al, (%%edi,%%ecx,) \n\t"
4562 "incl %%ecx \n\t"
4563 "cmpl _dif, %%ecx \n\t"
4564 "jb sub_lp1 \n\t"
4565
4566 "sub_go: \n\t"
4567 "movl _FullLength, %%eax \n\t"
4568 "movl %%eax, %%edx \n\t"
4569 "subl %%ecx, %%edx \n\t" // subtract alignment fix
4570 "andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
4571 "subl %%edx, %%eax \n\t" // drop over bytes from length
4572 "movl %%eax, _MMXLength \n\t"
4573
4574 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4575 "=D" (dummy_value_D) // 1
4576
4577 : "0" (bpp), // eax // input regs
4578 "1" (row) // edi
4579
4580 : "%esi", "%ecx", "%edx" // clobber list
4581
4582#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4583 , "%mm0", "%mm1", "%mm2", "%mm3"
4584 , "%mm4", "%mm5", "%mm6", "%mm7"
4585#endif
4586 );
4587
4588 // now do the math for the rest of the row
4589 switch (bpp)
4590 {
4591 case 3:
4592 {
4593 _ActiveMask.use = 0x0000ffffff000000LL;
4594 _ShiftBpp.use = 24; // == 3 * 8
4595 _ShiftRem.use = 40; // == 64 - 24
4596
4597 __asm__ __volatile__ (
4598// preload "movl row, %%edi \n\t"
4599 "movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
4600 // active byte group
4601 "movl %%edi, %%esi \n\t" // lp = row
4602// preload "movl bpp, %%eax \n\t"
4603 "addl %%eax, %%edi \n\t" // rp = row + bpp
4604 "movq %%mm7, %%mm6 \n\t"
4605 "movl _dif, %%edx \n\t"
4606 "psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
4607 // 3rd active byte group
4608 // prime the pump: load the first Raw(x-bpp) data set
4609 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4610
4611 "sub_3lp: \n\t" // shift data for adding first
4612 "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4613 // shift clears inactive bytes)
4614 // add 1st active group
4615 "movq (%%edi,%%edx,), %%mm0 \n\t"
4616 "paddb %%mm1, %%mm0 \n\t"
4617
4618 // add 2nd active group
4619 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4620 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4621 "pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
4622 "paddb %%mm1, %%mm0 \n\t"
4623
4624 // add 3rd active group
4625 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4626 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4627 "pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
4628 "addl $8, %%edx \n\t"
4629 "paddb %%mm1, %%mm0 \n\t"
4630
4631 "cmpl _MMXLength, %%edx \n\t"
4632 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4633 "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4634 "jb sub_3lp \n\t"
4635
4636 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4637 "=D" (dummy_value_D) // 1
4638
4639 : "0" (bpp), // eax // input regs
4640 "1" (row) // edi
4641
4642 : "%edx", "%esi" // clobber list
4643#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4644 , "%mm0", "%mm1", "%mm6", "%mm7"
4645#endif
4646 );
4647 }
4648 break;
4649
4650 case 1:
4651 {
4652 __asm__ __volatile__ (
4653 "movl _dif, %%edx \n\t"
4654// preload "movl row, %%edi \n\t"
4655 "cmpl _FullLength, %%edx \n\t"
4656 "jnb sub_1end \n\t"
4657 "movl %%edi, %%esi \n\t" // lp = row
4658 "xorl %%eax, %%eax \n\t"
4659// preload "movl bpp, %%eax \n\t"
4660 "addl %%eax, %%edi \n\t" // rp = row + bpp
4661
4662 "sub_1lp: \n\t"
4663 "movb (%%esi,%%edx,), %%al \n\t"
4664 "addb %%al, (%%edi,%%edx,) \n\t"
4665 "incl %%edx \n\t"
4666 "cmpl _FullLength, %%edx \n\t"
4667 "jb sub_1lp \n\t"
4668
4669 "sub_1end: \n\t"
4670
4671 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4672 "=D" (dummy_value_D) // 1
4673
4674 : "0" (bpp), // eax // input regs
4675 "1" (row) // edi
4676
4677 : "%edx", "%esi" // clobber list
4678 );
4679 }
4680 return;
4681
4682 case 6:
4683 case 4:
4684 //case 7: // GRR BOGUS
4685 //case 5: // GRR BOGUS
4686 {
4687 _ShiftBpp.use = bpp << 3;
4688 _ShiftRem.use = 64 - _ShiftBpp.use;
4689
4690 __asm__ __volatile__ (
4691// preload "movl row, %%edi \n\t"
4692 "movl _dif, %%edx \n\t"
4693 "movl %%edi, %%esi \n\t" // lp = row
4694// preload "movl bpp, %%eax \n\t"
4695 "addl %%eax, %%edi \n\t" // rp = row + bpp
4696
4697 // prime the pump: load the first Raw(x-bpp) data set
4698 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4699
4700 "sub_4lp: \n\t" // shift data for adding first
4701 "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4702 // shift clears inactive bytes)
4703 "movq (%%edi,%%edx,), %%mm0 \n\t"
4704 "paddb %%mm1, %%mm0 \n\t"
4705
4706 // add 2nd active group
4707 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4708 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4709 "addl $8, %%edx \n\t"
4710 "paddb %%mm1, %%mm0 \n\t"
4711
4712 "cmpl _MMXLength, %%edx \n\t"
4713 "movq %%mm0, -8(%%edi,%%edx,) \n\t"
4714 "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4715 "jb sub_4lp \n\t"
4716
4717 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4718 "=D" (dummy_value_D) // 1
4719
4720 : "0" (bpp), // eax // input regs
4721 "1" (row) // edi
4722
4723 : "%edx", "%esi" // clobber list
4724#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4725 , "%mm0", "%mm1"
4726#endif
4727 );
4728 }
4729 break;
4730
4731 case 2:
4732 {
4733 _ActiveMask.use = 0x00000000ffff0000LL;
4734 _ShiftBpp.use = 16; // == 2 * 8
4735 _ShiftRem.use = 48; // == 64 - 16
4736
4737 __asm__ __volatile__ (
4738 "movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
4739 // active byte group
4740 "movl _dif, %%edx \n\t"
4741 "movq %%mm7, %%mm6 \n\t"
4742// preload "movl row, %%edi \n\t"
4743 "psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
4744 // 3rd active byte group
4745 "movl %%edi, %%esi \n\t" // lp = row
4746 "movq %%mm6, %%mm5 \n\t"
4747// preload "movl bpp, %%eax \n\t"
4748 "addl %%eax, %%edi \n\t" // rp = row + bpp
4749 "psllq _ShiftBpp, %%mm5 \n\t" // move mask in mm5 to cover
4750 // 4th active byte group
4751 // prime the pump: load the first Raw(x-bpp) data set
4752 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4753
4754 "sub_2lp: \n\t" // shift data for adding first
4755 "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4756 // shift clears inactive bytes)
4757 // add 1st active group
4758 "movq (%%edi,%%edx,), %%mm0 \n\t"
4759 "paddb %%mm1, %%mm0 \n\t"
4760
4761 // add 2nd active group
4762 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4763 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4764 "pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
4765 "paddb %%mm1, %%mm0 \n\t"
4766
4767 // add 3rd active group
4768 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4769 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4770 "pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
4771 "paddb %%mm1, %%mm0 \n\t"
4772
4773 // add 4th active group
4774 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4775 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4776 "pand %%mm5, %%mm1 \n\t" // mask to use 4th active group
4777 "addl $8, %%edx \n\t"
4778 "paddb %%mm1, %%mm0 \n\t"
4779 "cmpl _MMXLength, %%edx \n\t"
4780 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4781 "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4782 "jb sub_2lp \n\t"
4783
4784 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4785 "=D" (dummy_value_D) // 1
4786
4787 : "0" (bpp), // eax // input regs
4788 "1" (row) // edi
4789
4790 : "%edx", "%esi" // clobber list
4791#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4792 , "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
4793#endif
4794 );
4795 }
4796 break;
4797
4798 case 8:
4799 {
4800 __asm__ __volatile__ (
4801// preload "movl row, %%edi \n\t"
4802 "movl _dif, %%edx \n\t"
4803 "movl %%edi, %%esi \n\t" // lp = row
4804// preload "movl bpp, %%eax \n\t"
4805 "addl %%eax, %%edi \n\t" // rp = row + bpp
4806 "movl _MMXLength, %%ecx \n\t"
4807
4808 // prime the pump: load the first Raw(x-bpp) data set
4809 "movq -8(%%edi,%%edx,), %%mm7 \n\t"
4810 "andl $0x0000003f, %%ecx \n\t" // calc bytes over mult of 64
4811
4812 "sub_8lp: \n\t"
4813 "movq (%%edi,%%edx,), %%mm0 \n\t" // load Sub(x) for 1st 8 bytes
4814 "paddb %%mm7, %%mm0 \n\t"
4815 "movq 8(%%edi,%%edx,), %%mm1 \n\t" // load Sub(x) for 2nd 8 bytes
4816 "movq %%mm0, (%%edi,%%edx,) \n\t" // write Raw(x) for 1st 8 bytes
4817
4818 // Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
4819 // This will be repeated for each group of 8 bytes with the 8th
4820 // group being used as the Raw(x-bpp) for the 1st group of the
4821 // next loop.
4822
4823 "paddb %%mm0, %%mm1 \n\t"
4824 "movq 16(%%edi,%%edx,), %%mm2 \n\t" // load Sub(x) for 3rd 8 bytes
4825 "movq %%mm1, 8(%%edi,%%edx,) \n\t" // write Raw(x) for 2nd 8 bytes
4826 "paddb %%mm1, %%mm2 \n\t"
4827 "movq 24(%%edi,%%edx,), %%mm3 \n\t" // load Sub(x) for 4th 8 bytes
4828 "movq %%mm2, 16(%%edi,%%edx,) \n\t" // write Raw(x) for 3rd 8 bytes
4829 "paddb %%mm2, %%mm3 \n\t"
4830 "movq 32(%%edi,%%edx,), %%mm4 \n\t" // load Sub(x) for 5th 8 bytes
4831 "movq %%mm3, 24(%%edi,%%edx,) \n\t" // write Raw(x) for 4th 8 bytes
4832 "paddb %%mm3, %%mm4 \n\t"
4833 "movq 40(%%edi,%%edx,), %%mm5 \n\t" // load Sub(x) for 6th 8 bytes
4834 "movq %%mm4, 32(%%edi,%%edx,) \n\t" // write Raw(x) for 5th 8 bytes
4835 "paddb %%mm4, %%mm5 \n\t"
4836 "movq 48(%%edi,%%edx,), %%mm6 \n\t" // load Sub(x) for 7th 8 bytes
4837 "movq %%mm5, 40(%%edi,%%edx,) \n\t" // write Raw(x) for 6th 8 bytes
4838 "paddb %%mm5, %%mm6 \n\t"
4839 "movq 56(%%edi,%%edx,), %%mm7 \n\t" // load Sub(x) for 8th 8 bytes
4840 "movq %%mm6, 48(%%edi,%%edx,) \n\t" // write Raw(x) for 7th 8 bytes
4841 "addl $64, %%edx \n\t"
4842 "paddb %%mm6, %%mm7 \n\t"
4843 "cmpl %%ecx, %%edx \n\t"
4844 "movq %%mm7, -8(%%edi,%%edx,) \n\t" // write Raw(x) for 8th 8 bytes
4845 "jb sub_8lp \n\t"
4846
4847 "cmpl _MMXLength, %%edx \n\t"
4848 "jnb sub_8lt8 \n\t"
4849
4850 "sub_8lpA: \n\t"
4851 "movq (%%edi,%%edx,), %%mm0 \n\t"
4852 "addl $8, %%edx \n\t"
4853 "paddb %%mm7, %%mm0 \n\t"
4854 "cmpl _MMXLength, %%edx \n\t"
4855 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // -8 to offset early addl edx
4856 "movq %%mm0, %%mm7 \n\t" // move calculated Raw(x) data
4857 // to mm1 to be new Raw(x-bpp)
4858 // for next loop
4859 "jb sub_8lpA \n\t"
4860
4861 "sub_8lt8: \n\t"
4862
4863 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4864 "=D" (dummy_value_D) // 1
4865
4866 : "0" (bpp), // eax // input regs
4867 "1" (row) // edi
4868
4869 : "%ecx", "%edx", "%esi" // clobber list
4870#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4871 , "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
4872#endif
4873 );
4874 }
4875 break;
4876
4877 default: // bpp greater than 8 bytes GRR BOGUS
4878 {
4879 __asm__ __volatile__ (
4880 "movl _dif, %%edx \n\t"
4881// preload "movl row, %%edi \n\t"
4882 "movl %%edi, %%esi \n\t" // lp = row
4883// preload "movl bpp, %%eax \n\t"
4884 "addl %%eax, %%edi \n\t" // rp = row + bpp
4885
4886 "sub_Alp: \n\t"
4887 "movq (%%edi,%%edx,), %%mm0 \n\t"
4888 "movq (%%esi,%%edx,), %%mm1 \n\t"
4889 "addl $8, %%edx \n\t"
4890 "paddb %%mm1, %%mm0 \n\t"
4891 "cmpl _MMXLength, %%edx \n\t"
4892 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // mov does not affect flags;
4893 // -8 to offset addl edx
4894 "jb sub_Alp \n\t"
4895
4896 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4897 "=D" (dummy_value_D) // 1
4898
4899 : "0" (bpp), // eax // input regs
4900 "1" (row) // edi
4901
4902 : "%edx", "%esi" // clobber list
4903#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4904 , "%mm0", "%mm1"
4905#endif
4906 );
4907 }
4908 break;
4909
4910 } // end switch (bpp)
4911
4912 __asm__ __volatile__ (
4913 "movl _MMXLength, %%edx \n\t"
4914//pre "movl row, %%edi \n\t"
4915 "cmpl _FullLength, %%edx \n\t"
4916 "jnb sub_end \n\t"
4917
4918 "movl %%edi, %%esi \n\t" // lp = row
4919//pre "movl bpp, %%eax \n\t"
4920 "addl %%eax, %%edi \n\t" // rp = row + bpp
4921 "xorl %%eax, %%eax \n\t"
4922
4923 "sub_lp2: \n\t"
4924 "movb (%%esi,%%edx,), %%al \n\t"
4925 "addb %%al, (%%edi,%%edx,) \n\t"
4926 "incl %%edx \n\t"
4927 "cmpl _FullLength, %%edx \n\t"
4928 "jb sub_lp2 \n\t"
4929
4930 "sub_end: \n\t"
4931 "EMMS \n\t" // end MMX instructions
4932
4933 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4934 "=D" (dummy_value_D) // 1
4935
4936 : "0" (bpp), // eax // input regs
4937 "1" (row) // edi
4938
4939 : "%edx", "%esi" // clobber list
4940 );
4941
4942} // end of png_read_filter_row_mmx_sub()
4943#endif
4944
4945
4946
4947
4948//===========================================================================//
4949// //
4950// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P //
4951// //
4952//===========================================================================//
4953
4954// Optimized code for PNG Up filter decoder
4955
4956static void /* PRIVATE */
4957png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
4958 png_bytep prev_row)
4959{
4960 png_uint_32 len;
4961 int dummy_value_d; // fix 'forbidden register 3 (dx) was spilled' error
4962 int dummy_value_S;
4963 int dummy_value_D;
4964
4965 len = row_info->rowbytes; // number of bytes to filter
4966
4967 __asm__ __volatile__ (
4968//pre "movl row, %%edi \n\t"
4969 // get # of bytes to alignment
4970#ifdef __PIC__
4971 "pushl %%ebx \n\t"
4972#endif
4973 "movl %%edi, %%ecx \n\t"
4974 "xorl %%ebx, %%ebx \n\t"
4975 "addl $0x7, %%ecx \n\t"
4976 "xorl %%eax, %%eax \n\t"
4977 "andl $0xfffffff8, %%ecx \n\t"
4978//pre "movl prev_row, %%esi \n\t"
4979 "subl %%edi, %%ecx \n\t"
4980 "jz up_go \n\t"
4981
4982 "up_lp1: \n\t" // fix alignment
4983 "movb (%%edi,%%ebx,), %%al \n\t"
4984 "addb (%%esi,%%ebx,), %%al \n\t"
4985 "incl %%ebx \n\t"
4986 "cmpl %%ecx, %%ebx \n\t"
4987 "movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
4988 "jb up_lp1 \n\t" // offset incl ebx
4989
4990 "up_go: \n\t"
4991//pre "movl len, %%edx \n\t"
4992 "movl %%edx, %%ecx \n\t"
4993 "subl %%ebx, %%edx \n\t" // subtract alignment fix
4994 "andl $0x0000003f, %%edx \n\t" // calc bytes over mult of 64
4995 "subl %%edx, %%ecx \n\t" // drop over bytes from length
4996
4997 // unrolled loop - use all MMX registers and interleave to reduce
4998 // number of branch instructions (loops) and reduce partial stalls
4999 "up_loop: \n\t"
5000 "movq (%%esi,%%ebx,), %%mm1 \n\t"
5001 "movq (%%edi,%%ebx,), %%mm0 \n\t"
5002 "movq 8(%%esi,%%ebx,), %%mm3 \n\t"
5003 "paddb %%mm1, %%mm0 \n\t"
5004 "movq 8(%%edi,%%ebx,), %%mm2 \n\t"
5005 "movq %%mm0, (%%edi,%%ebx,) \n\t"
5006 "paddb %%mm3, %%mm2 \n\t"
5007 "movq 16(%%esi,%%ebx,), %%mm5 \n\t"
5008 "movq %%mm2, 8(%%edi,%%ebx,) \n\t"
5009 "movq 16(%%edi,%%ebx,), %%mm4 \n\t"
5010 "movq 24(%%esi,%%ebx,), %%mm7 \n\t"
5011 "paddb %%mm5, %%mm4 \n\t"
5012 "movq 24(%%edi,%%ebx,), %%mm6 \n\t"
5013 "movq %%mm4, 16(%%edi,%%ebx,) \n\t"
5014 "paddb %%mm7, %%mm6 \n\t"
5015 "movq 32(%%esi,%%ebx,), %%mm1 \n\t"
5016 "movq %%mm6, 24(%%edi,%%ebx,) \n\t"
5017 "movq 32(%%edi,%%ebx,), %%mm0 \n\t"
5018 "movq 40(%%esi,%%ebx,), %%mm3 \n\t"
5019 "paddb %%mm1, %%mm0 \n\t"
5020 "movq 40(%%edi,%%ebx,), %%mm2 \n\t"
5021 "movq %%mm0, 32(%%edi,%%ebx,) \n\t"
5022 "paddb %%mm3, %%mm2 \n\t"
5023 "movq 48(%%esi,%%ebx,), %%mm5 \n\t"
5024 "movq %%mm2, 40(%%edi,%%ebx,) \n\t"
5025 "movq 48(%%edi,%%ebx,), %%mm4 \n\t"
5026 "movq 56(%%esi,%%ebx,), %%mm7 \n\t"
5027 "paddb %%mm5, %%mm4 \n\t"
5028 "movq 56(%%edi,%%ebx,), %%mm6 \n\t"
5029 "movq %%mm4, 48(%%edi,%%ebx,) \n\t"
5030 "addl $64, %%ebx \n\t"
5031 "paddb %%mm7, %%mm6 \n\t"
5032 "cmpl %%ecx, %%ebx \n\t"
5033 "movq %%mm6, -8(%%edi,%%ebx,) \n\t" // (+56)movq does not affect flags;
5034 "jb up_loop \n\t" // -8 to offset addl ebx
5035
5036 "cmpl $0, %%edx \n\t" // test for bytes over mult of 64
5037 "jz up_end \n\t"
5038
5039 "cmpl $8, %%edx \n\t" // test for less than 8 bytes
5040 "jb up_lt8 \n\t" // [added by lcreeve at netins.net]
5041
5042 "addl %%edx, %%ecx \n\t"
5043 "andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
5044 "subl %%edx, %%ecx \n\t" // drop over bytes from length
5045 "jz up_lt8 \n\t"
5046
5047 "up_lpA: \n\t" // use MMX regs to update 8 bytes sim.
5048 "movq (%%esi,%%ebx,), %%mm1 \n\t"
5049 "movq (%%edi,%%ebx,), %%mm0 \n\t"
5050 "addl $8, %%ebx \n\t"
5051 "paddb %%mm1, %%mm0 \n\t"
5052 "cmpl %%ecx, %%ebx \n\t"
5053 "movq %%mm0, -8(%%edi,%%ebx,) \n\t" // movq does not affect flags; -8 to
5054 "jb up_lpA \n\t" // offset add ebx
5055 "cmpl $0, %%edx \n\t" // test for bytes over mult of 8
5056 "jz up_end \n\t"
5057
5058 "up_lt8: \n\t"
5059 "xorl %%eax, %%eax \n\t"
5060 "addl %%edx, %%ecx \n\t" // move over byte count into counter
5061
5062 "up_lp2: \n\t" // use x86 regs for remaining bytes
5063 "movb (%%edi,%%ebx,), %%al \n\t"
5064 "addb (%%esi,%%ebx,), %%al \n\t"
5065 "incl %%ebx \n\t"
5066 "cmpl %%ecx, %%ebx \n\t"
5067 "movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
5068 "jb up_lp2 \n\t" // offset inc ebx
5069
5070 "up_end: \n\t"
5071 "EMMS \n\t" // conversion of filtered row complete
5072#ifdef __PIC__
5073 "popl %%ebx \n\t"
5074#endif
5075
5076 : "=d" (dummy_value_d), // 0 // output regs (dummy)
5077 "=S" (dummy_value_S), // 1
5078 "=D" (dummy_value_D) // 2
5079
5080 : "0" (len), // edx // input regs
5081 "1" (prev_row), // esi
5082 "2" (row) // edi
5083
5084 : "%eax", "%ecx" // clobber list (no input regs!)
5085#ifndef __PIC__
5086 , "%ebx"
5087#endif
5088
5089#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
5090 , "%mm0", "%mm1", "%mm2", "%mm3"
5091 , "%mm4", "%mm5", "%mm6", "%mm7"
5092#endif
5093 );
5094
5095} // end of png_read_filter_row_mmx_up()
5096
5097#endif /* PNG_MMX_CODE_SUPPORTED */
5098
5099
5100
5101
5102/*===========================================================================*/
5103/* */
5104/* P N G _ R E A D _ F I L T E R _ R O W */
5105/* */
5106/*===========================================================================*/
5107
5108
5109/* Optimized png_read_filter_row routines */
5110
5111void /* PRIVATE */
5112png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
5113 row, png_bytep prev_row, int filter)
5114{
5115#ifdef PNG_DEBUG
5116 char filnm[10];
5117#endif
5118
5119#if defined(PNG_MMX_CODE_SUPPORTED)
5120/* GRR: these are superseded by png_ptr->asm_flags: */
5121#define UseMMX_sub 1 // GRR: converted 20000730
5122#define UseMMX_up 1 // GRR: converted 20000729
5123#define UseMMX_avg 1 // GRR: converted 20000828 (+ 16-bit bugfix 20000916)
5124#define UseMMX_paeth 1 // GRR: converted 20000828
5125
5126 if (_mmx_supported == 2) {
5127 /* this should have happened in png_init_mmx_flags() already */
5128#if !defined(PNG_1_0_X)
5129 png_warning(png_ptr, "asm_flags may not have been initialized");
5130#endif
5131 png_mmx_support();
5132 }
5133#endif /* PNG_MMX_CODE_SUPPORTED */
5134
5135#ifdef PNG_DEBUG
5136 png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
5137 switch (filter)
5138 {
5139 case 0: sprintf(filnm, "none");
5140 break;
5141 case 1: sprintf(filnm, "sub-%s",
5142#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5143#if !defined(PNG_1_0_X)
5144 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" :
5145#endif
5146#endif
5147"x86");
5148 break;
5149 case 2: sprintf(filnm, "up-%s",
5150#ifdef PNG_MMX_CODE_SUPPORTED
5151#if !defined(PNG_1_0_X)
5152 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" :
5153#endif
5154#endif
5155 "x86");
5156 break;
5157 case 3: sprintf(filnm, "avg-%s",
5158#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5159#if !defined(PNG_1_0_X)
5160 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" :
5161#endif
5162#endif
5163 "x86");
5164 break;
5165 case 4: sprintf(filnm, "Paeth-%s",
5166#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5167#if !defined(PNG_1_0_X)
5168 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":
5169#endif
5170#endif
5171"x86");
5172 break;
5173 default: sprintf(filnm, "unknw");
5174 break;
5175 }
5176 png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
5177 png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
5178 png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
5179 (int)((row_info->pixel_depth + 7) >> 3));
5180 png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
5181#endif /* PNG_DEBUG */
5182
5183 switch (filter)
5184 {
5185 case PNG_FILTER_VALUE_NONE:
5186 break;
5187
5188 case PNG_FILTER_VALUE_SUB:
5189#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5190#if !defined(PNG_1_0_X)
5191 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
5192 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5193 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5194#else
5195 if (_mmx_supported)
5196#endif
5197 {
5198 png_read_filter_row_mmx_sub(row_info, row);
5199 }
5200 else
5201#endif /* PNG_MMX_CODE_SUPPORTED */
5202 {
5203 png_uint_32 i;
5204 png_uint_32 istop = row_info->rowbytes;
5205 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5206 png_bytep rp = row + bpp;
5207 png_bytep lp = row;
5208
5209 for (i = bpp; i < istop; i++)
5210 {
5211 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
5212 rp++;
5213 }
5214 } /* end !UseMMX_sub */
5215 break;
5216
5217 case PNG_FILTER_VALUE_UP:
5218#if defined(PNG_MMX_CODE_SUPPORTED)
5219#if !defined(PNG_1_0_X)
5220 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
5221 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5222 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5223#else
5224 if (_mmx_supported)
5225#endif
5226 {
5227 png_read_filter_row_mmx_up(row_info, row, prev_row);
5228 }
5229 else
5230#endif /* PNG_MMX_CODE_SUPPORTED */
5231 {
5232 png_uint_32 i;
5233 png_uint_32 istop = row_info->rowbytes;
5234 png_bytep rp = row;
5235 png_bytep pp = prev_row;
5236
5237 for (i = 0; i < istop; ++i)
5238 {
5239 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5240 rp++;
5241 }
5242 } /* end !UseMMX_up */
5243 break;
5244
5245 case PNG_FILTER_VALUE_AVG:
5246#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5247#if !defined(PNG_1_0_X)
5248 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
5249 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5250 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5251#else
5252 if (_mmx_supported)
5253#endif
5254 {
5255 png_read_filter_row_mmx_avg(row_info, row, prev_row);
5256 }
5257 else
5258#endif /* PNG_MMX_CODE_SUPPORTED */
5259 {
5260 png_uint_32 i;
5261 png_bytep rp = row;
5262 png_bytep pp = prev_row;
5263 png_bytep lp = row;
5264 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5265 png_uint_32 istop = row_info->rowbytes - bpp;
5266
5267 for (i = 0; i < bpp; i++)
5268 {
5269 *rp = (png_byte)(((int)(*rp) +
5270 ((int)(*pp++) >> 1)) & 0xff);
5271 rp++;
5272 }
5273
5274 for (i = 0; i < istop; i++)
5275 {
5276 *rp = (png_byte)(((int)(*rp) +
5277 ((int)(*pp++ + *lp++) >> 1)) & 0xff);
5278 rp++;
5279 }
5280 } /* end !UseMMX_avg */
5281 break;
5282
5283 case PNG_FILTER_VALUE_PAETH:
5284#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5285#if !defined(PNG_1_0_X)
5286 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
5287 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5288 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5289#else
5290 if (_mmx_supported)
5291#endif
5292 {
5293 png_read_filter_row_mmx_paeth(row_info, row, prev_row);
5294 }
5295 else
5296#endif /* PNG_MMX_CODE_SUPPORTED */
5297 {
5298 png_uint_32 i;
5299 png_bytep rp = row;
5300 png_bytep pp = prev_row;
5301 png_bytep lp = row;
5302 png_bytep cp = prev_row;
5303 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5304 png_uint_32 istop = row_info->rowbytes - bpp;
5305
5306 for (i = 0; i < bpp; i++)
5307 {
5308 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5309 rp++;
5310 }
5311
5312 for (i = 0; i < istop; i++) /* use leftover rp,pp */
5313 {
5314 int a, b, c, pa, pb, pc, p;
5315
5316 a = *lp++;
5317 b = *pp++;
5318 c = *cp++;
5319
5320 p = b - c;
5321 pc = a - c;
5322
5323#ifdef PNG_USE_ABS
5324 pa = abs(p);
5325 pb = abs(pc);
5326 pc = abs(p + pc);
5327#else
5328 pa = p < 0 ? -p : p;
5329 pb = pc < 0 ? -pc : pc;
5330 pc = (p + pc) < 0 ? -(p + pc) : p + pc;
5331#endif
5332
5333 /*
5334 if (pa <= pb && pa <= pc)
5335 p = a;
5336 else if (pb <= pc)
5337 p = b;
5338 else
5339 p = c;
5340 */
5341
5342 p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
5343
5344 *rp = (png_byte)(((int)(*rp) + p) & 0xff);
5345 rp++;
5346 }
5347 } /* end !UseMMX_paeth */
5348 break;
5349
5350 default:
5351 png_warning(png_ptr, "Ignoring bad row-filter type");
5352 *row=0;
5353 break;
5354 }
5355}
5356
5357#endif /* PNG_HAVE_MMX_READ_FILTER_ROW */
5358
5359
5360/*===========================================================================*/
5361/* */
5362/* P N G _ M M X _ S U P P O R T */
5363/* */
5364/*===========================================================================*/
5365
5366/* GRR NOTES: (1) the following code assumes 386 or better (pushfl/popfl)
5367 * (2) all instructions compile with gcc 2.7.2.3 and later
5368 * (3) the function is moved down here to prevent gcc from
5369 * inlining it in multiple places and then barfing be-
5370 * cause the ".NOT_SUPPORTED" label is multiply defined
5371 * [is there a way to signal that a *single* function should
5372 * not be inlined? is there a way to modify the label for
5373 * each inlined instance, e.g., by appending _1, _2, etc.?
5374 * maybe if don't use leading "." in label name? (nope...sigh)]
5375 */
5376
5377int PNGAPI
5378png_mmx_support(void)
5379{
5380#if defined(PNG_MMX_CODE_SUPPORTED)
5381 int result;
5382 __asm__ __volatile__ (
5383 "pushl %%ebx \n\t" // ebx gets clobbered by CPUID instruction
5384 "pushl %%ecx \n\t" // so does ecx...
5385 "pushl %%edx \n\t" // ...and edx (but ecx & edx safe on Linux)
5386// ".byte 0x66 \n\t" // convert 16-bit pushf to 32-bit pushfd
5387// "pushf \n\t" // 16-bit pushf
5388 "pushfl \n\t" // save Eflag to stack
5389 "popl %%eax \n\t" // get Eflag from stack into eax
5390 "movl %%eax, %%ecx \n\t" // make another copy of Eflag in ecx
5391 "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
5392 "pushl %%eax \n\t" // save modified Eflag back to stack
5393// ".byte 0x66 \n\t" // convert 16-bit popf to 32-bit popfd
5394// "popf \n\t" // 16-bit popf
5395 "popfl \n\t" // restore modified value to Eflag reg
5396 "pushfl \n\t" // save Eflag to stack
5397 "popl %%eax \n\t" // get Eflag from stack
5398 "pushl %%ecx \n\t" // save original Eflag to stack
5399 "popfl \n\t" // restore original Eflag
5400 "xorl %%ecx, %%eax \n\t" // compare new Eflag with original Eflag
5401 "jz 0f \n\t" // if same, CPUID instr. is not supported
5402
5403 "xorl %%eax, %%eax \n\t" // set eax to zero
5404// ".byte 0x0f, 0xa2 \n\t" // CPUID instruction (two-byte opcode)
5405 "cpuid \n\t" // get the CPU identification info
5406 "cmpl $1, %%eax \n\t" // make sure eax return non-zero value
5407 "jl 0f \n\t" // if eax is zero, MMX is not supported
5408
5409 "xorl %%eax, %%eax \n\t" // set eax to zero and...
5410 "incl %%eax \n\t" // ...increment eax to 1. This pair is
5411 // faster than the instruction "mov eax, 1"
5412 "cpuid \n\t" // get the CPU identification info again
5413 "andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
5414 "cmpl $0, %%edx \n\t" // 0 = MMX not supported
5415 "jz 0f \n\t" // non-zero = yes, MMX IS supported
5416
5417 "movl $1, %%eax \n\t" // set return value to 1
5418 "jmp 1f \n\t" // DONE: have MMX support
5419
5420 "0: \n\t" // .NOT_SUPPORTED: target label for jump instructions
5421 "movl $0, %%eax \n\t" // set return value to 0
5422 "1: \n\t" // .RETURN: target label for jump instructions
5423 "popl %%edx \n\t" // restore edx
5424 "popl %%ecx \n\t" // restore ecx
5425 "popl %%ebx \n\t" // restore ebx
5426
5427// "ret \n\t" // DONE: no MMX support
5428 // (fall through to standard C "ret")
5429
5430 : "=a" (result) // output list
5431
5432 : // any variables used on input (none)
5433
5434 // no clobber list
5435// , "%ebx", "%ecx", "%edx" // GRR: we handle these manually
5436// , "memory" // if write to a variable gcc thought was in a reg
5437// , "cc" // "condition codes" (flag bits)
5438 );
5439 _mmx_supported = result;
5440#else
5441 _mmx_supported = 0;
5442#endif /* PNG_MMX_CODE_SUPPORTED */
5443
5444 return _mmx_supported;
5445}
5446
5447
5448#endif /* PNG_USE_PNGGCCRD */