blob: c46e3c16d1e9478ba5ff59e7d1ae5c2ab39252b1 [file] [log] [blame]
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001/* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
2 *
3 * For Intel x86 CPU and Microsoft Visual C++ compiler
4 *
Glenn Randers-Pehrson82ae3832001-04-20 10:32:10 -05005 * libpng 1.0.11rc1 - April 20, 2001
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05006 * For conditions of distribution and use, see copyright notice in png.h
Glenn Randers-Pehrsonbe9de0f2001-01-22 08:52:16 -06007 * Copyright (c) 1998-2001 Glenn Randers-Pehrson
Glenn Randers-Pehrsond4366722000-06-04 14:29:29 -05008 * Copyright (c) 1998, Intel Corporation
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05009 *
10 * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
11 * Interface to libpng contributed by Gilles Vollant, 1999
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -060012 * Debugging and cleanup by Greg Roelofs, 2000, 2001
13 *
14 * In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d,
15 * a sign error in the post-MMX cleanup code for each pixel_depth resulted
16 * in bad pixels at the beginning of some rows of some images, and also
17 * (due to out-of-range memory reads and writes) caused heap corruption
18 * when compiled with MSVC 6.0. The error was fixed in version 1.0.4e.
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -050019 *
Glenn Randers-Pehrson5e5c1e12000-11-10 12:26:19 -060020 * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916]
21 *
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -050022 */
23
24#define PNG_INTERNAL
25#include "png.h"
26
27#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
28
29static int mmx_supported=2;
30
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -050031
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -060032int PNGAPI
33png_mmx_support(void)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -050034{
35 int mmx_supported_local = 0;
36 _asm {
Glenn Randers-Pehrson61c32d92000-02-04 23:40:16 -060037 push ebx //CPUID will trash these
38 push ecx
39 push edx
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -050040 pushfd //Save Eflag to stack
41 pop eax //Get Eflag from stack into eax
42 mov ecx, eax //Make another copy of Eflag in ecx
43 xor eax, 0x200000 //Toggle ID bit in Eflag [i.e. bit(21)]
44 push eax //Save modified Eflag back to stack
45
46 popfd //Restored modified value back to Eflag reg
47 pushfd //Save Eflag to stack
48 pop eax //Get Eflag from stack
49 xor eax, ecx //Compare the new Eflag with the original Eflag
50 jz NOT_SUPPORTED //If the same, CPUID instruction is not supported,
51 //skip following instructions and jump to
52 //NOT_SUPPORTED label
53
54 xor eax, eax //Set eax to zero
55
56 _asm _emit 0x0f //CPUID instruction (two bytes opcode)
57 _asm _emit 0xa2
58
59 cmp eax, 1 //make sure eax return non-zero value
60 jl NOT_SUPPORTED //If eax is zero, mmx not supported
61
62 xor eax, eax //set eax to zero
63 inc eax //Now increment eax to 1. This instruction is
64 //faster than the instruction "mov eax, 1"
65
66 _asm _emit 0x0f //CPUID instruction
67 _asm _emit 0xa2
68
69 and edx, 0x00800000 //mask out all bits but mmx bit(24)
70 cmp edx, 0 // 0 = mmx not supported
71 jz NOT_SUPPORTED // non-zero = Yes, mmx IS supported
72
73 mov mmx_supported_local, 1 //set return value to 1
74
75NOT_SUPPORTED:
76 mov eax, mmx_supported_local //move return value to eax
Glenn Randers-Pehrson61c32d92000-02-04 23:40:16 -060077 pop edx //CPUID trashed these
78 pop ecx
79 pop ebx
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -050080 }
81
82 //mmx_supported_local=0; // test code for force don't support MMX
83 //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
84
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -060085 mmx_supported = mmx_supported_local;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -050086 return mmx_supported_local;
87}
88
89/* Combines the row recently read in with the previous row.
90 This routine takes care of alpha and transparency if requested.
91 This routine also handles the two methods of progressive display
92 of interlaced images, depending on the mask value.
93 The mask value describes which pixels are to be combined with
94 the row. The pattern always repeats every 8 pixels, so just 8
95 bits are needed. A one indicates the pixel is to be combined; a
96 zero indicates the pixel is to be skipped. This is in addition
97 to any alpha or transparency value associated with the pixel. If
98 you want all pixels to be combined, pass 0xff (255) in mask. */
99
100/* Use this routine for x86 platform - uses faster MMX routine if machine
101 supports MMX */
102
Glenn Randers-Pehrson75294572000-05-06 14:09:57 -0500103void /* PRIVATE */
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500104png_combine_row(png_structp png_ptr, png_bytep row, int mask)
105{
Glenn Randers-Pehrson074af5e1999-11-28 23:32:18 -0600106#ifdef PNG_USE_LOCAL_ARRAYS
Glenn Randers-Pehrson5379b241999-11-27 10:22:33 -0600107 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
Glenn Randers-Pehrson074af5e1999-11-28 23:32:18 -0600108#endif
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500109
110 png_debug(1,"in png_combine_row_asm\n");
111
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -0600112 if (mmx_supported == 2) {
113 png_mmx_support();
114 }
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500115
116 if (mask == 0xff)
117 {
118 png_memcpy(row, png_ptr->row_buf + 1,
119 (png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
120 }
121 /* GRR: add "else if (mask == 0)" case?
122 * or does png_combine_row() not even get called in that case? */
123 else
124 {
125 switch (png_ptr->row_info.pixel_depth)
126 {
127 case 1:
128 {
129 png_bytep sp;
130 png_bytep dp;
131 int s_inc, s_start, s_end;
132 int m;
133 int shift;
134 png_uint_32 i;
135
136 sp = png_ptr->row_buf + 1;
137 dp = row;
138 m = 0x80;
139#if defined(PNG_READ_PACKSWAP_SUPPORTED)
140 if (png_ptr->transformations & PNG_PACKSWAP)
141 {
142 s_start = 0;
143 s_end = 7;
144 s_inc = 1;
145 }
146 else
147#endif
148 {
149 s_start = 7;
150 s_end = 0;
151 s_inc = -1;
152 }
153
154 shift = s_start;
155
156 for (i = 0; i < png_ptr->width; i++)
157 {
158 if (m & mask)
159 {
160 int value;
161
162 value = (*sp >> shift) & 0x1;
163 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
164 *dp |= (png_byte)(value << shift);
165 }
166
167 if (shift == s_end)
168 {
169 shift = s_start;
170 sp++;
171 dp++;
172 }
173 else
174 shift += s_inc;
175
176 if (m == 1)
177 m = 0x80;
178 else
179 m >>= 1;
180 }
181 break;
182 }
183
184 case 2:
185 {
186 png_bytep sp;
187 png_bytep dp;
188 int s_start, s_end, s_inc;
189 int m;
190 int shift;
191 png_uint_32 i;
192 int value;
193
194 sp = png_ptr->row_buf + 1;
195 dp = row;
196 m = 0x80;
197#if defined(PNG_READ_PACKSWAP_SUPPORTED)
198 if (png_ptr->transformations & PNG_PACKSWAP)
199 {
200 s_start = 0;
201 s_end = 6;
202 s_inc = 2;
203 }
204 else
205#endif
206 {
207 s_start = 6;
208 s_end = 0;
209 s_inc = -2;
210 }
211
212 shift = s_start;
213
214 for (i = 0; i < png_ptr->width; i++)
215 {
216 if (m & mask)
217 {
218 value = (*sp >> shift) & 0x3;
219 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
220 *dp |= (png_byte)(value << shift);
221 }
222
223 if (shift == s_end)
224 {
225 shift = s_start;
226 sp++;
227 dp++;
228 }
229 else
230 shift += s_inc;
231 if (m == 1)
232 m = 0x80;
233 else
234 m >>= 1;
235 }
236 break;
237 }
238
239 case 4:
240 {
241 png_bytep sp;
242 png_bytep dp;
243 int s_start, s_end, s_inc;
244 int m;
245 int shift;
246 png_uint_32 i;
247 int value;
248
249 sp = png_ptr->row_buf + 1;
250 dp = row;
251 m = 0x80;
252#if defined(PNG_READ_PACKSWAP_SUPPORTED)
253 if (png_ptr->transformations & PNG_PACKSWAP)
254 {
255 s_start = 0;
256 s_end = 4;
257 s_inc = 4;
258 }
259 else
260#endif
261 {
262 s_start = 4;
263 s_end = 0;
264 s_inc = -4;
265 }
266 shift = s_start;
267
268 for (i = 0; i < png_ptr->width; i++)
269 {
270 if (m & mask)
271 {
272 value = (*sp >> shift) & 0xf;
273 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
274 *dp |= (png_byte)(value << shift);
275 }
276
277 if (shift == s_end)
278 {
279 shift = s_start;
280 sp++;
281 dp++;
282 }
283 else
284 shift += s_inc;
285 if (m == 1)
286 m = 0x80;
287 else
288 m >>= 1;
289 }
290 break;
291 }
292
293 case 8:
294 {
295 png_bytep srcptr;
296 png_bytep dstptr;
297 png_uint_32 len;
298 int m;
299 int diff, unmask;
300
301 __int64 mask0=0x0102040810204080;
302
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -0600303 if ( mmx_supported )
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500304 {
305 srcptr = png_ptr->row_buf + 1;
306 dstptr = row;
307 m = 0x80;
308 unmask = ~mask;
309 len = png_ptr->width &~7; //reduce to multiple of 8
310 diff = png_ptr->width & 7; //amount lost
311
312 _asm
313 {
314 movd mm7, unmask //load bit pattern
315 psubb mm6,mm6 //zero mm6
316 punpcklbw mm7,mm7
317 punpcklwd mm7,mm7
318 punpckldq mm7,mm7 //fill register with 8 masks
319
320 movq mm0,mask0
321
322 pand mm0,mm7 //nonzero if keep byte
323 pcmpeqb mm0,mm6 //zeros->1s, v versa
324
325 mov ecx,len //load length of line (pixels)
326 mov esi,srcptr //load source
327 mov ebx,dstptr //load dest
328 cmp ecx,0 //lcr
329 je mainloop8end
330
331mainloop8:
332 movq mm4,[esi]
333 pand mm4,mm0
334 movq mm6,mm0
335 pandn mm6,[ebx]
336 por mm4,mm6
337 movq [ebx],mm4
338
339 add esi,8 //inc by 8 bytes processed
340 add ebx,8
341 sub ecx,8 //dec by 8 pixels processed
342
343 ja mainloop8
344mainloop8end:
345
346 mov ecx,diff
347 cmp ecx,0
348 jz end8
349
350 mov edx,mask
351 sal edx,24 //make low byte the high byte
352
353secondloop8:
354 sal edx,1 //move high bit to CF
355 jnc skip8 //if CF = 0
356 mov al,[esi]
357 mov [ebx],al
358skip8:
359 inc esi
360 inc ebx
361
362 dec ecx
363 jnz secondloop8
364end8:
365 emms
366 }
367 }
368 else /* mmx not supported - use modified C routine */
369 {
370 register unsigned int incr1, initial_val, final_val;
371 png_size_t pixel_bytes;
372 png_uint_32 i;
373 register int disp = png_pass_inc[png_ptr->pass];
374 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
375
376 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
377 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
378 pixel_bytes;
379 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
380 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
381 final_val = png_ptr->width*pixel_bytes;
382 incr1 = (disp)*pixel_bytes;
383 for (i = initial_val; i < final_val; i += incr1)
384 {
Glenn Randers-Pehrson19095602001-03-14 07:08:39 -0600385 if (pixel_bytes > (png_size_t)(final_val-i))
386 pixel_bytes = (png_size_t)(final_val-i);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500387 png_memcpy(dstptr, srcptr, pixel_bytes);
388 srcptr += incr1;
389 dstptr += incr1;
390 }
391 } /* end of else */
392
393 break;
394 } // end 8 bpp
395
396 case 16:
397 {
398 png_bytep srcptr;
399 png_bytep dstptr;
400 png_uint_32 len;
401 int unmask, diff;
402 __int64 mask1=0x0101020204040808,
403 mask0=0x1010202040408080;
404
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -0600405 if ( mmx_supported )
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500406 {
407 srcptr = png_ptr->row_buf + 1;
408 dstptr = row;
409
410 unmask = ~mask;
411 len = (png_ptr->width)&~7;
412 diff = (png_ptr->width)&7;
413 _asm
414 {
415 movd mm7, unmask //load bit pattern
416 psubb mm6,mm6 //zero mm6
417 punpcklbw mm7,mm7
418 punpcklwd mm7,mm7
419 punpckldq mm7,mm7 //fill register with 8 masks
420
421 movq mm0,mask0
422 movq mm1,mask1
423
424 pand mm0,mm7
425 pand mm1,mm7
426
427 pcmpeqb mm0,mm6
428 pcmpeqb mm1,mm6
429
430 mov ecx,len //load length of line
431 mov esi,srcptr //load source
432 mov ebx,dstptr //load dest
433 cmp ecx,0 //lcr
434 jz mainloop16end
435
436mainloop16:
437 movq mm4,[esi]
438 pand mm4,mm0
439 movq mm6,mm0
440 movq mm7,[ebx]
441 pandn mm6,mm7
442 por mm4,mm6
443 movq [ebx],mm4
444
445 movq mm5,[esi+8]
446 pand mm5,mm1
447 movq mm7,mm1
448 movq mm6,[ebx+8]
449 pandn mm7,mm6
450 por mm5,mm7
451 movq [ebx+8],mm5
452
453 add esi,16 //inc by 16 bytes processed
454 add ebx,16
455 sub ecx,8 //dec by 8 pixels processed
456
457 ja mainloop16
458
459mainloop16end:
460 mov ecx,diff
461 cmp ecx,0
462 jz end16
463
464 mov edx,mask
465 sal edx,24 //make low byte the high byte
466secondloop16:
467 sal edx,1 //move high bit to CF
468 jnc skip16 //if CF = 0
469 mov ax,[esi]
470 mov [ebx],ax
471skip16:
472 add esi,2
473 add ebx,2
474
475 dec ecx
476 jnz secondloop16
477end16:
478 emms
479 }
480 }
481 else /* mmx not supported - use modified C routine */
482 {
483 register unsigned int incr1, initial_val, final_val;
484 png_size_t pixel_bytes;
485 png_uint_32 i;
486 register int disp = png_pass_inc[png_ptr->pass];
487 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
488
489 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
490 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
491 pixel_bytes;
492 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
493 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
494 final_val = png_ptr->width*pixel_bytes;
495 incr1 = (disp)*pixel_bytes;
496 for (i = initial_val; i < final_val; i += incr1)
497 {
Glenn Randers-Pehrson19095602001-03-14 07:08:39 -0600498 if (pixel_bytes > (png_size_t)(final_val-i))
499 pixel_bytes = (png_size_t)(final_val-i);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500500 png_memcpy(dstptr, srcptr, pixel_bytes);
501 srcptr += incr1;
502 dstptr += incr1;
503 }
504 } /* end of else */
505
506 break;
507 } // end 16 bpp
508
509 case 24:
510 {
511 png_bytep srcptr;
512 png_bytep dstptr;
513 png_uint_32 len;
514 int unmask, diff;
515
516 __int64 mask2=0x0101010202020404, //24bpp
517 mask1=0x0408080810101020,
518 mask0=0x2020404040808080;
519
520 srcptr = png_ptr->row_buf + 1;
521 dstptr = row;
522
523 unmask = ~mask;
524 len = (png_ptr->width)&~7;
525 diff = (png_ptr->width)&7;
526
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -0600527 if ( mmx_supported )
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500528 {
529 _asm
530 {
531 movd mm7, unmask //load bit pattern
532 psubb mm6,mm6 //zero mm6
533 punpcklbw mm7,mm7
534 punpcklwd mm7,mm7
535 punpckldq mm7,mm7 //fill register with 8 masks
536
537 movq mm0,mask0
538 movq mm1,mask1
539 movq mm2,mask2
540
541 pand mm0,mm7
542 pand mm1,mm7
543 pand mm2,mm7
544
545 pcmpeqb mm0,mm6
546 pcmpeqb mm1,mm6
547 pcmpeqb mm2,mm6
548
549 mov ecx,len //load length of line
550 mov esi,srcptr //load source
551 mov ebx,dstptr //load dest
552 cmp ecx,0
553 jz mainloop24end
554
555mainloop24:
556 movq mm4,[esi]
557 pand mm4,mm0
558 movq mm6,mm0
559 movq mm7,[ebx]
560 pandn mm6,mm7
561 por mm4,mm6
562 movq [ebx],mm4
563
564
565 movq mm5,[esi+8]
566 pand mm5,mm1
567 movq mm7,mm1
568 movq mm6,[ebx+8]
569 pandn mm7,mm6
570 por mm5,mm7
571 movq [ebx+8],mm5
572
573 movq mm6,[esi+16]
574 pand mm6,mm2
575 movq mm4,mm2
576 movq mm7,[ebx+16]
577 pandn mm4,mm7
578 por mm6,mm4
579 movq [ebx+16],mm6
580
581 add esi,24 //inc by 24 bytes processed
582 add ebx,24
583 sub ecx,8 //dec by 8 pixels processed
584
585 ja mainloop24
586
587mainloop24end:
588 mov ecx,diff
589 cmp ecx,0
590 jz end24
591
592 mov edx,mask
593 sal edx,24 //make low byte the high byte
594secondloop24:
595 sal edx,1 //move high bit to CF
596 jnc skip24 //if CF = 0
597 mov ax,[esi]
598 mov [ebx],ax
599 xor eax,eax
600 mov al,[esi+2]
601 mov [ebx+2],al
602skip24:
603 add esi,3
604 add ebx,3
605
606 dec ecx
607 jnz secondloop24
608
609end24:
610 emms
611 }
612 }
613 else /* mmx not supported - use modified C routine */
614 {
615 register unsigned int incr1, initial_val, final_val;
616 png_size_t pixel_bytes;
617 png_uint_32 i;
618 register int disp = png_pass_inc[png_ptr->pass];
619 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
620
621 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
622 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
623 pixel_bytes;
624 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
625 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
626 final_val = png_ptr->width*pixel_bytes;
627 incr1 = (disp)*pixel_bytes;
628 for (i = initial_val; i < final_val; i += incr1)
629 {
Glenn Randers-Pehrson19095602001-03-14 07:08:39 -0600630 if (pixel_bytes > (png_size_t)(final_val-i))
631 pixel_bytes = (png_size_t)(final_val-i);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500632 png_memcpy(dstptr, srcptr, pixel_bytes);
633 srcptr += incr1;
634 dstptr += incr1;
635 }
636 } /* end of else */
637
638 break;
639 } // end 24 bpp
640
641 case 32:
642 {
643 png_bytep srcptr;
644 png_bytep dstptr;
645 png_uint_32 len;
646 int unmask, diff;
647
648 __int64 mask3=0x0101010102020202, //32bpp
649 mask2=0x0404040408080808,
650 mask1=0x1010101020202020,
651 mask0=0x4040404080808080;
652
653 srcptr = png_ptr->row_buf + 1;
654 dstptr = row;
655
656 unmask = ~mask;
657 len = (png_ptr->width)&~7;
658 diff = (png_ptr->width)&7;
659
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -0600660 if ( mmx_supported )
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500661 {
662 _asm
663 {
664 movd mm7, unmask //load bit pattern
665 psubb mm6,mm6 //zero mm6
666 punpcklbw mm7,mm7
667 punpcklwd mm7,mm7
668 punpckldq mm7,mm7 //fill register with 8 masks
669
670 movq mm0,mask0
671 movq mm1,mask1
672 movq mm2,mask2
673 movq mm3,mask3
674
675 pand mm0,mm7
676 pand mm1,mm7
677 pand mm2,mm7
678 pand mm3,mm7
679
680 pcmpeqb mm0,mm6
681 pcmpeqb mm1,mm6
682 pcmpeqb mm2,mm6
683 pcmpeqb mm3,mm6
684
685 mov ecx,len //load length of line
686 mov esi,srcptr //load source
687 mov ebx,dstptr //load dest
688
689 cmp ecx,0 //lcr
690 jz mainloop32end
691
692mainloop32:
693 movq mm4,[esi]
694 pand mm4,mm0
695 movq mm6,mm0
696 movq mm7,[ebx]
697 pandn mm6,mm7
698 por mm4,mm6
699 movq [ebx],mm4
700
701 movq mm5,[esi+8]
702 pand mm5,mm1
703 movq mm7,mm1
704 movq mm6,[ebx+8]
705 pandn mm7,mm6
706 por mm5,mm7
707 movq [ebx+8],mm5
708
709 movq mm6,[esi+16]
710 pand mm6,mm2
711 movq mm4,mm2
712 movq mm7,[ebx+16]
713 pandn mm4,mm7
714 por mm6,mm4
715 movq [ebx+16],mm6
716
717 movq mm7,[esi+24]
718 pand mm7,mm3
719 movq mm5,mm3
720 movq mm4,[ebx+24]
721 pandn mm5,mm4
722 por mm7,mm5
723 movq [ebx+24],mm7
724
725 add esi,32 //inc by 32 bytes processed
726 add ebx,32
727 sub ecx,8 //dec by 8 pixels processed
728
729 ja mainloop32
730
731mainloop32end:
732 mov ecx,diff
733 cmp ecx,0
734 jz end32
735
736 mov edx,mask
737 sal edx,24 //make low byte the high byte
738secondloop32:
739 sal edx,1 //move high bit to CF
740 jnc skip32 //if CF = 0
741 mov eax,[esi]
742 mov [ebx],eax
743skip32:
744 add esi,4
745 add ebx,4
746
747 dec ecx
748 jnz secondloop32
749
750end32:
751 emms
752 }
753 }
754 else /* mmx _not supported - Use modified C routine */
755 {
756 register unsigned int incr1, initial_val, final_val;
757 png_size_t pixel_bytes;
758 png_uint_32 i;
759 register int disp = png_pass_inc[png_ptr->pass];
760 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
761
762 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
763 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
764 pixel_bytes;
765 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
766 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
767 final_val = png_ptr->width*pixel_bytes;
768 incr1 = (disp)*pixel_bytes;
769 for (i = initial_val; i < final_val; i += incr1)
770 {
Glenn Randers-Pehrson19095602001-03-14 07:08:39 -0600771 if (pixel_bytes > (png_size_t)(final_val-i))
772 pixel_bytes = (png_size_t)(final_val-i);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500773 png_memcpy(dstptr, srcptr, pixel_bytes);
774 srcptr += incr1;
775 dstptr += incr1;
776 }
777 } /* end of else */
778
779 break;
780 } // end 32 bpp
781
782 case 48:
783 {
784 png_bytep srcptr;
785 png_bytep dstptr;
786 png_uint_32 len;
787 int unmask, diff;
788
789 __int64 mask5=0x0101010101010202,
790 mask4=0x0202020204040404,
791 mask3=0x0404080808080808,
792 mask2=0x1010101010102020,
793 mask1=0x2020202040404040,
794 mask0=0x4040808080808080;
795
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -0600796 if ( mmx_supported )
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500797 {
798 srcptr = png_ptr->row_buf + 1;
799 dstptr = row;
800
801 unmask = ~mask;
802 len = (png_ptr->width)&~7;
803 diff = (png_ptr->width)&7;
804 _asm
805 {
806 movd mm7, unmask //load bit pattern
807 psubb mm6,mm6 //zero mm6
808 punpcklbw mm7,mm7
809 punpcklwd mm7,mm7
810 punpckldq mm7,mm7 //fill register with 8 masks
811
812 movq mm0,mask0
813 movq mm1,mask1
814 movq mm2,mask2
815 movq mm3,mask3
816 movq mm4,mask4
817 movq mm5,mask5
818
819 pand mm0,mm7
820 pand mm1,mm7
821 pand mm2,mm7
822 pand mm3,mm7
823 pand mm4,mm7
824 pand mm5,mm7
825
826 pcmpeqb mm0,mm6
827 pcmpeqb mm1,mm6
828 pcmpeqb mm2,mm6
829 pcmpeqb mm3,mm6
830 pcmpeqb mm4,mm6
831 pcmpeqb mm5,mm6
832
833 mov ecx,len //load length of line
834 mov esi,srcptr //load source
835 mov ebx,dstptr //load dest
836
837 cmp ecx,0
838 jz mainloop48end
839
840mainloop48:
841 movq mm7,[esi]
842 pand mm7,mm0
843 movq mm6,mm0
844 pandn mm6,[ebx]
845 por mm7,mm6
846 movq [ebx],mm7
847
848 movq mm6,[esi+8]
849 pand mm6,mm1
850 movq mm7,mm1
851 pandn mm7,[ebx+8]
852 por mm6,mm7
853 movq [ebx+8],mm6
854
855 movq mm6,[esi+16]
856 pand mm6,mm2
857 movq mm7,mm2
858 pandn mm7,[ebx+16]
859 por mm6,mm7
860 movq [ebx+16],mm6
861
862 movq mm7,[esi+24]
863 pand mm7,mm3
864 movq mm6,mm3
865 pandn mm6,[ebx+24]
866 por mm7,mm6
867 movq [ebx+24],mm7
868
869 movq mm6,[esi+32]
870 pand mm6,mm4
871 movq mm7,mm4
872 pandn mm7,[ebx+32]
873 por mm6,mm7
874 movq [ebx+32],mm6
875
876 movq mm7,[esi+40]
877 pand mm7,mm5
878 movq mm6,mm5
879 pandn mm6,[ebx+40]
880 por mm7,mm6
881 movq [ebx+40],mm7
882
883 add esi,48 //inc by 32 bytes processed
884 add ebx,48
885 sub ecx,8 //dec by 8 pixels processed
886
887 ja mainloop48
888mainloop48end:
889
890 mov ecx,diff
891 cmp ecx,0
892 jz end48
893
894 mov edx,mask
895 sal edx,24 //make low byte the high byte
896
897secondloop48:
898 sal edx,1 //move high bit to CF
899 jnc skip48 //if CF = 0
900 mov eax,[esi]
901 mov [ebx],eax
902skip48:
903 add esi,4
904 add ebx,4
905
906 dec ecx
907 jnz secondloop48
908
909end48:
910 emms
911 }
912 }
913 else /* mmx _not supported - Use modified C routine */
914 {
915 register unsigned int incr1, initial_val, final_val;
916 png_size_t pixel_bytes;
917 png_uint_32 i;
918 register int disp = png_pass_inc[png_ptr->pass];
919 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
920
921 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
922 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
923 pixel_bytes;
924 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
925 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
926 final_val = png_ptr->width*pixel_bytes;
927 incr1 = (disp)*pixel_bytes;
928 for (i = initial_val; i < final_val; i += incr1)
929 {
Glenn Randers-Pehrson19095602001-03-14 07:08:39 -0600930 if (pixel_bytes > (png_size_t)(final_val-i))
931 pixel_bytes = (png_size_t)(final_val-i);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500932 png_memcpy(dstptr, srcptr, pixel_bytes);
933 srcptr += incr1;
934 dstptr += incr1;
935 }
936 } /* end of else */
937
938 break;
939 } // end 48 bpp
940
941 default:
942 {
943 png_bytep sptr;
944 png_bytep dp;
945 png_size_t pixel_bytes;
946 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
947 unsigned int i;
948 register int disp = png_pass_inc[png_ptr->pass]; // get the offset
949 register unsigned int incr1, initial_val, final_val;
950
951 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
952 sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
953 pixel_bytes;
954 dp = row + offset_table[png_ptr->pass]*pixel_bytes;
955 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
956 final_val = png_ptr->width*pixel_bytes;
957 incr1 = (disp)*pixel_bytes;
958 for (i = initial_val; i < final_val; i += incr1)
959 {
Glenn Randers-Pehrson19095602001-03-14 07:08:39 -0600960 if (pixel_bytes > (png_size_t)(final_val-i))
961 pixel_bytes = (png_size_t)(final_val-i);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500962 png_memcpy(dp, sptr, pixel_bytes);
963 sptr += incr1;
964 dp += incr1;
965 }
966 break;
967 }
968 } /* end switch (png_ptr->row_info.pixel_depth) */
969 } /* end if (non-trivial mask) */
970
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500971} /* end png_combine_row() */
972
973
974#if defined(PNG_READ_INTERLACING_SUPPORTED)
975
Glenn Randers-Pehrson75294572000-05-06 14:09:57 -0500976void /* PRIVATE */
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -0600977png_do_read_interlace(png_structp png_ptr)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500978{
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -0600979 png_row_infop row_info = &(png_ptr->row_info);
980 png_bytep row = png_ptr->row_buf + 1;
981 int pass = png_ptr->pass;
982 png_uint_32 transformations = png_ptr->transformations;
Glenn Randers-Pehrson074af5e1999-11-28 23:32:18 -0600983#ifdef PNG_USE_LOCAL_ARRAYS
Glenn Randers-Pehrson5379b241999-11-27 10:22:33 -0600984 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
Glenn Randers-Pehrson074af5e1999-11-28 23:32:18 -0600985#endif
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500986
987 png_debug(1,"in png_do_read_interlace\n");
988
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -0600989 if (mmx_supported == 2) {
990 png_mmx_support();
991 }
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500992
993 if (row != NULL && row_info != NULL)
994 {
995 png_uint_32 final_width;
996
997 final_width = row_info->width * png_pass_inc[pass];
998
999 switch (row_info->pixel_depth)
1000 {
1001 case 1:
1002 {
1003 png_bytep sp, dp;
1004 int sshift, dshift;
1005 int s_start, s_end, s_inc;
1006 png_byte v;
1007 png_uint_32 i;
1008 int j;
1009
1010 sp = row + (png_size_t)((row_info->width - 1) >> 3);
1011 dp = row + (png_size_t)((final_width - 1) >> 3);
1012#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1013 if (transformations & PNG_PACKSWAP)
1014 {
1015 sshift = (int)((row_info->width + 7) & 7);
1016 dshift = (int)((final_width + 7) & 7);
1017 s_start = 7;
1018 s_end = 0;
1019 s_inc = -1;
1020 }
1021 else
1022#endif
1023 {
1024 sshift = 7 - (int)((row_info->width + 7) & 7);
1025 dshift = 7 - (int)((final_width + 7) & 7);
1026 s_start = 0;
1027 s_end = 7;
1028 s_inc = 1;
1029 }
1030
1031 for (i = row_info->width; i; i--)
1032 {
1033 v = (png_byte)((*sp >> sshift) & 0x1);
1034 for (j = 0; j < png_pass_inc[pass]; j++)
1035 {
1036 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1037 *dp |= (png_byte)(v << dshift);
1038 if (dshift == s_end)
1039 {
1040 dshift = s_start;
1041 dp--;
1042 }
1043 else
1044 dshift += s_inc;
1045 }
1046 if (sshift == s_end)
1047 {
1048 sshift = s_start;
1049 sp--;
1050 }
1051 else
1052 sshift += s_inc;
1053 }
1054 break;
1055 }
1056
1057 case 2:
1058 {
1059 png_bytep sp, dp;
1060 int sshift, dshift;
1061 int s_start, s_end, s_inc;
1062 png_uint_32 i;
1063
1064 sp = row + (png_size_t)((row_info->width - 1) >> 2);
1065 dp = row + (png_size_t)((final_width - 1) >> 2);
1066#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1067 if (transformations & PNG_PACKSWAP)
1068 {
1069 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1070 dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1071 s_start = 6;
1072 s_end = 0;
1073 s_inc = -2;
1074 }
1075 else
1076#endif
1077 {
1078 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1079 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1080 s_start = 0;
1081 s_end = 6;
1082 s_inc = 2;
1083 }
1084
1085 for (i = row_info->width; i; i--)
1086 {
1087 png_byte v;
1088 int j;
1089
1090 v = (png_byte)((*sp >> sshift) & 0x3);
1091 for (j = 0; j < png_pass_inc[pass]; j++)
1092 {
1093 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1094 *dp |= (png_byte)(v << dshift);
1095 if (dshift == s_end)
1096 {
1097 dshift = s_start;
1098 dp--;
1099 }
1100 else
1101 dshift += s_inc;
1102 }
1103 if (sshift == s_end)
1104 {
1105 sshift = s_start;
1106 sp--;
1107 }
1108 else
1109 sshift += s_inc;
1110 }
1111 break;
1112 }
1113
1114 case 4:
1115 {
1116 png_bytep sp, dp;
1117 int sshift, dshift;
1118 int s_start, s_end, s_inc;
1119 png_uint_32 i;
1120
1121 sp = row + (png_size_t)((row_info->width - 1) >> 1);
1122 dp = row + (png_size_t)((final_width - 1) >> 1);
1123#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1124 if (transformations & PNG_PACKSWAP)
1125 {
1126 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1127 dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1128 s_start = 4;
1129 s_end = 0;
1130 s_inc = -4;
1131 }
1132 else
1133#endif
1134 {
1135 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1136 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1137 s_start = 0;
1138 s_end = 4;
1139 s_inc = 4;
1140 }
1141
1142 for (i = row_info->width; i; i--)
1143 {
1144 png_byte v;
1145 int j;
1146
1147 v = (png_byte)((*sp >> sshift) & 0xf);
1148 for (j = 0; j < png_pass_inc[pass]; j++)
1149 {
1150 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1151 *dp |= (png_byte)(v << dshift);
1152 if (dshift == s_end)
1153 {
1154 dshift = s_start;
1155 dp--;
1156 }
1157 else
1158 dshift += s_inc;
1159 }
1160 if (sshift == s_end)
1161 {
1162 sshift = s_start;
1163 sp--;
1164 }
1165 else
1166 sshift += s_inc;
1167 }
1168 break;
1169 }
1170
1171 default: // This is the place where the routine is modified
1172 {
1173 __int64 const4 = 0x0000000000FFFFFF;
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001174 // __int64 const5 = 0x000000FFFFFF0000; // unused...
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001175 __int64 const6 = 0x00000000000000FF;
1176 png_bytep sptr, dp;
1177 png_uint_32 i;
1178 png_size_t pixel_bytes;
1179 int width = row_info->width;
1180
1181 pixel_bytes = (row_info->pixel_depth >> 3);
1182
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001183 sptr = row + (width - 1) * pixel_bytes;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001184 dp = row + (final_width - 1) * pixel_bytes;
1185 // New code by Nirav Chhatrapati - Intel Corporation
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001186 // sign fix by GRR
1187 // NOTE: there is NO MMX code for 48-bit and 64-bit images
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001188
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06001189 // use MMX routine if machine supports it
1190 if ( mmx_supported )
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001191 {
1192 if (pixel_bytes == 3)
1193 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001194 if (((pass == 0) || (pass == 1)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001195 {
1196 _asm
1197 {
1198 mov esi, sptr
1199 mov edi, dp
1200 mov ecx, width
1201 sub edi, 21 // (png_pass_inc[pass] - 1)*pixel_bytes
1202loop_pass0:
1203 movd mm0, [esi] ; X X X X X v2 v1 v0
1204 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1205 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1206 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1207 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1208 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1209 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1210 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1211 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1212 movq mm3, mm0 ; v2 v1 v0 v2 v1 v0 v2 v1
1213 psllq mm0, 16 ; v0 v2 v1 v0 v2 v1 0 0
1214 movq mm4, mm3 ; v2 v1 v0 v2 v1 v0 v2 v1
1215 punpckhdq mm3, mm0 ; v0 v2 v1 v0 v2 v1 v0 v2
1216 movq [edi+16] , mm4
1217 psrlq mm0, 32 ; 0 0 0 0 v0 v2 v1 v0
1218 movq [edi+8] , mm3
1219 punpckldq mm0, mm4 ; v1 v0 v2 v1 v0 v2 v1 v0
1220 sub esi, 3
1221 movq [edi], mm0
1222 sub edi, 24
1223 //sub esi, 3
1224 dec ecx
1225 jnz loop_pass0
1226 EMMS
1227 }
1228 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001229 else if (((pass == 2) || (pass == 3)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001230 {
1231 _asm
1232 {
1233 mov esi, sptr
1234 mov edi, dp
1235 mov ecx, width
1236 sub edi, 9 // (png_pass_inc[pass] - 1)*pixel_bytes
1237loop_pass2:
1238 movd mm0, [esi] ; X X X X X v2 v1 v0
1239 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1240 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1241 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1242 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1243 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1244 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1245 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1246 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1247 movq [edi+4], mm0 ; move to memory
1248 psrlq mm0, 16 ; 0 0 v2 v1 v0 v2 v1 v0
1249 movd [edi], mm0 ; move to memory
1250 sub esi, 3
1251 sub edi, 12
1252 dec ecx
1253 jnz loop_pass2
1254 EMMS
1255 }
1256 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001257 else if (width) /* && ((pass == 4) || (pass == 5)) */
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001258 {
1259 int width_mmx = ((width >> 1) << 1) - 8;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001260 if (width_mmx < 0)
1261 width_mmx = 0;
1262 width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001263 if (width_mmx)
1264 {
1265 _asm
1266 {
1267 mov esi, sptr
1268 mov edi, dp
1269 mov ecx, width_mmx
1270 sub esi, 3
1271 sub edi, 9
1272loop_pass4:
1273 movq mm0, [esi] ; X X v2 v1 v0 v5 v4 v3
1274 movq mm7, mm0 ; X X v2 v1 v0 v5 v4 v3
1275 movq mm6, mm0 ; X X v2 v1 v0 v5 v4 v3
1276 psllq mm0, 24 ; v1 v0 v5 v4 v3 0 0 0
1277 pand mm7, const4 ; 0 0 0 0 0 v5 v4 v3
1278 psrlq mm6, 24 ; 0 0 0 X X v2 v1 v0
1279 por mm0, mm7 ; v1 v0 v5 v4 v3 v5 v4 v3
1280 movq mm5, mm6 ; 0 0 0 X X v2 v1 v0
1281 psllq mm6, 8 ; 0 0 X X v2 v1 v0 0
1282 movq [edi], mm0 ; move quad to memory
1283 psrlq mm5, 16 ; 0 0 0 0 0 X X v2
1284 pand mm5, const6 ; 0 0 0 0 0 0 0 v2
1285 por mm6, mm5 ; 0 0 X X v2 v1 v0 v2
1286 movd [edi+8], mm6 ; move double to memory
1287 sub esi, 6
1288 sub edi, 12
1289 sub ecx, 2
1290 jnz loop_pass4
1291 EMMS
1292 }
1293 }
1294
1295 sptr -= width_mmx*3;
1296 dp -= width_mmx*6;
1297 for (i = width; i; i--)
1298 {
1299 png_byte v[8];
1300 int j;
1301
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001302 png_memcpy(v, sptr, 3);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001303 for (j = 0; j < png_pass_inc[pass]; j++)
1304 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001305 png_memcpy(dp, v, 3);
1306 dp -= 3;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001307 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001308 sptr -= 3;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001309 }
1310 }
1311 } /* end of pixel_bytes == 3 */
1312
1313 else if (pixel_bytes == 1)
1314 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001315 if (((pass == 0) || (pass == 1)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001316 {
1317 int width_mmx = ((width >> 2) << 2);
1318 width -= width_mmx;
1319 if (width_mmx)
1320 {
1321 _asm
1322 {
1323 mov esi, sptr
1324 mov edi, dp
1325 mov ecx, width_mmx
1326 sub edi, 31
1327 sub esi, 3
1328loop1_pass0:
1329 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1330 movq mm1, mm0 ; X X X X v0 v1 v2 v3
1331 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1332 movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1333 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1334 movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1335 punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3
1336 punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2
1337 movq [edi], mm0 ; move to memory v3
1338 punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1339 movq [edi+8], mm3 ; move to memory v2
1340 movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1341 punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1
1342 punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0
1343 movq [edi+16], mm2 ; move to memory v1
1344 movq [edi+24], mm4 ; move to memory v0
1345 sub esi, 4
1346 sub edi, 32
1347 sub ecx, 4
1348 jnz loop1_pass0
1349 EMMS
1350 }
1351 }
1352
1353 sptr -= width_mmx;
1354 dp -= width_mmx*8;
1355 for (i = width; i; i--)
1356 {
1357 int j;
1358
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001359 /* I simplified this part in version 1.0.4e
1360 * here and in several other instances where
1361 * pixel_bytes == 1 -- GR-P
1362 *
1363 * Original code:
1364 *
1365 * png_byte v[8];
1366 * png_memcpy(v, sptr, pixel_bytes);
1367 * for (j = 0; j < png_pass_inc[pass]; j++)
1368 * {
1369 * png_memcpy(dp, v, pixel_bytes);
1370 * dp -= pixel_bytes;
1371 * }
1372 * sptr -= pixel_bytes;
1373 *
1374 * Replacement code is in the next three lines:
1375 */
1376
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001377 for (j = 0; j < png_pass_inc[pass]; j++)
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001378 *dp-- = *sptr;
1379 sptr--;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001380 }
1381 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001382 else if (((pass == 2) || (pass == 3)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001383 {
1384 int width_mmx = ((width >> 2) << 2);
1385 width -= width_mmx;
1386 if (width_mmx)
1387 {
1388 _asm
1389 {
1390 mov esi, sptr
1391 mov edi, dp
1392 mov ecx, width_mmx
1393 sub edi, 15
1394 sub esi, 3
1395loop1_pass2:
1396 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1397 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1398 movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1399 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1400 punpckhwd mm1, mm1 ; v0 v0 v0 v0 v1 v1 v1 v1
1401 movq [edi], mm0 ; move to memory v2 and v3
1402 sub esi, 4
1403 movq [edi+8], mm1 ; move to memory v1 and v0
1404 sub edi, 16
1405 sub ecx, 4
1406 jnz loop1_pass2
1407 EMMS
1408 }
1409 }
1410
1411 sptr -= width_mmx;
1412 dp -= width_mmx*4;
1413 for (i = width; i; i--)
1414 {
1415 int j;
1416
1417 for (j = 0; j < png_pass_inc[pass]; j++)
1418 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001419 *dp-- = *sptr;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001420 }
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001421 sptr --;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001422 }
1423 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001424 else if (width) /* && ((pass == 4) || (pass == 5))) */
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001425 {
1426 int width_mmx = ((width >> 3) << 3);
1427 width -= width_mmx;
1428 if (width_mmx)
1429 {
1430 _asm
1431 {
1432 mov esi, sptr
1433 mov edi, dp
1434 mov ecx, width_mmx
1435 sub edi, 15
1436 sub esi, 7
1437loop1_pass4:
1438 movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7
1439 movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7
1440 punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7
1441 //movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1442 punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3
1443 movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3
1444 sub esi, 8
1445 movq [edi], mm0 ; move to memory v4 v5 v6 and v7
1446 //sub esi, 4
1447 sub edi, 16
1448 sub ecx, 8
1449 jnz loop1_pass4
1450 EMMS
1451 }
1452 }
1453
1454 sptr -= width_mmx;
1455 dp -= width_mmx*2;
1456 for (i = width; i; i--)
1457 {
1458 int j;
1459
1460 for (j = 0; j < png_pass_inc[pass]; j++)
1461 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001462 *dp-- = *sptr;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001463 }
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001464 sptr --;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001465 }
1466 }
1467 } /* end of pixel_bytes == 1 */
1468
1469 else if (pixel_bytes == 2)
1470 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001471 if (((pass == 0) || (pass == 1)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001472 {
1473 int width_mmx = ((width >> 1) << 1);
1474 width -= width_mmx;
1475 if (width_mmx)
1476 {
1477 _asm
1478 {
1479 mov esi, sptr
1480 mov edi, dp
1481 mov ecx, width_mmx
1482 sub esi, 2
1483 sub edi, 30
1484loop2_pass0:
1485 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1486 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1487 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1488 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1489 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1490 movq [edi], mm0
1491 movq [edi + 8], mm0
1492 movq [edi + 16], mm1
1493 movq [edi + 24], mm1
1494 sub esi, 4
1495 sub edi, 32
1496 sub ecx, 2
1497 jnz loop2_pass0
1498 EMMS
1499 }
1500 }
1501
Glenn Randers-Pehrson166c5a31999-12-10 09:43:02 -06001502 sptr -= (width_mmx*2 - 2); // sign fixed
1503 dp -= (width_mmx*16 - 2); // sign fixed
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001504 for (i = width; i; i--)
1505 {
1506 png_byte v[8];
1507 int j;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001508 sptr -= 2;
1509 png_memcpy(v, sptr, 2);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001510 for (j = 0; j < png_pass_inc[pass]; j++)
1511 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001512 dp -= 2;
1513 png_memcpy(dp, v, 2);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001514 }
1515 }
1516 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001517 else if (((pass == 2) || (pass == 3)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001518 {
1519 int width_mmx = ((width >> 1) << 1) ;
1520 width -= width_mmx;
1521 if (width_mmx)
1522 {
1523 _asm
1524 {
1525 mov esi, sptr
1526 mov edi, dp
1527 mov ecx, width_mmx
1528 sub esi, 2
1529 sub edi, 14
1530loop2_pass2:
1531 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1532 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1533 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1534 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1535 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1536 movq [edi], mm0
1537 sub esi, 4
1538 movq [edi + 8], mm1
1539 //sub esi, 4
1540 sub edi, 16
1541 sub ecx, 2
1542 jnz loop2_pass2
1543 EMMS
1544 }
1545 }
1546
Glenn Randers-Pehrson166c5a31999-12-10 09:43:02 -06001547 sptr -= (width_mmx*2 - 2); // sign fixed
1548 dp -= (width_mmx*8 - 2); // sign fixed
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001549 for (i = width; i; i--)
1550 {
1551 png_byte v[8];
1552 int j;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001553 sptr -= 2;
1554 png_memcpy(v, sptr, 2);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001555 for (j = 0; j < png_pass_inc[pass]; j++)
1556 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001557 dp -= 2;
1558 png_memcpy(dp, v, 2);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001559 }
1560 }
1561 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001562 else if (width) // pass == 4 or 5
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001563 {
1564 int width_mmx = ((width >> 1) << 1) ;
1565 width -= width_mmx;
1566 if (width_mmx)
1567 {
1568 _asm
1569 {
1570 mov esi, sptr
1571 mov edi, dp
1572 mov ecx, width_mmx
1573 sub esi, 2
1574 sub edi, 6
1575loop2_pass4:
1576 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1577 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1578 sub esi, 4
1579 movq [edi], mm0
1580 sub edi, 8
1581 sub ecx, 2
1582 jnz loop2_pass4
1583 EMMS
1584 }
1585 }
1586
Glenn Randers-Pehrson166c5a31999-12-10 09:43:02 -06001587 sptr -= (width_mmx*2 - 2); // sign fixed
1588 dp -= (width_mmx*4 - 2); // sign fixed
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001589 for (i = width; i; i--)
1590 {
1591 png_byte v[8];
1592 int j;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001593 sptr -= 2;
1594 png_memcpy(v, sptr, 2);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001595 for (j = 0; j < png_pass_inc[pass]; j++)
1596 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001597 dp -= 2;
1598 png_memcpy(dp, v, 2);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001599 }
1600 }
1601 }
1602 } /* end of pixel_bytes == 2 */
1603
1604 else if (pixel_bytes == 4)
1605 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001606 if (((pass == 0) || (pass == 1)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001607 {
1608 int width_mmx = ((width >> 1) << 1) ;
1609 width -= width_mmx;
1610 if (width_mmx)
1611 {
1612 _asm
1613 {
1614 mov esi, sptr
1615 mov edi, dp
1616 mov ecx, width_mmx
1617 sub esi, 4
1618 sub edi, 60
1619loop4_pass0:
1620 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1621 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1622 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1623 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1624 movq [edi], mm0
1625 movq [edi + 8], mm0
1626 movq [edi + 16], mm0
1627 movq [edi + 24], mm0
1628 movq [edi+32], mm1
1629 movq [edi + 40], mm1
1630 movq [edi+ 48], mm1
1631 sub esi, 8
1632 movq [edi + 56], mm1
1633 sub edi, 64
1634 sub ecx, 2
1635 jnz loop4_pass0
1636 EMMS
1637 }
1638 }
1639
Glenn Randers-Pehrson166c5a31999-12-10 09:43:02 -06001640 sptr -= (width_mmx*4 - 4); // sign fixed
1641 dp -= (width_mmx*32 - 4); // sign fixed
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001642 for (i = width; i; i--)
1643 {
1644 png_byte v[8];
1645 int j;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001646 sptr -= 4;
1647 png_memcpy(v, sptr, 4);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001648 for (j = 0; j < png_pass_inc[pass]; j++)
1649 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001650 dp -= 4;
1651 png_memcpy(dp, v, 4);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001652 }
1653 }
1654 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001655 else if (((pass == 2) || (pass == 3)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001656 {
1657 int width_mmx = ((width >> 1) << 1) ;
1658 width -= width_mmx;
1659 if (width_mmx)
1660 {
1661 _asm
1662 {
1663 mov esi, sptr
1664 mov edi, dp
1665 mov ecx, width_mmx
1666 sub esi, 4
1667 sub edi, 28
1668loop4_pass2:
1669 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1670 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1671 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1672 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1673 movq [edi], mm0
1674 movq [edi + 8], mm0
1675 movq [edi+16], mm1
1676 movq [edi + 24], mm1
1677 sub esi, 8
1678 sub edi, 32
1679 sub ecx, 2
1680 jnz loop4_pass2
1681 EMMS
1682 }
1683 }
1684
Glenn Randers-Pehrson166c5a31999-12-10 09:43:02 -06001685 sptr -= (width_mmx*4 - 4); // sign fixed
1686 dp -= (width_mmx*16 - 4); // sign fixed
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001687 for (i = width; i; i--)
1688 {
1689 png_byte v[8];
1690 int j;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001691 sptr -= 4;
1692 png_memcpy(v, sptr, 4);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001693 for (j = 0; j < png_pass_inc[pass]; j++)
1694 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001695 dp -= 4;
1696 png_memcpy(dp, v, 4);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001697 }
1698 }
1699 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001700 else if (width) // pass == 4 or 5
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001701 {
1702 int width_mmx = ((width >> 1) << 1) ;
1703 width -= width_mmx;
1704 if (width_mmx)
1705 {
1706 _asm
1707 {
1708 mov esi, sptr
1709 mov edi, dp
1710 mov ecx, width_mmx
1711 sub esi, 4
1712 sub edi, 12
1713loop4_pass4:
1714 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1715 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1716 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1717 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1718 movq [edi], mm0
1719 sub esi, 8
1720 movq [edi + 8], mm1
1721 sub edi, 16
1722 sub ecx, 2
1723 jnz loop4_pass4
1724 EMMS
1725 }
1726 }
1727
Glenn Randers-Pehrson166c5a31999-12-10 09:43:02 -06001728 sptr -= (width_mmx*4 - 4); // sign fixed
1729 dp -= (width_mmx*8 - 4); // sign fixed
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001730 for (i = width; i; i--)
1731 {
1732 png_byte v[8];
1733 int j;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001734 sptr -= 4;
1735 png_memcpy(v, sptr, 4);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001736 for (j = 0; j < png_pass_inc[pass]; j++)
1737 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001738 dp -= 4;
1739 png_memcpy(dp, v, 4);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001740 }
1741 }
1742 }
1743
1744 } /* end of pixel_bytes == 4 */
1745
1746 else if (pixel_bytes == 6)
1747 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001748 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001749 {
1750 png_byte v[8];
1751 int j;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001752 png_memcpy(v, sptr, 6);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001753 for (j = 0; j < png_pass_inc[pass]; j++)
1754 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001755 png_memcpy(dp, v, 6);
1756 dp -= 6;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001757 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001758 sptr -= 6;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001759 }
1760 } /* end of pixel_bytes == 6 */
1761
1762 else
1763 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001764 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001765 {
1766 png_byte v[8];
1767 int j;
1768 png_memcpy(v, sptr, pixel_bytes);
1769 for (j = 0; j < png_pass_inc[pass]; j++)
1770 {
1771 png_memcpy(dp, v, pixel_bytes);
1772 dp -= pixel_bytes;
1773 }
1774 sptr-= pixel_bytes;
1775 }
1776 }
1777 } /* end of mmx_supported */
1778
1779 else /* MMX not supported: use modified C code - takes advantage
Glenn Randers-Pehrsone1644472001-03-23 05:06:56 -06001780 * of inlining of png_memcpy for a constant */
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001781 {
1782 if (pixel_bytes == 1)
1783 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001784 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001785 {
1786 int j;
1787 for (j = 0; j < png_pass_inc[pass]; j++)
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001788 *dp-- = *sptr;
1789 sptr--;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001790 }
1791 }
1792 else if (pixel_bytes == 3)
1793 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001794 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001795 {
1796 png_byte v[8];
1797 int j;
1798 png_memcpy(v, sptr, pixel_bytes);
1799 for (j = 0; j < png_pass_inc[pass]; j++)
1800 {
1801 png_memcpy(dp, v, pixel_bytes);
1802 dp -= pixel_bytes;
1803 }
1804 sptr -= pixel_bytes;
1805 }
1806 }
1807 else if (pixel_bytes == 2)
1808 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001809 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001810 {
1811 png_byte v[8];
1812 int j;
1813 png_memcpy(v, sptr, pixel_bytes);
1814 for (j = 0; j < png_pass_inc[pass]; j++)
1815 {
1816 png_memcpy(dp, v, pixel_bytes);
1817 dp -= pixel_bytes;
1818 }
1819 sptr -= pixel_bytes;
1820 }
1821 }
1822 else if (pixel_bytes == 4)
1823 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001824 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001825 {
1826 png_byte v[8];
1827 int j;
1828 png_memcpy(v, sptr, pixel_bytes);
1829 for (j = 0; j < png_pass_inc[pass]; j++)
1830 {
1831 png_memcpy(dp, v, pixel_bytes);
1832 dp -= pixel_bytes;
1833 }
1834 sptr -= pixel_bytes;
1835 }
1836 }
1837 else if (pixel_bytes == 6)
1838 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001839 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001840 {
1841 png_byte v[8];
1842 int j;
1843 png_memcpy(v, sptr, pixel_bytes);
1844 for (j = 0; j < png_pass_inc[pass]; j++)
1845 {
1846 png_memcpy(dp, v, pixel_bytes);
1847 dp -= pixel_bytes;
1848 }
1849 sptr -= pixel_bytes;
1850 }
1851 }
1852 else
1853 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001854 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001855 {
1856 png_byte v[8];
1857 int j;
1858 png_memcpy(v, sptr, pixel_bytes);
1859 for (j = 0; j < png_pass_inc[pass]; j++)
1860 {
1861 png_memcpy(dp, v, pixel_bytes);
1862 dp -= pixel_bytes;
1863 }
1864 sptr -= pixel_bytes;
1865 }
1866 }
1867
1868 } /* end of MMX not supported */
1869 break;
1870 }
1871 } /* end switch (row_info->pixel_depth) */
1872
1873 row_info->width = final_width;
1874 row_info->rowbytes = ((final_width *
1875 (png_uint_32)row_info->pixel_depth + 7) >> 3);
1876 }
1877
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001878}
1879
1880#endif /* PNG_READ_INTERLACING_SUPPORTED */
1881
1882
1883// These variables are utilized in the functions below. They are declared
1884// globally here to ensure alignment on 8-byte boundaries.
1885
1886union uAll {
1887 __int64 use;
1888 double align;
1889} LBCarryMask = {0x0101010101010101},
1890 HBClearMask = {0x7f7f7f7f7f7f7f7f},
1891 ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
1892
1893
1894// Optimized code for PNG Average filter decoder
Glenn Randers-Pehrson75294572000-05-06 14:09:57 -05001895void /* PRIVATE */
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001896png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
1897 , png_bytep prev_row)
1898{
1899 int bpp;
1900 png_uint_32 FullLength;
1901 png_uint_32 MMXLength;
1902 //png_uint_32 len;
1903 int diff;
1904
1905 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
1906 FullLength = row_info->rowbytes; // # of bytes to filter
1907 _asm {
1908 // Init address pointers and offset
1909 mov edi, row // edi ==> Avg(x)
1910 xor ebx, ebx // ebx ==> x
1911 mov edx, edi
1912 mov esi, prev_row // esi ==> Prior(x)
1913 sub edx, bpp // edx ==> Raw(x-bpp)
1914
1915 xor eax, eax
1916 // Compute the Raw value for the first bpp bytes
1917 // Raw(x) = Avg(x) + (Prior(x)/2)
1918davgrlp:
1919 mov al, [esi + ebx] // Load al with Prior(x)
1920 inc ebx
1921 shr al, 1 // divide by 2
1922 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
1923 cmp ebx, bpp
1924 mov [edi+ebx-1], al // Write back Raw(x);
1925 // mov does not affect flags; -1 to offset inc ebx
1926 jb davgrlp
1927 // get # of bytes to alignment
1928 mov diff, edi // take start of row
1929 add diff, ebx // add bpp
1930 add diff, 0xf // add 7 + 8 to incr past alignment boundary
1931 and diff, 0xfffffff8 // mask to alignment boundary
1932 sub diff, edi // subtract from start ==> value ebx at alignment
1933 jz davggo
1934 // fix alignment
1935 // Compute the Raw value for the bytes upto the alignment boundary
1936 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
1937 xor ecx, ecx
1938davglp1:
1939 xor eax, eax
1940 mov cl, [esi + ebx] // load cl with Prior(x)
1941 mov al, [edx + ebx] // load al with Raw(x-bpp)
1942 add ax, cx
1943 inc ebx
1944 shr ax, 1 // divide by 2
1945 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
1946 cmp ebx, diff // Check if at alignment boundary
1947 mov [edi+ebx-1], al // Write back Raw(x);
1948 // mov does not affect flags; -1 to offset inc ebx
1949 jb davglp1 // Repeat until at alignment boundary
1950davggo:
1951 mov eax, FullLength
1952 mov ecx, eax
1953 sub eax, ebx // subtract alignment fix
1954 and eax, 0x00000007 // calc bytes over mult of 8
1955 sub ecx, eax // drop over bytes from original length
1956 mov MMXLength, ecx
1957 } // end _asm block
1958 // Now do the math for the rest of the row
1959 switch ( bpp )
1960 {
1961 case 3:
1962 {
1963 ActiveMask.use = 0x0000000000ffffff;
1964 ShiftBpp.use = 24; // == 3 * 8
1965 ShiftRem.use = 40; // == 64 - 24
1966 _asm {
1967 // Re-init address pointers and offset
1968 movq mm7, ActiveMask
1969 mov ebx, diff // ebx ==> x = offset to alignment boundary
1970 movq mm5, LBCarryMask
1971 mov edi, row // edi ==> Avg(x)
1972 movq mm4, HBClearMask
1973 mov esi, prev_row // esi ==> Prior(x)
1974 // PRIME the pump (load the first Raw(x-bpp) data set
1975 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
1976 // (we correct position in loop below)
1977davg3lp:
1978 movq mm0, [edi + ebx] // Load mm0 with Avg(x)
1979 // Add (Prev_row/2) to Average
1980 movq mm3, mm5
1981 psrlq mm2, ShiftRem // Correct position Raw(x-bpp) data
1982 movq mm1, [esi + ebx] // Load mm1 with Prior(x)
1983 movq mm6, mm7
1984 pand mm3, mm1 // get lsb for each prev_row byte
1985 psrlq mm1, 1 // divide prev_row bytes by 2
1986 pand mm1, mm4 // clear invalid bit 7 of each byte
1987 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
1988 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
1989 movq mm1, mm3 // now use mm1 for getting LBCarrys
1990 pand mm1, mm2 // get LBCarrys for each byte where both
1991 // lsb's were == 1 (Only valid for active group)
1992 psrlq mm2, 1 // divide raw bytes by 2
1993 pand mm2, mm4 // clear invalid bit 7 of each byte
1994 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
1995 pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
1996 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
1997 // byte
1998 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
1999 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 3-5
2000 movq mm2, mm0 // mov updated Raws to mm2
2001 psllq mm2, ShiftBpp // shift data to position correctly
2002 movq mm1, mm3 // now use mm1 for getting LBCarrys
2003 pand mm1, mm2 // get LBCarrys for each byte where both
2004 // lsb's were == 1 (Only valid for active group)
2005 psrlq mm2, 1 // divide raw bytes by 2
2006 pand mm2, mm4 // clear invalid bit 7 of each byte
2007 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2008 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2009 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2010 // byte
2011
2012 // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
2013 psllq mm6, ShiftBpp // shift the mm6 mask to cover the last two
2014 // bytes
2015 movq mm2, mm0 // mov updated Raws to mm2
2016 psllq mm2, ShiftBpp // shift data to position correctly
2017 // Data only needs to be shifted once here to
2018 // get the correct x-bpp offset.
2019 movq mm1, mm3 // now use mm1 for getting LBCarrys
2020 pand mm1, mm2 // get LBCarrys for each byte where both
2021 // lsb's were == 1 (Only valid for active group)
2022 psrlq mm2, 1 // divide raw bytes by 2
2023 pand mm2, mm4 // clear invalid bit 7 of each byte
2024 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2025 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2026 add ebx, 8
2027 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2028 // byte
2029
2030 // Now ready to write back to memory
2031 movq [edi + ebx - 8], mm0
2032 // Move updated Raw(x) to use as Raw(x-bpp) for next loop
2033 cmp ebx, MMXLength
2034 movq mm2, mm0 // mov updated Raw(x) to mm2
2035 jb davg3lp
2036 } // end _asm block
2037 }
2038 break;
2039
2040 case 6:
2041 case 4:
2042 case 7:
2043 case 5:
2044 {
2045 ActiveMask.use = 0xffffffffffffffff; // use shift below to clear
2046 // appropriate inactive bytes
2047 ShiftBpp.use = bpp << 3;
2048 ShiftRem.use = 64 - ShiftBpp.use;
2049 _asm {
2050 movq mm4, HBClearMask
2051 // Re-init address pointers and offset
2052 mov ebx, diff // ebx ==> x = offset to alignment boundary
2053 // Load ActiveMask and clear all bytes except for 1st active group
2054 movq mm7, ActiveMask
2055 mov edi, row // edi ==> Avg(x)
2056 psrlq mm7, ShiftRem
2057 mov esi, prev_row // esi ==> Prior(x)
2058 movq mm6, mm7
2059 movq mm5, LBCarryMask
2060 psllq mm6, ShiftBpp // Create mask for 2nd active group
2061 // PRIME the pump (load the first Raw(x-bpp) data set
2062 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2063 // (we correct position in loop below)
2064davg4lp:
2065 movq mm0, [edi + ebx]
2066 psrlq mm2, ShiftRem // shift data to position correctly
2067 movq mm1, [esi + ebx]
2068 // Add (Prev_row/2) to Average
2069 movq mm3, mm5
2070 pand mm3, mm1 // get lsb for each prev_row byte
2071 psrlq mm1, 1 // divide prev_row bytes by 2
2072 pand mm1, mm4 // clear invalid bit 7 of each byte
2073 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2074 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2075 movq mm1, mm3 // now use mm1 for getting LBCarrys
2076 pand mm1, mm2 // get LBCarrys for each byte where both
2077 // lsb's were == 1 (Only valid for active group)
2078 psrlq mm2, 1 // divide raw bytes by 2
2079 pand mm2, mm4 // clear invalid bit 7 of each byte
2080 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2081 pand mm2, mm7 // Leave only Active Group 1 bytes to add to Avg
2082 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2083 // byte
2084 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2085 movq mm2, mm0 // mov updated Raws to mm2
2086 psllq mm2, ShiftBpp // shift data to position correctly
2087 add ebx, 8
2088 movq mm1, mm3 // now use mm1 for getting LBCarrys
2089 pand mm1, mm2 // get LBCarrys for each byte where both
2090 // lsb's were == 1 (Only valid for active group)
2091 psrlq mm2, 1 // divide raw bytes by 2
2092 pand mm2, mm4 // clear invalid bit 7 of each byte
2093 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2094 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2095 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2096 // byte
2097 cmp ebx, MMXLength
2098 // Now ready to write back to memory
2099 movq [edi + ebx - 8], mm0
2100 // Prep Raw(x-bpp) for next loop
2101 movq mm2, mm0 // mov updated Raws to mm2
2102 jb davg4lp
2103 } // end _asm block
2104 }
2105 break;
2106 case 2:
2107 {
2108 ActiveMask.use = 0x000000000000ffff;
Glenn Randers-Pehrson5e5c1e12000-11-10 12:26:19 -06002109 ShiftBpp.use = 16; // == 2 * 8 [BUGFIX]
2110 ShiftRem.use = 48; // == 64 - 16 [BUGFIX]
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05002111 _asm {
2112 // Load ActiveMask
2113 movq mm7, ActiveMask
2114 // Re-init address pointers and offset
2115 mov ebx, diff // ebx ==> x = offset to alignment boundary
2116 movq mm5, LBCarryMask
2117 mov edi, row // edi ==> Avg(x)
2118 movq mm4, HBClearMask
2119 mov esi, prev_row // esi ==> Prior(x)
2120 // PRIME the pump (load the first Raw(x-bpp) data set
2121 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2122 // (we correct position in loop below)
2123davg2lp:
2124 movq mm0, [edi + ebx]
Glenn Randers-Pehrson5e5c1e12000-11-10 12:26:19 -06002125 psrlq mm2, ShiftRem // shift data to position correctly [BUGFIX]
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05002126 movq mm1, [esi + ebx]
2127 // Add (Prev_row/2) to Average
2128 movq mm3, mm5
2129 pand mm3, mm1 // get lsb for each prev_row byte
2130 psrlq mm1, 1 // divide prev_row bytes by 2
2131 pand mm1, mm4 // clear invalid bit 7 of each byte
2132 movq mm6, mm7
2133 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2134 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2135 movq mm1, mm3 // now use mm1 for getting LBCarrys
2136 pand mm1, mm2 // get LBCarrys for each byte where both
2137 // lsb's were == 1 (Only valid for active group)
2138 psrlq mm2, 1 // divide raw bytes by 2
2139 pand mm2, mm4 // clear invalid bit 7 of each byte
2140 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2141 pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
2142 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2143 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2144 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3
2145 movq mm2, mm0 // mov updated Raws to mm2
2146 psllq mm2, ShiftBpp // shift data to position correctly
2147 movq mm1, mm3 // now use mm1 for getting LBCarrys
2148 pand mm1, mm2 // get LBCarrys for each byte where both
2149 // lsb's were == 1 (Only valid for active group)
2150 psrlq mm2, 1 // divide raw bytes by 2
2151 pand mm2, mm4 // clear invalid bit 7 of each byte
2152 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2153 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2154 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2155
2156 // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
2157 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5
2158 movq mm2, mm0 // mov updated Raws to mm2
2159 psllq mm2, ShiftBpp // shift data to position correctly
2160 // Data only needs to be shifted once here to
2161 // get the correct x-bpp offset.
2162 movq mm1, mm3 // now use mm1 for getting LBCarrys
2163 pand mm1, mm2 // get LBCarrys for each byte where both
2164 // lsb's were == 1 (Only valid for active group)
2165 psrlq mm2, 1 // divide raw bytes by 2
2166 pand mm2, mm4 // clear invalid bit 7 of each byte
2167 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2168 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2169 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2170
2171 // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
2172 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 6 & 7
2173 movq mm2, mm0 // mov updated Raws to mm2
2174 psllq mm2, ShiftBpp // shift data to position correctly
2175 // Data only needs to be shifted once here to
2176 // get the correct x-bpp offset.
2177 add ebx, 8
2178 movq mm1, mm3 // now use mm1 for getting LBCarrys
2179 pand mm1, mm2 // get LBCarrys for each byte where both
2180 // lsb's were == 1 (Only valid for active group)
2181 psrlq mm2, 1 // divide raw bytes by 2
2182 pand mm2, mm4 // clear invalid bit 7 of each byte
2183 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2184 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2185 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2186
2187 cmp ebx, MMXLength
2188 // Now ready to write back to memory
2189 movq [edi + ebx - 8], mm0
2190 // Prep Raw(x-bpp) for next loop
2191 movq mm2, mm0 // mov updated Raws to mm2
2192 jb davg2lp
2193 } // end _asm block
2194 }
2195 break;
2196
2197 case 1: // bpp == 1
2198 {
2199 _asm {
2200 // Re-init address pointers and offset
2201 mov ebx, diff // ebx ==> x = offset to alignment boundary
2202 mov edi, row // edi ==> Avg(x)
2203 cmp ebx, FullLength // Test if offset at end of array
2204 jnb davg1end
2205 // Do Paeth decode for remaining bytes
2206 mov esi, prev_row // esi ==> Prior(x)
2207 mov edx, edi
2208 xor ecx, ecx // zero ecx before using cl & cx in loop below
2209 sub edx, bpp // edx ==> Raw(x-bpp)
2210davg1lp:
2211 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2212 xor eax, eax
2213 mov cl, [esi + ebx] // load cl with Prior(x)
2214 mov al, [edx + ebx] // load al with Raw(x-bpp)
2215 add ax, cx
2216 inc ebx
2217 shr ax, 1 // divide by 2
2218 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2219 cmp ebx, FullLength // Check if at end of array
2220 mov [edi+ebx-1], al // Write back Raw(x);
2221 // mov does not affect flags; -1 to offset inc ebx
2222 jb davg1lp
2223davg1end:
2224 } // end _asm block
2225 }
2226 return;
2227
2228 case 8: // bpp == 8
2229 {
2230 _asm {
2231 // Re-init address pointers and offset
2232 mov ebx, diff // ebx ==> x = offset to alignment boundary
2233 movq mm5, LBCarryMask
2234 mov edi, row // edi ==> Avg(x)
2235 movq mm4, HBClearMask
2236 mov esi, prev_row // esi ==> Prior(x)
2237 // PRIME the pump (load the first Raw(x-bpp) data set
2238 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2239 // (NO NEED to correct position in loop below)
2240davg8lp:
2241 movq mm0, [edi + ebx]
2242 movq mm3, mm5
2243 movq mm1, [esi + ebx]
2244 add ebx, 8
2245 pand mm3, mm1 // get lsb for each prev_row byte
2246 psrlq mm1, 1 // divide prev_row bytes by 2
2247 pand mm3, mm2 // get LBCarrys for each byte where both
2248 // lsb's were == 1
2249 psrlq mm2, 1 // divide raw bytes by 2
2250 pand mm1, mm4 // clear invalid bit 7 of each byte
2251 paddb mm0, mm3 // add LBCarrys to Avg for each byte
2252 pand mm2, mm4 // clear invalid bit 7 of each byte
2253 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2254 paddb mm0, mm2 // add (Raw/2) to Avg for each byte
2255 cmp ebx, MMXLength
2256 movq [edi + ebx - 8], mm0
2257 movq mm2, mm0 // reuse as Raw(x-bpp)
2258 jb davg8lp
2259 } // end _asm block
2260 }
2261 break;
2262 default: // bpp greater than 8
2263 {
2264 _asm {
2265 movq mm5, LBCarryMask
2266 // Re-init address pointers and offset
2267 mov ebx, diff // ebx ==> x = offset to alignment boundary
2268 mov edi, row // edi ==> Avg(x)
2269 movq mm4, HBClearMask
2270 mov edx, edi
2271 mov esi, prev_row // esi ==> Prior(x)
2272 sub edx, bpp // edx ==> Raw(x-bpp)
2273davgAlp:
2274 movq mm0, [edi + ebx]
2275 movq mm3, mm5
2276 movq mm1, [esi + ebx]
2277 pand mm3, mm1 // get lsb for each prev_row byte
2278 movq mm2, [edx + ebx]
2279 psrlq mm1, 1 // divide prev_row bytes by 2
2280 pand mm3, mm2 // get LBCarrys for each byte where both
2281 // lsb's were == 1
2282 psrlq mm2, 1 // divide raw bytes by 2
2283 pand mm1, mm4 // clear invalid bit 7 of each byte
2284 paddb mm0, mm3 // add LBCarrys to Avg for each byte
2285 pand mm2, mm4 // clear invalid bit 7 of each byte
2286 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2287 add ebx, 8
2288 paddb mm0, mm2 // add (Raw/2) to Avg for each byte
2289 cmp ebx, MMXLength
2290 movq [edi + ebx - 8], mm0
2291 jb davgAlp
2292 } // end _asm block
2293 }
2294 break;
2295 } // end switch ( bpp )
2296
2297 _asm {
2298 // MMX acceleration complete now do clean-up
2299 // Check if any remaining bytes left to decode
2300 mov ebx, MMXLength // ebx ==> x = offset bytes remaining after MMX
2301 mov edi, row // edi ==> Avg(x)
2302 cmp ebx, FullLength // Test if offset at end of array
2303 jnb davgend
2304 // Do Paeth decode for remaining bytes
2305 mov esi, prev_row // esi ==> Prior(x)
2306 mov edx, edi
2307 xor ecx, ecx // zero ecx before using cl & cx in loop below
2308 sub edx, bpp // edx ==> Raw(x-bpp)
2309davglp2:
2310 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2311 xor eax, eax
2312 mov cl, [esi + ebx] // load cl with Prior(x)
2313 mov al, [edx + ebx] // load al with Raw(x-bpp)
2314 add ax, cx
2315 inc ebx
2316 shr ax, 1 // divide by 2
2317 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2318 cmp ebx, FullLength // Check if at end of array
2319 mov [edi+ebx-1], al // Write back Raw(x);
2320 // mov does not affect flags; -1 to offset inc ebx
2321 jb davglp2
2322davgend:
2323 emms // End MMX instructions; prep for possible FP instrs.
2324 } // end _asm block
2325}
2326
2327// Optimized code for PNG Paeth filter decoder
Glenn Randers-Pehrson75294572000-05-06 14:09:57 -05002328void /* PRIVATE */
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05002329png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
2330 png_bytep prev_row)
2331{
2332 png_uint_32 FullLength;
2333 png_uint_32 MMXLength;
2334 //png_uint_32 len;
2335 int bpp;
2336 int diff;
2337 //int ptemp;
2338 int patemp, pbtemp, pctemp;
2339
2340 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
2341 FullLength = row_info->rowbytes; // # of bytes to filter
2342 _asm
2343 {
2344 xor ebx, ebx // ebx ==> x offset
2345 mov edi, row
2346 xor edx, edx // edx ==> x-bpp offset
2347 mov esi, prev_row
2348 xor eax, eax
2349
2350 // Compute the Raw value for the first bpp bytes
2351 // Note: the formula works out to be always
2352 // Paeth(x) = Raw(x) + Prior(x) where x < bpp
2353dpthrlp:
2354 mov al, [edi + ebx]
2355 add al, [esi + ebx]
2356 inc ebx
2357 cmp ebx, bpp
2358 mov [edi + ebx - 1], al
2359 jb dpthrlp
2360 // get # of bytes to alignment
2361 mov diff, edi // take start of row
2362 add diff, ebx // add bpp
2363 xor ecx, ecx
2364 add diff, 0xf // add 7 + 8 to incr past alignment boundary
2365 and diff, 0xfffffff8 // mask to alignment boundary
2366 sub diff, edi // subtract from start ==> value ebx at alignment
2367 jz dpthgo
2368 // fix alignment
2369dpthlp1:
2370 xor eax, eax
2371 // pav = p - a = (a + b - c) - a = b - c
2372 mov al, [esi + ebx] // load Prior(x) into al
2373 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2374 sub eax, ecx // subtract Prior(x-bpp)
2375 mov patemp, eax // Save pav for later use
2376 xor eax, eax
2377 // pbv = p - b = (a + b - c) - b = a - c
2378 mov al, [edi + edx] // load Raw(x-bpp) into al
2379 sub eax, ecx // subtract Prior(x-bpp)
2380 mov ecx, eax
2381 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2382 add eax, patemp // pcv = pav + pbv
2383 // pc = abs(pcv)
2384 test eax, 0x80000000
2385 jz dpthpca
2386 neg eax // reverse sign of neg values
2387dpthpca:
2388 mov pctemp, eax // save pc for later use
2389 // pb = abs(pbv)
2390 test ecx, 0x80000000
2391 jz dpthpba
2392 neg ecx // reverse sign of neg values
2393dpthpba:
2394 mov pbtemp, ecx // save pb for later use
2395 // pa = abs(pav)
2396 mov eax, patemp
2397 test eax, 0x80000000
2398 jz dpthpaa
2399 neg eax // reverse sign of neg values
2400dpthpaa:
2401 mov patemp, eax // save pa for later use
2402 // test if pa <= pb
2403 cmp eax, ecx
2404 jna dpthabb
2405 // pa > pb; now test if pb <= pc
2406 cmp ecx, pctemp
2407 jna dpthbbc
2408 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2409 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2410 jmp dpthpaeth
2411dpthbbc:
2412 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
2413 mov cl, [esi + ebx] // load Prior(x) into cl
2414 jmp dpthpaeth
2415dpthabb:
2416 // pa <= pb; now test if pa <= pc
2417 cmp eax, pctemp
2418 jna dpthabc
2419 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2420 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2421 jmp dpthpaeth
2422dpthabc:
2423 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
2424 mov cl, [edi + edx] // load Raw(x-bpp) into cl
2425dpthpaeth:
2426 inc ebx
2427 inc edx
2428 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
2429 add [edi + ebx - 1], cl
2430 cmp ebx, diff
2431 jb dpthlp1
2432dpthgo:
2433 mov ecx, FullLength
2434 mov eax, ecx
2435 sub eax, ebx // subtract alignment fix
2436 and eax, 0x00000007 // calc bytes over mult of 8
2437 sub ecx, eax // drop over bytes from original length
2438 mov MMXLength, ecx
2439 } // end _asm block
2440 // Now do the math for the rest of the row
2441 switch ( bpp )
2442 {
2443 case 3:
2444 {
2445 ActiveMask.use = 0x0000000000ffffff;
2446 ActiveMaskEnd.use = 0xffff000000000000;
2447 ShiftBpp.use = 24; // == bpp(3) * 8
2448 ShiftRem.use = 40; // == 64 - 24
2449 _asm
2450 {
2451 mov ebx, diff
2452 mov edi, row
2453 mov esi, prev_row
2454 pxor mm0, mm0
2455 // PRIME the pump (load the first Raw(x-bpp) data set
2456 movq mm1, [edi+ebx-8]
2457dpth3lp:
2458 psrlq mm1, ShiftRem // shift last 3 bytes to 1st 3 bytes
2459 movq mm2, [esi + ebx] // load b=Prior(x)
2460 punpcklbw mm1, mm0 // Unpack High bytes of a
2461 movq mm3, [esi+ebx-8] // Prep c=Prior(x-bpp) bytes
2462 punpcklbw mm2, mm0 // Unpack High bytes of b
2463 psrlq mm3, ShiftRem // shift last 3 bytes to 1st 3 bytes
2464 // pav = p - a = (a + b - c) - a = b - c
2465 movq mm4, mm2
2466 punpcklbw mm3, mm0 // Unpack High bytes of c
2467 // pbv = p - b = (a + b - c) - b = a - c
2468 movq mm5, mm1
2469 psubw mm4, mm3
2470 pxor mm7, mm7
2471 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2472 movq mm6, mm4
2473 psubw mm5, mm3
2474
2475 // pa = abs(p-a) = abs(pav)
2476 // pb = abs(p-b) = abs(pbv)
2477 // pc = abs(p-c) = abs(pcv)
2478 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2479 paddw mm6, mm5
2480 pand mm0, mm4 // Only pav bytes < 0 in mm7
2481 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2482 psubw mm4, mm0
2483 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2484 psubw mm4, mm0
2485 psubw mm5, mm7
2486 pxor mm0, mm0
2487 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2488 pand mm0, mm6 // Only pav bytes < 0 in mm7
2489 psubw mm5, mm7
2490 psubw mm6, mm0
2491 // test pa <= pb
2492 movq mm7, mm4
2493 psubw mm6, mm0
2494 pcmpgtw mm7, mm5 // pa > pb?
2495 movq mm0, mm7
2496 // use mm7 mask to merge pa & pb
2497 pand mm5, mm7
2498 // use mm0 mask copy to merge a & b
2499 pand mm2, mm0
2500 pandn mm7, mm4
2501 pandn mm0, mm1
2502 paddw mm7, mm5
2503 paddw mm0, mm2
2504 // test ((pa <= pb)? pa:pb) <= pc
2505 pcmpgtw mm7, mm6 // pab > pc?
2506 pxor mm1, mm1
2507 pand mm3, mm7
2508 pandn mm7, mm0
2509 paddw mm7, mm3
2510 pxor mm0, mm0
2511 packuswb mm7, mm1
2512 movq mm3, [esi + ebx] // load c=Prior(x-bpp)
2513 pand mm7, ActiveMask
2514 movq mm2, mm3 // load b=Prior(x) step 1
2515 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2516 punpcklbw mm3, mm0 // Unpack High bytes of c
2517 movq [edi + ebx], mm7 // write back updated value
2518 movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
2519 // Now do Paeth for 2nd set of bytes (3-5)
2520 psrlq mm2, ShiftBpp // load b=Prior(x) step 2
2521 punpcklbw mm1, mm0 // Unpack High bytes of a
2522 pxor mm7, mm7
2523 punpcklbw mm2, mm0 // Unpack High bytes of b
2524 // pbv = p - b = (a + b - c) - b = a - c
2525 movq mm5, mm1
2526 // pav = p - a = (a + b - c) - a = b - c
2527 movq mm4, mm2
2528 psubw mm5, mm3
2529 psubw mm4, mm3
2530 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
2531 // pav + pbv = pbv + pav
2532 movq mm6, mm5
2533 paddw mm6, mm4
2534
2535 // pa = abs(p-a) = abs(pav)
2536 // pb = abs(p-b) = abs(pbv)
2537 // pc = abs(p-c) = abs(pcv)
2538 pcmpgtw mm0, mm5 // Create mask pbv bytes < 0
2539 pcmpgtw mm7, mm4 // Create mask pav bytes < 0
2540 pand mm0, mm5 // Only pbv bytes < 0 in mm0
2541 pand mm7, mm4 // Only pav bytes < 0 in mm7
2542 psubw mm5, mm0
2543 psubw mm4, mm7
2544 psubw mm5, mm0
2545 psubw mm4, mm7
2546 pxor mm0, mm0
2547 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2548 pand mm0, mm6 // Only pav bytes < 0 in mm7
2549 psubw mm6, mm0
2550 // test pa <= pb
2551 movq mm7, mm4
2552 psubw mm6, mm0
2553 pcmpgtw mm7, mm5 // pa > pb?
2554 movq mm0, mm7
2555 // use mm7 mask to merge pa & pb
2556 pand mm5, mm7
2557 // use mm0 mask copy to merge a & b
2558 pand mm2, mm0
2559 pandn mm7, mm4
2560 pandn mm0, mm1
2561 paddw mm7, mm5
2562 paddw mm0, mm2
2563 // test ((pa <= pb)? pa:pb) <= pc
2564 pcmpgtw mm7, mm6 // pab > pc?
2565 movq mm2, [esi + ebx] // load b=Prior(x)
2566 pand mm3, mm7
2567 pandn mm7, mm0
2568 pxor mm1, mm1
2569 paddw mm7, mm3
2570 pxor mm0, mm0
2571 packuswb mm7, mm1
2572 movq mm3, mm2 // load c=Prior(x-bpp) step 1
2573 pand mm7, ActiveMask
2574 punpckhbw mm2, mm0 // Unpack High bytes of b
2575 psllq mm7, ShiftBpp // Shift bytes to 2nd group of 3 bytes
2576 // pav = p - a = (a + b - c) - a = b - c
2577 movq mm4, mm2
2578 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2579 psllq mm3, ShiftBpp // load c=Prior(x-bpp) step 2
2580 movq [edi + ebx], mm7 // write back updated value
2581 movq mm1, mm7
2582 punpckhbw mm3, mm0 // Unpack High bytes of c
2583 psllq mm1, ShiftBpp // Shift bytes
2584 // Now mm1 will be used as Raw(x-bpp)
2585 // Now do Paeth for 3rd, and final, set of bytes (6-7)
2586 pxor mm7, mm7
2587 punpckhbw mm1, mm0 // Unpack High bytes of a
2588 psubw mm4, mm3
2589 // pbv = p - b = (a + b - c) - b = a - c
2590 movq mm5, mm1
2591 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2592 movq mm6, mm4
2593 psubw mm5, mm3
2594 pxor mm0, mm0
2595 paddw mm6, mm5
2596
2597 // pa = abs(p-a) = abs(pav)
2598 // pb = abs(p-b) = abs(pbv)
2599 // pc = abs(p-c) = abs(pcv)
2600 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2601 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2602 pand mm0, mm4 // Only pav bytes < 0 in mm7
2603 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2604 psubw mm4, mm0
2605 psubw mm5, mm7
2606 psubw mm4, mm0
2607 psubw mm5, mm7
2608 pxor mm0, mm0
2609 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2610 pand mm0, mm6 // Only pav bytes < 0 in mm7
2611 psubw mm6, mm0
2612 // test pa <= pb
2613 movq mm7, mm4
2614 psubw mm6, mm0
2615 pcmpgtw mm7, mm5 // pa > pb?
2616 movq mm0, mm7
2617 // use mm0 mask copy to merge a & b
2618 pand mm2, mm0
2619 // use mm7 mask to merge pa & pb
2620 pand mm5, mm7
2621 pandn mm0, mm1
2622 pandn mm7, mm4
2623 paddw mm0, mm2
2624 paddw mm7, mm5
2625 // test ((pa <= pb)? pa:pb) <= pc
2626 pcmpgtw mm7, mm6 // pab > pc?
2627 pand mm3, mm7
2628 pandn mm7, mm0
2629 paddw mm7, mm3
2630 pxor mm1, mm1
2631 packuswb mm1, mm7
2632 // Step ebx to next set of 8 bytes and repeat loop til done
2633 add ebx, 8
2634 pand mm1, ActiveMaskEnd
2635 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2636
2637 cmp ebx, MMXLength
2638 pxor mm0, mm0 // pxor does not affect flags
2639 movq [edi + ebx - 8], mm1 // write back updated value
2640 // mm1 will be used as Raw(x-bpp) next loop
2641 // mm3 ready to be used as Prior(x-bpp) next loop
2642 jb dpth3lp
2643 } // end _asm block
2644 }
2645 break;
2646
2647 case 6:
2648 case 7:
2649 case 5:
2650 {
2651 ActiveMask.use = 0x00000000ffffffff;
2652 ActiveMask2.use = 0xffffffff00000000;
2653 ShiftBpp.use = bpp << 3; // == bpp * 8
2654 ShiftRem.use = 64 - ShiftBpp.use;
2655 _asm
2656 {
2657 mov ebx, diff
2658 mov edi, row
2659 mov esi, prev_row
2660 // PRIME the pump (load the first Raw(x-bpp) data set
2661 movq mm1, [edi+ebx-8]
2662 pxor mm0, mm0
2663dpth6lp:
2664 // Must shift to position Raw(x-bpp) data
2665 psrlq mm1, ShiftRem
2666 // Do first set of 4 bytes
2667 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2668 punpcklbw mm1, mm0 // Unpack Low bytes of a
2669 movq mm2, [esi + ebx] // load b=Prior(x)
2670 punpcklbw mm2, mm0 // Unpack Low bytes of b
2671 // Must shift to position Prior(x-bpp) data
2672 psrlq mm3, ShiftRem
2673 // pav = p - a = (a + b - c) - a = b - c
2674 movq mm4, mm2
2675 punpcklbw mm3, mm0 // Unpack Low bytes of c
2676 // pbv = p - b = (a + b - c) - b = a - c
2677 movq mm5, mm1
2678 psubw mm4, mm3
2679 pxor mm7, mm7
2680 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2681 movq mm6, mm4
2682 psubw mm5, mm3
2683 // pa = abs(p-a) = abs(pav)
2684 // pb = abs(p-b) = abs(pbv)
2685 // pc = abs(p-c) = abs(pcv)
2686 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2687 paddw mm6, mm5
2688 pand mm0, mm4 // Only pav bytes < 0 in mm7
2689 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2690 psubw mm4, mm0
2691 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2692 psubw mm4, mm0
2693 psubw mm5, mm7
2694 pxor mm0, mm0
2695 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2696 pand mm0, mm6 // Only pav bytes < 0 in mm7
2697 psubw mm5, mm7
2698 psubw mm6, mm0
2699 // test pa <= pb
2700 movq mm7, mm4
2701 psubw mm6, mm0
2702 pcmpgtw mm7, mm5 // pa > pb?
2703 movq mm0, mm7
2704 // use mm7 mask to merge pa & pb
2705 pand mm5, mm7
2706 // use mm0 mask copy to merge a & b
2707 pand mm2, mm0
2708 pandn mm7, mm4
2709 pandn mm0, mm1
2710 paddw mm7, mm5
2711 paddw mm0, mm2
2712 // test ((pa <= pb)? pa:pb) <= pc
2713 pcmpgtw mm7, mm6 // pab > pc?
2714 pxor mm1, mm1
2715 pand mm3, mm7
2716 pandn mm7, mm0
2717 paddw mm7, mm3
2718 pxor mm0, mm0
2719 packuswb mm7, mm1
2720 movq mm3, [esi + ebx - 8] // load c=Prior(x-bpp)
2721 pand mm7, ActiveMask
2722 psrlq mm3, ShiftRem
2723 movq mm2, [esi + ebx] // load b=Prior(x) step 1
2724 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2725 movq mm6, mm2
2726 movq [edi + ebx], mm7 // write back updated value
2727 movq mm1, [edi+ebx-8]
2728 psllq mm6, ShiftBpp
2729 movq mm5, mm7
2730 psrlq mm1, ShiftRem
2731 por mm3, mm6
2732 psllq mm5, ShiftBpp
2733 punpckhbw mm3, mm0 // Unpack High bytes of c
2734 por mm1, mm5
2735 // Do second set of 4 bytes
2736 punpckhbw mm2, mm0 // Unpack High bytes of b
2737 punpckhbw mm1, mm0 // Unpack High bytes of a
2738 // pav = p - a = (a + b - c) - a = b - c
2739 movq mm4, mm2
2740 // pbv = p - b = (a + b - c) - b = a - c
2741 movq mm5, mm1
2742 psubw mm4, mm3
2743 pxor mm7, mm7
2744 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2745 movq mm6, mm4
2746 psubw mm5, mm3
2747 // pa = abs(p-a) = abs(pav)
2748 // pb = abs(p-b) = abs(pbv)
2749 // pc = abs(p-c) = abs(pcv)
2750 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2751 paddw mm6, mm5
2752 pand mm0, mm4 // Only pav bytes < 0 in mm7
2753 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2754 psubw mm4, mm0
2755 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2756 psubw mm4, mm0
2757 psubw mm5, mm7
2758 pxor mm0, mm0
2759 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2760 pand mm0, mm6 // Only pav bytes < 0 in mm7
2761 psubw mm5, mm7
2762 psubw mm6, mm0
2763 // test pa <= pb
2764 movq mm7, mm4
2765 psubw mm6, mm0
2766 pcmpgtw mm7, mm5 // pa > pb?
2767 movq mm0, mm7
2768 // use mm7 mask to merge pa & pb
2769 pand mm5, mm7
2770 // use mm0 mask copy to merge a & b
2771 pand mm2, mm0
2772 pandn mm7, mm4
2773 pandn mm0, mm1
2774 paddw mm7, mm5
2775 paddw mm0, mm2
2776 // test ((pa <= pb)? pa:pb) <= pc
2777 pcmpgtw mm7, mm6 // pab > pc?
2778 pxor mm1, mm1
2779 pand mm3, mm7
2780 pandn mm7, mm0
2781 pxor mm1, mm1
2782 paddw mm7, mm3
2783 pxor mm0, mm0
2784 // Step ex to next set of 8 bytes and repeat loop til done
2785 add ebx, 8
2786 packuswb mm1, mm7
2787 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2788 cmp ebx, MMXLength
2789 movq [edi + ebx - 8], mm1 // write back updated value
2790 // mm1 will be used as Raw(x-bpp) next loop
2791 jb dpth6lp
2792 } // end _asm block
2793 }
2794 break;
2795
2796 case 4:
2797 {
2798 ActiveMask.use = 0x00000000ffffffff;
2799 _asm {
2800 mov ebx, diff
2801 mov edi, row
2802 mov esi, prev_row
2803 pxor mm0, mm0
2804 // PRIME the pump (load the first Raw(x-bpp) data set
2805 movq mm1, [edi+ebx-8] // Only time should need to read
2806 // a=Raw(x-bpp) bytes
2807dpth4lp:
2808 // Do first set of 4 bytes
2809 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2810 punpckhbw mm1, mm0 // Unpack Low bytes of a
2811 movq mm2, [esi + ebx] // load b=Prior(x)
2812 punpcklbw mm2, mm0 // Unpack High bytes of b
2813 // pav = p - a = (a + b - c) - a = b - c
2814 movq mm4, mm2
2815 punpckhbw mm3, mm0 // Unpack High bytes of c
2816 // pbv = p - b = (a + b - c) - b = a - c
2817 movq mm5, mm1
2818 psubw mm4, mm3
2819 pxor mm7, mm7
2820 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2821 movq mm6, mm4
2822 psubw mm5, mm3
2823 // pa = abs(p-a) = abs(pav)
2824 // pb = abs(p-b) = abs(pbv)
2825 // pc = abs(p-c) = abs(pcv)
2826 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2827 paddw mm6, mm5
2828 pand mm0, mm4 // Only pav bytes < 0 in mm7
2829 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2830 psubw mm4, mm0
2831 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2832 psubw mm4, mm0
2833 psubw mm5, mm7
2834 pxor mm0, mm0
2835 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2836 pand mm0, mm6 // Only pav bytes < 0 in mm7
2837 psubw mm5, mm7
2838 psubw mm6, mm0
2839 // test pa <= pb
2840 movq mm7, mm4
2841 psubw mm6, mm0
2842 pcmpgtw mm7, mm5 // pa > pb?
2843 movq mm0, mm7
2844 // use mm7 mask to merge pa & pb
2845 pand mm5, mm7
2846 // use mm0 mask copy to merge a & b
2847 pand mm2, mm0
2848 pandn mm7, mm4
2849 pandn mm0, mm1
2850 paddw mm7, mm5
2851 paddw mm0, mm2
2852 // test ((pa <= pb)? pa:pb) <= pc
2853 pcmpgtw mm7, mm6 // pab > pc?
2854 pxor mm1, mm1
2855 pand mm3, mm7
2856 pandn mm7, mm0
2857 paddw mm7, mm3
2858 pxor mm0, mm0
2859 packuswb mm7, mm1
2860 movq mm3, [esi + ebx] // load c=Prior(x-bpp)
2861 pand mm7, ActiveMask
2862 movq mm2, mm3 // load b=Prior(x) step 1
2863 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2864 punpcklbw mm3, mm0 // Unpack High bytes of c
2865 movq [edi + ebx], mm7 // write back updated value
2866 movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
2867 // Do second set of 4 bytes
2868 punpckhbw mm2, mm0 // Unpack Low bytes of b
2869 punpcklbw mm1, mm0 // Unpack Low bytes of a
2870 // pav = p - a = (a + b - c) - a = b - c
2871 movq mm4, mm2
2872 // pbv = p - b = (a + b - c) - b = a - c
2873 movq mm5, mm1
2874 psubw mm4, mm3
2875 pxor mm7, mm7
2876 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2877 movq mm6, mm4
2878 psubw mm5, mm3
2879 // pa = abs(p-a) = abs(pav)
2880 // pb = abs(p-b) = abs(pbv)
2881 // pc = abs(p-c) = abs(pcv)
2882 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2883 paddw mm6, mm5
2884 pand mm0, mm4 // Only pav bytes < 0 in mm7
2885 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2886 psubw mm4, mm0
2887 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2888 psubw mm4, mm0
2889 psubw mm5, mm7
2890 pxor mm0, mm0
2891 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2892 pand mm0, mm6 // Only pav bytes < 0 in mm7
2893 psubw mm5, mm7
2894 psubw mm6, mm0
2895 // test pa <= pb
2896 movq mm7, mm4
2897 psubw mm6, mm0
2898 pcmpgtw mm7, mm5 // pa > pb?
2899 movq mm0, mm7
2900 // use mm7 mask to merge pa & pb
2901 pand mm5, mm7
2902 // use mm0 mask copy to merge a & b
2903 pand mm2, mm0
2904 pandn mm7, mm4
2905 pandn mm0, mm1
2906 paddw mm7, mm5
2907 paddw mm0, mm2
2908 // test ((pa <= pb)? pa:pb) <= pc
2909 pcmpgtw mm7, mm6 // pab > pc?
2910 pxor mm1, mm1
2911 pand mm3, mm7
2912 pandn mm7, mm0
2913 pxor mm1, mm1
2914 paddw mm7, mm3
2915 pxor mm0, mm0
2916 // Step ex to next set of 8 bytes and repeat loop til done
2917 add ebx, 8
2918 packuswb mm1, mm7
2919 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2920 cmp ebx, MMXLength
2921 movq [edi + ebx - 8], mm1 // write back updated value
2922 // mm1 will be used as Raw(x-bpp) next loop
2923 jb dpth4lp
2924 } // end _asm block
2925 }
2926 break;
2927 case 8: // bpp == 8
2928 {
2929 ActiveMask.use = 0x00000000ffffffff;
2930 _asm {
2931 mov ebx, diff
2932 mov edi, row
2933 mov esi, prev_row
2934 pxor mm0, mm0
2935 // PRIME the pump (load the first Raw(x-bpp) data set
2936 movq mm1, [edi+ebx-8] // Only time should need to read
2937 // a=Raw(x-bpp) bytes
2938dpth8lp:
2939 // Do first set of 4 bytes
2940 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2941 punpcklbw mm1, mm0 // Unpack Low bytes of a
2942 movq mm2, [esi + ebx] // load b=Prior(x)
2943 punpcklbw mm2, mm0 // Unpack Low bytes of b
2944 // pav = p - a = (a + b - c) - a = b - c
2945 movq mm4, mm2
2946 punpcklbw mm3, mm0 // Unpack Low bytes of c
2947 // pbv = p - b = (a + b - c) - b = a - c
2948 movq mm5, mm1
2949 psubw mm4, mm3
2950 pxor mm7, mm7
2951 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2952 movq mm6, mm4
2953 psubw mm5, mm3
2954 // pa = abs(p-a) = abs(pav)
2955 // pb = abs(p-b) = abs(pbv)
2956 // pc = abs(p-c) = abs(pcv)
2957 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2958 paddw mm6, mm5
2959 pand mm0, mm4 // Only pav bytes < 0 in mm7
2960 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2961 psubw mm4, mm0
2962 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2963 psubw mm4, mm0
2964 psubw mm5, mm7
2965 pxor mm0, mm0
2966 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2967 pand mm0, mm6 // Only pav bytes < 0 in mm7
2968 psubw mm5, mm7
2969 psubw mm6, mm0
2970 // test pa <= pb
2971 movq mm7, mm4
2972 psubw mm6, mm0
2973 pcmpgtw mm7, mm5 // pa > pb?
2974 movq mm0, mm7
2975 // use mm7 mask to merge pa & pb
2976 pand mm5, mm7
2977 // use mm0 mask copy to merge a & b
2978 pand mm2, mm0
2979 pandn mm7, mm4
2980 pandn mm0, mm1
2981 paddw mm7, mm5
2982 paddw mm0, mm2
2983 // test ((pa <= pb)? pa:pb) <= pc
2984 pcmpgtw mm7, mm6 // pab > pc?
2985 pxor mm1, mm1
2986 pand mm3, mm7
2987 pandn mm7, mm0
2988 paddw mm7, mm3
2989 pxor mm0, mm0
2990 packuswb mm7, mm1
2991 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2992 pand mm7, ActiveMask
2993 movq mm2, [esi + ebx] // load b=Prior(x)
2994 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2995 punpckhbw mm3, mm0 // Unpack High bytes of c
2996 movq [edi + ebx], mm7 // write back updated value
2997 movq mm1, [edi+ebx-8] // read a=Raw(x-bpp) bytes
2998
2999 // Do second set of 4 bytes
3000 punpckhbw mm2, mm0 // Unpack High bytes of b
3001 punpckhbw mm1, mm0 // Unpack High bytes of a
3002 // pav = p - a = (a + b - c) - a = b - c
3003 movq mm4, mm2
3004 // pbv = p - b = (a + b - c) - b = a - c
3005 movq mm5, mm1
3006 psubw mm4, mm3
3007 pxor mm7, mm7
3008 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3009 movq mm6, mm4
3010 psubw mm5, mm3
3011 // pa = abs(p-a) = abs(pav)
3012 // pb = abs(p-b) = abs(pbv)
3013 // pc = abs(p-c) = abs(pcv)
3014 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
3015 paddw mm6, mm5
3016 pand mm0, mm4 // Only pav bytes < 0 in mm7
3017 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
3018 psubw mm4, mm0
3019 pand mm7, mm5 // Only pbv bytes < 0 in mm0
3020 psubw mm4, mm0
3021 psubw mm5, mm7
3022 pxor mm0, mm0
3023 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
3024 pand mm0, mm6 // Only pav bytes < 0 in mm7
3025 psubw mm5, mm7
3026 psubw mm6, mm0
3027 // test pa <= pb
3028 movq mm7, mm4
3029 psubw mm6, mm0
3030 pcmpgtw mm7, mm5 // pa > pb?
3031 movq mm0, mm7
3032 // use mm7 mask to merge pa & pb
3033 pand mm5, mm7
3034 // use mm0 mask copy to merge a & b
3035 pand mm2, mm0
3036 pandn mm7, mm4
3037 pandn mm0, mm1
3038 paddw mm7, mm5
3039 paddw mm0, mm2
3040 // test ((pa <= pb)? pa:pb) <= pc
3041 pcmpgtw mm7, mm6 // pab > pc?
3042 pxor mm1, mm1
3043 pand mm3, mm7
3044 pandn mm7, mm0
3045 pxor mm1, mm1
3046 paddw mm7, mm3
3047 pxor mm0, mm0
3048 // Step ex to next set of 8 bytes and repeat loop til done
3049 add ebx, 8
3050 packuswb mm1, mm7
3051 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
3052 cmp ebx, MMXLength
3053 movq [edi + ebx - 8], mm1 // write back updated value
3054 // mm1 will be used as Raw(x-bpp) next loop
3055 jb dpth8lp
3056 } // end _asm block
3057 }
3058 break;
3059
3060 case 1: // bpp = 1
3061 case 2: // bpp = 2
3062 default: // bpp > 8
3063 {
3064 _asm {
3065 mov ebx, diff
3066 cmp ebx, FullLength
3067 jnb dpthdend
3068 mov edi, row
3069 mov esi, prev_row
3070 // Do Paeth decode for remaining bytes
3071 mov edx, ebx
3072 xor ecx, ecx // zero ecx before using cl & cx in loop below
3073 sub edx, bpp // Set edx = ebx - bpp
3074dpthdlp:
3075 xor eax, eax
3076 // pav = p - a = (a + b - c) - a = b - c
3077 mov al, [esi + ebx] // load Prior(x) into al
3078 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3079 sub eax, ecx // subtract Prior(x-bpp)
3080 mov patemp, eax // Save pav for later use
3081 xor eax, eax
3082 // pbv = p - b = (a + b - c) - b = a - c
3083 mov al, [edi + edx] // load Raw(x-bpp) into al
3084 sub eax, ecx // subtract Prior(x-bpp)
3085 mov ecx, eax
3086 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3087 add eax, patemp // pcv = pav + pbv
3088 // pc = abs(pcv)
3089 test eax, 0x80000000
3090 jz dpthdpca
3091 neg eax // reverse sign of neg values
3092dpthdpca:
3093 mov pctemp, eax // save pc for later use
3094 // pb = abs(pbv)
3095 test ecx, 0x80000000
3096 jz dpthdpba
3097 neg ecx // reverse sign of neg values
3098dpthdpba:
3099 mov pbtemp, ecx // save pb for later use
3100 // pa = abs(pav)
3101 mov eax, patemp
3102 test eax, 0x80000000
3103 jz dpthdpaa
3104 neg eax // reverse sign of neg values
3105dpthdpaa:
3106 mov patemp, eax // save pa for later use
3107 // test if pa <= pb
3108 cmp eax, ecx
3109 jna dpthdabb
3110 // pa > pb; now test if pb <= pc
3111 cmp ecx, pctemp
3112 jna dpthdbbc
3113 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3114 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3115 jmp dpthdpaeth
3116dpthdbbc:
3117 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3118 mov cl, [esi + ebx] // load Prior(x) into cl
3119 jmp dpthdpaeth
3120dpthdabb:
3121 // pa <= pb; now test if pa <= pc
3122 cmp eax, pctemp
3123 jna dpthdabc
3124 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3125 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3126 jmp dpthdpaeth
3127dpthdabc:
3128 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3129 mov cl, [edi + edx] // load Raw(x-bpp) into cl
3130dpthdpaeth:
3131 inc ebx
3132 inc edx
3133 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3134 add [edi + ebx - 1], cl
3135 cmp ebx, FullLength
3136 jb dpthdlp
3137dpthdend:
3138 } // end _asm block
3139 }
3140 return; // No need to go further with this one
3141 } // end switch ( bpp )
3142 _asm
3143 {
3144 // MMX acceleration complete now do clean-up
3145 // Check if any remaining bytes left to decode
3146 mov ebx, MMXLength
3147 cmp ebx, FullLength
3148 jnb dpthend
3149 mov edi, row
3150 mov esi, prev_row
3151 // Do Paeth decode for remaining bytes
3152 mov edx, ebx
3153 xor ecx, ecx // zero ecx before using cl & cx in loop below
3154 sub edx, bpp // Set edx = ebx - bpp
3155dpthlp2:
3156 xor eax, eax
3157 // pav = p - a = (a + b - c) - a = b - c
3158 mov al, [esi + ebx] // load Prior(x) into al
3159 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3160 sub eax, ecx // subtract Prior(x-bpp)
3161 mov patemp, eax // Save pav for later use
3162 xor eax, eax
3163 // pbv = p - b = (a + b - c) - b = a - c
3164 mov al, [edi + edx] // load Raw(x-bpp) into al
3165 sub eax, ecx // subtract Prior(x-bpp)
3166 mov ecx, eax
3167 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3168 add eax, patemp // pcv = pav + pbv
3169 // pc = abs(pcv)
3170 test eax, 0x80000000
3171 jz dpthpca2
3172 neg eax // reverse sign of neg values
3173dpthpca2:
3174 mov pctemp, eax // save pc for later use
3175 // pb = abs(pbv)
3176 test ecx, 0x80000000
3177 jz dpthpba2
3178 neg ecx // reverse sign of neg values
3179dpthpba2:
3180 mov pbtemp, ecx // save pb for later use
3181 // pa = abs(pav)
3182 mov eax, patemp
3183 test eax, 0x80000000
3184 jz dpthpaa2
3185 neg eax // reverse sign of neg values
3186dpthpaa2:
3187 mov patemp, eax // save pa for later use
3188 // test if pa <= pb
3189 cmp eax, ecx
3190 jna dpthabb2
3191 // pa > pb; now test if pb <= pc
3192 cmp ecx, pctemp
3193 jna dpthbbc2
3194 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3195 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3196 jmp dpthpaeth2
3197dpthbbc2:
3198 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3199 mov cl, [esi + ebx] // load Prior(x) into cl
3200 jmp dpthpaeth2
3201dpthabb2:
3202 // pa <= pb; now test if pa <= pc
3203 cmp eax, pctemp
3204 jna dpthabc2
3205 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3206 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3207 jmp dpthpaeth2
3208dpthabc2:
3209 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3210 mov cl, [edi + edx] // load Raw(x-bpp) into cl
3211dpthpaeth2:
3212 inc ebx
3213 inc edx
3214 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3215 add [edi + ebx - 1], cl
3216 cmp ebx, FullLength
3217 jb dpthlp2
3218dpthend:
3219 emms // End MMX instructions; prep for possible FP instrs.
3220 } // end _asm block
3221}
3222
3223// Optimized code for PNG Sub filter decoder
Glenn Randers-Pehrson75294572000-05-06 14:09:57 -05003224void /* PRIVATE */
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003225png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
3226{
3227 //int test;
3228 int bpp;
3229 png_uint_32 FullLength;
3230 png_uint_32 MMXLength;
3231 int diff;
3232
3233 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3234 FullLength = row_info->rowbytes - bpp; // # of bytes to filter
3235 _asm {
3236 mov edi, row
3237 mov esi, edi // lp = row
3238 add edi, bpp // rp = row + bpp
3239 xor eax, eax
3240 // get # of bytes to alignment
3241 mov diff, edi // take start of row
3242 add diff, 0xf // add 7 + 8 to incr past
3243 // alignment boundary
3244 xor ebx, ebx
3245 and diff, 0xfffffff8 // mask to alignment boundary
3246 sub diff, edi // subtract from start ==> value
3247 // ebx at alignment
3248 jz dsubgo
3249 // fix alignment
3250dsublp1:
3251 mov al, [esi+ebx]
3252 add [edi+ebx], al
3253 inc ebx
3254 cmp ebx, diff
3255 jb dsublp1
3256dsubgo:
3257 mov ecx, FullLength
3258 mov edx, ecx
3259 sub edx, ebx // subtract alignment fix
3260 and edx, 0x00000007 // calc bytes over mult of 8
3261 sub ecx, edx // drop over bytes from length
3262 mov MMXLength, ecx
3263 } // end _asm block
3264
3265 // Now do the math for the rest of the row
3266 switch ( bpp )
3267 {
3268 case 3:
3269 {
3270 ActiveMask.use = 0x0000ffffff000000;
3271 ShiftBpp.use = 24; // == 3 * 8
3272 ShiftRem.use = 40; // == 64 - 24
3273 _asm {
3274 mov edi, row
3275 movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
3276 mov esi, edi // lp = row
3277 add edi, bpp // rp = row + bpp
3278 movq mm6, mm7
3279 mov ebx, diff
3280 psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
3281 // byte group
3282 // PRIME the pump (load the first Raw(x-bpp) data set
3283 movq mm1, [edi+ebx-8]
3284dsub3lp:
3285 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3286 // no need for mask; shift clears inactive bytes
3287 // Add 1st active group
3288 movq mm0, [edi+ebx]
3289 paddb mm0, mm1
3290 // Add 2nd active group
3291 movq mm1, mm0 // mov updated Raws to mm1
3292 psllq mm1, ShiftBpp // shift data to position correctly
3293 pand mm1, mm7 // mask to use only 2nd active group
3294 paddb mm0, mm1
3295 // Add 3rd active group
3296 movq mm1, mm0 // mov updated Raws to mm1
3297 psllq mm1, ShiftBpp // shift data to position correctly
3298 pand mm1, mm6 // mask to use only 3rd active group
3299 add ebx, 8
3300 paddb mm0, mm1
3301 cmp ebx, MMXLength
3302 movq [edi+ebx-8], mm0 // Write updated Raws back to array
3303 // Prep for doing 1st add at top of loop
3304 movq mm1, mm0
3305 jb dsub3lp
3306 } // end _asm block
3307 }
3308 break;
3309
3310 case 1:
3311 {
3312 // Placed here just in case this is a duplicate of the
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003313 // non-MMX code for the SUB filter in png_read_filter_row below
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003314 //
3315 // png_bytep rp;
3316 // png_bytep lp;
3317 // png_uint_32 i;
3318 // bpp = (row_info->pixel_depth + 7) >> 3;
3319 // for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
3320 // i < row_info->rowbytes; i++, rp++, lp++)
3321 // {
3322 // *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
3323 // }
3324 _asm {
3325 mov ebx, diff
3326 mov edi, row
3327 cmp ebx, FullLength
3328 jnb dsub1end
3329 mov esi, edi // lp = row
3330 xor eax, eax
3331 add edi, bpp // rp = row + bpp
3332dsub1lp:
3333 mov al, [esi+ebx]
3334 add [edi+ebx], al
3335 inc ebx
3336 cmp ebx, FullLength
3337 jb dsub1lp
3338dsub1end:
3339 } // end _asm block
3340 }
3341 return;
3342
3343 case 6:
3344 case 7:
3345 case 4:
3346 case 5:
3347 {
3348 ShiftBpp.use = bpp << 3;
3349 ShiftRem.use = 64 - ShiftBpp.use;
3350 _asm {
3351 mov edi, row
3352 mov ebx, diff
3353 mov esi, edi // lp = row
3354 add edi, bpp // rp = row + bpp
3355 // PRIME the pump (load the first Raw(x-bpp) data set
3356 movq mm1, [edi+ebx-8]
3357dsub4lp:
3358 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3359 // no need for mask; shift clears inactive bytes
3360 movq mm0, [edi+ebx]
3361 paddb mm0, mm1
3362 // Add 2nd active group
3363 movq mm1, mm0 // mov updated Raws to mm1
3364 psllq mm1, ShiftBpp // shift data to position correctly
3365 // there is no need for any mask
3366 // since shift clears inactive bits/bytes
3367 add ebx, 8
3368 paddb mm0, mm1
3369 cmp ebx, MMXLength
3370 movq [edi+ebx-8], mm0
3371 movq mm1, mm0 // Prep for doing 1st add at top of loop
3372 jb dsub4lp
3373 } // end _asm block
3374 }
3375 break;
3376
3377 case 2:
3378 {
3379 ActiveMask.use = 0x00000000ffff0000;
3380 ShiftBpp.use = 16; // == 2 * 8
3381 ShiftRem.use = 48; // == 64 - 16
3382 _asm {
3383 movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
3384 mov ebx, diff
3385 movq mm6, mm7
3386 mov edi, row
3387 psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
3388 // byte group
3389 mov esi, edi // lp = row
3390 movq mm5, mm6
3391 add edi, bpp // rp = row + bpp
3392 psllq mm5, ShiftBpp // Move mask in mm5 to cover 4th active
3393 // byte group
3394 // PRIME the pump (load the first Raw(x-bpp) data set
3395 movq mm1, [edi+ebx-8]
3396dsub2lp:
3397 // Add 1st active group
3398 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3399 // no need for mask; shift clears inactive
3400 // bytes
3401 movq mm0, [edi+ebx]
3402 paddb mm0, mm1
3403 // Add 2nd active group
3404 movq mm1, mm0 // mov updated Raws to mm1
3405 psllq mm1, ShiftBpp // shift data to position correctly
3406 pand mm1, mm7 // mask to use only 2nd active group
3407 paddb mm0, mm1
3408 // Add 3rd active group
3409 movq mm1, mm0 // mov updated Raws to mm1
3410 psllq mm1, ShiftBpp // shift data to position correctly
3411 pand mm1, mm6 // mask to use only 3rd active group
3412 paddb mm0, mm1
3413 // Add 4th active group
3414 movq mm1, mm0 // mov updated Raws to mm1
3415 psllq mm1, ShiftBpp // shift data to position correctly
3416 pand mm1, mm5 // mask to use only 4th active group
3417 add ebx, 8
3418 paddb mm0, mm1
3419 cmp ebx, MMXLength
3420 movq [edi+ebx-8], mm0 // Write updated Raws back to array
3421 movq mm1, mm0 // Prep for doing 1st add at top of loop
3422 jb dsub2lp
3423 } // end _asm block
3424 }
3425 break;
3426 case 8:
3427 {
3428 _asm {
3429 mov edi, row
3430 mov ebx, diff
3431 mov esi, edi // lp = row
3432 add edi, bpp // rp = row + bpp
3433 mov ecx, MMXLength
3434 movq mm7, [edi+ebx-8] // PRIME the pump (load the first
3435 // Raw(x-bpp) data set
3436 and ecx, 0x0000003f // calc bytes over mult of 64
3437dsub8lp:
3438 movq mm0, [edi+ebx] // Load Sub(x) for 1st 8 bytes
3439 paddb mm0, mm7
3440 movq mm1, [edi+ebx+8] // Load Sub(x) for 2nd 8 bytes
3441 movq [edi+ebx], mm0 // Write Raw(x) for 1st 8 bytes
3442 // Now mm0 will be used as Raw(x-bpp) for
3443 // the 2nd group of 8 bytes. This will be
3444 // repeated for each group of 8 bytes with
3445 // the 8th group being used as the Raw(x-bpp)
3446 // for the 1st group of the next loop.
3447 paddb mm1, mm0
3448 movq mm2, [edi+ebx+16] // Load Sub(x) for 3rd 8 bytes
3449 movq [edi+ebx+8], mm1 // Write Raw(x) for 2nd 8 bytes
3450 paddb mm2, mm1
3451 movq mm3, [edi+ebx+24] // Load Sub(x) for 4th 8 bytes
3452 movq [edi+ebx+16], mm2 // Write Raw(x) for 3rd 8 bytes
3453 paddb mm3, mm2
3454 movq mm4, [edi+ebx+32] // Load Sub(x) for 5th 8 bytes
3455 movq [edi+ebx+24], mm3 // Write Raw(x) for 4th 8 bytes
3456 paddb mm4, mm3
3457 movq mm5, [edi+ebx+40] // Load Sub(x) for 6th 8 bytes
3458 movq [edi+ebx+32], mm4 // Write Raw(x) for 5th 8 bytes
3459 paddb mm5, mm4
3460 movq mm6, [edi+ebx+48] // Load Sub(x) for 7th 8 bytes
3461 movq [edi+ebx+40], mm5 // Write Raw(x) for 6th 8 bytes
3462 paddb mm6, mm5
3463 movq mm7, [edi+ebx+56] // Load Sub(x) for 8th 8 bytes
3464 movq [edi+ebx+48], mm6 // Write Raw(x) for 7th 8 bytes
3465 add ebx, 64
3466 paddb mm7, mm6
3467 cmp ebx, ecx
3468 movq [edi+ebx-8], mm7 // Write Raw(x) for 8th 8 bytes
3469 jb dsub8lp
3470 cmp ebx, MMXLength
3471 jnb dsub8lt8
3472dsub8lpA:
3473 movq mm0, [edi+ebx]
3474 add ebx, 8
3475 paddb mm0, mm7
3476 cmp ebx, MMXLength
3477 movq [edi+ebx-8], mm0 // use -8 to offset early add to ebx
3478 movq mm7, mm0 // Move calculated Raw(x) data to mm1 to
3479 // be the new Raw(x-bpp) for the next loop
3480 jb dsub8lpA
3481dsub8lt8:
3482 } // end _asm block
3483 }
3484 break;
3485
3486 default: // bpp greater than 8 bytes
3487 {
3488 _asm {
3489 mov ebx, diff
3490 mov edi, row
3491 mov esi, edi // lp = row
3492 add edi, bpp // rp = row + bpp
3493dsubAlp:
3494 movq mm0, [edi+ebx]
3495 movq mm1, [esi+ebx]
3496 add ebx, 8
3497 paddb mm0, mm1
3498 cmp ebx, MMXLength
3499 movq [edi+ebx-8], mm0 // mov does not affect flags; -8 to offset
3500 // add ebx
3501 jb dsubAlp
3502 } // end _asm block
3503 }
3504 break;
3505
3506 } // end switch ( bpp )
3507
3508 _asm {
3509 mov ebx, MMXLength
3510 mov edi, row
3511 cmp ebx, FullLength
3512 jnb dsubend
3513 mov esi, edi // lp = row
3514 xor eax, eax
3515 add edi, bpp // rp = row + bpp
3516dsublp2:
3517 mov al, [esi+ebx]
3518 add [edi+ebx], al
3519 inc ebx
3520 cmp ebx, FullLength
3521 jb dsublp2
3522dsubend:
3523 emms // End MMX instructions; prep for possible FP instrs.
3524 } // end _asm block
3525}
3526
3527// Optimized code for PNG Up filter decoder
Glenn Randers-Pehrson75294572000-05-06 14:09:57 -05003528void /* PRIVATE */
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003529png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
3530 png_bytep prev_row)
3531{
3532 png_uint_32 len;
3533 len = row_info->rowbytes; // # of bytes to filter
3534 _asm {
3535 mov edi, row
3536 // get # of bytes to alignment
3537 mov ecx, edi
3538 xor ebx, ebx
3539 add ecx, 0x7
3540 xor eax, eax
3541 and ecx, 0xfffffff8
3542 mov esi, prev_row
3543 sub ecx, edi
3544 jz dupgo
3545 // fix alignment
3546duplp1:
3547 mov al, [edi+ebx]
3548 add al, [esi+ebx]
3549 inc ebx
3550 cmp ebx, ecx
3551 mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3552 jb duplp1
3553dupgo:
3554 mov ecx, len
3555 mov edx, ecx
3556 sub edx, ebx // subtract alignment fix
3557 and edx, 0x0000003f // calc bytes over mult of 64
3558 sub ecx, edx // drop over bytes from length
3559 // Unrolled loop - use all MMX registers and interleave to reduce
3560 // number of branch instructions (loops) and reduce partial stalls
3561duploop:
3562 movq mm1, [esi+ebx]
3563 movq mm0, [edi+ebx]
3564 movq mm3, [esi+ebx+8]
3565 paddb mm0, mm1
3566 movq mm2, [edi+ebx+8]
3567 movq [edi+ebx], mm0
3568 paddb mm2, mm3
3569 movq mm5, [esi+ebx+16]
3570 movq [edi+ebx+8], mm2
3571 movq mm4, [edi+ebx+16]
3572 movq mm7, [esi+ebx+24]
3573 paddb mm4, mm5
3574 movq mm6, [edi+ebx+24]
3575 movq [edi+ebx+16], mm4
3576 paddb mm6, mm7
3577 movq mm1, [esi+ebx+32]
3578 movq [edi+ebx+24], mm6
3579 movq mm0, [edi+ebx+32]
3580 movq mm3, [esi+ebx+40]
3581 paddb mm0, mm1
3582 movq mm2, [edi+ebx+40]
3583 movq [edi+ebx+32], mm0
3584 paddb mm2, mm3
3585 movq mm5, [esi+ebx+48]
3586 movq [edi+ebx+40], mm2
3587 movq mm4, [edi+ebx+48]
3588 movq mm7, [esi+ebx+56]
3589 paddb mm4, mm5
3590 movq mm6, [edi+ebx+56]
3591 movq [edi+ebx+48], mm4
3592 add ebx, 64
3593 paddb mm6, mm7
3594 cmp ebx, ecx
3595 movq [edi+ebx-8], mm6 // (+56)movq does not affect flags;
3596 // -8 to offset add ebx
3597 jb duploop
3598
3599 cmp edx, 0 // Test for bytes over mult of 64
3600 jz dupend
3601
3602
3603 // 2 lines added by lcreeve@netins.net
3604 // (mail 11 Jul 98 in png-implement list)
3605 cmp edx, 8 //test for less than 8 bytes
3606 jb duplt8
3607
3608
3609 add ecx, edx
3610 and edx, 0x00000007 // calc bytes over mult of 8
3611 sub ecx, edx // drop over bytes from length
3612 jz duplt8
3613 // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
3614duplpA:
3615 movq mm1, [esi+ebx]
3616 movq mm0, [edi+ebx]
3617 add ebx, 8
3618 paddb mm0, mm1
3619 cmp ebx, ecx
3620 movq [edi+ebx-8], mm0 // movq does not affect flags; -8 to offset add ebx
3621 jb duplpA
3622 cmp edx, 0 // Test for bytes over mult of 8
3623 jz dupend
3624duplt8:
3625 xor eax, eax
3626 add ecx, edx // move over byte count into counter
3627 // Loop using x86 registers to update remaining bytes
3628duplp2:
3629 mov al, [edi + ebx]
3630 add al, [esi + ebx]
3631 inc ebx
3632 cmp ebx, ecx
3633 mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3634 jb duplp2
3635dupend:
3636 // Conversion of filtered row completed
3637 emms // End MMX instructions; prep for possible FP instrs.
3638 } // end _asm block
3639}
3640
3641
3642// Optimized png_read_filter_row routines
Glenn Randers-Pehrson75294572000-05-06 14:09:57 -05003643void /* PRIVATE */
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003644png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
3645 row, png_bytep prev_row, int filter)
3646{
3647#ifdef PNG_DEBUG
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003648 char filnm[10];
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003649#endif
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003650
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003651 if (mmx_supported == 2) {
3652 png_mmx_support();
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003653 }
3654
3655#ifdef PNG_DEBUG
3656 png_debug(1, "in png_read_filter_row\n");
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003657 switch (filter)
3658 {
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003659 case 0: sprintf(filnm, "none");
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003660 break;
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003661 case 1: sprintf(filnm, "sub-%s", "MMX");
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003662 break;
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003663 case 2: sprintf(filnm, "up-%s", "MMX");
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003664 break;
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003665 case 3: sprintf(filnm, "avg-%s", "MMX");
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003666 break;
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003667 case 4: sprintf(filnm, "Paeth-%s", "MMX");
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003668 break;
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003669 default: sprintf(filnm, "unknw");
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003670 break;
3671 }
3672 png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
3673 png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
3674 (int)((row_info->pixel_depth + 7) >> 3));
3675 png_debug1(0,"len=%8d, ", row_info->rowbytes);
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003676#endif /* PNG_DEBUG */
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003677
3678 switch (filter)
3679 {
3680 case PNG_FILTER_VALUE_NONE:
3681 break;
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003682
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003683 case PNG_FILTER_VALUE_SUB:
3684 {
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003685 if (
3686 (row_info->pixel_depth >= PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT) &&
3687 (row_info->rowbytes >= PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT))
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003688 {
3689 png_read_filter_row_mmx_sub(row_info, row);
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003690 }
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003691 else
3692 {
3693 png_uint_32 i;
3694 png_uint_32 istop = row_info->rowbytes;
3695 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3696 png_bytep rp = row + bpp;
3697 png_bytep lp = row;
3698
3699 for (i = bpp; i < istop; i++)
3700 {
3701 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
3702 rp++;
3703 }
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003704 }
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003705 break;
3706 }
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003707
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003708 case PNG_FILTER_VALUE_UP:
3709 {
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003710 if (
3711 (row_info->pixel_depth >= PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT) &&
3712 (row_info->rowbytes >= PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT))
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003713 {
3714 png_read_filter_row_mmx_up(row_info, row, prev_row);
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003715 }
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003716 else
3717 {
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003718 png_uint_32 i;
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003719 png_uint_32 istop = row_info->rowbytes;
3720 png_bytep rp = row;
3721 png_bytep pp = prev_row;
3722
3723 for (i = 0; i < istop; ++i)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003724 {
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003725 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3726 rp++;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003727 }
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003728 }
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003729 break;
3730 }
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003731
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003732 case PNG_FILTER_VALUE_AVG:
3733 {
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003734 if (
3735 (row_info->pixel_depth >= PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT) &&
3736 (row_info->rowbytes >= PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT))
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003737 {
3738 png_read_filter_row_mmx_avg(row_info, row, prev_row);
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003739 }
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003740 else
3741 {
3742 png_uint_32 i;
3743 png_bytep rp = row;
3744 png_bytep pp = prev_row;
3745 png_bytep lp = row;
3746 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3747 png_uint_32 istop = row_info->rowbytes - bpp;
3748
3749 for (i = 0; i < bpp; i++)
3750 {
3751 *rp = (png_byte)(((int)(*rp) +
3752 ((int)(*pp++) >> 1)) & 0xff);
3753 rp++;
3754 }
3755
3756 for (i = 0; i < istop; i++)
3757 {
3758 *rp = (png_byte)(((int)(*rp) +
3759 ((int)(*pp++ + *lp++) >> 1)) & 0xff);
3760 rp++;
3761 }
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003762 }
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003763 break;
3764 }
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003765
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003766 case PNG_FILTER_VALUE_PAETH:
3767 {
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003768 if (
3769 (row_info->pixel_depth >= PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT) &&
3770 (row_info->rowbytes >= PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT))
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003771 {
3772 png_read_filter_row_mmx_paeth(row_info, row, prev_row);
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003773 }
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003774 else
3775 {
3776 png_uint_32 i;
3777 png_bytep rp = row;
3778 png_bytep pp = prev_row;
3779 png_bytep lp = row;
3780 png_bytep cp = prev_row;
3781 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3782 png_uint_32 istop=row_info->rowbytes - bpp;
3783
3784 for (i = 0; i < bpp; i++)
3785 {
3786 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3787 rp++;
3788 }
3789
3790 for (i = 0; i < istop; i++) // use leftover rp,pp
3791 {
3792 int a, b, c, pa, pb, pc, p;
3793
3794 a = *lp++;
3795 b = *pp++;
3796 c = *cp++;
3797
3798 p = b - c;
3799 pc = a - c;
3800
3801#ifdef PNG_USE_ABS
3802 pa = abs(p);
3803 pb = abs(pc);
3804 pc = abs(p + pc);
3805#else
3806 pa = p < 0 ? -p : p;
3807 pb = pc < 0 ? -pc : pc;
3808 pc = (p + pc) < 0 ? -(p + pc) : p + pc;
3809#endif
3810
3811 /*
3812 if (pa <= pb && pa <= pc)
3813 p = a;
3814 else if (pb <= pc)
3815 p = b;
3816 else
3817 p = c;
3818 */
3819
3820 p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
3821
3822 *rp = (png_byte)(((int)(*rp) + p) & 0xff);
3823 rp++;
3824 }
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003825 }
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003826 break;
3827 }
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003828
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003829 default:
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003830 png_warning(png_ptr, "Ignoring bad row filter type");
Glenn Randers-Pehrsonec61c232000-05-16 06:17:36 -05003831 *row=0;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003832 break;
3833 }
3834}
Glenn Randers-Pehrson231e6872001-01-12 15:13:06 -06003835
3836#endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGVCRD */