blob: ae48a78bc886c72d9b0201c55a056d567b692d4d [file] [log] [blame]
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001/* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
2 *
3 * For Intel x86 CPU and Microsoft Visual C++ compiler
4 *
Glenn Randers-Pehrson5379b241999-11-27 10:22:33 -06005 * libpng 1.0.5c - November 27, 1999
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05006 * For conditions of distribution and use, see copyright notice in png.h
7 * Copyright (c) 1998, Intel Corporation
8 * Copyright (c) 1998, 1999 Glenn Randers-Pehrson
9 *
10 * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
11 * Interface to libpng contributed by Gilles Vollant, 1999
12 *
13 */
14
15#define PNG_INTERNAL
16#include "png.h"
17
18#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
19
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -050020/*
21 One of these might need to be defined.
22#define DISABLE_PNGVCRD_COMBINE
23#define DISABLE_PNGVCRD_INTERLACE
24*/
25
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -050026static int mmx_supported=2;
27
28void
29png_read_filter_row_c(png_structp png_ptr, png_row_infop row_info,
30 png_bytep row, png_bytep prev_row, int filter);
31
32static int mmxsupport()
33{
34 int mmx_supported_local = 0;
35 _asm {
36 pushfd //Save Eflag to stack
37 pop eax //Get Eflag from stack into eax
38 mov ecx, eax //Make another copy of Eflag in ecx
39 xor eax, 0x200000 //Toggle ID bit in Eflag [i.e. bit(21)]
40 push eax //Save modified Eflag back to stack
41
42 popfd //Restored modified value back to Eflag reg
43 pushfd //Save Eflag to stack
44 pop eax //Get Eflag from stack
45 xor eax, ecx //Compare the new Eflag with the original Eflag
46 jz NOT_SUPPORTED //If the same, CPUID instruction is not supported,
47 //skip following instructions and jump to
48 //NOT_SUPPORTED label
49
50 xor eax, eax //Set eax to zero
51
52 _asm _emit 0x0f //CPUID instruction (two bytes opcode)
53 _asm _emit 0xa2
54
55 cmp eax, 1 //make sure eax return non-zero value
56 jl NOT_SUPPORTED //If eax is zero, mmx not supported
57
58 xor eax, eax //set eax to zero
59 inc eax //Now increment eax to 1. This instruction is
60 //faster than the instruction "mov eax, 1"
61
62 _asm _emit 0x0f //CPUID instruction
63 _asm _emit 0xa2
64
65 and edx, 0x00800000 //mask out all bits but mmx bit(24)
66 cmp edx, 0 // 0 = mmx not supported
67 jz NOT_SUPPORTED // non-zero = Yes, mmx IS supported
68
69 mov mmx_supported_local, 1 //set return value to 1
70
71NOT_SUPPORTED:
72 mov eax, mmx_supported_local //move return value to eax
73
74 }
75
76 //mmx_supported_local=0; // test code for force don't support MMX
77 //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
78
79 return mmx_supported_local;
80}
81
82/* Combines the row recently read in with the previous row.
83 This routine takes care of alpha and transparency if requested.
84 This routine also handles the two methods of progressive display
85 of interlaced images, depending on the mask value.
86 The mask value describes which pixels are to be combined with
87 the row. The pattern always repeats every 8 pixels, so just 8
88 bits are needed. A one indicates the pixel is to be combined; a
89 zero indicates the pixel is to be skipped. This is in addition
90 to any alpha or transparency value associated with the pixel. If
91 you want all pixels to be combined, pass 0xff (255) in mask. */
92
93/* Use this routine for x86 platform - uses faster MMX routine if machine
94 supports MMX */
95
96void
97png_combine_row(png_structp png_ptr, png_bytep row, int mask)
98{
Glenn Randers-Pehrson5379b241999-11-27 10:22:33 -060099 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -0500100#ifdef DISABLE_PNGVCRD_COMBINE
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500101 int save_mmx_supported = mmx_supported;
102#endif
103
104 png_debug(1,"in png_combine_row_asm\n");
105
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -0500106#ifdef DISABLE_PNGVCRD_COMBINE
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500107 if ((png_ptr->transformations & PNG_INTERLACE) && png_ptr->pass != 6)
108 mmx_supported = 0;
109 else
110#endif
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -0500111 if (mmx_supported == 2)
112 mmx_supported = mmxsupport();
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500113
114 if (mask == 0xff)
115 {
116 png_memcpy(row, png_ptr->row_buf + 1,
117 (png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
118 }
119 /* GRR: add "else if (mask == 0)" case?
120 * or does png_combine_row() not even get called in that case? */
121 else
122 {
123 switch (png_ptr->row_info.pixel_depth)
124 {
125 case 1:
126 {
127 png_bytep sp;
128 png_bytep dp;
129 int s_inc, s_start, s_end;
130 int m;
131 int shift;
132 png_uint_32 i;
133
134 sp = png_ptr->row_buf + 1;
135 dp = row;
136 m = 0x80;
137#if defined(PNG_READ_PACKSWAP_SUPPORTED)
138 if (png_ptr->transformations & PNG_PACKSWAP)
139 {
140 s_start = 0;
141 s_end = 7;
142 s_inc = 1;
143 }
144 else
145#endif
146 {
147 s_start = 7;
148 s_end = 0;
149 s_inc = -1;
150 }
151
152 shift = s_start;
153
154 for (i = 0; i < png_ptr->width; i++)
155 {
156 if (m & mask)
157 {
158 int value;
159
160 value = (*sp >> shift) & 0x1;
161 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
162 *dp |= (png_byte)(value << shift);
163 }
164
165 if (shift == s_end)
166 {
167 shift = s_start;
168 sp++;
169 dp++;
170 }
171 else
172 shift += s_inc;
173
174 if (m == 1)
175 m = 0x80;
176 else
177 m >>= 1;
178 }
179 break;
180 }
181
182 case 2:
183 {
184 png_bytep sp;
185 png_bytep dp;
186 int s_start, s_end, s_inc;
187 int m;
188 int shift;
189 png_uint_32 i;
190 int value;
191
192 sp = png_ptr->row_buf + 1;
193 dp = row;
194 m = 0x80;
195#if defined(PNG_READ_PACKSWAP_SUPPORTED)
196 if (png_ptr->transformations & PNG_PACKSWAP)
197 {
198 s_start = 0;
199 s_end = 6;
200 s_inc = 2;
201 }
202 else
203#endif
204 {
205 s_start = 6;
206 s_end = 0;
207 s_inc = -2;
208 }
209
210 shift = s_start;
211
212 for (i = 0; i < png_ptr->width; i++)
213 {
214 if (m & mask)
215 {
216 value = (*sp >> shift) & 0x3;
217 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
218 *dp |= (png_byte)(value << shift);
219 }
220
221 if (shift == s_end)
222 {
223 shift = s_start;
224 sp++;
225 dp++;
226 }
227 else
228 shift += s_inc;
229 if (m == 1)
230 m = 0x80;
231 else
232 m >>= 1;
233 }
234 break;
235 }
236
237 case 4:
238 {
239 png_bytep sp;
240 png_bytep dp;
241 int s_start, s_end, s_inc;
242 int m;
243 int shift;
244 png_uint_32 i;
245 int value;
246
247 sp = png_ptr->row_buf + 1;
248 dp = row;
249 m = 0x80;
250#if defined(PNG_READ_PACKSWAP_SUPPORTED)
251 if (png_ptr->transformations & PNG_PACKSWAP)
252 {
253 s_start = 0;
254 s_end = 4;
255 s_inc = 4;
256 }
257 else
258#endif
259 {
260 s_start = 4;
261 s_end = 0;
262 s_inc = -4;
263 }
264 shift = s_start;
265
266 for (i = 0; i < png_ptr->width; i++)
267 {
268 if (m & mask)
269 {
270 value = (*sp >> shift) & 0xf;
271 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
272 *dp |= (png_byte)(value << shift);
273 }
274
275 if (shift == s_end)
276 {
277 shift = s_start;
278 sp++;
279 dp++;
280 }
281 else
282 shift += s_inc;
283 if (m == 1)
284 m = 0x80;
285 else
286 m >>= 1;
287 }
288 break;
289 }
290
291 case 8:
292 {
293 png_bytep srcptr;
294 png_bytep dstptr;
295 png_uint_32 len;
296 int m;
297 int diff, unmask;
298
299 __int64 mask0=0x0102040810204080;
300
301 if (mmx_supported)
302 {
303 srcptr = png_ptr->row_buf + 1;
304 dstptr = row;
305 m = 0x80;
306 unmask = ~mask;
307 len = png_ptr->width &~7; //reduce to multiple of 8
308 diff = png_ptr->width & 7; //amount lost
309
310 _asm
311 {
312 movd mm7, unmask //load bit pattern
313 psubb mm6,mm6 //zero mm6
314 punpcklbw mm7,mm7
315 punpcklwd mm7,mm7
316 punpckldq mm7,mm7 //fill register with 8 masks
317
318 movq mm0,mask0
319
320 pand mm0,mm7 //nonzero if keep byte
321 pcmpeqb mm0,mm6 //zeros->1s, v versa
322
323 mov ecx,len //load length of line (pixels)
324 mov esi,srcptr //load source
325 mov ebx,dstptr //load dest
326 cmp ecx,0 //lcr
327 je mainloop8end
328
329mainloop8:
330 movq mm4,[esi]
331 pand mm4,mm0
332 movq mm6,mm0
333 pandn mm6,[ebx]
334 por mm4,mm6
335 movq [ebx],mm4
336
337 add esi,8 //inc by 8 bytes processed
338 add ebx,8
339 sub ecx,8 //dec by 8 pixels processed
340
341 ja mainloop8
342mainloop8end:
343
344 mov ecx,diff
345 cmp ecx,0
346 jz end8
347
348 mov edx,mask
349 sal edx,24 //make low byte the high byte
350
351secondloop8:
352 sal edx,1 //move high bit to CF
353 jnc skip8 //if CF = 0
354 mov al,[esi]
355 mov [ebx],al
356skip8:
357 inc esi
358 inc ebx
359
360 dec ecx
361 jnz secondloop8
362end8:
363 emms
364 }
365 }
366 else /* mmx not supported - use modified C routine */
367 {
368 register unsigned int incr1, initial_val, final_val;
369 png_size_t pixel_bytes;
370 png_uint_32 i;
371 register int disp = png_pass_inc[png_ptr->pass];
372 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
373
374 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
375 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
376 pixel_bytes;
377 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
378 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
379 final_val = png_ptr->width*pixel_bytes;
380 incr1 = (disp)*pixel_bytes;
381 for (i = initial_val; i < final_val; i += incr1)
382 {
383 png_memcpy(dstptr, srcptr, pixel_bytes);
384 srcptr += incr1;
385 dstptr += incr1;
386 }
387 } /* end of else */
388
389 break;
390 } // end 8 bpp
391
392 case 16:
393 {
394 png_bytep srcptr;
395 png_bytep dstptr;
396 png_uint_32 len;
397 int unmask, diff;
398 __int64 mask1=0x0101020204040808,
399 mask0=0x1010202040408080;
400
401 if (mmx_supported)
402 {
403 srcptr = png_ptr->row_buf + 1;
404 dstptr = row;
405
406 unmask = ~mask;
407 len = (png_ptr->width)&~7;
408 diff = (png_ptr->width)&7;
409 _asm
410 {
411 movd mm7, unmask //load bit pattern
412 psubb mm6,mm6 //zero mm6
413 punpcklbw mm7,mm7
414 punpcklwd mm7,mm7
415 punpckldq mm7,mm7 //fill register with 8 masks
416
417 movq mm0,mask0
418 movq mm1,mask1
419
420 pand mm0,mm7
421 pand mm1,mm7
422
423 pcmpeqb mm0,mm6
424 pcmpeqb mm1,mm6
425
426 mov ecx,len //load length of line
427 mov esi,srcptr //load source
428 mov ebx,dstptr //load dest
429 cmp ecx,0 //lcr
430 jz mainloop16end
431
432mainloop16:
433 movq mm4,[esi]
434 pand mm4,mm0
435 movq mm6,mm0
436 movq mm7,[ebx]
437 pandn mm6,mm7
438 por mm4,mm6
439 movq [ebx],mm4
440
441 movq mm5,[esi+8]
442 pand mm5,mm1
443 movq mm7,mm1
444 movq mm6,[ebx+8]
445 pandn mm7,mm6
446 por mm5,mm7
447 movq [ebx+8],mm5
448
449 add esi,16 //inc by 16 bytes processed
450 add ebx,16
451 sub ecx,8 //dec by 8 pixels processed
452
453 ja mainloop16
454
455mainloop16end:
456 mov ecx,diff
457 cmp ecx,0
458 jz end16
459
460 mov edx,mask
461 sal edx,24 //make low byte the high byte
462secondloop16:
463 sal edx,1 //move high bit to CF
464 jnc skip16 //if CF = 0
465 mov ax,[esi]
466 mov [ebx],ax
467skip16:
468 add esi,2
469 add ebx,2
470
471 dec ecx
472 jnz secondloop16
473end16:
474 emms
475 }
476 }
477 else /* mmx not supported - use modified C routine */
478 {
479 register unsigned int incr1, initial_val, final_val;
480 png_size_t pixel_bytes;
481 png_uint_32 i;
482 register int disp = png_pass_inc[png_ptr->pass];
483 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
484
485 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
486 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
487 pixel_bytes;
488 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
489 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
490 final_val = png_ptr->width*pixel_bytes;
491 incr1 = (disp)*pixel_bytes;
492 for (i = initial_val; i < final_val; i += incr1)
493 {
494 png_memcpy(dstptr, srcptr, pixel_bytes);
495 srcptr += incr1;
496 dstptr += incr1;
497 }
498 } /* end of else */
499
500 break;
501 } // end 16 bpp
502
503 case 24:
504 {
505 png_bytep srcptr;
506 png_bytep dstptr;
507 png_uint_32 len;
508 int unmask, diff;
509
510 __int64 mask2=0x0101010202020404, //24bpp
511 mask1=0x0408080810101020,
512 mask0=0x2020404040808080;
513
514 srcptr = png_ptr->row_buf + 1;
515 dstptr = row;
516
517 unmask = ~mask;
518 len = (png_ptr->width)&~7;
519 diff = (png_ptr->width)&7;
520
521 if (mmx_supported)
522 {
523 _asm
524 {
525 movd mm7, unmask //load bit pattern
526 psubb mm6,mm6 //zero mm6
527 punpcklbw mm7,mm7
528 punpcklwd mm7,mm7
529 punpckldq mm7,mm7 //fill register with 8 masks
530
531 movq mm0,mask0
532 movq mm1,mask1
533 movq mm2,mask2
534
535 pand mm0,mm7
536 pand mm1,mm7
537 pand mm2,mm7
538
539 pcmpeqb mm0,mm6
540 pcmpeqb mm1,mm6
541 pcmpeqb mm2,mm6
542
543 mov ecx,len //load length of line
544 mov esi,srcptr //load source
545 mov ebx,dstptr //load dest
546 cmp ecx,0
547 jz mainloop24end
548
549mainloop24:
550 movq mm4,[esi]
551 pand mm4,mm0
552 movq mm6,mm0
553 movq mm7,[ebx]
554 pandn mm6,mm7
555 por mm4,mm6
556 movq [ebx],mm4
557
558
559 movq mm5,[esi+8]
560 pand mm5,mm1
561 movq mm7,mm1
562 movq mm6,[ebx+8]
563 pandn mm7,mm6
564 por mm5,mm7
565 movq [ebx+8],mm5
566
567 movq mm6,[esi+16]
568 pand mm6,mm2
569 movq mm4,mm2
570 movq mm7,[ebx+16]
571 pandn mm4,mm7
572 por mm6,mm4
573 movq [ebx+16],mm6
574
575 add esi,24 //inc by 24 bytes processed
576 add ebx,24
577 sub ecx,8 //dec by 8 pixels processed
578
579 ja mainloop24
580
581mainloop24end:
582 mov ecx,diff
583 cmp ecx,0
584 jz end24
585
586 mov edx,mask
587 sal edx,24 //make low byte the high byte
588secondloop24:
589 sal edx,1 //move high bit to CF
590 jnc skip24 //if CF = 0
591 mov ax,[esi]
592 mov [ebx],ax
593 xor eax,eax
594 mov al,[esi+2]
595 mov [ebx+2],al
596skip24:
597 add esi,3
598 add ebx,3
599
600 dec ecx
601 jnz secondloop24
602
603end24:
604 emms
605 }
606 }
607 else /* mmx not supported - use modified C routine */
608 {
609 register unsigned int incr1, initial_val, final_val;
610 png_size_t pixel_bytes;
611 png_uint_32 i;
612 register int disp = png_pass_inc[png_ptr->pass];
613 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
614
615 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
616 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
617 pixel_bytes;
618 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
619 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
620 final_val = png_ptr->width*pixel_bytes;
621 incr1 = (disp)*pixel_bytes;
622 for (i = initial_val; i < final_val; i += incr1)
623 {
624 png_memcpy(dstptr, srcptr, pixel_bytes);
625 srcptr += incr1;
626 dstptr += incr1;
627 }
628 } /* end of else */
629
630 break;
631 } // end 24 bpp
632
633 case 32:
634 {
635 png_bytep srcptr;
636 png_bytep dstptr;
637 png_uint_32 len;
638 int unmask, diff;
639
640 __int64 mask3=0x0101010102020202, //32bpp
641 mask2=0x0404040408080808,
642 mask1=0x1010101020202020,
643 mask0=0x4040404080808080;
644
645 srcptr = png_ptr->row_buf + 1;
646 dstptr = row;
647
648 unmask = ~mask;
649 len = (png_ptr->width)&~7;
650 diff = (png_ptr->width)&7;
651
652 if (mmx_supported)
653 {
654 _asm
655 {
656 movd mm7, unmask //load bit pattern
657 psubb mm6,mm6 //zero mm6
658 punpcklbw mm7,mm7
659 punpcklwd mm7,mm7
660 punpckldq mm7,mm7 //fill register with 8 masks
661
662 movq mm0,mask0
663 movq mm1,mask1
664 movq mm2,mask2
665 movq mm3,mask3
666
667 pand mm0,mm7
668 pand mm1,mm7
669 pand mm2,mm7
670 pand mm3,mm7
671
672 pcmpeqb mm0,mm6
673 pcmpeqb mm1,mm6
674 pcmpeqb mm2,mm6
675 pcmpeqb mm3,mm6
676
677 mov ecx,len //load length of line
678 mov esi,srcptr //load source
679 mov ebx,dstptr //load dest
680
681 cmp ecx,0 //lcr
682 jz mainloop32end
683
684mainloop32:
685 movq mm4,[esi]
686 pand mm4,mm0
687 movq mm6,mm0
688 movq mm7,[ebx]
689 pandn mm6,mm7
690 por mm4,mm6
691 movq [ebx],mm4
692
693 movq mm5,[esi+8]
694 pand mm5,mm1
695 movq mm7,mm1
696 movq mm6,[ebx+8]
697 pandn mm7,mm6
698 por mm5,mm7
699 movq [ebx+8],mm5
700
701 movq mm6,[esi+16]
702 pand mm6,mm2
703 movq mm4,mm2
704 movq mm7,[ebx+16]
705 pandn mm4,mm7
706 por mm6,mm4
707 movq [ebx+16],mm6
708
709 movq mm7,[esi+24]
710 pand mm7,mm3
711 movq mm5,mm3
712 movq mm4,[ebx+24]
713 pandn mm5,mm4
714 por mm7,mm5
715 movq [ebx+24],mm7
716
717 add esi,32 //inc by 32 bytes processed
718 add ebx,32
719 sub ecx,8 //dec by 8 pixels processed
720
721 ja mainloop32
722
723mainloop32end:
724 mov ecx,diff
725 cmp ecx,0
726 jz end32
727
728 mov edx,mask
729 sal edx,24 //make low byte the high byte
730secondloop32:
731 sal edx,1 //move high bit to CF
732 jnc skip32 //if CF = 0
733 mov eax,[esi]
734 mov [ebx],eax
735skip32:
736 add esi,4
737 add ebx,4
738
739 dec ecx
740 jnz secondloop32
741
742end32:
743 emms
744 }
745 }
746 else /* mmx _not supported - Use modified C routine */
747 {
748 register unsigned int incr1, initial_val, final_val;
749 png_size_t pixel_bytes;
750 png_uint_32 i;
751 register int disp = png_pass_inc[png_ptr->pass];
752 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
753
754 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
755 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
756 pixel_bytes;
757 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
758 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
759 final_val = png_ptr->width*pixel_bytes;
760 incr1 = (disp)*pixel_bytes;
761 for (i = initial_val; i < final_val; i += incr1)
762 {
763 png_memcpy(dstptr, srcptr, pixel_bytes);
764 srcptr += incr1;
765 dstptr += incr1;
766 }
767 } /* end of else */
768
769 break;
770 } // end 32 bpp
771
772 case 48:
773 {
774 png_bytep srcptr;
775 png_bytep dstptr;
776 png_uint_32 len;
777 int unmask, diff;
778
779 __int64 mask5=0x0101010101010202,
780 mask4=0x0202020204040404,
781 mask3=0x0404080808080808,
782 mask2=0x1010101010102020,
783 mask1=0x2020202040404040,
784 mask0=0x4040808080808080;
785
786 if (mmx_supported)
787 {
788 srcptr = png_ptr->row_buf + 1;
789 dstptr = row;
790
791 unmask = ~mask;
792 len = (png_ptr->width)&~7;
793 diff = (png_ptr->width)&7;
794 _asm
795 {
796 movd mm7, unmask //load bit pattern
797 psubb mm6,mm6 //zero mm6
798 punpcklbw mm7,mm7
799 punpcklwd mm7,mm7
800 punpckldq mm7,mm7 //fill register with 8 masks
801
802 movq mm0,mask0
803 movq mm1,mask1
804 movq mm2,mask2
805 movq mm3,mask3
806 movq mm4,mask4
807 movq mm5,mask5
808
809 pand mm0,mm7
810 pand mm1,mm7
811 pand mm2,mm7
812 pand mm3,mm7
813 pand mm4,mm7
814 pand mm5,mm7
815
816 pcmpeqb mm0,mm6
817 pcmpeqb mm1,mm6
818 pcmpeqb mm2,mm6
819 pcmpeqb mm3,mm6
820 pcmpeqb mm4,mm6
821 pcmpeqb mm5,mm6
822
823 mov ecx,len //load length of line
824 mov esi,srcptr //load source
825 mov ebx,dstptr //load dest
826
827 cmp ecx,0
828 jz mainloop48end
829
830mainloop48:
831 movq mm7,[esi]
832 pand mm7,mm0
833 movq mm6,mm0
834 pandn mm6,[ebx]
835 por mm7,mm6
836 movq [ebx],mm7
837
838 movq mm6,[esi+8]
839 pand mm6,mm1
840 movq mm7,mm1
841 pandn mm7,[ebx+8]
842 por mm6,mm7
843 movq [ebx+8],mm6
844
845 movq mm6,[esi+16]
846 pand mm6,mm2
847 movq mm7,mm2
848 pandn mm7,[ebx+16]
849 por mm6,mm7
850 movq [ebx+16],mm6
851
852 movq mm7,[esi+24]
853 pand mm7,mm3
854 movq mm6,mm3
855 pandn mm6,[ebx+24]
856 por mm7,mm6
857 movq [ebx+24],mm7
858
859 movq mm6,[esi+32]
860 pand mm6,mm4
861 movq mm7,mm4
862 pandn mm7,[ebx+32]
863 por mm6,mm7
864 movq [ebx+32],mm6
865
866 movq mm7,[esi+40]
867 pand mm7,mm5
868 movq mm6,mm5
869 pandn mm6,[ebx+40]
870 por mm7,mm6
871 movq [ebx+40],mm7
872
873 add esi,48 //inc by 32 bytes processed
874 add ebx,48
875 sub ecx,8 //dec by 8 pixels processed
876
877 ja mainloop48
878mainloop48end:
879
880 mov ecx,diff
881 cmp ecx,0
882 jz end48
883
884 mov edx,mask
885 sal edx,24 //make low byte the high byte
886
887secondloop48:
888 sal edx,1 //move high bit to CF
889 jnc skip48 //if CF = 0
890 mov eax,[esi]
891 mov [ebx],eax
892skip48:
893 add esi,4
894 add ebx,4
895
896 dec ecx
897 jnz secondloop48
898
899end48:
900 emms
901 }
902 }
903 else /* mmx _not supported - Use modified C routine */
904 {
905 register unsigned int incr1, initial_val, final_val;
906 png_size_t pixel_bytes;
907 png_uint_32 i;
908 register int disp = png_pass_inc[png_ptr->pass];
909 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
910
911 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
912 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
913 pixel_bytes;
914 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
915 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
916 final_val = png_ptr->width*pixel_bytes;
917 incr1 = (disp)*pixel_bytes;
918 for (i = initial_val; i < final_val; i += incr1)
919 {
920 png_memcpy(dstptr, srcptr, pixel_bytes);
921 srcptr += incr1;
922 dstptr += incr1;
923 }
924 } /* end of else */
925
926 break;
927 } // end 48 bpp
928
929 default:
930 {
931 png_bytep sptr;
932 png_bytep dp;
933 png_size_t pixel_bytes;
934 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
935 unsigned int i;
936 register int disp = png_pass_inc[png_ptr->pass]; // get the offset
937 register unsigned int incr1, initial_val, final_val;
938
939 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
940 sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
941 pixel_bytes;
942 dp = row + offset_table[png_ptr->pass]*pixel_bytes;
943 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
944 final_val = png_ptr->width*pixel_bytes;
945 incr1 = (disp)*pixel_bytes;
946 for (i = initial_val; i < final_val; i += incr1)
947 {
948 png_memcpy(dp, sptr, pixel_bytes);
949 sptr += incr1;
950 dp += incr1;
951 }
952 break;
953 }
954 } /* end switch (png_ptr->row_info.pixel_depth) */
955 } /* end if (non-trivial mask) */
956
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -0500957#ifdef DISABLE_PNGVCRD_COMBINE
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500958 mmx_supported = save_mmx_supported;
959#endif
960
961} /* end png_combine_row() */
962
963
964#if defined(PNG_READ_INTERLACING_SUPPORTED)
965
966void
967png_do_read_interlace(png_row_infop row_info, png_bytep row, int pass,
968 png_uint_32 transformations)
969{
Glenn Randers-Pehrson5379b241999-11-27 10:22:33 -0600970 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -0500971#ifdef DISABLE_PNGVCRD_INTERLACE
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500972 int save_mmx_supported = mmx_supported;
973#endif
974
975 png_debug(1,"in png_do_read_interlace\n");
976
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -0500977#ifdef DISABLE_PNGVCRD_INTERLACE
978 /* In libpng versions 1.0.3a through 1.0.4d,
979 * a sign error in the post-MMX cleanup code for each pixel_depth resulted
980 * in bad pixels at the beginning of some rows of some images, and also
981 * (due to out-of-range memory reads and writes) caused heap corruption
982 * when compiled with MSVC 6.0. The error was fixed in version 1.0.4e,
983 * and the code appears to work completely correctly, so it is enabled
984 * by default.
985 */
986 if (1) /* all passes caused a heap problem in the old code */
987 mmx_supported = 0;
988 else
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500989#endif
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -0500990 if (mmx_supported == 2)
991 mmx_supported = mmxsupport();
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500992
993 if (row != NULL && row_info != NULL)
994 {
995 png_uint_32 final_width;
996
997 final_width = row_info->width * png_pass_inc[pass];
998
999 switch (row_info->pixel_depth)
1000 {
1001 case 1:
1002 {
1003 png_bytep sp, dp;
1004 int sshift, dshift;
1005 int s_start, s_end, s_inc;
1006 png_byte v;
1007 png_uint_32 i;
1008 int j;
1009
1010 sp = row + (png_size_t)((row_info->width - 1) >> 3);
1011 dp = row + (png_size_t)((final_width - 1) >> 3);
1012#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1013 if (transformations & PNG_PACKSWAP)
1014 {
1015 sshift = (int)((row_info->width + 7) & 7);
1016 dshift = (int)((final_width + 7) & 7);
1017 s_start = 7;
1018 s_end = 0;
1019 s_inc = -1;
1020 }
1021 else
1022#endif
1023 {
1024 sshift = 7 - (int)((row_info->width + 7) & 7);
1025 dshift = 7 - (int)((final_width + 7) & 7);
1026 s_start = 0;
1027 s_end = 7;
1028 s_inc = 1;
1029 }
1030
1031 for (i = row_info->width; i; i--)
1032 {
1033 v = (png_byte)((*sp >> sshift) & 0x1);
1034 for (j = 0; j < png_pass_inc[pass]; j++)
1035 {
1036 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1037 *dp |= (png_byte)(v << dshift);
1038 if (dshift == s_end)
1039 {
1040 dshift = s_start;
1041 dp--;
1042 }
1043 else
1044 dshift += s_inc;
1045 }
1046 if (sshift == s_end)
1047 {
1048 sshift = s_start;
1049 sp--;
1050 }
1051 else
1052 sshift += s_inc;
1053 }
1054 break;
1055 }
1056
1057 case 2:
1058 {
1059 png_bytep sp, dp;
1060 int sshift, dshift;
1061 int s_start, s_end, s_inc;
1062 png_uint_32 i;
1063
1064 sp = row + (png_size_t)((row_info->width - 1) >> 2);
1065 dp = row + (png_size_t)((final_width - 1) >> 2);
1066#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1067 if (transformations & PNG_PACKSWAP)
1068 {
1069 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1070 dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1071 s_start = 6;
1072 s_end = 0;
1073 s_inc = -2;
1074 }
1075 else
1076#endif
1077 {
1078 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1079 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1080 s_start = 0;
1081 s_end = 6;
1082 s_inc = 2;
1083 }
1084
1085 for (i = row_info->width; i; i--)
1086 {
1087 png_byte v;
1088 int j;
1089
1090 v = (png_byte)((*sp >> sshift) & 0x3);
1091 for (j = 0; j < png_pass_inc[pass]; j++)
1092 {
1093 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1094 *dp |= (png_byte)(v << dshift);
1095 if (dshift == s_end)
1096 {
1097 dshift = s_start;
1098 dp--;
1099 }
1100 else
1101 dshift += s_inc;
1102 }
1103 if (sshift == s_end)
1104 {
1105 sshift = s_start;
1106 sp--;
1107 }
1108 else
1109 sshift += s_inc;
1110 }
1111 break;
1112 }
1113
1114 case 4:
1115 {
1116 png_bytep sp, dp;
1117 int sshift, dshift;
1118 int s_start, s_end, s_inc;
1119 png_uint_32 i;
1120
1121 sp = row + (png_size_t)((row_info->width - 1) >> 1);
1122 dp = row + (png_size_t)((final_width - 1) >> 1);
1123#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1124 if (transformations & PNG_PACKSWAP)
1125 {
1126 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1127 dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1128 s_start = 4;
1129 s_end = 0;
1130 s_inc = -4;
1131 }
1132 else
1133#endif
1134 {
1135 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1136 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1137 s_start = 0;
1138 s_end = 4;
1139 s_inc = 4;
1140 }
1141
1142 for (i = row_info->width; i; i--)
1143 {
1144 png_byte v;
1145 int j;
1146
1147 v = (png_byte)((*sp >> sshift) & 0xf);
1148 for (j = 0; j < png_pass_inc[pass]; j++)
1149 {
1150 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1151 *dp |= (png_byte)(v << dshift);
1152 if (dshift == s_end)
1153 {
1154 dshift = s_start;
1155 dp--;
1156 }
1157 else
1158 dshift += s_inc;
1159 }
1160 if (sshift == s_end)
1161 {
1162 sshift = s_start;
1163 sp--;
1164 }
1165 else
1166 sshift += s_inc;
1167 }
1168 break;
1169 }
1170
1171 default: // This is the place where the routine is modified
1172 {
1173 __int64 const4 = 0x0000000000FFFFFF;
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001174 // __int64 const5 = 0x000000FFFFFF0000; // unused...
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001175 __int64 const6 = 0x00000000000000FF;
1176 png_bytep sptr, dp;
1177 png_uint_32 i;
1178 png_size_t pixel_bytes;
1179 int width = row_info->width;
1180
1181 pixel_bytes = (row_info->pixel_depth >> 3);
1182
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001183 sptr = row + (width - 1) * pixel_bytes;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001184 dp = row + (final_width - 1) * pixel_bytes;
1185 // New code by Nirav Chhatrapati - Intel Corporation
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001186 // sign fix by GRR
1187 // NOTE: there is NO MMX code for 48-bit and 64-bit images
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001188
1189 if (mmx_supported) // use MMX routine if machine supports it
1190 {
1191 if (pixel_bytes == 3)
1192 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001193 if (((pass == 0) || (pass == 1)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001194 {
1195 _asm
1196 {
1197 mov esi, sptr
1198 mov edi, dp
1199 mov ecx, width
1200 sub edi, 21 // (png_pass_inc[pass] - 1)*pixel_bytes
1201loop_pass0:
1202 movd mm0, [esi] ; X X X X X v2 v1 v0
1203 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1204 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1205 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1206 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1207 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1208 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1209 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1210 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1211 movq mm3, mm0 ; v2 v1 v0 v2 v1 v0 v2 v1
1212 psllq mm0, 16 ; v0 v2 v1 v0 v2 v1 0 0
1213 movq mm4, mm3 ; v2 v1 v0 v2 v1 v0 v2 v1
1214 punpckhdq mm3, mm0 ; v0 v2 v1 v0 v2 v1 v0 v2
1215 movq [edi+16] , mm4
1216 psrlq mm0, 32 ; 0 0 0 0 v0 v2 v1 v0
1217 movq [edi+8] , mm3
1218 punpckldq mm0, mm4 ; v1 v0 v2 v1 v0 v2 v1 v0
1219 sub esi, 3
1220 movq [edi], mm0
1221 sub edi, 24
1222 //sub esi, 3
1223 dec ecx
1224 jnz loop_pass0
1225 EMMS
1226 }
1227 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001228 else if (((pass == 2) || (pass == 3)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001229 {
1230 _asm
1231 {
1232 mov esi, sptr
1233 mov edi, dp
1234 mov ecx, width
1235 sub edi, 9 // (png_pass_inc[pass] - 1)*pixel_bytes
1236loop_pass2:
1237 movd mm0, [esi] ; X X X X X v2 v1 v0
1238 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1239 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1240 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1241 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1242 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1243 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1244 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1245 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1246 movq [edi+4], mm0 ; move to memory
1247 psrlq mm0, 16 ; 0 0 v2 v1 v0 v2 v1 v0
1248 movd [edi], mm0 ; move to memory
1249 sub esi, 3
1250 sub edi, 12
1251 dec ecx
1252 jnz loop_pass2
1253 EMMS
1254 }
1255 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001256 else if (width) /* && ((pass == 4) || (pass == 5)) */
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001257 {
1258 int width_mmx = ((width >> 1) << 1) - 8;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001259 if (width_mmx < 0)
1260 width_mmx = 0;
1261 width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001262 if (width_mmx)
1263 {
1264 _asm
1265 {
1266 mov esi, sptr
1267 mov edi, dp
1268 mov ecx, width_mmx
1269 sub esi, 3
1270 sub edi, 9
1271loop_pass4:
1272 movq mm0, [esi] ; X X v2 v1 v0 v5 v4 v3
1273 movq mm7, mm0 ; X X v2 v1 v0 v5 v4 v3
1274 movq mm6, mm0 ; X X v2 v1 v0 v5 v4 v3
1275 psllq mm0, 24 ; v1 v0 v5 v4 v3 0 0 0
1276 pand mm7, const4 ; 0 0 0 0 0 v5 v4 v3
1277 psrlq mm6, 24 ; 0 0 0 X X v2 v1 v0
1278 por mm0, mm7 ; v1 v0 v5 v4 v3 v5 v4 v3
1279 movq mm5, mm6 ; 0 0 0 X X v2 v1 v0
1280 psllq mm6, 8 ; 0 0 X X v2 v1 v0 0
1281 movq [edi], mm0 ; move quad to memory
1282 psrlq mm5, 16 ; 0 0 0 0 0 X X v2
1283 pand mm5, const6 ; 0 0 0 0 0 0 0 v2
1284 por mm6, mm5 ; 0 0 X X v2 v1 v0 v2
1285 movd [edi+8], mm6 ; move double to memory
1286 sub esi, 6
1287 sub edi, 12
1288 sub ecx, 2
1289 jnz loop_pass4
1290 EMMS
1291 }
1292 }
1293
1294 sptr -= width_mmx*3;
1295 dp -= width_mmx*6;
1296 for (i = width; i; i--)
1297 {
1298 png_byte v[8];
1299 int j;
1300
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001301 png_memcpy(v, sptr, 3);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001302 for (j = 0; j < png_pass_inc[pass]; j++)
1303 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001304 png_memcpy(dp, v, 3);
1305 dp -= 3;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001306 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001307 sptr -= 3;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001308 }
1309 }
1310 } /* end of pixel_bytes == 3 */
1311
1312 else if (pixel_bytes == 1)
1313 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001314 if (((pass == 0) || (pass == 1)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001315 {
1316 int width_mmx = ((width >> 2) << 2);
1317 width -= width_mmx;
1318 if (width_mmx)
1319 {
1320 _asm
1321 {
1322 mov esi, sptr
1323 mov edi, dp
1324 mov ecx, width_mmx
1325 sub edi, 31
1326 sub esi, 3
1327loop1_pass0:
1328 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1329 movq mm1, mm0 ; X X X X v0 v1 v2 v3
1330 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1331 movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1332 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1333 movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1334 punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3
1335 punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2
1336 movq [edi], mm0 ; move to memory v3
1337 punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1338 movq [edi+8], mm3 ; move to memory v2
1339 movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1340 punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1
1341 punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0
1342 movq [edi+16], mm2 ; move to memory v1
1343 movq [edi+24], mm4 ; move to memory v0
1344 sub esi, 4
1345 sub edi, 32
1346 sub ecx, 4
1347 jnz loop1_pass0
1348 EMMS
1349 }
1350 }
1351
1352 sptr -= width_mmx;
1353 dp -= width_mmx*8;
1354 for (i = width; i; i--)
1355 {
1356 int j;
1357
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001358 /* I simplified this part in version 1.0.4e
1359 * here and in several other instances where
1360 * pixel_bytes == 1 -- GR-P
1361 *
1362 * Original code:
1363 *
1364 * png_byte v[8];
1365 * png_memcpy(v, sptr, pixel_bytes);
1366 * for (j = 0; j < png_pass_inc[pass]; j++)
1367 * {
1368 * png_memcpy(dp, v, pixel_bytes);
1369 * dp -= pixel_bytes;
1370 * }
1371 * sptr -= pixel_bytes;
1372 *
1373 * Replacement code is in the next three lines:
1374 */
1375
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001376 for (j = 0; j < png_pass_inc[pass]; j++)
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001377 *dp-- = *sptr;
1378 sptr--;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001379 }
1380 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001381 else if (((pass == 2) || (pass == 3)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001382 {
1383 int width_mmx = ((width >> 2) << 2);
1384 width -= width_mmx;
1385 if (width_mmx)
1386 {
1387 _asm
1388 {
1389 mov esi, sptr
1390 mov edi, dp
1391 mov ecx, width_mmx
1392 sub edi, 15
1393 sub esi, 3
1394loop1_pass2:
1395 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1396 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1397 movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1398 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1399 punpckhwd mm1, mm1 ; v0 v0 v0 v0 v1 v1 v1 v1
1400 movq [edi], mm0 ; move to memory v2 and v3
1401 sub esi, 4
1402 movq [edi+8], mm1 ; move to memory v1 and v0
1403 sub edi, 16
1404 sub ecx, 4
1405 jnz loop1_pass2
1406 EMMS
1407 }
1408 }
1409
1410 sptr -= width_mmx;
1411 dp -= width_mmx*4;
1412 for (i = width; i; i--)
1413 {
1414 int j;
1415
1416 for (j = 0; j < png_pass_inc[pass]; j++)
1417 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001418 *dp-- = *sptr;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001419 }
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001420 sptr --;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001421 }
1422 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001423 else if (width) /* && ((pass == 4) || (pass == 5))) */
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001424 {
1425 int width_mmx = ((width >> 3) << 3);
1426 width -= width_mmx;
1427 if (width_mmx)
1428 {
1429 _asm
1430 {
1431 mov esi, sptr
1432 mov edi, dp
1433 mov ecx, width_mmx
1434 sub edi, 15
1435 sub esi, 7
1436loop1_pass4:
1437 movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7
1438 movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7
1439 punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7
1440 //movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1441 punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3
1442 movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3
1443 sub esi, 8
1444 movq [edi], mm0 ; move to memory v4 v5 v6 and v7
1445 //sub esi, 4
1446 sub edi, 16
1447 sub ecx, 8
1448 jnz loop1_pass4
1449 EMMS
1450 }
1451 }
1452
1453 sptr -= width_mmx;
1454 dp -= width_mmx*2;
1455 for (i = width; i; i--)
1456 {
1457 int j;
1458
1459 for (j = 0; j < png_pass_inc[pass]; j++)
1460 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001461 *dp-- = *sptr;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001462 }
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001463 sptr --;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001464 }
1465 }
1466 } /* end of pixel_bytes == 1 */
1467
1468 else if (pixel_bytes == 2)
1469 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001470 if (((pass == 0) || (pass == 1)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001471 {
1472 int width_mmx = ((width >> 1) << 1);
1473 width -= width_mmx;
1474 if (width_mmx)
1475 {
1476 _asm
1477 {
1478 mov esi, sptr
1479 mov edi, dp
1480 mov ecx, width_mmx
1481 sub esi, 2
1482 sub edi, 30
1483loop2_pass0:
1484 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1485 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1486 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1487 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1488 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1489 movq [edi], mm0
1490 movq [edi + 8], mm0
1491 movq [edi + 16], mm1
1492 movq [edi + 24], mm1
1493 sub esi, 4
1494 sub edi, 32
1495 sub ecx, 2
1496 jnz loop2_pass0
1497 EMMS
1498 }
1499 }
1500
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001501 sptr -= (width_mmx*2 - 2); // sign fixed
1502 dp -= (width_mmx*16 - 2); // sign fixed
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001503 for (i = width; i; i--)
1504 {
1505 png_byte v[8];
1506 int j;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001507 sptr -= 2;
1508 png_memcpy(v, sptr, 2);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001509 for (j = 0; j < png_pass_inc[pass]; j++)
1510 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001511 dp -= 2;
1512 png_memcpy(dp, v, 2);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001513 }
1514 }
1515 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001516 else if (((pass == 2) || (pass == 3)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001517 {
1518 int width_mmx = ((width >> 1) << 1) ;
1519 width -= width_mmx;
1520 if (width_mmx)
1521 {
1522 _asm
1523 {
1524 mov esi, sptr
1525 mov edi, dp
1526 mov ecx, width_mmx
1527 sub esi, 2
1528 sub edi, 14
1529loop2_pass2:
1530 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1531 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1532 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1533 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1534 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1535 movq [edi], mm0
1536 sub esi, 4
1537 movq [edi + 8], mm1
1538 //sub esi, 4
1539 sub edi, 16
1540 sub ecx, 2
1541 jnz loop2_pass2
1542 EMMS
1543 }
1544 }
1545
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001546 sptr -= (width_mmx*2 - 2); // sign fixed
1547 dp -= (width_mmx*8 - 2); // sign fixed
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001548 for (i = width; i; i--)
1549 {
1550 png_byte v[8];
1551 int j;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001552 sptr -= 2;
1553 png_memcpy(v, sptr, 2);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001554 for (j = 0; j < png_pass_inc[pass]; j++)
1555 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001556 dp -= 2;
1557 png_memcpy(dp, v, 2);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001558 }
1559 }
1560 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001561 else if (width) // pass == 4 or 5
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001562 {
1563 int width_mmx = ((width >> 1) << 1) ;
1564 width -= width_mmx;
1565 if (width_mmx)
1566 {
1567 _asm
1568 {
1569 mov esi, sptr
1570 mov edi, dp
1571 mov ecx, width_mmx
1572 sub esi, 2
1573 sub edi, 6
1574loop2_pass4:
1575 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1576 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1577 sub esi, 4
1578 movq [edi], mm0
1579 sub edi, 8
1580 sub ecx, 2
1581 jnz loop2_pass4
1582 EMMS
1583 }
1584 }
1585
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001586 sptr -= (width_mmx*2 - 2); // sign fixed
1587 dp -= (width_mmx*4 - 2); // sign fixed
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001588 for (i = width; i; i--)
1589 {
1590 png_byte v[8];
1591 int j;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001592 sptr -= 2;
1593 png_memcpy(v, sptr, 2);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001594 for (j = 0; j < png_pass_inc[pass]; j++)
1595 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001596 dp -= 2;
1597 png_memcpy(dp, v, 2);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001598 }
1599 }
1600 }
1601 } /* end of pixel_bytes == 2 */
1602
1603 else if (pixel_bytes == 4)
1604 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001605 if (((pass == 0) || (pass == 1)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001606 {
1607 int width_mmx = ((width >> 1) << 1) ;
1608 width -= width_mmx;
1609 if (width_mmx)
1610 {
1611 _asm
1612 {
1613 mov esi, sptr
1614 mov edi, dp
1615 mov ecx, width_mmx
1616 sub esi, 4
1617 sub edi, 60
1618loop4_pass0:
1619 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1620 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1621 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1622 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1623 movq [edi], mm0
1624 movq [edi + 8], mm0
1625 movq [edi + 16], mm0
1626 movq [edi + 24], mm0
1627 movq [edi+32], mm1
1628 movq [edi + 40], mm1
1629 movq [edi+ 48], mm1
1630 sub esi, 8
1631 movq [edi + 56], mm1
1632 sub edi, 64
1633 sub ecx, 2
1634 jnz loop4_pass0
1635 EMMS
1636 }
1637 }
1638
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001639 sptr -= (width_mmx*4 - 4); // sign fixed
1640 dp -= (width_mmx*32 - 4); // sign fixed
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001641 for (i = width; i; i--)
1642 {
1643 png_byte v[8];
1644 int j;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001645 sptr -= 4;
1646 png_memcpy(v, sptr, 4);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001647 for (j = 0; j < png_pass_inc[pass]; j++)
1648 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001649 dp -= 4;
1650 png_memcpy(dp, v, 4);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001651 }
1652 }
1653 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001654 else if (((pass == 2) || (pass == 3)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001655 {
1656 int width_mmx = ((width >> 1) << 1) ;
1657 width -= width_mmx;
1658 if (width_mmx)
1659 {
1660 _asm
1661 {
1662 mov esi, sptr
1663 mov edi, dp
1664 mov ecx, width_mmx
1665 sub esi, 4
1666 sub edi, 28
1667loop4_pass2:
1668 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1669 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1670 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1671 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1672 movq [edi], mm0
1673 movq [edi + 8], mm0
1674 movq [edi+16], mm1
1675 movq [edi + 24], mm1
1676 sub esi, 8
1677 sub edi, 32
1678 sub ecx, 2
1679 jnz loop4_pass2
1680 EMMS
1681 }
1682 }
1683
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001684 sptr -= (width_mmx*4 - 4); // sign fixed
1685 dp -= (width_mmx*16 - 4); // sign fixed
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001686 for (i = width; i; i--)
1687 {
1688 png_byte v[8];
1689 int j;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001690 sptr -= 4;
1691 png_memcpy(v, sptr, 4);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001692 for (j = 0; j < png_pass_inc[pass]; j++)
1693 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001694 dp -= 4;
1695 png_memcpy(dp, v, 4);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001696 }
1697 }
1698 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001699 else if (width) // pass == 4 or 5
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001700 {
1701 int width_mmx = ((width >> 1) << 1) ;
1702 width -= width_mmx;
1703 if (width_mmx)
1704 {
1705 _asm
1706 {
1707 mov esi, sptr
1708 mov edi, dp
1709 mov ecx, width_mmx
1710 sub esi, 4
1711 sub edi, 12
1712loop4_pass4:
1713 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1714 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1715 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1716 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1717 movq [edi], mm0
1718 sub esi, 8
1719 movq [edi + 8], mm1
1720 sub edi, 16
1721 sub ecx, 2
1722 jnz loop4_pass4
1723 EMMS
1724 }
1725 }
1726
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001727 sptr -= (width_mmx*4 - 4); // sign fixed
1728 dp -= (width_mmx*8 - 4); // sign fixed
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001729 for (i = width; i; i--)
1730 {
1731 png_byte v[8];
1732 int j;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001733 sptr -= 4;
1734 png_memcpy(v, sptr, 4);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001735 for (j = 0; j < png_pass_inc[pass]; j++)
1736 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001737 dp -= 4;
1738 png_memcpy(dp, v, 4);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001739 }
1740 }
1741 }
1742
1743 } /* end of pixel_bytes == 4 */
1744
1745 else if (pixel_bytes == 6)
1746 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001747 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001748 {
1749 png_byte v[8];
1750 int j;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001751 png_memcpy(v, sptr, 6);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001752 for (j = 0; j < png_pass_inc[pass]; j++)
1753 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001754 png_memcpy(dp, v, 6);
1755 dp -= 6;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001756 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001757 sptr -= 6;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001758 }
1759 } /* end of pixel_bytes == 6 */
1760
1761 else
1762 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001763 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001764 {
1765 png_byte v[8];
1766 int j;
1767 png_memcpy(v, sptr, pixel_bytes);
1768 for (j = 0; j < png_pass_inc[pass]; j++)
1769 {
1770 png_memcpy(dp, v, pixel_bytes);
1771 dp -= pixel_bytes;
1772 }
1773 sptr-= pixel_bytes;
1774 }
1775 }
1776 } /* end of mmx_supported */
1777
1778 else /* MMX not supported: use modified C code - takes advantage
1779 * of inlining of memcpy for a constant */
1780 {
1781 if (pixel_bytes == 1)
1782 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001783 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001784 {
1785 int j;
1786 for (j = 0; j < png_pass_inc[pass]; j++)
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001787 *dp-- = *sptr;
1788 sptr--;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001789 }
1790 }
1791 else if (pixel_bytes == 3)
1792 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001793 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001794 {
1795 png_byte v[8];
1796 int j;
1797 png_memcpy(v, sptr, pixel_bytes);
1798 for (j = 0; j < png_pass_inc[pass]; j++)
1799 {
1800 png_memcpy(dp, v, pixel_bytes);
1801 dp -= pixel_bytes;
1802 }
1803 sptr -= pixel_bytes;
1804 }
1805 }
1806 else if (pixel_bytes == 2)
1807 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001808 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001809 {
1810 png_byte v[8];
1811 int j;
1812 png_memcpy(v, sptr, pixel_bytes);
1813 for (j = 0; j < png_pass_inc[pass]; j++)
1814 {
1815 png_memcpy(dp, v, pixel_bytes);
1816 dp -= pixel_bytes;
1817 }
1818 sptr -= pixel_bytes;
1819 }
1820 }
1821 else if (pixel_bytes == 4)
1822 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001823 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001824 {
1825 png_byte v[8];
1826 int j;
1827 png_memcpy(v, sptr, pixel_bytes);
1828 for (j = 0; j < png_pass_inc[pass]; j++)
1829 {
1830 png_memcpy(dp, v, pixel_bytes);
1831 dp -= pixel_bytes;
1832 }
1833 sptr -= pixel_bytes;
1834 }
1835 }
1836 else if (pixel_bytes == 6)
1837 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001838 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001839 {
1840 png_byte v[8];
1841 int j;
1842 png_memcpy(v, sptr, pixel_bytes);
1843 for (j = 0; j < png_pass_inc[pass]; j++)
1844 {
1845 png_memcpy(dp, v, pixel_bytes);
1846 dp -= pixel_bytes;
1847 }
1848 sptr -= pixel_bytes;
1849 }
1850 }
1851 else
1852 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001853 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001854 {
1855 png_byte v[8];
1856 int j;
1857 png_memcpy(v, sptr, pixel_bytes);
1858 for (j = 0; j < png_pass_inc[pass]; j++)
1859 {
1860 png_memcpy(dp, v, pixel_bytes);
1861 dp -= pixel_bytes;
1862 }
1863 sptr -= pixel_bytes;
1864 }
1865 }
1866
1867 } /* end of MMX not supported */
1868 break;
1869 }
1870 } /* end switch (row_info->pixel_depth) */
1871
1872 row_info->width = final_width;
1873 row_info->rowbytes = ((final_width *
1874 (png_uint_32)row_info->pixel_depth + 7) >> 3);
1875 }
1876
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001877#ifdef DISABLE_PNGVCRD_INTERLACE
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001878 mmx_supported = save_mmx_supported;
1879#endif
1880}
1881
1882#endif /* PNG_READ_INTERLACING_SUPPORTED */
1883
1884
1885// These variables are utilized in the functions below. They are declared
1886// globally here to ensure alignment on 8-byte boundaries.
1887
1888union uAll {
1889 __int64 use;
1890 double align;
1891} LBCarryMask = {0x0101010101010101},
1892 HBClearMask = {0x7f7f7f7f7f7f7f7f},
1893 ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
1894
1895
1896// Optimized code for PNG Average filter decoder
1897void
1898png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
1899 , png_bytep prev_row)
1900{
1901 int bpp;
1902 png_uint_32 FullLength;
1903 png_uint_32 MMXLength;
1904 //png_uint_32 len;
1905 int diff;
1906
1907 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
1908 FullLength = row_info->rowbytes; // # of bytes to filter
1909 _asm {
1910 // Init address pointers and offset
1911 mov edi, row // edi ==> Avg(x)
1912 xor ebx, ebx // ebx ==> x
1913 mov edx, edi
1914 mov esi, prev_row // esi ==> Prior(x)
1915 sub edx, bpp // edx ==> Raw(x-bpp)
1916
1917 xor eax, eax
1918 // Compute the Raw value for the first bpp bytes
1919 // Raw(x) = Avg(x) + (Prior(x)/2)
1920davgrlp:
1921 mov al, [esi + ebx] // Load al with Prior(x)
1922 inc ebx
1923 shr al, 1 // divide by 2
1924 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
1925 cmp ebx, bpp
1926 mov [edi+ebx-1], al // Write back Raw(x);
1927 // mov does not affect flags; -1 to offset inc ebx
1928 jb davgrlp
1929 // get # of bytes to alignment
1930 mov diff, edi // take start of row
1931 add diff, ebx // add bpp
1932 add diff, 0xf // add 7 + 8 to incr past alignment boundary
1933 and diff, 0xfffffff8 // mask to alignment boundary
1934 sub diff, edi // subtract from start ==> value ebx at alignment
1935 jz davggo
1936 // fix alignment
1937 // Compute the Raw value for the bytes upto the alignment boundary
1938 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
1939 xor ecx, ecx
1940davglp1:
1941 xor eax, eax
1942 mov cl, [esi + ebx] // load cl with Prior(x)
1943 mov al, [edx + ebx] // load al with Raw(x-bpp)
1944 add ax, cx
1945 inc ebx
1946 shr ax, 1 // divide by 2
1947 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
1948 cmp ebx, diff // Check if at alignment boundary
1949 mov [edi+ebx-1], al // Write back Raw(x);
1950 // mov does not affect flags; -1 to offset inc ebx
1951 jb davglp1 // Repeat until at alignment boundary
1952davggo:
1953 mov eax, FullLength
1954 mov ecx, eax
1955 sub eax, ebx // subtract alignment fix
1956 and eax, 0x00000007 // calc bytes over mult of 8
1957 sub ecx, eax // drop over bytes from original length
1958 mov MMXLength, ecx
1959 } // end _asm block
1960 // Now do the math for the rest of the row
1961 switch ( bpp )
1962 {
1963 case 3:
1964 {
1965 ActiveMask.use = 0x0000000000ffffff;
1966 ShiftBpp.use = 24; // == 3 * 8
1967 ShiftRem.use = 40; // == 64 - 24
1968 _asm {
1969 // Re-init address pointers and offset
1970 movq mm7, ActiveMask
1971 mov ebx, diff // ebx ==> x = offset to alignment boundary
1972 movq mm5, LBCarryMask
1973 mov edi, row // edi ==> Avg(x)
1974 movq mm4, HBClearMask
1975 mov esi, prev_row // esi ==> Prior(x)
1976 // PRIME the pump (load the first Raw(x-bpp) data set
1977 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
1978 // (we correct position in loop below)
1979davg3lp:
1980 movq mm0, [edi + ebx] // Load mm0 with Avg(x)
1981 // Add (Prev_row/2) to Average
1982 movq mm3, mm5
1983 psrlq mm2, ShiftRem // Correct position Raw(x-bpp) data
1984 movq mm1, [esi + ebx] // Load mm1 with Prior(x)
1985 movq mm6, mm7
1986 pand mm3, mm1 // get lsb for each prev_row byte
1987 psrlq mm1, 1 // divide prev_row bytes by 2
1988 pand mm1, mm4 // clear invalid bit 7 of each byte
1989 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
1990 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
1991 movq mm1, mm3 // now use mm1 for getting LBCarrys
1992 pand mm1, mm2 // get LBCarrys for each byte where both
1993 // lsb's were == 1 (Only valid for active group)
1994 psrlq mm2, 1 // divide raw bytes by 2
1995 pand mm2, mm4 // clear invalid bit 7 of each byte
1996 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
1997 pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
1998 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
1999 // byte
2000 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2001 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 3-5
2002 movq mm2, mm0 // mov updated Raws to mm2
2003 psllq mm2, ShiftBpp // shift data to position correctly
2004 movq mm1, mm3 // now use mm1 for getting LBCarrys
2005 pand mm1, mm2 // get LBCarrys for each byte where both
2006 // lsb's were == 1 (Only valid for active group)
2007 psrlq mm2, 1 // divide raw bytes by 2
2008 pand mm2, mm4 // clear invalid bit 7 of each byte
2009 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2010 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2011 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2012 // byte
2013
2014 // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
2015 psllq mm6, ShiftBpp // shift the mm6 mask to cover the last two
2016 // bytes
2017 movq mm2, mm0 // mov updated Raws to mm2
2018 psllq mm2, ShiftBpp // shift data to position correctly
2019 // Data only needs to be shifted once here to
2020 // get the correct x-bpp offset.
2021 movq mm1, mm3 // now use mm1 for getting LBCarrys
2022 pand mm1, mm2 // get LBCarrys for each byte where both
2023 // lsb's were == 1 (Only valid for active group)
2024 psrlq mm2, 1 // divide raw bytes by 2
2025 pand mm2, mm4 // clear invalid bit 7 of each byte
2026 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2027 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2028 add ebx, 8
2029 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2030 // byte
2031
2032 // Now ready to write back to memory
2033 movq [edi + ebx - 8], mm0
2034 // Move updated Raw(x) to use as Raw(x-bpp) for next loop
2035 cmp ebx, MMXLength
2036 movq mm2, mm0 // mov updated Raw(x) to mm2
2037 jb davg3lp
2038 } // end _asm block
2039 }
2040 break;
2041
2042 case 6:
2043 case 4:
2044 case 7:
2045 case 5:
2046 {
2047 ActiveMask.use = 0xffffffffffffffff; // use shift below to clear
2048 // appropriate inactive bytes
2049 ShiftBpp.use = bpp << 3;
2050 ShiftRem.use = 64 - ShiftBpp.use;
2051 _asm {
2052 movq mm4, HBClearMask
2053 // Re-init address pointers and offset
2054 mov ebx, diff // ebx ==> x = offset to alignment boundary
2055 // Load ActiveMask and clear all bytes except for 1st active group
2056 movq mm7, ActiveMask
2057 mov edi, row // edi ==> Avg(x)
2058 psrlq mm7, ShiftRem
2059 mov esi, prev_row // esi ==> Prior(x)
2060 movq mm6, mm7
2061 movq mm5, LBCarryMask
2062 psllq mm6, ShiftBpp // Create mask for 2nd active group
2063 // PRIME the pump (load the first Raw(x-bpp) data set
2064 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2065 // (we correct position in loop below)
2066davg4lp:
2067 movq mm0, [edi + ebx]
2068 psrlq mm2, ShiftRem // shift data to position correctly
2069 movq mm1, [esi + ebx]
2070 // Add (Prev_row/2) to Average
2071 movq mm3, mm5
2072 pand mm3, mm1 // get lsb for each prev_row byte
2073 psrlq mm1, 1 // divide prev_row bytes by 2
2074 pand mm1, mm4 // clear invalid bit 7 of each byte
2075 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2076 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2077 movq mm1, mm3 // now use mm1 for getting LBCarrys
2078 pand mm1, mm2 // get LBCarrys for each byte where both
2079 // lsb's were == 1 (Only valid for active group)
2080 psrlq mm2, 1 // divide raw bytes by 2
2081 pand mm2, mm4 // clear invalid bit 7 of each byte
2082 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2083 pand mm2, mm7 // Leave only Active Group 1 bytes to add to Avg
2084 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2085 // byte
2086 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2087 movq mm2, mm0 // mov updated Raws to mm2
2088 psllq mm2, ShiftBpp // shift data to position correctly
2089 add ebx, 8
2090 movq mm1, mm3 // now use mm1 for getting LBCarrys
2091 pand mm1, mm2 // get LBCarrys for each byte where both
2092 // lsb's were == 1 (Only valid for active group)
2093 psrlq mm2, 1 // divide raw bytes by 2
2094 pand mm2, mm4 // clear invalid bit 7 of each byte
2095 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2096 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2097 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2098 // byte
2099 cmp ebx, MMXLength
2100 // Now ready to write back to memory
2101 movq [edi + ebx - 8], mm0
2102 // Prep Raw(x-bpp) for next loop
2103 movq mm2, mm0 // mov updated Raws to mm2
2104 jb davg4lp
2105 } // end _asm block
2106 }
2107 break;
2108 case 2:
2109 {
2110 ActiveMask.use = 0x000000000000ffff;
2111 ShiftBpp.use = 24; // == 3 * 8
2112 ShiftRem.use = 40; // == 64 - 24
2113 _asm {
2114 // Load ActiveMask
2115 movq mm7, ActiveMask
2116 // Re-init address pointers and offset
2117 mov ebx, diff // ebx ==> x = offset to alignment boundary
2118 movq mm5, LBCarryMask
2119 mov edi, row // edi ==> Avg(x)
2120 movq mm4, HBClearMask
2121 mov esi, prev_row // esi ==> Prior(x)
2122 // PRIME the pump (load the first Raw(x-bpp) data set
2123 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2124 // (we correct position in loop below)
2125davg2lp:
2126 movq mm0, [edi + ebx]
2127 psllq mm2, ShiftRem // shift data to position correctly
2128 movq mm1, [esi + ebx]
2129 // Add (Prev_row/2) to Average
2130 movq mm3, mm5
2131 pand mm3, mm1 // get lsb for each prev_row byte
2132 psrlq mm1, 1 // divide prev_row bytes by 2
2133 pand mm1, mm4 // clear invalid bit 7 of each byte
2134 movq mm6, mm7
2135 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2136 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2137 movq mm1, mm3 // now use mm1 for getting LBCarrys
2138 pand mm1, mm2 // get LBCarrys for each byte where both
2139 // lsb's were == 1 (Only valid for active group)
2140 psrlq mm2, 1 // divide raw bytes by 2
2141 pand mm2, mm4 // clear invalid bit 7 of each byte
2142 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2143 pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
2144 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2145 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2146 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3
2147 movq mm2, mm0 // mov updated Raws to mm2
2148 psllq mm2, ShiftBpp // shift data to position correctly
2149 movq mm1, mm3 // now use mm1 for getting LBCarrys
2150 pand mm1, mm2 // get LBCarrys for each byte where both
2151 // lsb's were == 1 (Only valid for active group)
2152 psrlq mm2, 1 // divide raw bytes by 2
2153 pand mm2, mm4 // clear invalid bit 7 of each byte
2154 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2155 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2156 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2157
2158 // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
2159 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5
2160 movq mm2, mm0 // mov updated Raws to mm2
2161 psllq mm2, ShiftBpp // shift data to position correctly
2162 // Data only needs to be shifted once here to
2163 // get the correct x-bpp offset.
2164 movq mm1, mm3 // now use mm1 for getting LBCarrys
2165 pand mm1, mm2 // get LBCarrys for each byte where both
2166 // lsb's were == 1 (Only valid for active group)
2167 psrlq mm2, 1 // divide raw bytes by 2
2168 pand mm2, mm4 // clear invalid bit 7 of each byte
2169 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2170 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2171 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2172
2173 // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
2174 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 6 & 7
2175 movq mm2, mm0 // mov updated Raws to mm2
2176 psllq mm2, ShiftBpp // shift data to position correctly
2177 // Data only needs to be shifted once here to
2178 // get the correct x-bpp offset.
2179 add ebx, 8
2180 movq mm1, mm3 // now use mm1 for getting LBCarrys
2181 pand mm1, mm2 // get LBCarrys for each byte where both
2182 // lsb's were == 1 (Only valid for active group)
2183 psrlq mm2, 1 // divide raw bytes by 2
2184 pand mm2, mm4 // clear invalid bit 7 of each byte
2185 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2186 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2187 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2188
2189 cmp ebx, MMXLength
2190 // Now ready to write back to memory
2191 movq [edi + ebx - 8], mm0
2192 // Prep Raw(x-bpp) for next loop
2193 movq mm2, mm0 // mov updated Raws to mm2
2194 jb davg2lp
2195 } // end _asm block
2196 }
2197 break;
2198
2199 case 1: // bpp == 1
2200 {
2201 _asm {
2202 // Re-init address pointers and offset
2203 mov ebx, diff // ebx ==> x = offset to alignment boundary
2204 mov edi, row // edi ==> Avg(x)
2205 cmp ebx, FullLength // Test if offset at end of array
2206 jnb davg1end
2207 // Do Paeth decode for remaining bytes
2208 mov esi, prev_row // esi ==> Prior(x)
2209 mov edx, edi
2210 xor ecx, ecx // zero ecx before using cl & cx in loop below
2211 sub edx, bpp // edx ==> Raw(x-bpp)
2212davg1lp:
2213 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2214 xor eax, eax
2215 mov cl, [esi + ebx] // load cl with Prior(x)
2216 mov al, [edx + ebx] // load al with Raw(x-bpp)
2217 add ax, cx
2218 inc ebx
2219 shr ax, 1 // divide by 2
2220 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2221 cmp ebx, FullLength // Check if at end of array
2222 mov [edi+ebx-1], al // Write back Raw(x);
2223 // mov does not affect flags; -1 to offset inc ebx
2224 jb davg1lp
2225davg1end:
2226 } // end _asm block
2227 }
2228 return;
2229
2230 case 8: // bpp == 8
2231 {
2232 _asm {
2233 // Re-init address pointers and offset
2234 mov ebx, diff // ebx ==> x = offset to alignment boundary
2235 movq mm5, LBCarryMask
2236 mov edi, row // edi ==> Avg(x)
2237 movq mm4, HBClearMask
2238 mov esi, prev_row // esi ==> Prior(x)
2239 // PRIME the pump (load the first Raw(x-bpp) data set
2240 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2241 // (NO NEED to correct position in loop below)
2242davg8lp:
2243 movq mm0, [edi + ebx]
2244 movq mm3, mm5
2245 movq mm1, [esi + ebx]
2246 add ebx, 8
2247 pand mm3, mm1 // get lsb for each prev_row byte
2248 psrlq mm1, 1 // divide prev_row bytes by 2
2249 pand mm3, mm2 // get LBCarrys for each byte where both
2250 // lsb's were == 1
2251 psrlq mm2, 1 // divide raw bytes by 2
2252 pand mm1, mm4 // clear invalid bit 7 of each byte
2253 paddb mm0, mm3 // add LBCarrys to Avg for each byte
2254 pand mm2, mm4 // clear invalid bit 7 of each byte
2255 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2256 paddb mm0, mm2 // add (Raw/2) to Avg for each byte
2257 cmp ebx, MMXLength
2258 movq [edi + ebx - 8], mm0
2259 movq mm2, mm0 // reuse as Raw(x-bpp)
2260 jb davg8lp
2261 } // end _asm block
2262 }
2263 break;
2264 default: // bpp greater than 8
2265 {
2266 _asm {
2267 movq mm5, LBCarryMask
2268 // Re-init address pointers and offset
2269 mov ebx, diff // ebx ==> x = offset to alignment boundary
2270 mov edi, row // edi ==> Avg(x)
2271 movq mm4, HBClearMask
2272 mov edx, edi
2273 mov esi, prev_row // esi ==> Prior(x)
2274 sub edx, bpp // edx ==> Raw(x-bpp)
2275davgAlp:
2276 movq mm0, [edi + ebx]
2277 movq mm3, mm5
2278 movq mm1, [esi + ebx]
2279 pand mm3, mm1 // get lsb for each prev_row byte
2280 movq mm2, [edx + ebx]
2281 psrlq mm1, 1 // divide prev_row bytes by 2
2282 pand mm3, mm2 // get LBCarrys for each byte where both
2283 // lsb's were == 1
2284 psrlq mm2, 1 // divide raw bytes by 2
2285 pand mm1, mm4 // clear invalid bit 7 of each byte
2286 paddb mm0, mm3 // add LBCarrys to Avg for each byte
2287 pand mm2, mm4 // clear invalid bit 7 of each byte
2288 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2289 add ebx, 8
2290 paddb mm0, mm2 // add (Raw/2) to Avg for each byte
2291 cmp ebx, MMXLength
2292 movq [edi + ebx - 8], mm0
2293 jb davgAlp
2294 } // end _asm block
2295 }
2296 break;
2297 } // end switch ( bpp )
2298
2299 _asm {
2300 // MMX acceleration complete now do clean-up
2301 // Check if any remaining bytes left to decode
2302 mov ebx, MMXLength // ebx ==> x = offset bytes remaining after MMX
2303 mov edi, row // edi ==> Avg(x)
2304 cmp ebx, FullLength // Test if offset at end of array
2305 jnb davgend
2306 // Do Paeth decode for remaining bytes
2307 mov esi, prev_row // esi ==> Prior(x)
2308 mov edx, edi
2309 xor ecx, ecx // zero ecx before using cl & cx in loop below
2310 sub edx, bpp // edx ==> Raw(x-bpp)
2311davglp2:
2312 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2313 xor eax, eax
2314 mov cl, [esi + ebx] // load cl with Prior(x)
2315 mov al, [edx + ebx] // load al with Raw(x-bpp)
2316 add ax, cx
2317 inc ebx
2318 shr ax, 1 // divide by 2
2319 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2320 cmp ebx, FullLength // Check if at end of array
2321 mov [edi+ebx-1], al // Write back Raw(x);
2322 // mov does not affect flags; -1 to offset inc ebx
2323 jb davglp2
2324davgend:
2325 emms // End MMX instructions; prep for possible FP instrs.
2326 } // end _asm block
2327}
2328
2329// Optimized code for PNG Paeth filter decoder
2330void
2331png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
2332 png_bytep prev_row)
2333{
2334 png_uint_32 FullLength;
2335 png_uint_32 MMXLength;
2336 //png_uint_32 len;
2337 int bpp;
2338 int diff;
2339 //int ptemp;
2340 int patemp, pbtemp, pctemp;
2341
2342 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
2343 FullLength = row_info->rowbytes; // # of bytes to filter
2344 _asm
2345 {
2346 xor ebx, ebx // ebx ==> x offset
2347 mov edi, row
2348 xor edx, edx // edx ==> x-bpp offset
2349 mov esi, prev_row
2350 xor eax, eax
2351
2352 // Compute the Raw value for the first bpp bytes
2353 // Note: the formula works out to be always
2354 // Paeth(x) = Raw(x) + Prior(x) where x < bpp
2355dpthrlp:
2356 mov al, [edi + ebx]
2357 add al, [esi + ebx]
2358 inc ebx
2359 cmp ebx, bpp
2360 mov [edi + ebx - 1], al
2361 jb dpthrlp
2362 // get # of bytes to alignment
2363 mov diff, edi // take start of row
2364 add diff, ebx // add bpp
2365 xor ecx, ecx
2366 add diff, 0xf // add 7 + 8 to incr past alignment boundary
2367 and diff, 0xfffffff8 // mask to alignment boundary
2368 sub diff, edi // subtract from start ==> value ebx at alignment
2369 jz dpthgo
2370 // fix alignment
2371dpthlp1:
2372 xor eax, eax
2373 // pav = p - a = (a + b - c) - a = b - c
2374 mov al, [esi + ebx] // load Prior(x) into al
2375 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2376 sub eax, ecx // subtract Prior(x-bpp)
2377 mov patemp, eax // Save pav for later use
2378 xor eax, eax
2379 // pbv = p - b = (a + b - c) - b = a - c
2380 mov al, [edi + edx] // load Raw(x-bpp) into al
2381 sub eax, ecx // subtract Prior(x-bpp)
2382 mov ecx, eax
2383 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2384 add eax, patemp // pcv = pav + pbv
2385 // pc = abs(pcv)
2386 test eax, 0x80000000
2387 jz dpthpca
2388 neg eax // reverse sign of neg values
2389dpthpca:
2390 mov pctemp, eax // save pc for later use
2391 // pb = abs(pbv)
2392 test ecx, 0x80000000
2393 jz dpthpba
2394 neg ecx // reverse sign of neg values
2395dpthpba:
2396 mov pbtemp, ecx // save pb for later use
2397 // pa = abs(pav)
2398 mov eax, patemp
2399 test eax, 0x80000000
2400 jz dpthpaa
2401 neg eax // reverse sign of neg values
2402dpthpaa:
2403 mov patemp, eax // save pa for later use
2404 // test if pa <= pb
2405 cmp eax, ecx
2406 jna dpthabb
2407 // pa > pb; now test if pb <= pc
2408 cmp ecx, pctemp
2409 jna dpthbbc
2410 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2411 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2412 jmp dpthpaeth
2413dpthbbc:
2414 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
2415 mov cl, [esi + ebx] // load Prior(x) into cl
2416 jmp dpthpaeth
2417dpthabb:
2418 // pa <= pb; now test if pa <= pc
2419 cmp eax, pctemp
2420 jna dpthabc
2421 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2422 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2423 jmp dpthpaeth
2424dpthabc:
2425 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
2426 mov cl, [edi + edx] // load Raw(x-bpp) into cl
2427dpthpaeth:
2428 inc ebx
2429 inc edx
2430 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
2431 add [edi + ebx - 1], cl
2432 cmp ebx, diff
2433 jb dpthlp1
2434dpthgo:
2435 mov ecx, FullLength
2436 mov eax, ecx
2437 sub eax, ebx // subtract alignment fix
2438 and eax, 0x00000007 // calc bytes over mult of 8
2439 sub ecx, eax // drop over bytes from original length
2440 mov MMXLength, ecx
2441 } // end _asm block
2442 // Now do the math for the rest of the row
2443 switch ( bpp )
2444 {
2445 case 3:
2446 {
2447 ActiveMask.use = 0x0000000000ffffff;
2448 ActiveMaskEnd.use = 0xffff000000000000;
2449 ShiftBpp.use = 24; // == bpp(3) * 8
2450 ShiftRem.use = 40; // == 64 - 24
2451 _asm
2452 {
2453 mov ebx, diff
2454 mov edi, row
2455 mov esi, prev_row
2456 pxor mm0, mm0
2457 // PRIME the pump (load the first Raw(x-bpp) data set
2458 movq mm1, [edi+ebx-8]
2459dpth3lp:
2460 psrlq mm1, ShiftRem // shift last 3 bytes to 1st 3 bytes
2461 movq mm2, [esi + ebx] // load b=Prior(x)
2462 punpcklbw mm1, mm0 // Unpack High bytes of a
2463 movq mm3, [esi+ebx-8] // Prep c=Prior(x-bpp) bytes
2464 punpcklbw mm2, mm0 // Unpack High bytes of b
2465 psrlq mm3, ShiftRem // shift last 3 bytes to 1st 3 bytes
2466 // pav = p - a = (a + b - c) - a = b - c
2467 movq mm4, mm2
2468 punpcklbw mm3, mm0 // Unpack High bytes of c
2469 // pbv = p - b = (a + b - c) - b = a - c
2470 movq mm5, mm1
2471 psubw mm4, mm3
2472 pxor mm7, mm7
2473 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2474 movq mm6, mm4
2475 psubw mm5, mm3
2476
2477 // pa = abs(p-a) = abs(pav)
2478 // pb = abs(p-b) = abs(pbv)
2479 // pc = abs(p-c) = abs(pcv)
2480 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2481 paddw mm6, mm5
2482 pand mm0, mm4 // Only pav bytes < 0 in mm7
2483 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2484 psubw mm4, mm0
2485 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2486 psubw mm4, mm0
2487 psubw mm5, mm7
2488 pxor mm0, mm0
2489 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2490 pand mm0, mm6 // Only pav bytes < 0 in mm7
2491 psubw mm5, mm7
2492 psubw mm6, mm0
2493 // test pa <= pb
2494 movq mm7, mm4
2495 psubw mm6, mm0
2496 pcmpgtw mm7, mm5 // pa > pb?
2497 movq mm0, mm7
2498 // use mm7 mask to merge pa & pb
2499 pand mm5, mm7
2500 // use mm0 mask copy to merge a & b
2501 pand mm2, mm0
2502 pandn mm7, mm4
2503 pandn mm0, mm1
2504 paddw mm7, mm5
2505 paddw mm0, mm2
2506 // test ((pa <= pb)? pa:pb) <= pc
2507 pcmpgtw mm7, mm6 // pab > pc?
2508 pxor mm1, mm1
2509 pand mm3, mm7
2510 pandn mm7, mm0
2511 paddw mm7, mm3
2512 pxor mm0, mm0
2513 packuswb mm7, mm1
2514 movq mm3, [esi + ebx] // load c=Prior(x-bpp)
2515 pand mm7, ActiveMask
2516 movq mm2, mm3 // load b=Prior(x) step 1
2517 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2518 punpcklbw mm3, mm0 // Unpack High bytes of c
2519 movq [edi + ebx], mm7 // write back updated value
2520 movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
2521 // Now do Paeth for 2nd set of bytes (3-5)
2522 psrlq mm2, ShiftBpp // load b=Prior(x) step 2
2523 punpcklbw mm1, mm0 // Unpack High bytes of a
2524 pxor mm7, mm7
2525 punpcklbw mm2, mm0 // Unpack High bytes of b
2526 // pbv = p - b = (a + b - c) - b = a - c
2527 movq mm5, mm1
2528 // pav = p - a = (a + b - c) - a = b - c
2529 movq mm4, mm2
2530 psubw mm5, mm3
2531 psubw mm4, mm3
2532 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
2533 // pav + pbv = pbv + pav
2534 movq mm6, mm5
2535 paddw mm6, mm4
2536
2537 // pa = abs(p-a) = abs(pav)
2538 // pb = abs(p-b) = abs(pbv)
2539 // pc = abs(p-c) = abs(pcv)
2540 pcmpgtw mm0, mm5 // Create mask pbv bytes < 0
2541 pcmpgtw mm7, mm4 // Create mask pav bytes < 0
2542 pand mm0, mm5 // Only pbv bytes < 0 in mm0
2543 pand mm7, mm4 // Only pav bytes < 0 in mm7
2544 psubw mm5, mm0
2545 psubw mm4, mm7
2546 psubw mm5, mm0
2547 psubw mm4, mm7
2548 pxor mm0, mm0
2549 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2550 pand mm0, mm6 // Only pav bytes < 0 in mm7
2551 psubw mm6, mm0
2552 // test pa <= pb
2553 movq mm7, mm4
2554 psubw mm6, mm0
2555 pcmpgtw mm7, mm5 // pa > pb?
2556 movq mm0, mm7
2557 // use mm7 mask to merge pa & pb
2558 pand mm5, mm7
2559 // use mm0 mask copy to merge a & b
2560 pand mm2, mm0
2561 pandn mm7, mm4
2562 pandn mm0, mm1
2563 paddw mm7, mm5
2564 paddw mm0, mm2
2565 // test ((pa <= pb)? pa:pb) <= pc
2566 pcmpgtw mm7, mm6 // pab > pc?
2567 movq mm2, [esi + ebx] // load b=Prior(x)
2568 pand mm3, mm7
2569 pandn mm7, mm0
2570 pxor mm1, mm1
2571 paddw mm7, mm3
2572 pxor mm0, mm0
2573 packuswb mm7, mm1
2574 movq mm3, mm2 // load c=Prior(x-bpp) step 1
2575 pand mm7, ActiveMask
2576 punpckhbw mm2, mm0 // Unpack High bytes of b
2577 psllq mm7, ShiftBpp // Shift bytes to 2nd group of 3 bytes
2578 // pav = p - a = (a + b - c) - a = b - c
2579 movq mm4, mm2
2580 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2581 psllq mm3, ShiftBpp // load c=Prior(x-bpp) step 2
2582 movq [edi + ebx], mm7 // write back updated value
2583 movq mm1, mm7
2584 punpckhbw mm3, mm0 // Unpack High bytes of c
2585 psllq mm1, ShiftBpp // Shift bytes
2586 // Now mm1 will be used as Raw(x-bpp)
2587 // Now do Paeth for 3rd, and final, set of bytes (6-7)
2588 pxor mm7, mm7
2589 punpckhbw mm1, mm0 // Unpack High bytes of a
2590 psubw mm4, mm3
2591 // pbv = p - b = (a + b - c) - b = a - c
2592 movq mm5, mm1
2593 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2594 movq mm6, mm4
2595 psubw mm5, mm3
2596 pxor mm0, mm0
2597 paddw mm6, mm5
2598
2599 // pa = abs(p-a) = abs(pav)
2600 // pb = abs(p-b) = abs(pbv)
2601 // pc = abs(p-c) = abs(pcv)
2602 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2603 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2604 pand mm0, mm4 // Only pav bytes < 0 in mm7
2605 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2606 psubw mm4, mm0
2607 psubw mm5, mm7
2608 psubw mm4, mm0
2609 psubw mm5, mm7
2610 pxor mm0, mm0
2611 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2612 pand mm0, mm6 // Only pav bytes < 0 in mm7
2613 psubw mm6, mm0
2614 // test pa <= pb
2615 movq mm7, mm4
2616 psubw mm6, mm0
2617 pcmpgtw mm7, mm5 // pa > pb?
2618 movq mm0, mm7
2619 // use mm0 mask copy to merge a & b
2620 pand mm2, mm0
2621 // use mm7 mask to merge pa & pb
2622 pand mm5, mm7
2623 pandn mm0, mm1
2624 pandn mm7, mm4
2625 paddw mm0, mm2
2626 paddw mm7, mm5
2627 // test ((pa <= pb)? pa:pb) <= pc
2628 pcmpgtw mm7, mm6 // pab > pc?
2629 pand mm3, mm7
2630 pandn mm7, mm0
2631 paddw mm7, mm3
2632 pxor mm1, mm1
2633 packuswb mm1, mm7
2634 // Step ebx to next set of 8 bytes and repeat loop til done
2635 add ebx, 8
2636 pand mm1, ActiveMaskEnd
2637 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2638
2639 cmp ebx, MMXLength
2640 pxor mm0, mm0 // pxor does not affect flags
2641 movq [edi + ebx - 8], mm1 // write back updated value
2642 // mm1 will be used as Raw(x-bpp) next loop
2643 // mm3 ready to be used as Prior(x-bpp) next loop
2644 jb dpth3lp
2645 } // end _asm block
2646 }
2647 break;
2648
2649 case 6:
2650 case 7:
2651 case 5:
2652 {
2653 ActiveMask.use = 0x00000000ffffffff;
2654 ActiveMask2.use = 0xffffffff00000000;
2655 ShiftBpp.use = bpp << 3; // == bpp * 8
2656 ShiftRem.use = 64 - ShiftBpp.use;
2657 _asm
2658 {
2659 mov ebx, diff
2660 mov edi, row
2661 mov esi, prev_row
2662 // PRIME the pump (load the first Raw(x-bpp) data set
2663 movq mm1, [edi+ebx-8]
2664 pxor mm0, mm0
2665dpth6lp:
2666 // Must shift to position Raw(x-bpp) data
2667 psrlq mm1, ShiftRem
2668 // Do first set of 4 bytes
2669 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2670 punpcklbw mm1, mm0 // Unpack Low bytes of a
2671 movq mm2, [esi + ebx] // load b=Prior(x)
2672 punpcklbw mm2, mm0 // Unpack Low bytes of b
2673 // Must shift to position Prior(x-bpp) data
2674 psrlq mm3, ShiftRem
2675 // pav = p - a = (a + b - c) - a = b - c
2676 movq mm4, mm2
2677 punpcklbw mm3, mm0 // Unpack Low bytes of c
2678 // pbv = p - b = (a + b - c) - b = a - c
2679 movq mm5, mm1
2680 psubw mm4, mm3
2681 pxor mm7, mm7
2682 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2683 movq mm6, mm4
2684 psubw mm5, mm3
2685 // pa = abs(p-a) = abs(pav)
2686 // pb = abs(p-b) = abs(pbv)
2687 // pc = abs(p-c) = abs(pcv)
2688 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2689 paddw mm6, mm5
2690 pand mm0, mm4 // Only pav bytes < 0 in mm7
2691 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2692 psubw mm4, mm0
2693 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2694 psubw mm4, mm0
2695 psubw mm5, mm7
2696 pxor mm0, mm0
2697 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2698 pand mm0, mm6 // Only pav bytes < 0 in mm7
2699 psubw mm5, mm7
2700 psubw mm6, mm0
2701 // test pa <= pb
2702 movq mm7, mm4
2703 psubw mm6, mm0
2704 pcmpgtw mm7, mm5 // pa > pb?
2705 movq mm0, mm7
2706 // use mm7 mask to merge pa & pb
2707 pand mm5, mm7
2708 // use mm0 mask copy to merge a & b
2709 pand mm2, mm0
2710 pandn mm7, mm4
2711 pandn mm0, mm1
2712 paddw mm7, mm5
2713 paddw mm0, mm2
2714 // test ((pa <= pb)? pa:pb) <= pc
2715 pcmpgtw mm7, mm6 // pab > pc?
2716 pxor mm1, mm1
2717 pand mm3, mm7
2718 pandn mm7, mm0
2719 paddw mm7, mm3
2720 pxor mm0, mm0
2721 packuswb mm7, mm1
2722 movq mm3, [esi + ebx - 8] // load c=Prior(x-bpp)
2723 pand mm7, ActiveMask
2724 psrlq mm3, ShiftRem
2725 movq mm2, [esi + ebx] // load b=Prior(x) step 1
2726 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2727 movq mm6, mm2
2728 movq [edi + ebx], mm7 // write back updated value
2729 movq mm1, [edi+ebx-8]
2730 psllq mm6, ShiftBpp
2731 movq mm5, mm7
2732 psrlq mm1, ShiftRem
2733 por mm3, mm6
2734 psllq mm5, ShiftBpp
2735 punpckhbw mm3, mm0 // Unpack High bytes of c
2736 por mm1, mm5
2737 // Do second set of 4 bytes
2738 punpckhbw mm2, mm0 // Unpack High bytes of b
2739 punpckhbw mm1, mm0 // Unpack High bytes of a
2740 // pav = p - a = (a + b - c) - a = b - c
2741 movq mm4, mm2
2742 // pbv = p - b = (a + b - c) - b = a - c
2743 movq mm5, mm1
2744 psubw mm4, mm3
2745 pxor mm7, mm7
2746 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2747 movq mm6, mm4
2748 psubw mm5, mm3
2749 // pa = abs(p-a) = abs(pav)
2750 // pb = abs(p-b) = abs(pbv)
2751 // pc = abs(p-c) = abs(pcv)
2752 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2753 paddw mm6, mm5
2754 pand mm0, mm4 // Only pav bytes < 0 in mm7
2755 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2756 psubw mm4, mm0
2757 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2758 psubw mm4, mm0
2759 psubw mm5, mm7
2760 pxor mm0, mm0
2761 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2762 pand mm0, mm6 // Only pav bytes < 0 in mm7
2763 psubw mm5, mm7
2764 psubw mm6, mm0
2765 // test pa <= pb
2766 movq mm7, mm4
2767 psubw mm6, mm0
2768 pcmpgtw mm7, mm5 // pa > pb?
2769 movq mm0, mm7
2770 // use mm7 mask to merge pa & pb
2771 pand mm5, mm7
2772 // use mm0 mask copy to merge a & b
2773 pand mm2, mm0
2774 pandn mm7, mm4
2775 pandn mm0, mm1
2776 paddw mm7, mm5
2777 paddw mm0, mm2
2778 // test ((pa <= pb)? pa:pb) <= pc
2779 pcmpgtw mm7, mm6 // pab > pc?
2780 pxor mm1, mm1
2781 pand mm3, mm7
2782 pandn mm7, mm0
2783 pxor mm1, mm1
2784 paddw mm7, mm3
2785 pxor mm0, mm0
2786 // Step ex to next set of 8 bytes and repeat loop til done
2787 add ebx, 8
2788 packuswb mm1, mm7
2789 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2790 cmp ebx, MMXLength
2791 movq [edi + ebx - 8], mm1 // write back updated value
2792 // mm1 will be used as Raw(x-bpp) next loop
2793 jb dpth6lp
2794 } // end _asm block
2795 }
2796 break;
2797
2798 case 4:
2799 {
2800 ActiveMask.use = 0x00000000ffffffff;
2801 _asm {
2802 mov ebx, diff
2803 mov edi, row
2804 mov esi, prev_row
2805 pxor mm0, mm0
2806 // PRIME the pump (load the first Raw(x-bpp) data set
2807 movq mm1, [edi+ebx-8] // Only time should need to read
2808 // a=Raw(x-bpp) bytes
2809dpth4lp:
2810 // Do first set of 4 bytes
2811 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2812 punpckhbw mm1, mm0 // Unpack Low bytes of a
2813 movq mm2, [esi + ebx] // load b=Prior(x)
2814 punpcklbw mm2, mm0 // Unpack High bytes of b
2815 // pav = p - a = (a + b - c) - a = b - c
2816 movq mm4, mm2
2817 punpckhbw mm3, mm0 // Unpack High bytes of c
2818 // pbv = p - b = (a + b - c) - b = a - c
2819 movq mm5, mm1
2820 psubw mm4, mm3
2821 pxor mm7, mm7
2822 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2823 movq mm6, mm4
2824 psubw mm5, mm3
2825 // pa = abs(p-a) = abs(pav)
2826 // pb = abs(p-b) = abs(pbv)
2827 // pc = abs(p-c) = abs(pcv)
2828 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2829 paddw mm6, mm5
2830 pand mm0, mm4 // Only pav bytes < 0 in mm7
2831 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2832 psubw mm4, mm0
2833 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2834 psubw mm4, mm0
2835 psubw mm5, mm7
2836 pxor mm0, mm0
2837 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2838 pand mm0, mm6 // Only pav bytes < 0 in mm7
2839 psubw mm5, mm7
2840 psubw mm6, mm0
2841 // test pa <= pb
2842 movq mm7, mm4
2843 psubw mm6, mm0
2844 pcmpgtw mm7, mm5 // pa > pb?
2845 movq mm0, mm7
2846 // use mm7 mask to merge pa & pb
2847 pand mm5, mm7
2848 // use mm0 mask copy to merge a & b
2849 pand mm2, mm0
2850 pandn mm7, mm4
2851 pandn mm0, mm1
2852 paddw mm7, mm5
2853 paddw mm0, mm2
2854 // test ((pa <= pb)? pa:pb) <= pc
2855 pcmpgtw mm7, mm6 // pab > pc?
2856 pxor mm1, mm1
2857 pand mm3, mm7
2858 pandn mm7, mm0
2859 paddw mm7, mm3
2860 pxor mm0, mm0
2861 packuswb mm7, mm1
2862 movq mm3, [esi + ebx] // load c=Prior(x-bpp)
2863 pand mm7, ActiveMask
2864 movq mm2, mm3 // load b=Prior(x) step 1
2865 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2866 punpcklbw mm3, mm0 // Unpack High bytes of c
2867 movq [edi + ebx], mm7 // write back updated value
2868 movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
2869 // Do second set of 4 bytes
2870 punpckhbw mm2, mm0 // Unpack Low bytes of b
2871 punpcklbw mm1, mm0 // Unpack Low bytes of a
2872 // pav = p - a = (a + b - c) - a = b - c
2873 movq mm4, mm2
2874 // pbv = p - b = (a + b - c) - b = a - c
2875 movq mm5, mm1
2876 psubw mm4, mm3
2877 pxor mm7, mm7
2878 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2879 movq mm6, mm4
2880 psubw mm5, mm3
2881 // pa = abs(p-a) = abs(pav)
2882 // pb = abs(p-b) = abs(pbv)
2883 // pc = abs(p-c) = abs(pcv)
2884 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2885 paddw mm6, mm5
2886 pand mm0, mm4 // Only pav bytes < 0 in mm7
2887 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2888 psubw mm4, mm0
2889 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2890 psubw mm4, mm0
2891 psubw mm5, mm7
2892 pxor mm0, mm0
2893 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2894 pand mm0, mm6 // Only pav bytes < 0 in mm7
2895 psubw mm5, mm7
2896 psubw mm6, mm0
2897 // test pa <= pb
2898 movq mm7, mm4
2899 psubw mm6, mm0
2900 pcmpgtw mm7, mm5 // pa > pb?
2901 movq mm0, mm7
2902 // use mm7 mask to merge pa & pb
2903 pand mm5, mm7
2904 // use mm0 mask copy to merge a & b
2905 pand mm2, mm0
2906 pandn mm7, mm4
2907 pandn mm0, mm1
2908 paddw mm7, mm5
2909 paddw mm0, mm2
2910 // test ((pa <= pb)? pa:pb) <= pc
2911 pcmpgtw mm7, mm6 // pab > pc?
2912 pxor mm1, mm1
2913 pand mm3, mm7
2914 pandn mm7, mm0
2915 pxor mm1, mm1
2916 paddw mm7, mm3
2917 pxor mm0, mm0
2918 // Step ex to next set of 8 bytes and repeat loop til done
2919 add ebx, 8
2920 packuswb mm1, mm7
2921 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2922 cmp ebx, MMXLength
2923 movq [edi + ebx - 8], mm1 // write back updated value
2924 // mm1 will be used as Raw(x-bpp) next loop
2925 jb dpth4lp
2926 } // end _asm block
2927 }
2928 break;
2929 case 8: // bpp == 8
2930 {
2931 ActiveMask.use = 0x00000000ffffffff;
2932 _asm {
2933 mov ebx, diff
2934 mov edi, row
2935 mov esi, prev_row
2936 pxor mm0, mm0
2937 // PRIME the pump (load the first Raw(x-bpp) data set
2938 movq mm1, [edi+ebx-8] // Only time should need to read
2939 // a=Raw(x-bpp) bytes
2940dpth8lp:
2941 // Do first set of 4 bytes
2942 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2943 punpcklbw mm1, mm0 // Unpack Low bytes of a
2944 movq mm2, [esi + ebx] // load b=Prior(x)
2945 punpcklbw mm2, mm0 // Unpack Low bytes of b
2946 // pav = p - a = (a + b - c) - a = b - c
2947 movq mm4, mm2
2948 punpcklbw mm3, mm0 // Unpack Low bytes of c
2949 // pbv = p - b = (a + b - c) - b = a - c
2950 movq mm5, mm1
2951 psubw mm4, mm3
2952 pxor mm7, mm7
2953 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2954 movq mm6, mm4
2955 psubw mm5, mm3
2956 // pa = abs(p-a) = abs(pav)
2957 // pb = abs(p-b) = abs(pbv)
2958 // pc = abs(p-c) = abs(pcv)
2959 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2960 paddw mm6, mm5
2961 pand mm0, mm4 // Only pav bytes < 0 in mm7
2962 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2963 psubw mm4, mm0
2964 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2965 psubw mm4, mm0
2966 psubw mm5, mm7
2967 pxor mm0, mm0
2968 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2969 pand mm0, mm6 // Only pav bytes < 0 in mm7
2970 psubw mm5, mm7
2971 psubw mm6, mm0
2972 // test pa <= pb
2973 movq mm7, mm4
2974 psubw mm6, mm0
2975 pcmpgtw mm7, mm5 // pa > pb?
2976 movq mm0, mm7
2977 // use mm7 mask to merge pa & pb
2978 pand mm5, mm7
2979 // use mm0 mask copy to merge a & b
2980 pand mm2, mm0
2981 pandn mm7, mm4
2982 pandn mm0, mm1
2983 paddw mm7, mm5
2984 paddw mm0, mm2
2985 // test ((pa <= pb)? pa:pb) <= pc
2986 pcmpgtw mm7, mm6 // pab > pc?
2987 pxor mm1, mm1
2988 pand mm3, mm7
2989 pandn mm7, mm0
2990 paddw mm7, mm3
2991 pxor mm0, mm0
2992 packuswb mm7, mm1
2993 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2994 pand mm7, ActiveMask
2995 movq mm2, [esi + ebx] // load b=Prior(x)
2996 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2997 punpckhbw mm3, mm0 // Unpack High bytes of c
2998 movq [edi + ebx], mm7 // write back updated value
2999 movq mm1, [edi+ebx-8] // read a=Raw(x-bpp) bytes
3000
3001 // Do second set of 4 bytes
3002 punpckhbw mm2, mm0 // Unpack High bytes of b
3003 punpckhbw mm1, mm0 // Unpack High bytes of a
3004 // pav = p - a = (a + b - c) - a = b - c
3005 movq mm4, mm2
3006 // pbv = p - b = (a + b - c) - b = a - c
3007 movq mm5, mm1
3008 psubw mm4, mm3
3009 pxor mm7, mm7
3010 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3011 movq mm6, mm4
3012 psubw mm5, mm3
3013 // pa = abs(p-a) = abs(pav)
3014 // pb = abs(p-b) = abs(pbv)
3015 // pc = abs(p-c) = abs(pcv)
3016 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
3017 paddw mm6, mm5
3018 pand mm0, mm4 // Only pav bytes < 0 in mm7
3019 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
3020 psubw mm4, mm0
3021 pand mm7, mm5 // Only pbv bytes < 0 in mm0
3022 psubw mm4, mm0
3023 psubw mm5, mm7
3024 pxor mm0, mm0
3025 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
3026 pand mm0, mm6 // Only pav bytes < 0 in mm7
3027 psubw mm5, mm7
3028 psubw mm6, mm0
3029 // test pa <= pb
3030 movq mm7, mm4
3031 psubw mm6, mm0
3032 pcmpgtw mm7, mm5 // pa > pb?
3033 movq mm0, mm7
3034 // use mm7 mask to merge pa & pb
3035 pand mm5, mm7
3036 // use mm0 mask copy to merge a & b
3037 pand mm2, mm0
3038 pandn mm7, mm4
3039 pandn mm0, mm1
3040 paddw mm7, mm5
3041 paddw mm0, mm2
3042 // test ((pa <= pb)? pa:pb) <= pc
3043 pcmpgtw mm7, mm6 // pab > pc?
3044 pxor mm1, mm1
3045 pand mm3, mm7
3046 pandn mm7, mm0
3047 pxor mm1, mm1
3048 paddw mm7, mm3
3049 pxor mm0, mm0
3050 // Step ex to next set of 8 bytes and repeat loop til done
3051 add ebx, 8
3052 packuswb mm1, mm7
3053 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
3054 cmp ebx, MMXLength
3055 movq [edi + ebx - 8], mm1 // write back updated value
3056 // mm1 will be used as Raw(x-bpp) next loop
3057 jb dpth8lp
3058 } // end _asm block
3059 }
3060 break;
3061
3062 case 1: // bpp = 1
3063 case 2: // bpp = 2
3064 default: // bpp > 8
3065 {
3066 _asm {
3067 mov ebx, diff
3068 cmp ebx, FullLength
3069 jnb dpthdend
3070 mov edi, row
3071 mov esi, prev_row
3072 // Do Paeth decode for remaining bytes
3073 mov edx, ebx
3074 xor ecx, ecx // zero ecx before using cl & cx in loop below
3075 sub edx, bpp // Set edx = ebx - bpp
3076dpthdlp:
3077 xor eax, eax
3078 // pav = p - a = (a + b - c) - a = b - c
3079 mov al, [esi + ebx] // load Prior(x) into al
3080 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3081 sub eax, ecx // subtract Prior(x-bpp)
3082 mov patemp, eax // Save pav for later use
3083 xor eax, eax
3084 // pbv = p - b = (a + b - c) - b = a - c
3085 mov al, [edi + edx] // load Raw(x-bpp) into al
3086 sub eax, ecx // subtract Prior(x-bpp)
3087 mov ecx, eax
3088 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3089 add eax, patemp // pcv = pav + pbv
3090 // pc = abs(pcv)
3091 test eax, 0x80000000
3092 jz dpthdpca
3093 neg eax // reverse sign of neg values
3094dpthdpca:
3095 mov pctemp, eax // save pc for later use
3096 // pb = abs(pbv)
3097 test ecx, 0x80000000
3098 jz dpthdpba
3099 neg ecx // reverse sign of neg values
3100dpthdpba:
3101 mov pbtemp, ecx // save pb for later use
3102 // pa = abs(pav)
3103 mov eax, patemp
3104 test eax, 0x80000000
3105 jz dpthdpaa
3106 neg eax // reverse sign of neg values
3107dpthdpaa:
3108 mov patemp, eax // save pa for later use
3109 // test if pa <= pb
3110 cmp eax, ecx
3111 jna dpthdabb
3112 // pa > pb; now test if pb <= pc
3113 cmp ecx, pctemp
3114 jna dpthdbbc
3115 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3116 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3117 jmp dpthdpaeth
3118dpthdbbc:
3119 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3120 mov cl, [esi + ebx] // load Prior(x) into cl
3121 jmp dpthdpaeth
3122dpthdabb:
3123 // pa <= pb; now test if pa <= pc
3124 cmp eax, pctemp
3125 jna dpthdabc
3126 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3127 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3128 jmp dpthdpaeth
3129dpthdabc:
3130 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3131 mov cl, [edi + edx] // load Raw(x-bpp) into cl
3132dpthdpaeth:
3133 inc ebx
3134 inc edx
3135 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3136 add [edi + ebx - 1], cl
3137 cmp ebx, FullLength
3138 jb dpthdlp
3139dpthdend:
3140 } // end _asm block
3141 }
3142 return; // No need to go further with this one
3143 } // end switch ( bpp )
3144 _asm
3145 {
3146 // MMX acceleration complete now do clean-up
3147 // Check if any remaining bytes left to decode
3148 mov ebx, MMXLength
3149 cmp ebx, FullLength
3150 jnb dpthend
3151 mov edi, row
3152 mov esi, prev_row
3153 // Do Paeth decode for remaining bytes
3154 mov edx, ebx
3155 xor ecx, ecx // zero ecx before using cl & cx in loop below
3156 sub edx, bpp // Set edx = ebx - bpp
3157dpthlp2:
3158 xor eax, eax
3159 // pav = p - a = (a + b - c) - a = b - c
3160 mov al, [esi + ebx] // load Prior(x) into al
3161 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3162 sub eax, ecx // subtract Prior(x-bpp)
3163 mov patemp, eax // Save pav for later use
3164 xor eax, eax
3165 // pbv = p - b = (a + b - c) - b = a - c
3166 mov al, [edi + edx] // load Raw(x-bpp) into al
3167 sub eax, ecx // subtract Prior(x-bpp)
3168 mov ecx, eax
3169 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3170 add eax, patemp // pcv = pav + pbv
3171 // pc = abs(pcv)
3172 test eax, 0x80000000
3173 jz dpthpca2
3174 neg eax // reverse sign of neg values
3175dpthpca2:
3176 mov pctemp, eax // save pc for later use
3177 // pb = abs(pbv)
3178 test ecx, 0x80000000
3179 jz dpthpba2
3180 neg ecx // reverse sign of neg values
3181dpthpba2:
3182 mov pbtemp, ecx // save pb for later use
3183 // pa = abs(pav)
3184 mov eax, patemp
3185 test eax, 0x80000000
3186 jz dpthpaa2
3187 neg eax // reverse sign of neg values
3188dpthpaa2:
3189 mov patemp, eax // save pa for later use
3190 // test if pa <= pb
3191 cmp eax, ecx
3192 jna dpthabb2
3193 // pa > pb; now test if pb <= pc
3194 cmp ecx, pctemp
3195 jna dpthbbc2
3196 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3197 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3198 jmp dpthpaeth2
3199dpthbbc2:
3200 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3201 mov cl, [esi + ebx] // load Prior(x) into cl
3202 jmp dpthpaeth2
3203dpthabb2:
3204 // pa <= pb; now test if pa <= pc
3205 cmp eax, pctemp
3206 jna dpthabc2
3207 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3208 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3209 jmp dpthpaeth2
3210dpthabc2:
3211 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3212 mov cl, [edi + edx] // load Raw(x-bpp) into cl
3213dpthpaeth2:
3214 inc ebx
3215 inc edx
3216 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3217 add [edi + ebx - 1], cl
3218 cmp ebx, FullLength
3219 jb dpthlp2
3220dpthend:
3221 emms // End MMX instructions; prep for possible FP instrs.
3222 } // end _asm block
3223}
3224
3225// Optimized code for PNG Sub filter decoder
3226void
3227png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
3228{
3229 //int test;
3230 int bpp;
3231 png_uint_32 FullLength;
3232 png_uint_32 MMXLength;
3233 int diff;
3234
3235 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3236 FullLength = row_info->rowbytes - bpp; // # of bytes to filter
3237 _asm {
3238 mov edi, row
3239 mov esi, edi // lp = row
3240 add edi, bpp // rp = row + bpp
3241 xor eax, eax
3242 // get # of bytes to alignment
3243 mov diff, edi // take start of row
3244 add diff, 0xf // add 7 + 8 to incr past
3245 // alignment boundary
3246 xor ebx, ebx
3247 and diff, 0xfffffff8 // mask to alignment boundary
3248 sub diff, edi // subtract from start ==> value
3249 // ebx at alignment
3250 jz dsubgo
3251 // fix alignment
3252dsublp1:
3253 mov al, [esi+ebx]
3254 add [edi+ebx], al
3255 inc ebx
3256 cmp ebx, diff
3257 jb dsublp1
3258dsubgo:
3259 mov ecx, FullLength
3260 mov edx, ecx
3261 sub edx, ebx // subtract alignment fix
3262 and edx, 0x00000007 // calc bytes over mult of 8
3263 sub ecx, edx // drop over bytes from length
3264 mov MMXLength, ecx
3265 } // end _asm block
3266
3267 // Now do the math for the rest of the row
3268 switch ( bpp )
3269 {
3270 case 3:
3271 {
3272 ActiveMask.use = 0x0000ffffff000000;
3273 ShiftBpp.use = 24; // == 3 * 8
3274 ShiftRem.use = 40; // == 64 - 24
3275 _asm {
3276 mov edi, row
3277 movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
3278 mov esi, edi // lp = row
3279 add edi, bpp // rp = row + bpp
3280 movq mm6, mm7
3281 mov ebx, diff
3282 psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
3283 // byte group
3284 // PRIME the pump (load the first Raw(x-bpp) data set
3285 movq mm1, [edi+ebx-8]
3286dsub3lp:
3287 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3288 // no need for mask; shift clears inactive bytes
3289 // Add 1st active group
3290 movq mm0, [edi+ebx]
3291 paddb mm0, mm1
3292 // Add 2nd active group
3293 movq mm1, mm0 // mov updated Raws to mm1
3294 psllq mm1, ShiftBpp // shift data to position correctly
3295 pand mm1, mm7 // mask to use only 2nd active group
3296 paddb mm0, mm1
3297 // Add 3rd active group
3298 movq mm1, mm0 // mov updated Raws to mm1
3299 psllq mm1, ShiftBpp // shift data to position correctly
3300 pand mm1, mm6 // mask to use only 3rd active group
3301 add ebx, 8
3302 paddb mm0, mm1
3303 cmp ebx, MMXLength
3304 movq [edi+ebx-8], mm0 // Write updated Raws back to array
3305 // Prep for doing 1st add at top of loop
3306 movq mm1, mm0
3307 jb dsub3lp
3308 } // end _asm block
3309 }
3310 break;
3311
3312 case 1:
3313 {
3314 // Placed here just in case this is a duplicate of the
3315 // non-MMX code for the SUB filter in png_read_filter_row above
3316 //
3317 // png_bytep rp;
3318 // png_bytep lp;
3319 // png_uint_32 i;
3320 // bpp = (row_info->pixel_depth + 7) >> 3;
3321 // for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
3322 // i < row_info->rowbytes; i++, rp++, lp++)
3323 // {
3324 // *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
3325 // }
3326 _asm {
3327 mov ebx, diff
3328 mov edi, row
3329 cmp ebx, FullLength
3330 jnb dsub1end
3331 mov esi, edi // lp = row
3332 xor eax, eax
3333 add edi, bpp // rp = row + bpp
3334dsub1lp:
3335 mov al, [esi+ebx]
3336 add [edi+ebx], al
3337 inc ebx
3338 cmp ebx, FullLength
3339 jb dsub1lp
3340dsub1end:
3341 } // end _asm block
3342 }
3343 return;
3344
3345 case 6:
3346 case 7:
3347 case 4:
3348 case 5:
3349 {
3350 ShiftBpp.use = bpp << 3;
3351 ShiftRem.use = 64 - ShiftBpp.use;
3352 _asm {
3353 mov edi, row
3354 mov ebx, diff
3355 mov esi, edi // lp = row
3356 add edi, bpp // rp = row + bpp
3357 // PRIME the pump (load the first Raw(x-bpp) data set
3358 movq mm1, [edi+ebx-8]
3359dsub4lp:
3360 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3361 // no need for mask; shift clears inactive bytes
3362 movq mm0, [edi+ebx]
3363 paddb mm0, mm1
3364 // Add 2nd active group
3365 movq mm1, mm0 // mov updated Raws to mm1
3366 psllq mm1, ShiftBpp // shift data to position correctly
3367 // there is no need for any mask
3368 // since shift clears inactive bits/bytes
3369 add ebx, 8
3370 paddb mm0, mm1
3371 cmp ebx, MMXLength
3372 movq [edi+ebx-8], mm0
3373 movq mm1, mm0 // Prep for doing 1st add at top of loop
3374 jb dsub4lp
3375 } // end _asm block
3376 }
3377 break;
3378
3379 case 2:
3380 {
3381 ActiveMask.use = 0x00000000ffff0000;
3382 ShiftBpp.use = 16; // == 2 * 8
3383 ShiftRem.use = 48; // == 64 - 16
3384 _asm {
3385 movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
3386 mov ebx, diff
3387 movq mm6, mm7
3388 mov edi, row
3389 psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
3390 // byte group
3391 mov esi, edi // lp = row
3392 movq mm5, mm6
3393 add edi, bpp // rp = row + bpp
3394 psllq mm5, ShiftBpp // Move mask in mm5 to cover 4th active
3395 // byte group
3396 // PRIME the pump (load the first Raw(x-bpp) data set
3397 movq mm1, [edi+ebx-8]
3398dsub2lp:
3399 // Add 1st active group
3400 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3401 // no need for mask; shift clears inactive
3402 // bytes
3403 movq mm0, [edi+ebx]
3404 paddb mm0, mm1
3405 // Add 2nd active group
3406 movq mm1, mm0 // mov updated Raws to mm1
3407 psllq mm1, ShiftBpp // shift data to position correctly
3408 pand mm1, mm7 // mask to use only 2nd active group
3409 paddb mm0, mm1
3410 // Add 3rd active group
3411 movq mm1, mm0 // mov updated Raws to mm1
3412 psllq mm1, ShiftBpp // shift data to position correctly
3413 pand mm1, mm6 // mask to use only 3rd active group
3414 paddb mm0, mm1
3415 // Add 4th active group
3416 movq mm1, mm0 // mov updated Raws to mm1
3417 psllq mm1, ShiftBpp // shift data to position correctly
3418 pand mm1, mm5 // mask to use only 4th active group
3419 add ebx, 8
3420 paddb mm0, mm1
3421 cmp ebx, MMXLength
3422 movq [edi+ebx-8], mm0 // Write updated Raws back to array
3423 movq mm1, mm0 // Prep for doing 1st add at top of loop
3424 jb dsub2lp
3425 } // end _asm block
3426 }
3427 break;
3428 case 8:
3429 {
3430 _asm {
3431 mov edi, row
3432 mov ebx, diff
3433 mov esi, edi // lp = row
3434 add edi, bpp // rp = row + bpp
3435 mov ecx, MMXLength
3436 movq mm7, [edi+ebx-8] // PRIME the pump (load the first
3437 // Raw(x-bpp) data set
3438 and ecx, 0x0000003f // calc bytes over mult of 64
3439dsub8lp:
3440 movq mm0, [edi+ebx] // Load Sub(x) for 1st 8 bytes
3441 paddb mm0, mm7
3442 movq mm1, [edi+ebx+8] // Load Sub(x) for 2nd 8 bytes
3443 movq [edi+ebx], mm0 // Write Raw(x) for 1st 8 bytes
3444 // Now mm0 will be used as Raw(x-bpp) for
3445 // the 2nd group of 8 bytes. This will be
3446 // repeated for each group of 8 bytes with
3447 // the 8th group being used as the Raw(x-bpp)
3448 // for the 1st group of the next loop.
3449 paddb mm1, mm0
3450 movq mm2, [edi+ebx+16] // Load Sub(x) for 3rd 8 bytes
3451 movq [edi+ebx+8], mm1 // Write Raw(x) for 2nd 8 bytes
3452 paddb mm2, mm1
3453 movq mm3, [edi+ebx+24] // Load Sub(x) for 4th 8 bytes
3454 movq [edi+ebx+16], mm2 // Write Raw(x) for 3rd 8 bytes
3455 paddb mm3, mm2
3456 movq mm4, [edi+ebx+32] // Load Sub(x) for 5th 8 bytes
3457 movq [edi+ebx+24], mm3 // Write Raw(x) for 4th 8 bytes
3458 paddb mm4, mm3
3459 movq mm5, [edi+ebx+40] // Load Sub(x) for 6th 8 bytes
3460 movq [edi+ebx+32], mm4 // Write Raw(x) for 5th 8 bytes
3461 paddb mm5, mm4
3462 movq mm6, [edi+ebx+48] // Load Sub(x) for 7th 8 bytes
3463 movq [edi+ebx+40], mm5 // Write Raw(x) for 6th 8 bytes
3464 paddb mm6, mm5
3465 movq mm7, [edi+ebx+56] // Load Sub(x) for 8th 8 bytes
3466 movq [edi+ebx+48], mm6 // Write Raw(x) for 7th 8 bytes
3467 add ebx, 64
3468 paddb mm7, mm6
3469 cmp ebx, ecx
3470 movq [edi+ebx-8], mm7 // Write Raw(x) for 8th 8 bytes
3471 jb dsub8lp
3472 cmp ebx, MMXLength
3473 jnb dsub8lt8
3474dsub8lpA:
3475 movq mm0, [edi+ebx]
3476 add ebx, 8
3477 paddb mm0, mm7
3478 cmp ebx, MMXLength
3479 movq [edi+ebx-8], mm0 // use -8 to offset early add to ebx
3480 movq mm7, mm0 // Move calculated Raw(x) data to mm1 to
3481 // be the new Raw(x-bpp) for the next loop
3482 jb dsub8lpA
3483dsub8lt8:
3484 } // end _asm block
3485 }
3486 break;
3487
3488 default: // bpp greater than 8 bytes
3489 {
3490 _asm {
3491 mov ebx, diff
3492 mov edi, row
3493 mov esi, edi // lp = row
3494 add edi, bpp // rp = row + bpp
3495dsubAlp:
3496 movq mm0, [edi+ebx]
3497 movq mm1, [esi+ebx]
3498 add ebx, 8
3499 paddb mm0, mm1
3500 cmp ebx, MMXLength
3501 movq [edi+ebx-8], mm0 // mov does not affect flags; -8 to offset
3502 // add ebx
3503 jb dsubAlp
3504 } // end _asm block
3505 }
3506 break;
3507
3508 } // end switch ( bpp )
3509
3510 _asm {
3511 mov ebx, MMXLength
3512 mov edi, row
3513 cmp ebx, FullLength
3514 jnb dsubend
3515 mov esi, edi // lp = row
3516 xor eax, eax
3517 add edi, bpp // rp = row + bpp
3518dsublp2:
3519 mov al, [esi+ebx]
3520 add [edi+ebx], al
3521 inc ebx
3522 cmp ebx, FullLength
3523 jb dsublp2
3524dsubend:
3525 emms // End MMX instructions; prep for possible FP instrs.
3526 } // end _asm block
3527}
3528
3529// Optimized code for PNG Up filter decoder
3530void
3531png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
3532 png_bytep prev_row)
3533{
3534 png_uint_32 len;
3535 len = row_info->rowbytes; // # of bytes to filter
3536 _asm {
3537 mov edi, row
3538 // get # of bytes to alignment
3539 mov ecx, edi
3540 xor ebx, ebx
3541 add ecx, 0x7
3542 xor eax, eax
3543 and ecx, 0xfffffff8
3544 mov esi, prev_row
3545 sub ecx, edi
3546 jz dupgo
3547 // fix alignment
3548duplp1:
3549 mov al, [edi+ebx]
3550 add al, [esi+ebx]
3551 inc ebx
3552 cmp ebx, ecx
3553 mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3554 jb duplp1
3555dupgo:
3556 mov ecx, len
3557 mov edx, ecx
3558 sub edx, ebx // subtract alignment fix
3559 and edx, 0x0000003f // calc bytes over mult of 64
3560 sub ecx, edx // drop over bytes from length
3561 // Unrolled loop - use all MMX registers and interleave to reduce
3562 // number of branch instructions (loops) and reduce partial stalls
3563duploop:
3564 movq mm1, [esi+ebx]
3565 movq mm0, [edi+ebx]
3566 movq mm3, [esi+ebx+8]
3567 paddb mm0, mm1
3568 movq mm2, [edi+ebx+8]
3569 movq [edi+ebx], mm0
3570 paddb mm2, mm3
3571 movq mm5, [esi+ebx+16]
3572 movq [edi+ebx+8], mm2
3573 movq mm4, [edi+ebx+16]
3574 movq mm7, [esi+ebx+24]
3575 paddb mm4, mm5
3576 movq mm6, [edi+ebx+24]
3577 movq [edi+ebx+16], mm4
3578 paddb mm6, mm7
3579 movq mm1, [esi+ebx+32]
3580 movq [edi+ebx+24], mm6
3581 movq mm0, [edi+ebx+32]
3582 movq mm3, [esi+ebx+40]
3583 paddb mm0, mm1
3584 movq mm2, [edi+ebx+40]
3585 movq [edi+ebx+32], mm0
3586 paddb mm2, mm3
3587 movq mm5, [esi+ebx+48]
3588 movq [edi+ebx+40], mm2
3589 movq mm4, [edi+ebx+48]
3590 movq mm7, [esi+ebx+56]
3591 paddb mm4, mm5
3592 movq mm6, [edi+ebx+56]
3593 movq [edi+ebx+48], mm4
3594 add ebx, 64
3595 paddb mm6, mm7
3596 cmp ebx, ecx
3597 movq [edi+ebx-8], mm6 // (+56)movq does not affect flags;
3598 // -8 to offset add ebx
3599 jb duploop
3600
3601 cmp edx, 0 // Test for bytes over mult of 64
3602 jz dupend
3603
3604
3605 // 2 lines added by lcreeve@netins.net
3606 // (mail 11 Jul 98 in png-implement list)
3607 cmp edx, 8 //test for less than 8 bytes
3608 jb duplt8
3609
3610
3611 add ecx, edx
3612 and edx, 0x00000007 // calc bytes over mult of 8
3613 sub ecx, edx // drop over bytes from length
3614 jz duplt8
3615 // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
3616duplpA:
3617 movq mm1, [esi+ebx]
3618 movq mm0, [edi+ebx]
3619 add ebx, 8
3620 paddb mm0, mm1
3621 cmp ebx, ecx
3622 movq [edi+ebx-8], mm0 // movq does not affect flags; -8 to offset add ebx
3623 jb duplpA
3624 cmp edx, 0 // Test for bytes over mult of 8
3625 jz dupend
3626duplt8:
3627 xor eax, eax
3628 add ecx, edx // move over byte count into counter
3629 // Loop using x86 registers to update remaining bytes
3630duplp2:
3631 mov al, [edi + ebx]
3632 add al, [esi + ebx]
3633 inc ebx
3634 cmp ebx, ecx
3635 mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3636 jb duplp2
3637dupend:
3638 // Conversion of filtered row completed
3639 emms // End MMX instructions; prep for possible FP instrs.
3640 } // end _asm block
3641}
3642
3643
3644// Optimized png_read_filter_row routines
3645void
3646png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
3647 row, png_bytep prev_row, int filter)
3648{
3649#ifdef PNG_DEBUG
3650 char filnm[6];
3651#endif
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003652 #define UseMMX 1
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003653
3654 if (mmx_supported == 2)
3655 mmx_supported = mmxsupport();
3656
3657 if (!mmx_supported)
3658 {
3659 png_read_filter_row_c(png_ptr, row_info, row, prev_row, filter);
3660 return ;
3661 }
3662
3663#ifdef PNG_DEBUG
3664 png_debug(1, "in png_read_filter_row\n");
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003665#if (UseMMX == 1)
3666 png_debug1(0,"%s, ", "MMX");
3667#else
3668 png_debug1(0,"%s, ", "x86");
3669#endif
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003670 switch (filter)
3671 {
3672 case 0: sprintf(filnm, "None ");
3673 break;
3674 case 1: sprintf(filnm, "Sub ");
3675 break;
3676 case 2: sprintf(filnm, "Up ");
3677 break;
3678 case 3: sprintf(filnm, "Avg ");
3679 break;
3680 case 4: sprintf(filnm, "Paeth");
3681 break;
3682 default: sprintf(filnm, "Unknw");
3683 break;
3684 }
3685 png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
3686 png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
3687 (int)((row_info->pixel_depth + 7) >> 3));
3688 png_debug1(0,"len=%8d, ", row_info->rowbytes);
3689#endif
3690
3691 switch (filter)
3692 {
3693 case PNG_FILTER_VALUE_NONE:
3694 break;
3695 case PNG_FILTER_VALUE_SUB:
3696 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003697#if (UseMMX == 1)
3698 if ((row_info->pixel_depth > 8) &&
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003699 (row_info->rowbytes >= 128) )
3700 {
3701 png_read_filter_row_mmx_sub(row_info, row);
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003702 }
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003703 else
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003704#endif
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003705 {
3706 png_uint_32 i;
3707 png_uint_32 istop = row_info->rowbytes;
3708 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3709 png_bytep rp = row + bpp;
3710 png_bytep lp = row;
3711
3712 for (i = bpp; i < istop; i++)
3713 {
3714 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
3715 rp++;
3716 }
3717 } //end !UseMMX
3718 break;
3719 }
3720 case PNG_FILTER_VALUE_UP:
3721 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003722#if (UseMMX == 1)
3723 if ((row_info->pixel_depth > 8) &&
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003724 (row_info->rowbytes >= 128) )
3725 {
3726 png_read_filter_row_mmx_up(row_info, row, prev_row);
3727 } //end if UseMMX
3728 else
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003729#endif
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003730 {
3731 png_bytep rp;
3732 png_bytep pp;
3733 png_uint_32 i;
3734 for (i = 0, rp = row, pp = prev_row;
3735 i < row_info->rowbytes; i++, rp++, pp++)
3736 {
3737 *rp = (png_byte)(((int)(*rp) + (int)(*pp)) & 0xff);
3738 }
3739 } //end !UseMMX
3740 break;
3741 }
3742 case PNG_FILTER_VALUE_AVG:
3743 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003744#if (UseMMX == 1)
3745 if ((row_info->pixel_depth > 8) &&
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003746 (row_info->rowbytes >= 128) )
3747 {
3748 png_read_filter_row_mmx_avg(row_info, row, prev_row);
3749 } //end if UseMMX
3750 else
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003751#endif
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003752 {
3753 png_uint_32 i;
3754 png_bytep rp = row;
3755 png_bytep pp = prev_row;
3756 png_bytep lp = row;
3757 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3758 png_uint_32 istop = row_info->rowbytes - bpp;
3759
3760 for (i = 0; i < bpp; i++)
3761 {
3762 *rp = (png_byte)(((int)(*rp) +
3763 ((int)(*pp++) >> 1)) & 0xff);
3764 rp++;
3765 }
3766
3767 for (i = 0; i < istop; i++)
3768 {
3769 *rp = (png_byte)(((int)(*rp) +
3770 ((int)(*pp++ + *lp++) >> 1)) & 0xff);
3771 rp++;
3772 }
3773 } //end !UseMMX
3774 break;
3775 }
3776 case PNG_FILTER_VALUE_PAETH:
3777 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003778#if (UseMMX == 1)
3779 if ((row_info->pixel_depth > 8) &&
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003780 (row_info->rowbytes >= 128) )
3781 {
3782 png_read_filter_row_mmx_paeth(row_info, row, prev_row);
3783 } //end if UseMMX
3784 else
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003785#endif
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003786 {
3787 png_uint_32 i;
3788 png_bytep rp = row;
3789 png_bytep pp = prev_row;
3790 png_bytep lp = row;
3791 png_bytep cp = prev_row;
3792 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3793 png_uint_32 istop=row_info->rowbytes - bpp;
3794
3795 for (i = 0; i < bpp; i++)
3796 {
3797 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3798 rp++;
3799 }
3800
3801 for (i = 0; i < istop; i++) // use leftover rp,pp
3802 {
3803 int a, b, c, pa, pb, pc, p;
3804
3805 a = *lp++;
3806 b = *pp++;
3807 c = *cp++;
3808
3809 p = b - c;
3810 pc = a - c;
3811
3812#ifdef PNG_USE_ABS
3813 pa = abs(p);
3814 pb = abs(pc);
3815 pc = abs(p + pc);
3816#else
3817 pa = p < 0 ? -p : p;
3818 pb = pc < 0 ? -pc : pc;
3819 pc = (p + pc) < 0 ? -(p + pc) : p + pc;
3820#endif
3821
3822 /*
3823 if (pa <= pb && pa <= pc)
3824 p = a;
3825 else if (pb <= pc)
3826 p = b;
3827 else
3828 p = c;
3829 */
3830
3831 p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
3832
3833 *rp = (png_byte)(((int)(*rp) + p) & 0xff);
3834 rp++;
3835 }
3836 } //end !UseMMX
3837 break;
3838 }
3839 default:
3840 png_error(png_ptr, "Bad adaptive filter type");
3841 break;
3842 }
3843}
3844#endif