blob: df924559136a12f99387e31664926fc9987ba828 [file] [log] [blame]
Glenn Randers-Pehrson4393a9a1999-09-17 12:27:26 -05001/* pngvcrd.c - assembler version of utilities to read a PNG file
2 *
3 * For Intel CPU and Microsoft Visual C++ compiler
4 *
Glenn Randers-Pehrsonf8b008c1999-09-18 10:54:36 -05005 * libpng 1.0.4 - September 18, 1999
Glenn Randers-Pehrson4393a9a1999-09-17 12:27:26 -05006 * For conditions of distribution and use, see copyright notice in png.h
7 * Copyright (c) 1998, Intel Corporation
8 * Copyright (c) 1998, 1999 Glenn Randers-Pehrson
9 *
10 * Contributed by Nirav Chhatrapati, INTEL Corporation, 1998
11 * Interface to libpng contributed by Gilles Vollant, 1999
12 *
13 */
14
15#define PNG_INTERNAL
16#include "png.h"
17
18#ifdef PNG_ASSEMBLER_CODE_SUPPORTED
19
20static int mmx_supported=2;
21
22void
23png_read_filter_row_c(png_structp png_ptr, png_row_infop row_info,
24 png_bytep row, png_bytep prev_row, int filter);
25
26static int mmxsupport()
27{
28 int mmx_supported_local = 0;
29
30 _asm {
31 pushfd //Save Eflag to stack
32 pop eax //Get Eflag from stack into eax
33 mov ecx, eax //Make another copy of Eflag in ecx
34 xor eax, 0x200000 //Toggle ID bit in Eflag [i.e. bit(21)]
35 push eax //Save modified Eflag back to stack
36
37 popfd //Restored modified value back to Eflag reg
38 pushfd //Save Eflag to stack
39 pop eax //Get Eflag from stack
40 xor eax, ecx //Compare the new Eflag with the original Eflag
41 jz NOT_SUPPORTED //If the same, CPUID instruction is not supported,
42 //skip following instructions and jump to
43 //NOT_SUPPORTED label
44
45 xor eax, eax //Set eax to zero
46
47 _asm _emit 0x0f //CPUID instruction (two bytes opcode)
48 _asm _emit 0xa2
49
50 cmp eax, 1 //make sure eax return non-zero value
51 jl NOT_SUPPORTED //If eax is zero, mmx not supported
52
53 xor eax, eax //set eax to zero
54 inc eax //Now increment eax to 1. This instruction is
55 //faster than the instruction "mov eax, 1"
56
57 _asm _emit 0x0f //CPUID instruction
58 _asm _emit 0xa2
59
60 and edx, 0x00800000 //mask out all bits but mmx bit(24)
61 cmp edx, 0 // 0 = mmx not supported
62 jz NOT_SUPPORTED // non-zero = Yes, mmx IS supported
63
64 mov mmx_supported_local, 1 //set return value to 1
65
66NOT_SUPPORTED:
67 mov eax, mmx_supported_local //move return value to eax
68
69 }
70
71//mmx_supported_local=0; // test code for force don't support MMX
72 //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
73
74 return mmx_supported_local;
75}
76
77/* Combines the row recently read in with the previous row.
78 This routine takes care of alpha and transparency if requested.
79 This routine also handles the two methods of progressive display
80 of interlaced images, depending on the mask value.
81 The mask value describes which pixels are to be combined with
82 the row. The pattern always repeats every 8 pixels, so just 8
83 bits are needed. A one indicates the pixels is to be combined,
84 a zero indicates the pixel is to be skipped. This is in addition
85 to any alpha or transparency value associated with the pixel. If
86 you want all pixels to be combined, pass 0xff (255) in mask. */
87
88/* Use this routine for X86 platform - uses faster MMX routine if machine
89supports MMX */
90
91void
92png_combine_row(png_structp png_ptr, png_bytep row,
93 int mask)
94{
95 //int mmx_supported=0; // another test code for remove MMX in this routine
96 png_debug(1,"in png_combine_row_asm\n");
97 //if (mmx_supported==2)
98 // mmx_supported=mmxsupport();
99
100 if (mask == 0xff)
101 {
102 png_memcpy(row, png_ptr->row_buf + 1,
103 (png_size_t)((png_ptr->width *
104 png_ptr->row_info.pixel_depth + 7) >> 3));
105 }
106 else
107 {
108 switch (png_ptr->row_info.pixel_depth)
109 {
110 case 1:
111 {
112 png_bytep sp;
113 png_bytep dp;
114 int s_inc, s_start, s_end;
115 int m;
116 int shift;
117 png_uint_32 i;
118
119 sp = png_ptr->row_buf + 1;
120 dp = row;
121 m = 0x80;
122#if defined(PNG_READ_PACKSWAP_SUPPORTED)
123 if (png_ptr->transformations & PNG_PACKSWAP)
124 {
125 s_start = 0;
126 s_end = 7;
127 s_inc = 1;
128 }
129 else
130#endif
131 {
132 s_start = 7;
133 s_end = 0;
134 s_inc = -1;
135 }
136
137 shift = s_start;
138
139 for (i = 0; i < png_ptr->width; i++)
140 {
141 if (m & mask)
142 {
143 int value;
144
145 value = (*sp >> shift) & 0x1;
146 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
147 *dp |= (png_byte)(value << shift);
148 }
149
150 if (shift == s_end)
151 {
152 shift = s_start;
153 sp++;
154 dp++;
155 }
156 else
157 shift += s_inc;
158
159 if (m == 1)
160 m = 0x80;
161 else
162 m >>= 1;
163 }
164 break;
165 }
166 case 2:
167 {
168 png_bytep sp;
169 png_bytep dp;
170 int s_start, s_end, s_inc;
171 int m;
172 int shift;
173 png_uint_32 i;
174 int value;
175
176 sp = png_ptr->row_buf + 1;
177 dp = row;
178 m = 0x80;
179#if defined(PNG_READ_PACKSWAP_SUPPORTED)
180 if (png_ptr->transformations & PNG_PACKSWAP)
181 {
182 s_start = 0;
183 s_end = 6;
184 s_inc = 2;
185 }
186 else
187#endif
188 {
189 s_start = 6;
190 s_end = 0;
191 s_inc = -2;
192 }
193
194 shift = s_start;
195
196 for (i = 0; i < png_ptr->width; i++)
197 {
198 if (m & mask)
199 {
200 value = (*sp >> shift) & 0x3;
201 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
202 *dp |= (png_byte)(value << shift);
203 }
204
205 if (shift == s_end)
206 {
207 shift = s_start;
208 sp++;
209 dp++;
210 }
211 else
212 shift += s_inc;
213 if (m == 1)
214 m = 0x80;
215 else
216 m >>= 1;
217 }
218 break;
219 }
220 case 4:
221 {
222 png_bytep sp;
223 png_bytep dp;
224 int s_start, s_end, s_inc;
225 int m;
226 int shift;
227 png_uint_32 i;
228 int value;
229
230 sp = png_ptr->row_buf + 1;
231 dp = row;
232 m = 0x80;
233#if defined(PNG_READ_PACKSWAP_SUPPORTED)
234 if (png_ptr->transformations & PNG_PACKSWAP)
235 {
236 s_start = 0;
237 s_end = 4;
238 s_inc = 4;
239 }
240 else
241#endif
242 {
243 s_start = 4;
244 s_end = 0;
245 s_inc = -4;
246 }
247 shift = s_start;
248
249 for (i = 0; i < png_ptr->width; i++)
250 {
251 if (m & mask)
252 {
253 value = (*sp >> shift) & 0xf;
254 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
255 *dp |= (png_byte)(value << shift);
256 }
257
258 if (shift == s_end)
259 {
260 shift = s_start;
261 sp++;
262 dp++;
263 }
264 else
265 shift += s_inc;
266 if (m == 1)
267 m = 0x80;
268 else
269 m >>= 1;
270 }
271 break;
272 }
273 case 8:
274 {
275 png_bytep srcptr;
276 png_bytep dstptr;
277 png_uint_32 len;
278 int m;
279 int diff, unmask;
280
281 __int64 mask0=0x0102040810204080;
282
283 if (mmx_supported)
284 {
285 srcptr = png_ptr->row_buf + 1;
286 dstptr = row;
287 m = 0x80;
288 unmask = ~mask;
289 len = png_ptr->width &~7; //reduce to multiple of 8
290 diff = png_ptr->width & 7; //amount lost
291 _asm {
292 movd mm7, unmask //load bit pattern
293 psubb mm6,mm6 //zero mm6
294 punpcklbw mm7,mm7
295 punpcklwd mm7,mm7
296 punpckldq mm7,mm7 //fill register with 8 masks
297
298 movq mm0,mask0
299
300 pand mm0,mm7 //nonzero if keep byte
301 pcmpeqb mm0,mm6 //zeros->1s, v versa
302
303 mov ecx,len //load length of line
304 mov esi,srcptr //load source
305 mov ebx,dstptr //load dest
306 cmp ecx,0 //lcr
307 je mainloop8end
308
309mainloop8:
310 movq mm4,[esi]
311 pand mm4,mm0
312 movq mm6,mm0
313 pandn mm6,[ebx]
314 por mm4,mm6
315 movq [ebx],mm4
316
317 add esi,8 //inc by 8 bytes processed
318 add ebx,8
319 sub ecx,8 //dec by 8 pixels processed
320
321 ja mainloop8
322mainloop8end:
323
324 mov ecx,diff
325 cmp ecx,0
326 jz end8
327
328 mov edx,mask
329 sal edx,24 //make low byte the high byte
330
331secondloop8:
332 sal edx,1 //move high bit to CF
333 jnc skip8 //if CF = 0
334 mov al,[esi]
335 mov [ebx],al
336skip8:
337 inc esi
338 inc ebx
339
340 dec ecx
341 jnz secondloop8
342end8:
343 emms
344 }
345 }
346 else /* mmx _not supported - Use modified C routine*/
347 {
348 register unsigned int incr1, initial_val, final_val;
349 png_size_t pixel_bytes;
350 png_uint_32 i;
351 //if ((mask != 0x0f) && (mask != 0x33))
352 register int disp = png_pass_inc[png_ptr->pass];
353 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
354 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
355 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
356 pixel_bytes;
357 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
358 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
359 final_val = png_ptr->width*pixel_bytes;
360 incr1 = (disp)*pixel_bytes;
361 for (i = initial_val; i < final_val; i += incr1)
362 {
363 png_memcpy(dstptr, srcptr, pixel_bytes);
364 srcptr += incr1;
365 dstptr += incr1;
366 }
367 } /* end of else */
368
369 break;
370 } //end 8bpp
371
372 case 16:
373 {
374 png_bytep srcptr;
375 png_bytep dstptr;
376 png_uint_32 len;
377 int unmask, diff;
378
379 __int64 mask1=0x0101020204040808,
380 mask0=0x1010202040408080;
381
382 if (mmx_supported)
383 {
384 srcptr = png_ptr->row_buf + 1;
385 dstptr = row;
386
387 unmask = ~mask;
388 len = (png_ptr->width)&~7;
389 diff = (png_ptr->width)&7;
390 _asm {
391 movd mm7, unmask //load bit pattern
392 psubb mm6,mm6 //zero mm6
393 punpcklbw mm7,mm7
394 punpcklwd mm7,mm7
395 punpckldq mm7,mm7 //fill register with 8 masks
396
397 movq mm0,mask0
398 movq mm1,mask1
399
400 pand mm0,mm7
401 pand mm1,mm7
402
403 pcmpeqb mm0,mm6
404 pcmpeqb mm1,mm6
405
406 mov ecx,len //load length of line
407 mov esi,srcptr //load source
408 mov ebx,dstptr //load dest
409 cmp ecx,0 //lcr
410 jz mainloop16end
411
412mainloop16:
413 movq mm4,[esi]
414 pand mm4,mm0
415 movq mm6,mm0
416 movq mm7,[ebx]
417 pandn mm6,mm7
418 por mm4,mm6
419 movq [ebx],mm4
420
421 movq mm5,[esi+8]
422 pand mm5,mm1
423 movq mm7,mm1
424 movq mm6,[ebx+8]
425 pandn mm7,mm6
426 por mm5,mm7
427 movq [ebx+8],mm5
428
429 add esi,16 //inc by 16 bytes processed
430 add ebx,16
431 sub ecx,8 //dec by 8 pixels processed
432
433 ja mainloop16
434mainloop16end:
435
436 mov ecx,diff
437 cmp ecx,0
438 jz end16
439
440 mov edx,mask
441 sal edx,24 //make low byte the high byte
442
443secondloop16:
444 sal edx,1 //move high bit to CF
445 jnc skip16 //if CF = 0
446 mov ax,[esi]
447 mov [ebx],ax
448skip16:
449 add esi,2
450 add ebx,2
451
452 dec ecx
453 jnz secondloop16
454
455end16:
456 emms
457 }
458 }
459 else /* mmx _not supported - Use modified C routine */
460 {
461 register unsigned int incr1, initial_val, final_val;
462 png_size_t pixel_bytes;
463 png_uint_32 i;
464 register int disp = png_pass_inc[png_ptr->pass];
465 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
466 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
467 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
468 pixel_bytes;
469 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
470 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
471 final_val = png_ptr->width*pixel_bytes;
472 incr1 = (disp)*pixel_bytes;
473 for (i = initial_val; i < final_val; i += incr1)
474 {
475 png_memcpy(dstptr, srcptr, pixel_bytes);
476 srcptr += incr1;
477 dstptr += incr1;
478 }
479 } /* end of else */
480
481 break;
482 }
483 case 24:
484 {
485 png_bytep srcptr;
486 png_bytep dstptr;
487 png_uint_32 len;
488 int unmask, diff;
489
490 __int64 mask2=0x0101010202020404, //24bpp
491 mask1=0x0408080810101020,
492 mask0=0x2020404040808080;
493
494 srcptr = png_ptr->row_buf + 1;
495 dstptr = row;
496
497 unmask = ~mask;
498 len = (png_ptr->width)&~7;
499 diff = (png_ptr->width)&7;
500
501 if (mmx_supported)
502 {
503 _asm {
504 movd mm7, unmask //load bit pattern
505 psubb mm6,mm6 //zero mm6
506 punpcklbw mm7,mm7
507 punpcklwd mm7,mm7
508 punpckldq mm7,mm7 //fill register with 8 masks
509
510 movq mm0,mask0
511 movq mm1,mask1
512 movq mm2,mask2
513
514
515 pand mm0,mm7
516 pand mm1,mm7
517 pand mm2,mm7
518
519 pcmpeqb mm0,mm6
520 pcmpeqb mm1,mm6
521 pcmpeqb mm2,mm6
522
523 mov ecx,len //load length of line
524 mov esi,srcptr //load source
525 mov ebx,dstptr //load dest
526 cmp ecx,0
527 jz mainloop24end
528
529mainloop24:
530 movq mm4,[esi]
531 pand mm4,mm0
532 movq mm6,mm0
533 movq mm7,[ebx]
534 pandn mm6,mm7
535 por mm4,mm6
536 movq [ebx],mm4
537
538
539 movq mm5,[esi+8]
540 pand mm5,mm1
541 movq mm7,mm1
542 movq mm6,[ebx+8]
543 pandn mm7,mm6
544 por mm5,mm7
545 movq [ebx+8],mm5
546
547 movq mm6,[esi+16]
548 pand mm6,mm2
549 movq mm4,mm2
550 movq mm7,[ebx+16]
551 pandn mm4,mm7
552 por mm6,mm4
553 movq [ebx+16],mm6
554
555 add esi,24 //inc by 24 bytes processed
556 add ebx,24
557 sub ecx,8 //dec by 8 pixels processed
558
559 ja mainloop24
560mainloop24end:
561
562 mov ecx,diff
563 cmp ecx,0
564 jz end24
565
566 mov edx,mask
567 sal edx,24 //make low byte the high byte
568
569secondloop24:
570 sal edx,1 //move high bit to CF
571 jnc skip24 //if CF = 0
572 mov ax,[esi]
573 mov [ebx],ax
574 xor eax,eax
575 mov al,[esi+2]
576 mov [ebx+2],al
577skip24:
578 add esi,3
579 add ebx,3
580
581 dec ecx
582 jnz secondloop24
583
584end24:
585 emms
586
587 }
588 }
589 else /* mmx _not supported - Use modified C routine */
590 {
591 register unsigned int incr1, initial_val, final_val;
592 png_size_t pixel_bytes;
593 png_uint_32 i;
594 register int disp = png_pass_inc[png_ptr->pass];
595 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
596 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
597 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]
598 *pixel_bytes;
599 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
600 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
601 final_val = png_ptr->width*pixel_bytes;
602 incr1 = (disp)*pixel_bytes;
603 for (i = initial_val; i < final_val; i += incr1)
604 {
605 png_memcpy(dstptr, srcptr, pixel_bytes);
606 srcptr += incr1;
607 dstptr += incr1;
608 }
609 } /* end of else */
610
611 break;
612 } //end 24bpp
613 case 32:
614 {
615 png_bytep srcptr;
616 png_bytep dstptr;
617 png_uint_32 len;
618 int unmask, diff;
619
620
621
622 __int64 mask3=0x0101010102020202, //32bpp
623 mask2=0x0404040408080808,
624 mask1=0x1010101020202020,
625 mask0=0x4040404080808080;
626
627 srcptr = png_ptr->row_buf + 1;
628 dstptr = row;
629
630 unmask = ~mask;
631 len = (png_ptr->width)&~7;
632 diff = (png_ptr->width)&7;
633
634 if (mmx_supported)
635 {
636 _asm {
637 movd mm7, unmask //load bit pattern
638 psubb mm6,mm6 //zero mm6
639 punpcklbw mm7,mm7
640 punpcklwd mm7,mm7
641 punpckldq mm7,mm7 //fill register with 8 masks
642
643 movq mm0,mask0
644 movq mm1,mask1
645 movq mm2,mask2
646 movq mm3,mask3
647
648
649 pand mm0,mm7
650 pand mm1,mm7
651 pand mm2,mm7
652 pand mm3,mm7
653
654 pcmpeqb mm0,mm6
655 pcmpeqb mm1,mm6
656 pcmpeqb mm2,mm6
657 pcmpeqb mm3,mm6
658
659 mov ecx,len //load length of line
660 mov esi,srcptr //load source
661 mov ebx,dstptr //load dest
662
663 cmp ecx,0 //lcr
664 jz mainloop32end
665
666mainloop32:
667 movq mm4,[esi]
668 pand mm4,mm0
669 movq mm6,mm0
670 movq mm7,[ebx]
671 pandn mm6,mm7
672 por mm4,mm6
673 movq [ebx],mm4
674
675
676 movq mm5,[esi+8]
677 pand mm5,mm1
678 movq mm7,mm1
679 movq mm6,[ebx+8]
680 pandn mm7,mm6
681 por mm5,mm7
682 movq [ebx+8],mm5
683
684 movq mm6,[esi+16]
685 pand mm6,mm2
686 movq mm4,mm2
687 movq mm7,[ebx+16]
688 pandn mm4,mm7
689 por mm6,mm4
690 movq [ebx+16],mm6
691
692 movq mm7,[esi+24]
693 pand mm7,mm3
694 movq mm5,mm3
695 movq mm4,[ebx+24]
696 pandn mm5,mm4
697 por mm7,mm5
698 movq [ebx+24],mm7
699
700
701 add esi,32 //inc by 32 bytes processed
702 add ebx,32
703 sub ecx,8 //dec by 8 pixels processed
704
705 ja mainloop32
706mainloop32end:
707
708 mov ecx,diff
709 cmp ecx,0
710 jz end32
711
712 mov edx,mask
713 sal edx,24 //make low byte the high byte
714
715secondloop32:
716 sal edx,1 //move high bit to CF
717 jnc skip32 //if CF = 0
718 mov eax,[esi]
719 mov [ebx],eax
720skip32:
721 add esi,4
722 add ebx,4
723
724 dec ecx
725 jnz secondloop32
726
727end32:
728 emms
729
730 }
731 }
732 else /* mmx _not supported - Use modified C routine */
733 {
734 register unsigned int incr1, initial_val, final_val;
735 png_size_t pixel_bytes;
736 png_uint_32 i;
737 register int disp = png_pass_inc[png_ptr->pass];
738 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
739 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
740 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
741 pixel_bytes;
742 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
743 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
744 final_val = png_ptr->width*pixel_bytes;
745 incr1 = (disp)*pixel_bytes;
746 for (i = initial_val; i < final_val; i += incr1)
747 {
748 png_memcpy(dstptr, srcptr, pixel_bytes);
749 srcptr += incr1;
750 dstptr += incr1;
751 }
752 } /* end of else */
753
754 break;
755 } //end 32bpp
756
757
758 case 48:
759 {
760 png_bytep srcptr;
761 png_bytep dstptr;
762 png_uint_32 len;
763 int unmask, diff;
764
765 __int64 mask5=0x0101010101010202,
766 mask4=0x0202020204040404,
767 mask3=0x0404080808080808,
768 mask2=0x1010101010102020,
769 mask1=0x2020202040404040,
770 mask0=0x4040808080808080;
771
772 if (mmx_supported)
773 {
774
775 srcptr = png_ptr->row_buf + 1;
776 dstptr = row;
777
778 unmask = ~mask;
779 len = (png_ptr->width)&~7;
780 diff = (png_ptr->width)&7;
781 _asm {
782 movd mm7, unmask //load bit pattern
783 psubb mm6,mm6 //zero mm6
784 punpcklbw mm7,mm7
785 punpcklwd mm7,mm7
786 punpckldq mm7,mm7 //fill register with 8 masks
787
788 movq mm0,mask0
789 movq mm1,mask1
790 movq mm2,mask2
791 movq mm3,mask3
792 movq mm4,mask4
793 movq mm5,mask5
794
795 pand mm0,mm7
796 pand mm1,mm7
797 pand mm2,mm7
798 pand mm3,mm7
799 pand mm4,mm7
800 pand mm5,mm7
801
802 pcmpeqb mm0,mm6
803 pcmpeqb mm1,mm6
804 pcmpeqb mm2,mm6
805 pcmpeqb mm3,mm6
806 pcmpeqb mm4,mm6
807 pcmpeqb mm5,mm6
808
809 mov ecx,len //load length of line
810 mov esi,srcptr //load source
811 mov ebx,dstptr //load dest
812
813 cmp ecx,0
814 jz mainloop48end
815
816mainloop48:
817 movq mm7,[esi]
818 pand mm7,mm0
819 movq mm6,mm0
820 pandn mm6,[ebx]
821 por mm7,mm6
822 movq [ebx],mm7
823
824
825 movq mm6,[esi+8]
826 pand mm6,mm1
827 movq mm7,mm1
828 pandn mm7,[ebx+8]
829 por mm6,mm7
830 movq [ebx+8],mm6
831
832 movq mm6,[esi+16]
833 pand mm6,mm2
834 movq mm7,mm2
835 pandn mm7,[ebx+16]
836 por mm6,mm7
837 movq [ebx+16],mm6
838
839 movq mm7,[esi+24]
840 pand mm7,mm3
841 movq mm6,mm3
842 pandn mm6,[ebx+24]
843 por mm7,mm6
844 movq [ebx+24],mm7
845
846 movq mm6,[esi+32]
847 pand mm6,mm4
848 movq mm7,mm4
849 pandn mm7,[ebx+32]
850 por mm6,mm7
851 movq [ebx+32],mm6
852
853 movq mm7,[esi+40]
854 pand mm7,mm5
855 movq mm6,mm5
856 pandn mm6,[ebx+40]
857 por mm7,mm6
858 movq [ebx+40],mm7
859
860 add esi,48 //inc by 32 bytes processed
861 add ebx,48
862 sub ecx,8 //dec by 8 pixels processed
863
864 ja mainloop48
865mainloop48end:
866
867 mov ecx,diff
868 cmp ecx,0
869 jz end48
870
871 mov edx,mask
872 sal edx,24 //make low byte the high byte
873
874secondloop48:
875 sal edx,1 //move high bit to CF
876 jnc skip48 //if CF = 0
877 mov eax,[esi]
878 mov [ebx],eax
879skip48:
880 add esi,4
881 add ebx,4
882
883 dec ecx
884 jnz secondloop48
885
886end48:
887 emms
888 }
889 }
890 else /* mmx _not supported - Use modified C routine */
891 {
892 register unsigned int incr1, initial_val, final_val;
893 png_size_t pixel_bytes;
894 png_uint_32 i;
895 register int disp = png_pass_inc[png_ptr->pass];
896 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
897 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
898 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
899 pixel_bytes;
900 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
901 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
902 final_val = png_ptr->width*pixel_bytes;
903 incr1 = (disp)*pixel_bytes;
904 for (i = initial_val; i < final_val; i += incr1)
905 {
906 png_memcpy(dstptr, srcptr, pixel_bytes);
907 srcptr += incr1;
908 dstptr += incr1;
909 }
910 } /* end of else */
911 break; // end 48 bpp
912 }
913 default:
914 {
915 png_bytep sptr;
916 png_bytep dp;
917 png_size_t pixel_bytes;
918 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
919 unsigned int i;
920 register int disp = png_pass_inc[png_ptr->pass]; // get the offset
921 register unsigned int incr1, initial_val, final_val;
922 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
923 sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*pixel_bytes;
924 dp = row + offset_table[png_ptr->pass]*pixel_bytes;
925 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
926 final_val = png_ptr->width*pixel_bytes;
927 incr1 = (disp)*pixel_bytes;
928 for (i = initial_val; i < final_val; i += incr1)
929 {
930 png_memcpy(dp, sptr, pixel_bytes);
931 sptr += incr1;
932 dp += incr1;
933 }
934
935 break;
936 }
937 }
938 }
939}
940
941
942#if defined(PNG_READ_INTERLACING_SUPPORTED)
943
944void
945png_do_read_interlace(png_row_infop row_info, png_bytep row, int pass,
946 png_uint_32 transformations)
947{
948
949 png_debug(1,"in png_do_read_interlace\n");
950 if (mmx_supported==2)
951 mmx_supported=mmxsupport();
952
953 if (row != NULL && row_info != NULL)
954 {
955 png_uint_32 final_width;
956
957 final_width = row_info->width * png_pass_inc[pass];
958
959 switch (row_info->pixel_depth)
960 {
961 case 1:
962 {
963 png_bytep sp, dp;
964 int sshift, dshift;
965 int s_start, s_end, s_inc;
966 png_byte v;
967 png_uint_32 i;
968 int j;
969
970 sp = row + (png_size_t)((row_info->width - 1) >> 3);
971 dp = row + (png_size_t)((final_width - 1) >> 3);
972#if defined(PNG_READ_PACKSWAP_SUPPORTED)
973 if (transformations & PNG_PACKSWAP)
974 {
975 sshift = (int)((row_info->width + 7) & 7);
976 dshift = (int)((final_width + 7) & 7);
977 s_start = 7;
978 s_end = 0;
979 s_inc = -1;
980 }
981 else
982#endif
983 {
984 sshift = 7 - (int)((row_info->width + 7) & 7);
985 dshift = 7 - (int)((final_width + 7) & 7);
986 s_start = 0;
987 s_end = 7;
988 s_inc = 1;
989 }
990
991 for (i = row_info->width; i; i--)
992 {
993 v = (png_byte)((*sp >> sshift) & 0x1);
994 for (j = 0; j < png_pass_inc[pass]; j++)
995 {
996 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
997 *dp |= (png_byte)(v << dshift);
998 if (dshift == s_end)
999 {
1000 dshift = s_start;
1001 dp--;
1002 }
1003 else
1004 dshift += s_inc;
1005 }
1006 if (sshift == s_end)
1007 {
1008 sshift = s_start;
1009 sp--;
1010 }
1011 else
1012 sshift += s_inc;
1013 }
1014 break;
1015 }
1016 case 2:
1017 {
1018 png_bytep sp, dp;
1019 int sshift, dshift;
1020 int s_start, s_end, s_inc;
1021 png_uint_32 i;
1022
1023 sp = row + (png_size_t)((row_info->width - 1) >> 2);
1024 dp = row + (png_size_t)((final_width - 1) >> 2);
1025#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1026 if (transformations & PNG_PACKSWAP)
1027 {
1028 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1029 dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1030 s_start = 6;
1031 s_end = 0;
1032 s_inc = -2;
1033 }
1034 else
1035#endif
1036 {
1037 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1038 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1039 s_start = 0;
1040 s_end = 6;
1041 s_inc = 2;
1042 }
1043
1044 for (i = row_info->width; i; i--)
1045 {
1046 png_byte v;
1047 int j;
1048
1049 v = (png_byte)((*sp >> sshift) & 0x3);
1050 for (j = 0; j < png_pass_inc[pass]; j++)
1051 {
1052 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1053 *dp |= (png_byte)(v << dshift);
1054 if (dshift == s_end)
1055 {
1056 dshift = s_start;
1057 dp--;
1058 }
1059 else
1060 dshift += s_inc;
1061 }
1062 if (sshift == s_end)
1063 {
1064 sshift = s_start;
1065 sp--;
1066 }
1067 else
1068 sshift += s_inc;
1069 }
1070 break;
1071 }
1072 case 4:
1073 {
1074 png_bytep sp, dp;
1075 int sshift, dshift;
1076 int s_start, s_end, s_inc;
1077 png_uint_32 i;
1078
1079 sp = row + (png_size_t)((row_info->width - 1) >> 1);
1080 dp = row + (png_size_t)((final_width - 1) >> 1);
1081#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1082 if (transformations & PNG_PACKSWAP)
1083 {
1084 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1085 dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1086 s_start = 4;
1087 s_end = 0;
1088 s_inc = -4;
1089 }
1090 else
1091#endif
1092 {
1093 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1094 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1095 s_start = 0;
1096 s_end = 4;
1097 s_inc = 4;
1098 }
1099
1100 for (i = row_info->width; i; i--)
1101 {
1102 png_byte v;
1103 int j;
1104
1105 v = (png_byte)((*sp >> sshift) & 0xf);
1106 for (j = 0; j < png_pass_inc[pass]; j++)
1107 {
1108 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1109 *dp |= (png_byte)(v << dshift);
1110 if (dshift == s_end)
1111 {
1112 dshift = s_start;
1113 dp--;
1114 }
1115 else
1116 dshift += s_inc;
1117 }
1118 if (sshift == s_end)
1119 {
1120 sshift = s_start;
1121 sp--;
1122 }
1123 else
1124 sshift += s_inc;
1125 }
1126 break;
1127 }
1128 default: // This is the place where the routine is modified
1129 {
1130 __int64 const4 = 0x0000000000FFFFFF;
1131 __int64 const5 = 0x000000FFFFFF0000;
1132 __int64 const6 = 0x00000000000000FF;
1133 //int mmx_supported = 1;
1134
1135 png_bytep sptr, dp;
1136 png_uint_32 i;
1137 png_size_t pixel_bytes;
1138
1139 int width = row_info->width;
1140
1141 pixel_bytes = (row_info->pixel_depth >> 3);
1142
1143 sptr = row + (row_info->width - 1) * pixel_bytes;
1144 dp = row + (final_width - 1) * pixel_bytes;
1145 // New code by Nirav Chhatrapati - Intel Corporation
1146
1147 if (mmx_supported) // If machine supports MMX technology use MMX routine
1148 {
1149 if (pixel_bytes == 3)
1150 {
1151 if ((pass == 0) || (pass == 1))
1152 {
1153 _asm
1154 {
1155 mov esi, sptr
1156
1157 mov edi, dp
1158
1159 mov ecx, width
1160
1161 sub edi, 21 // (png_pass_inc[pass] - 1)*pixel_bytes
1162
1163loop_pass0:
1164
1165 movd mm0, [esi] ; X X X X X val2 val1 val0
1166
1167 pand mm0, const4 ; 0 0 0 0 0 val2 val1 val0
1168
1169 movq mm1, mm0 ; 0 0 0 0 0 val2 val1 val0
1170
1171 psllq mm0, 16 ; 0 0 0 val2 val1 val0 0 0
1172
1173 movq mm2, mm0 ; 0 0 0 val2 val1 val0 0 0
1174
1175 psllq mm0, 24 ; val2 val1 val0 0 0 0 0 0
1176
1177 psrlq mm1, 8 ; 0 0 0 0 0 0 val2 val1
1178
1179 por mm0, mm2 ; val2 val1 val0 val2 val1 val0 0 0
1180
1181 por mm0, mm1 ; val2 val1 val0 val2 val1 val0 val2 val1
1182
1183 movq mm3, mm0 ; val2 val1 val0 val2 val1 val0 val2 val1
1184
1185 psllq mm0, 16 ; val0 val2 val1 val0 val2 val1 0 0
1186
1187 movq mm4, mm3 ; val2 val1 val0 val2 val1 val0 val2 val1
1188
1189 punpckhdq mm3, mm0 ; val0 val2 val1 val0 val2 val1 val0 val2
1190
1191 movq [edi+16] , mm4
1192
1193 psrlq mm0, 32 ; 0 0 0 0 val0 val2 val1 val0
1194
1195 movq [edi+8] , mm3
1196
1197 punpckldq mm0, mm4 ; val1 val0 val2 val1 val0 val2 val1 val0
1198
1199 sub esi, 3
1200
1201 movq [edi], mm0
1202
1203 sub edi, 24
1204
1205 //sub esi, 3
1206
1207 dec ecx
1208
1209 jnz loop_pass0
1210
1211 EMMS
1212 }
1213
1214 }
1215
1216 else if ((pass == 2) || (pass == 3))
1217 {
1218 _asm
1219 {
1220 mov esi, sptr
1221
1222 mov edi, dp
1223
1224 mov ecx, width
1225
1226 sub edi, 9 // (png_pass_inc[pass] - 1)*pixel_bytes
1227
1228loop_pass2:
1229
1230 movd mm0, [esi] ; X X X X X val2 val1 val0
1231
1232 pand mm0, const4 ; 0 0 0 0 0 val2 val1 val0
1233
1234 movq mm1, mm0 ; 0 0 0 0 0 val2 val1 val0
1235
1236 psllq mm0, 16 ; 0 0 0 val2 val1 val0 0 0
1237
1238 movq mm2, mm0 ; 0 0 0 val2 val1 val0 0 0
1239
1240 psllq mm0, 24 ; val2 val1 val0 0 0 0 0 0
1241
1242 psrlq mm1, 8 ; 0 0 0 0 0 0 val2 val1
1243
1244 por mm0, mm2 ; val2 val1 val0 val2 val1 val0 0 0
1245
1246 por mm0, mm1 ; val2 val1 val0 val2 val1 val0 val2 val1
1247
1248 movq [edi+4], mm0 ; move to memory
1249
1250 psrlq mm0, 16 ; 0 0 val2 val1 val0 val2 val1 val0
1251
1252 movd [edi], mm0 ; move to memory
1253
1254 sub esi, 3
1255
1256 sub edi, 12
1257
1258 dec ecx
1259
1260 jnz loop_pass2
1261
1262 EMMS
1263 }
1264 }
1265
1266 else /*if ((pass == 4) || (pass == 5)) */
1267 {
1268
1269 int width_mmx = ((width >> 1) << 1) - 8;
1270 width -= width_mmx;
1271 if(width_mmx)
1272 _asm
1273 {
1274 mov esi, sptr
1275
1276 mov edi, dp
1277
1278 mov ecx, width_mmx
1279
1280 sub esi, 3
1281
1282 sub edi, 9
1283
1284loop_pass4:
1285
1286 movq mm0, [esi] ; X X v2 v1 v0 v5 v4 v3
1287
1288 movq mm7, mm0 ; X X v2 v1 v0 v5 v4 v3
1289
1290 movq mm6, mm0 ; X X v2 v1 v0 v5 v4 v3
1291
1292 psllq mm0, 24 ; v1 v0 v5 v4 v3 0 0 0
1293
1294 pand mm7, const4 ; 0 0 0 0 0 v5 v4 v3
1295
1296 psrlq mm6, 24 ; 0 0 0 X X v2 v1 v0
1297
1298 por mm0, mm7 ; v1 v0 v5 v4 v3 v5 v4 v3
1299
1300 movq mm5, mm6 ; 0 0 0 X X v2 v1 v0
1301
1302 psllq mm6, 8 ; 0 0 X X v2 v1 v0 0
1303
1304 movq [edi], mm0 ; move quad to memory
1305
1306 psrlq mm5, 16 ; 0 0 0 0 0 X X v2
1307
1308 pand mm5, const6 ; 0 0 0 0 0 0 0 v2
1309
1310 por mm6, mm5 ; 0 0 X X v2 v1 v0 v2
1311
1312 movd [edi+8], mm6 ; move double to memory
1313
1314 sub esi, 6
1315
1316 sub edi, 12
1317
1318 sub ecx, 2
1319
1320 jnz loop_pass4
1321
1322 EMMS
1323 }
1324
1325 sptr -= width_mmx*3;
1326 dp -= width_mmx*6;
1327 for (i = width; i; i--)
1328 {
1329 png_byte v[8];
1330 int j;
1331
1332 png_memcpy(v, sptr, pixel_bytes);
1333 for (j = 0; j < png_pass_inc[pass]; j++)
1334 {
1335 png_memcpy(dp, v, pixel_bytes);
1336 dp -= pixel_bytes;
1337 }
1338 sptr -= pixel_bytes;
1339 }
1340
1341 }
1342
1343 } /* end of pixel_bytes == 3 */
1344
1345 else if (pixel_bytes == 1)
1346 {
1347
1348 if ((pass == 0) || (pass == 1))
1349 {
1350 int width_mmx = ((width >> 2) << 2);
1351 width -= width_mmx;
1352 if(width_mmx)
1353 _asm
1354 {
1355
1356 mov esi, sptr
1357
1358 mov edi, dp
1359
1360 mov ecx, width_mmx
1361
1362 sub edi, 31
1363
1364 sub esi, 3
1365
1366loop1_pass0:
1367
1368 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1369
1370 movq mm1, mm0 ; X X X X v0 v1 v2 v3
1371
1372 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1373
1374 movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1375
1376 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1377
1378 movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1379
1380 punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3
1381
1382 punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2
1383
1384 movq [edi], mm0 ; move to memory v3
1385
1386 punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1387
1388 movq [edi+8], mm3 ; move to memory v2
1389
1390 movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1391
1392 punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1
1393
1394 punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0
1395
1396 movq [edi+16], mm2 ; move to memory v1
1397
1398 movq [edi+24], mm4 ; move to memory v0
1399
1400 sub esi, 4
1401
1402 sub edi, 32
1403
1404 sub ecx, 4
1405
1406 jnz loop1_pass0
1407
1408 EMMS
1409 }
1410
1411 sptr -= width_mmx;
1412 dp -= width_mmx*8;
1413 for (i = width; i; i--)
1414 {
1415 png_byte v[8];
1416 int j;
1417
1418 png_memcpy(v, sptr, pixel_bytes);
1419 for (j = 0; j < png_pass_inc[pass]; j++)
1420 {
1421 png_memcpy(dp, v, pixel_bytes);
1422 dp -= pixel_bytes;
1423 }
1424 sptr -= pixel_bytes;
1425 }
1426
1427 }
1428
1429
1430 else if ((pass == 2) || (pass == 3))
1431 {
1432 int width_mmx = ((width >> 2) << 2);
1433 width -= width_mmx;
1434 if(width_mmx)
1435 _asm
1436 {
1437
1438 mov esi, sptr
1439
1440 mov edi, dp
1441
1442 mov ecx, width_mmx
1443
1444 sub edi, 15
1445
1446 sub esi, 3
1447
1448loop1_pass2:
1449
1450 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1451
1452 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1453
1454 movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1455
1456 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1457
1458 punpckhwd mm1, mm1 ; v0 v0 v0 v0 v1 v1 v1 v1
1459
1460 movq [edi], mm0 ; move to memory v2 and v3
1461
1462 sub esi, 4
1463
1464 movq [edi+8], mm1 ; move to memory v1 and v0
1465
1466 sub edi, 16
1467
1468 sub ecx, 4
1469
1470 jnz loop1_pass2
1471
1472 EMMS
1473 }
1474
1475 sptr -= width_mmx;
1476 dp -= width_mmx*4;
1477 for (i = width; i; i--)
1478 {
1479 png_byte v[8];
1480 int j;
1481
1482 png_memcpy(v, sptr, pixel_bytes);
1483 for (j = 0; j < png_pass_inc[pass]; j++)
1484 {
1485 png_memcpy(dp, v, pixel_bytes);
1486 dp -= pixel_bytes;
1487 }
1488 sptr -= pixel_bytes;
1489 }
1490
1491 }
1492
1493 else //if ((pass == 4) || (pass == 5))
1494 {
1495 int width_mmx = ((width >> 3) << 3);
1496 width -= width_mmx;
1497 if(width_mmx)
1498 _asm
1499 {
1500
1501 mov esi, sptr
1502 mov edi, dp
1503 mov ecx, width_mmx
1504 sub edi, 15
1505 sub esi, 7
1506
1507loop1_pass4:
1508
1509 movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7
1510 movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7
1511 punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7
1512 //movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1513 punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3
1514 movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3
1515 sub esi, 8
1516 movq [edi], mm0 ; move to memory v4 v5 v6 and v7
1517 //sub esi, 4
1518 sub edi, 16
1519 sub ecx, 8
1520 jnz loop1_pass4
1521
1522 EMMS
1523 }
1524
1525 sptr -= width_mmx;
1526 dp -= width_mmx*2;
1527 for (i = width; i; i--)
1528 {
1529 png_byte v[8];
1530 int j;
1531
1532 png_memcpy(v, sptr, pixel_bytes);
1533 for (j = 0; j < png_pass_inc[pass]; j++)
1534 {
1535 png_memcpy(dp, v, pixel_bytes);
1536 dp -= pixel_bytes;
1537 }
1538 sptr -= pixel_bytes;
1539 }
1540
1541 }
1542
1543 } /* end of pixel_bytes == 1 */
1544
1545 else if (pixel_bytes == 2)
1546 {
1547
1548 if ((pass == 0) || (pass == 1))
1549 {
1550 int width_mmx = ((width >> 1) << 1);
1551 width -= width_mmx;
1552 if(width_mmx)
1553 _asm
1554 {
1555 mov esi, sptr
1556 mov edi, dp
1557 mov ecx, width_mmx
1558 sub esi, 2
1559 sub edi, 30
1560
1561loop2_pass0:
1562 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1563 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1564 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1565 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1566 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1567 movq [edi], mm0
1568 movq [edi + 8], mm0
1569 movq [edi + 16], mm1
1570 movq [edi + 24], mm1
1571 sub esi, 4
1572 sub edi, 32
1573 sub ecx, 2
1574 jnz loop2_pass0
1575
1576 EMMS
1577 }
1578
1579 sptr -= (width_mmx*2 + 2);
1580 dp -= (width_mmx*16 + 2);
1581
1582 for (i = width; i; i--)
1583 {
1584
1585 png_byte v[8];
1586 int j;
1587 sptr -= pixel_bytes;
1588 png_memcpy(v, sptr, pixel_bytes);
1589 for (j = 0; j < png_pass_inc[pass]; j++)
1590 {
1591 dp -= pixel_bytes;
1592 png_memcpy(dp, v, pixel_bytes);
1593 //dp -= pixel_bytes;
1594 }
1595 //sptr -= pixel_bytes;
1596 }
1597 }
1598
1599 else if ((pass == 2) || (pass == 3))
1600 {
1601 int width_mmx = ((width >> 1) << 1) ;
1602 width -= width_mmx;
1603 if(width_mmx)
1604 _asm
1605 {
1606 mov esi, sptr
1607 mov edi, dp
1608 mov ecx, width_mmx
1609 sub esi, 2
1610 sub edi, 14
1611
1612loop2_pass2:
1613 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1614 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1615 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1616 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1617 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1618 movq [edi], mm0
1619 sub esi, 4
1620 movq [edi + 8], mm1
1621 //sub esi, 4
1622 sub edi, 16
1623 sub ecx, 2
1624 jnz loop2_pass2
1625
1626 EMMS
1627 }
1628
1629 sptr -= (width_mmx*2 + 2);
1630 dp -= (width_mmx*8 + 2);
1631
1632 for (i = width; i; i--)
1633 {
1634
1635 png_byte v[8];
1636 int j;
1637 sptr -= pixel_bytes;
1638 png_memcpy(v, sptr, pixel_bytes);
1639 for (j = 0; j < png_pass_inc[pass]; j++)
1640 {
1641 dp -= pixel_bytes;
1642 png_memcpy(dp, v, pixel_bytes);
1643 //dp -= pixel_bytes;
1644 }
1645 //sptr -= pixel_bytes;
1646 }
1647 }
1648
1649 else // pass == 4 or 5
1650 {
1651 int width_mmx = ((width >> 1) << 1) ;
1652 width -= width_mmx;
1653 if(width_mmx)
1654 _asm
1655 {
1656 mov esi, sptr
1657 mov edi, dp
1658 mov ecx, width_mmx
1659 sub esi, 2
1660 sub edi, 6
1661
1662loop2_pass4:
1663 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1664 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1665 sub esi, 4
1666 movq [edi], mm0
1667 sub edi, 8
1668 sub ecx, 2
1669 jnz loop2_pass4
1670
1671 EMMS
1672 }
1673
1674 sptr -= (width_mmx*2 + 2);
1675 dp -= (width_mmx*4 + 2);
1676
1677 for (i = width; i; i--)
1678 {
1679
1680 png_byte v[8];
1681 int j;
1682 sptr -= pixel_bytes;
1683 png_memcpy(v, sptr, pixel_bytes);
1684 for (j = 0; j < png_pass_inc[pass]; j++)
1685 {
1686 dp -= pixel_bytes;
1687 png_memcpy(dp, v, pixel_bytes);
1688 //dp -= pixel_bytes;
1689 }
1690 //sptr -= pixel_bytes;
1691 }
1692 }
1693
1694 } /* end of pixel_bytes == 2 */
1695
1696 else if (pixel_bytes == 4)
1697 {
1698 if ((pass == 0) || (pass == 1))
1699 {
1700 int width_mmx = ((width >> 1) << 1) ;
1701 width -= width_mmx;
1702 if(width_mmx)
1703 _asm
1704 {
1705 mov esi, sptr
1706 mov edi, dp
1707 mov ecx, width_mmx
1708 sub esi, 4
1709 sub edi, 60
1710
1711loop4_pass0:
1712 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1713 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1714 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1715 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1716 movq [edi], mm0
1717 movq [edi + 8], mm0
1718 movq [edi + 16], mm0
1719 movq [edi + 24], mm0
1720 movq [edi+32], mm1
1721 movq [edi + 40], mm1
1722 movq [edi+ 48], mm1
1723 sub esi, 8
1724 movq [edi + 56], mm1
1725 sub edi, 64
1726 sub ecx, 2
1727 jnz loop4_pass0
1728
1729 EMMS
1730 }
1731
1732 sptr -= (width_mmx*4 + 4);
1733 dp -= (width_mmx*32 + 4);
1734
1735 for (i = width; i; i--)
1736 {
1737
1738 png_byte v[8];
1739 int j;
1740 sptr -= pixel_bytes;
1741 png_memcpy(v, sptr, pixel_bytes);
1742 for (j = 0; j < png_pass_inc[pass]; j++)
1743 {
1744 dp -= pixel_bytes;
1745 png_memcpy(dp, v, pixel_bytes);
1746 //dp -= pixel_bytes;
1747 }
1748 //sptr -= pixel_bytes;
1749 }
1750 }
1751
1752 else if ((pass == 2) || (pass == 3))
1753 {
1754 int width_mmx = ((width >> 1) << 1) ;
1755 width -= width_mmx;
1756 if(width_mmx)
1757 _asm
1758 {
1759 mov esi, sptr
1760 mov edi, dp
1761 mov ecx, width_mmx
1762 sub esi, 4
1763 sub edi, 28
1764
1765loop4_pass2:
1766 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1767 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1768 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1769 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1770 movq [edi], mm0
1771 movq [edi + 8], mm0
1772 movq [edi+16], mm1
1773 movq [edi + 24], mm1
1774 sub esi, 8
1775 sub edi, 32
1776 sub ecx, 2
1777 jnz loop4_pass2
1778
1779 EMMS
1780 }
1781
1782 sptr -= (width_mmx*4 + 4);
1783 dp -= (width_mmx*16 + 4);
1784
1785 for (i = width; i; i--)
1786 {
1787
1788 png_byte v[8];
1789 int j;
1790 sptr -= pixel_bytes;
1791 png_memcpy(v, sptr, pixel_bytes);
1792 for (j = 0; j < png_pass_inc[pass]; j++)
1793 {
1794 dp -= pixel_bytes;
1795 png_memcpy(dp, v, pixel_bytes);
1796 //dp -= pixel_bytes;
1797 }
1798 //sptr -= pixel_bytes;
1799 }
1800 }
1801
1802 else // pass == 4 or 5
1803 {
1804 int width_mmx = ((width >> 1) << 1) ;
1805 width -= width_mmx;
1806 if(width_mmx)
1807 _asm
1808 {
1809 mov esi, sptr
1810 mov edi, dp
1811 mov ecx, width_mmx
1812 sub esi, 4
1813 sub edi, 12
1814
1815loop4_pass4:
1816 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1817 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1818 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1819 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1820 movq [edi], mm0
1821 sub esi, 8
1822 movq [edi + 8], mm1
1823 sub edi, 16
1824 sub ecx, 2
1825 jnz loop4_pass4
1826
1827 EMMS
1828 }
1829
1830 sptr -= (width_mmx*4 + 4);
1831 dp -= (width_mmx*8 + 4);
1832
1833 for (i = width; i; i--)
1834 {
1835
1836 png_byte v[8];
1837 int j;
1838 sptr -= pixel_bytes;
1839 png_memcpy(v, sptr, pixel_bytes);
1840 for (j = 0; j < png_pass_inc[pass]; j++)
1841 {
1842 dp -= pixel_bytes;
1843 png_memcpy(dp, v, pixel_bytes);
1844 //dp -= pixel_bytes;
1845 }
1846 //sptr -= pixel_bytes;
1847 }
1848 }
1849
1850 } /* end of pixel_bytes == 4 */
1851
1852 else if (pixel_bytes == 6)
1853 {
1854 for (i = row_info->width; i; i--)
1855 {
1856
1857 png_byte v[8];
1858 int j;
1859 png_memcpy(v, sptr, pixel_bytes);
1860 for (j = 0; j < png_pass_inc[pass]; j++)
1861 {
1862 png_memcpy(dp, v, pixel_bytes);
1863 dp -= pixel_bytes;
1864 }
1865 sptr -= pixel_bytes;
1866 }
1867 } /* end of pixel_bytes == 6 */
1868
1869 else
1870 {
1871 for (i = row_info->width; i; i--)
1872 {
1873
1874 png_byte v[8];
1875 int j;
1876 png_memcpy(v, sptr, pixel_bytes);
1877 for (j = 0; j < png_pass_inc[pass]; j++)
1878 {
1879 png_memcpy(dp, v, pixel_bytes);
1880 dp -= pixel_bytes;
1881 }
1882 sptr-= pixel_bytes;
1883 }
1884 }
1885 } /* end of mmx_supported */
1886
1887 else /* MMX not supported */
1888 /* use modified C code - takes advantage of inlining of memcpy for
1889 a constant */
1890 {
1891 if (pixel_bytes == 1)
1892 {
1893 for (i = row_info->width; i; i--)
1894 {
1895 png_byte v[8];
1896 int j;
1897
1898 png_memcpy(v, sptr, pixel_bytes);
1899 for (j = 0; j < png_pass_inc[pass]; j++)
1900 {
1901 png_memcpy(dp, v, pixel_bytes);
1902 dp -= pixel_bytes;
1903 }
1904 sptr -= pixel_bytes;
1905 }
1906 }
1907 else if (pixel_bytes == 3)
1908 {
1909 for (i = row_info->width; i; i--)
1910 {
1911 png_byte v[8];
1912 int j;
1913 png_memcpy(v, sptr, pixel_bytes);
1914 for (j = 0; j < png_pass_inc[pass]; j++)
1915 {
1916 png_memcpy(dp, v, pixel_bytes);
1917 dp -= pixel_bytes;
1918 }
1919 sptr -= pixel_bytes;
1920 }
1921 }
1922 else if (pixel_bytes == 2)
1923 {
1924 for (i = row_info->width; i; i--)
1925 {
1926 png_byte v[8];
1927 int j;
1928 png_memcpy(v, sptr, pixel_bytes);
1929 for (j = 0; j < png_pass_inc[pass]; j++)
1930 {
1931 png_memcpy(dp, v, pixel_bytes);
1932 dp -= pixel_bytes;
1933 }
1934 sptr -= pixel_bytes;
1935 }
1936 }
1937 else if (pixel_bytes == 4)
1938 {
1939 for (i = row_info->width; i; i--)
1940 {
1941 png_byte v[8];
1942 int j;
1943 png_memcpy(v, sptr, pixel_bytes);
1944 for (j = 0; j < png_pass_inc[pass]; j++)
1945 {
1946 png_memcpy(dp, v, pixel_bytes);
1947 dp -= pixel_bytes;
1948 }
1949 sptr -= pixel_bytes;
1950 }
1951 }
1952 else if (pixel_bytes == 6)
1953 {
1954 for (i = row_info->width; i; i--)
1955 {
1956 png_byte v[8];
1957 int j;
1958 png_memcpy(v, sptr, pixel_bytes);
1959 for (j = 0; j < png_pass_inc[pass]; j++)
1960 {
1961 png_memcpy(dp, v, pixel_bytes);
1962 dp -= pixel_bytes;
1963 }
1964 sptr -= pixel_bytes;
1965 }
1966 }
1967 else
1968 {
1969 for (i = row_info->width; i; i--)
1970 {
1971 png_byte v[8];
1972 int j;
1973 png_memcpy(v, sptr, pixel_bytes);
1974 for (j = 0; j < png_pass_inc[pass]; j++)
1975 {
1976 png_memcpy(dp, v, pixel_bytes);
1977 dp -= pixel_bytes;
1978 }
1979 sptr -= pixel_bytes;
1980 }
1981 }
1982
1983 } /* end of MMX not supported */
1984 break;
1985 }
1986 }
1987 row_info->width = final_width;
1988 row_info->rowbytes = ((final_width *
1989 (png_uint_32)row_info->pixel_depth + 7) >> 3);
1990 }
1991}
1992
1993#endif
1994
1995
1996
1997// These variables are utilized in the functions below. They are declared
1998// globally here to ensure alignment on 8-byte boundaries.
1999union uAll {
2000 __int64 use;
2001 double align;
2002} LBCarryMask = {0x0101010101010101}, HBClearMask = {0x7f7f7f7f7f7f7f7f},
2003 ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
2004
2005// Optimized code for PNG Average filter decoder
2006void
2007png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
2008 , png_bytep prev_row)
2009{
2010 int bpp;
2011 png_uint_32 FullLength;
2012 png_uint_32 MMXLength;
2013 //png_uint_32 len;
2014 int diff;
2015 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
2016 FullLength = row_info->rowbytes; // # of bytes to filter
2017 _asm {
2018 // Init address pointers and offset
2019 mov edi, row // edi ==> Avg(x)
2020 xor ebx, ebx // ebx ==> x
2021 mov edx, edi
2022 mov esi, prev_row // esi ==> Prior(x)
2023 sub edx, bpp // edx ==> Raw(x-bpp)
2024
2025 xor eax, eax
2026 // Compute the Raw value for the first bpp bytes
2027 // Raw(x) = Avg(x) + (Prior(x)/2)
2028davgrlp:
2029 mov al, [esi + ebx] // Load al with Prior(x)
2030 inc ebx
2031 shr al, 1 // divide by 2
2032 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2033 cmp ebx, bpp
2034 mov [edi+ebx-1], al // Write back Raw(x);
2035 // mov does not affect flags; -1 to offset inc ebx
2036 jb davgrlp
2037 // get # of bytes to alignment
2038 mov diff, edi // take start of row
2039 add diff, ebx // add bpp
2040 add diff, 0xf // add 7 + 8 to incr past alignment boundary
2041 and diff, 0xfffffff8 // mask to alignment boundary
2042 sub diff, edi // subtract from start ==> value ebx at alignment
2043 jz davggo
2044 // fix alignment
2045 // Compute the Raw value for the bytes upto the alignment boundary
2046 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2047 xor ecx, ecx
2048davglp1:
2049 xor eax, eax
2050 mov cl, [esi + ebx] // load cl with Prior(x)
2051 mov al, [edx + ebx] // load al with Raw(x-bpp)
2052 add ax, cx
2053 inc ebx
2054 shr ax, 1 // divide by 2
2055 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2056 cmp ebx, diff // Check if at alignment boundary
2057 mov [edi+ebx-1], al // Write back Raw(x);
2058 // mov does not affect flags; -1 to offset inc ebx
2059 jb davglp1 // Repeat until at alignment boundary
2060davggo:
2061 mov eax, FullLength
2062 mov ecx, eax
2063 sub eax, ebx // subtract alignment fix
2064 and eax, 0x00000007 // calc bytes over mult of 8
2065 sub ecx, eax // drop over bytes from original length
2066 mov MMXLength, ecx
2067 } // end _asm block
2068 // Now do the math for the rest of the row
2069 switch ( bpp )
2070 {
2071 case 3:
2072 {
2073 ActiveMask.use = 0x0000000000ffffff;
2074 ShiftBpp.use = 24; // == 3 * 8
2075 ShiftRem.use = 40; // == 64 - 24
2076 _asm {
2077 // Re-init address pointers and offset
2078 movq mm7, ActiveMask
2079 mov ebx, diff // ebx ==> x = offset to alignment boundary
2080 movq mm5, LBCarryMask
2081 mov edi, row // edi ==> Avg(x)
2082 movq mm4, HBClearMask
2083 mov esi, prev_row // esi ==> Prior(x)
2084 // PRIME the pump (load the first Raw(x-bpp) data set
2085 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2086 // (we correct position in loop below)
2087davg3lp:
2088 movq mm0, [edi + ebx] // Load mm0 with Avg(x)
2089 // Add (Prev_row/2) to Average
2090 movq mm3, mm5
2091 psrlq mm2, ShiftRem // Correct position Raw(x-bpp) data
2092 movq mm1, [esi + ebx] // Load mm1 with Prior(x)
2093 movq mm6, mm7
2094 pand mm3, mm1 // get lsb for each prev_row byte
2095 psrlq mm1, 1 // divide prev_row bytes by 2
2096 pand mm1, mm4 // clear invalid bit 7 of each byte
2097 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2098 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2099 movq mm1, mm3 // now use mm1 for getting LBCarrys
2100 pand mm1, mm2 // get LBCarrys for each byte where both
2101 // lsb's were == 1 (Only valid for active group)
2102 psrlq mm2, 1 // divide raw bytes by 2
2103 pand mm2, mm4 // clear invalid bit 7 of each byte
2104 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2105 pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
2106 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2107 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2108 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 3-5
2109 movq mm2, mm0 // mov updated Raws to mm2
2110 psllq mm2, ShiftBpp // shift data to position correctly
2111 movq mm1, mm3 // now use mm1 for getting LBCarrys
2112 pand mm1, mm2 // get LBCarrys for each byte where both
2113 // lsb's were == 1 (Only valid for active group)
2114 psrlq mm2, 1 // divide raw bytes by 2
2115 pand mm2, mm4 // clear invalid bit 7 of each byte
2116 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2117 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2118 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2119
2120 // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
2121 psllq mm6, ShiftBpp // shift the mm6 mask to cover the last two bytes
2122 movq mm2, mm0 // mov updated Raws to mm2
2123 psllq mm2, ShiftBpp // shift data to position correctly
2124 // Data only needs to be shifted once here to
2125 // get the correct x-bpp offset.
2126 movq mm1, mm3 // now use mm1 for getting LBCarrys
2127 pand mm1, mm2 // get LBCarrys for each byte where both
2128 // lsb's were == 1 (Only valid for active group)
2129 psrlq mm2, 1 // divide raw bytes by 2
2130 pand mm2, mm4 // clear invalid bit 7 of each byte
2131 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2132 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2133 add ebx, 8
2134 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2135
2136 // Now ready to write back to memory
2137 movq [edi + ebx - 8], mm0
2138 // Move updated Raw(x) to use as Raw(x-bpp) for next loop
2139 cmp ebx, MMXLength
2140 movq mm2, mm0 // mov updated Raw(x) to mm2
2141 jb davg3lp
2142 } // end _asm block
2143 }
2144 break;
2145 case 6:
2146 case 4:
2147 case 7:
2148 case 5:
2149 {
2150 ActiveMask.use = 0xffffffffffffffff; // use shift below to clear
2151 // appropriate inactive bytes
2152 ShiftBpp.use = bpp << 3;
2153 ShiftRem.use = 64 - ShiftBpp.use;
2154 _asm {
2155 movq mm4, HBClearMask
2156 // Re-init address pointers and offset
2157 mov ebx, diff // ebx ==> x = offset to alignment boundary
2158 // Load ActiveMask and clear all bytes except for 1st active group
2159 movq mm7, ActiveMask
2160 mov edi, row // edi ==> Avg(x)
2161 psrlq mm7, ShiftRem
2162 mov esi, prev_row // esi ==> Prior(x)
2163 movq mm6, mm7
2164 movq mm5, LBCarryMask
2165 psllq mm6, ShiftBpp // Create mask for 2nd active group
2166 // PRIME the pump (load the first Raw(x-bpp) data set
2167 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2168 // (we correct position in loop below)
2169davg4lp:
2170 movq mm0, [edi + ebx]
2171 psrlq mm2, ShiftRem // shift data to position correctly
2172 movq mm1, [esi + ebx]
2173 // Add (Prev_row/2) to Average
2174 movq mm3, mm5
2175 pand mm3, mm1 // get lsb for each prev_row byte
2176 psrlq mm1, 1 // divide prev_row bytes by 2
2177 pand mm1, mm4 // clear invalid bit 7 of each byte
2178 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2179 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2180 movq mm1, mm3 // now use mm1 for getting LBCarrys
2181 pand mm1, mm2 // get LBCarrys for each byte where both
2182 // lsb's were == 1 (Only valid for active group)
2183 psrlq mm2, 1 // divide raw bytes by 2
2184 pand mm2, mm4 // clear invalid bit 7 of each byte
2185 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2186 pand mm2, mm7 // Leave only Active Group 1 bytes to add to Avg
2187 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2188 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2189 movq mm2, mm0 // mov updated Raws to mm2
2190 psllq mm2, ShiftBpp // shift data to position correctly
2191 add ebx, 8
2192 movq mm1, mm3 // now use mm1 for getting LBCarrys
2193 pand mm1, mm2 // get LBCarrys for each byte where both
2194 // lsb's were == 1 (Only valid for active group)
2195 psrlq mm2, 1 // divide raw bytes by 2
2196 pand mm2, mm4 // clear invalid bit 7 of each byte
2197 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2198 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2199 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2200 cmp ebx, MMXLength
2201 // Now ready to write back to memory
2202 movq [edi + ebx - 8], mm0
2203 // Prep Raw(x-bpp) for next loop
2204 movq mm2, mm0 // mov updated Raws to mm2
2205 jb davg4lp
2206 } // end _asm block
2207 }
2208 break;
2209 case 2:
2210 {
2211 ActiveMask.use = 0x000000000000ffff;
2212 ShiftBpp.use = 24; // == 3 * 8
2213 ShiftRem.use = 40; // == 64 - 24
2214 _asm {
2215 // Load ActiveMask
2216 movq mm7, ActiveMask
2217 // Re-init address pointers and offset
2218 mov ebx, diff // ebx ==> x = offset to alignment boundary
2219 movq mm5, LBCarryMask
2220 mov edi, row // edi ==> Avg(x)
2221 movq mm4, HBClearMask
2222 mov esi, prev_row // esi ==> Prior(x)
2223 // PRIME the pump (load the first Raw(x-bpp) data set
2224 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2225 // (we correct position in loop below)
2226davg2lp:
2227 movq mm0, [edi + ebx]
2228 psllq mm2, ShiftRem // shift data to position correctly
2229 movq mm1, [esi + ebx]
2230 // Add (Prev_row/2) to Average
2231 movq mm3, mm5
2232 pand mm3, mm1 // get lsb for each prev_row byte
2233 psrlq mm1, 1 // divide prev_row bytes by 2
2234 pand mm1, mm4 // clear invalid bit 7 of each byte
2235 movq mm6, mm7
2236 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2237 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2238 movq mm1, mm3 // now use mm1 for getting LBCarrys
2239 pand mm1, mm2 // get LBCarrys for each byte where both
2240 // lsb's were == 1 (Only valid for active group)
2241 psrlq mm2, 1 // divide raw bytes by 2
2242 pand mm2, mm4 // clear invalid bit 7 of each byte
2243 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2244 pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
2245 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2246 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2247 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3
2248 movq mm2, mm0 // mov updated Raws to mm2
2249 psllq mm2, ShiftBpp // shift data to position correctly
2250 movq mm1, mm3 // now use mm1 for getting LBCarrys
2251 pand mm1, mm2 // get LBCarrys for each byte where both
2252 // lsb's were == 1 (Only valid for active group)
2253 psrlq mm2, 1 // divide raw bytes by 2
2254 pand mm2, mm4 // clear invalid bit 7 of each byte
2255 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2256 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2257 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2258
2259 // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
2260 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5
2261 movq mm2, mm0 // mov updated Raws to mm2
2262 psllq mm2, ShiftBpp // shift data to position correctly
2263 // Data only needs to be shifted once here to
2264 // get the correct x-bpp offset.
2265 movq mm1, mm3 // now use mm1 for getting LBCarrys
2266 pand mm1, mm2 // get LBCarrys for each byte where both
2267 // lsb's were == 1 (Only valid for active group)
2268 psrlq mm2, 1 // divide raw bytes by 2
2269 pand mm2, mm4 // clear invalid bit 7 of each byte
2270 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2271 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2272 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2273
2274 // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
2275 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 6 & 7
2276 movq mm2, mm0 // mov updated Raws to mm2
2277 psllq mm2, ShiftBpp // shift data to position correctly
2278 // Data only needs to be shifted once here to
2279 // get the correct x-bpp offset.
2280 add ebx, 8
2281 movq mm1, mm3 // now use mm1 for getting LBCarrys
2282 pand mm1, mm2 // get LBCarrys for each byte where both
2283 // lsb's were == 1 (Only valid for active group)
2284 psrlq mm2, 1 // divide raw bytes by 2
2285 pand mm2, mm4 // clear invalid bit 7 of each byte
2286 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2287 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2288 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2289
2290 cmp ebx, MMXLength
2291 // Now ready to write back to memory
2292 movq [edi + ebx - 8], mm0
2293 // Prep Raw(x-bpp) for next loop
2294 movq mm2, mm0 // mov updated Raws to mm2
2295 jb davg2lp
2296 } // end _asm block
2297 }
2298 break;
2299 case 1: // bpp == 1
2300 {
2301 _asm {
2302 // Re-init address pointers and offset
2303 mov ebx, diff // ebx ==> x = offset to alignment boundary
2304 mov edi, row // edi ==> Avg(x)
2305 cmp ebx, FullLength // Test if offset at end of array
2306 jnb davg1end
2307 // Do Paeth decode for remaining bytes
2308 mov esi, prev_row // esi ==> Prior(x)
2309 mov edx, edi
2310 xor ecx, ecx // zero ecx before using cl & cx in loop below
2311 sub edx, bpp // edx ==> Raw(x-bpp)
2312davg1lp:
2313 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2314 xor eax, eax
2315 mov cl, [esi + ebx] // load cl with Prior(x)
2316 mov al, [edx + ebx] // load al with Raw(x-bpp)
2317 add ax, cx
2318 inc ebx
2319 shr ax, 1 // divide by 2
2320 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2321 cmp ebx, FullLength // Check if at end of array
2322 mov [edi+ebx-1], al // Write back Raw(x);
2323 // mov does not affect flags; -1 to offset inc ebx
2324 jb davg1lp
2325davg1end:
2326 } // end _asm block
2327 }
2328 return;
2329
2330 case 8: // bpp == 8
2331 {
2332 _asm {
2333 // Re-init address pointers and offset
2334 mov ebx, diff // ebx ==> x = offset to alignment boundary
2335 movq mm5, LBCarryMask
2336 mov edi, row // edi ==> Avg(x)
2337 movq mm4, HBClearMask
2338 mov esi, prev_row // esi ==> Prior(x)
2339 // PRIME the pump (load the first Raw(x-bpp) data set
2340 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2341 // (NO NEED to correct position in loop below)
2342davg8lp:
2343 movq mm0, [edi + ebx]
2344 movq mm3, mm5
2345 movq mm1, [esi + ebx]
2346 add ebx, 8
2347 pand mm3, mm1 // get lsb for each prev_row byte
2348 psrlq mm1, 1 // divide prev_row bytes by 2
2349 pand mm3, mm2 // get LBCarrys for each byte where both
2350 // lsb's were == 1
2351 psrlq mm2, 1 // divide raw bytes by 2
2352 pand mm1, mm4 // clear invalid bit 7 of each byte
2353 paddb mm0, mm3 // add LBCarrys to Avg for each byte
2354 pand mm2, mm4 // clear invalid bit 7 of each byte
2355 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2356 paddb mm0, mm2 // add (Raw/2) to Avg for each byte
2357 cmp ebx, MMXLength
2358 movq [edi + ebx - 8], mm0
2359 movq mm2, mm0 // reuse as Raw(x-bpp)
2360 jb davg8lp
2361 } // end _asm block
2362 }
2363 break;
2364 default: // bpp greater than 8
2365 {
2366 _asm {
2367 movq mm5, LBCarryMask
2368 // Re-init address pointers and offset
2369 mov ebx, diff // ebx ==> x = offset to alignment boundary
2370 mov edi, row // edi ==> Avg(x)
2371 movq mm4, HBClearMask
2372 mov edx, edi
2373 mov esi, prev_row // esi ==> Prior(x)
2374 sub edx, bpp // edx ==> Raw(x-bpp)
2375davgAlp:
2376 movq mm0, [edi + ebx]
2377 movq mm3, mm5
2378 movq mm1, [esi + ebx]
2379 pand mm3, mm1 // get lsb for each prev_row byte
2380 movq mm2, [edx + ebx]
2381 psrlq mm1, 1 // divide prev_row bytes by 2
2382 pand mm3, mm2 // get LBCarrys for each byte where both
2383 // lsb's were == 1
2384 psrlq mm2, 1 // divide raw bytes by 2
2385 pand mm1, mm4 // clear invalid bit 7 of each byte
2386 paddb mm0, mm3 // add LBCarrys to Avg for each byte
2387 pand mm2, mm4 // clear invalid bit 7 of each byte
2388 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2389 add ebx, 8
2390 paddb mm0, mm2 // add (Raw/2) to Avg for each byte
2391 cmp ebx, MMXLength
2392 movq [edi + ebx - 8], mm0
2393 jb davgAlp
2394 } // end _asm block
2395 }
2396 break;
2397 } // end switch ( bpp )
2398
2399 _asm {
2400 // MMX acceleration complete now do clean-up
2401 // Check if any remaining bytes left to decode
2402 mov ebx, MMXLength // ebx ==> x = offset bytes remaining after MMX
2403 mov edi, row // edi ==> Avg(x)
2404 cmp ebx, FullLength // Test if offset at end of array
2405 jnb davgend
2406 // Do Paeth decode for remaining bytes
2407 mov esi, prev_row // esi ==> Prior(x)
2408 mov edx, edi
2409 xor ecx, ecx // zero ecx before using cl & cx in loop below
2410 sub edx, bpp // edx ==> Raw(x-bpp)
2411davglp2:
2412 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2413 xor eax, eax
2414 mov cl, [esi + ebx] // load cl with Prior(x)
2415 mov al, [edx + ebx] // load al with Raw(x-bpp)
2416 add ax, cx
2417 inc ebx
2418 shr ax, 1 // divide by 2
2419 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2420 cmp ebx, FullLength // Check if at end of array
2421 mov [edi+ebx-1], al // Write back Raw(x);
2422 // mov does not affect flags; -1 to offset inc ebx
2423 jb davglp2
2424davgend:
2425 emms // End MMX instructions; prep for possible FP instrs.
2426 } // end _asm block
2427}
2428
2429// Optimized code for PNG Paeth filter decoder
2430void
2431png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row
2432 , png_bytep prev_row)
2433{
2434 png_uint_32 FullLength;
2435 png_uint_32 MMXLength;
2436 //png_uint_32 len;
2437 int bpp;
2438 int diff;
2439 //int ptemp;
2440 int patemp, pbtemp, pctemp;
2441 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
2442 FullLength = row_info->rowbytes; // # of bytes to filter
2443 _asm {
2444 xor ebx, ebx // ebx ==> x offset
2445 mov edi, row
2446 xor edx, edx // edx ==> x-bpp offset
2447 mov esi, prev_row
2448 xor eax, eax
2449
2450 // Compute the Raw value for the first bpp bytes
2451 // Note: the formula works out to always be Paeth(x) = Raw(x) + Prior(x)
2452 // where x < bpp
2453dpthrlp:
2454 mov al, [edi + ebx]
2455 add al, [esi + ebx]
2456 inc ebx
2457 cmp ebx, bpp
2458 mov [edi + ebx - 1], al
2459 jb dpthrlp
2460 // get # of bytes to alignment
2461 mov diff, edi // take start of row
2462 add diff, ebx // add bpp
2463 xor ecx, ecx
2464 add diff, 0xf // add 7 + 8 to incr past alignment boundary
2465 and diff, 0xfffffff8 // mask to alignment boundary
2466 sub diff, edi // subtract from start ==> value ebx at alignment
2467 jz dpthgo
2468 // fix alignment
2469dpthlp1:
2470 xor eax, eax
2471 // pav = p - a = (a + b - c) - a = b - c
2472 mov al, [esi + ebx] // load Prior(x) into al
2473 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2474 sub eax, ecx // subtract Prior(x-bpp)
2475 mov patemp, eax // Save pav for later use
2476 xor eax, eax
2477 // pbv = p - b = (a + b - c) - b = a - c
2478 mov al, [edi + edx] // load Raw(x-bpp) into al
2479 sub eax, ecx // subtract Prior(x-bpp)
2480 mov ecx, eax
2481 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2482 add eax, patemp // pcv = pav + pbv
2483 // pc = abs(pcv)
2484 test eax, 0x80000000
2485 jz dpthpca
2486 neg eax // reverse sign of neg values
2487dpthpca:
2488 mov pctemp, eax // save pc for later use
2489 // pb = abs(pbv)
2490 test ecx, 0x80000000
2491 jz dpthpba
2492 neg ecx // reverse sign of neg values
2493dpthpba:
2494 mov pbtemp, ecx // save pb for later use
2495 // pa = abs(pav)
2496 mov eax, patemp
2497 test eax, 0x80000000
2498 jz dpthpaa
2499 neg eax // reverse sign of neg values
2500dpthpaa:
2501 mov patemp, eax // save pa for later use
2502 // test if pa <= pb
2503 cmp eax, ecx
2504 jna dpthabb
2505 // pa > pb; now test if pb <= pc
2506 cmp ecx, pctemp
2507 jna dpthbbc
2508 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2509 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2510 jmp dpthpaeth
2511dpthbbc:
2512 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
2513 mov cl, [esi + ebx] // load Prior(x) into cl
2514 jmp dpthpaeth
2515dpthabb:
2516 // pa <= pb; now test if pa <= pc
2517 cmp eax, pctemp
2518 jna dpthabc
2519 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2520 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2521 jmp dpthpaeth
2522dpthabc:
2523 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
2524 mov cl, [edi + edx] // load Raw(x-bpp) into cl
2525dpthpaeth:
2526 inc ebx
2527 inc edx
2528 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
2529 add [edi + ebx - 1], cl
2530 cmp ebx, diff
2531 jb dpthlp1
2532dpthgo:
2533 mov ecx, FullLength
2534 mov eax, ecx
2535 sub eax, ebx // subtract alignment fix
2536 and eax, 0x00000007 // calc bytes over mult of 8
2537 sub ecx, eax // drop over bytes from original length
2538 mov MMXLength, ecx
2539 } // end _asm block
2540 // Now do the math for the rest of the row
2541 switch ( bpp )
2542 {
2543 case 3:
2544 {
2545 ActiveMask.use = 0x0000000000ffffff;
2546 ActiveMaskEnd.use = 0xffff000000000000;
2547 ShiftBpp.use = 24; // == bpp(3) * 8
2548 ShiftRem.use = 40; // == 64 - 24
2549 _asm {
2550 mov ebx, diff
2551 mov edi, row
2552 mov esi, prev_row
2553 pxor mm0, mm0
2554 // PRIME the pump (load the first Raw(x-bpp) data set
2555 movq mm1, [edi+ebx-8]
2556dpth3lp:
2557 psrlq mm1, ShiftRem // shift last 3 bytes to 1st 3 bytes
2558 movq mm2, [esi + ebx] // load b=Prior(x)
2559 punpcklbw mm1, mm0 // Unpack High bytes of a
2560 movq mm3, [esi+ebx-8] // Prep c=Prior(x-bpp) bytes
2561 punpcklbw mm2, mm0 // Unpack High bytes of b
2562 psrlq mm3, ShiftRem // shift last 3 bytes to 1st 3 bytes
2563 // pav = p - a = (a + b - c) - a = b - c
2564 movq mm4, mm2
2565 punpcklbw mm3, mm0 // Unpack High bytes of c
2566 // pbv = p - b = (a + b - c) - b = a - c
2567 movq mm5, mm1
2568 psubw mm4, mm3
2569 pxor mm7, mm7
2570 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2571 movq mm6, mm4
2572 psubw mm5, mm3
2573
2574 // pa = abs(p-a) = abs(pav)
2575 // pb = abs(p-b) = abs(pbv)
2576 // pc = abs(p-c) = abs(pcv)
2577 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2578 paddw mm6, mm5
2579 pand mm0, mm4 // Only pav bytes < 0 in mm7
2580 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2581 psubw mm4, mm0
2582 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2583 psubw mm4, mm0
2584 psubw mm5, mm7
2585 pxor mm0, mm0
2586 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2587 pand mm0, mm6 // Only pav bytes < 0 in mm7
2588 psubw mm5, mm7
2589 psubw mm6, mm0
2590 // test pa <= pb
2591 movq mm7, mm4
2592 psubw mm6, mm0
2593 pcmpgtw mm7, mm5 // pa > pb?
2594 movq mm0, mm7
2595 // use mm7 mask to merge pa & pb
2596 pand mm5, mm7
2597 // use mm0 mask copy to merge a & b
2598 pand mm2, mm0
2599 pandn mm7, mm4
2600 pandn mm0, mm1
2601 paddw mm7, mm5
2602 paddw mm0, mm2
2603 // test ((pa <= pb)? pa:pb) <= pc
2604 pcmpgtw mm7, mm6 // pab > pc?
2605 pxor mm1, mm1
2606 pand mm3, mm7
2607 pandn mm7, mm0
2608 paddw mm7, mm3
2609 pxor mm0, mm0
2610 packuswb mm7, mm1
2611 movq mm3, [esi + ebx] // load c=Prior(x-bpp)
2612 pand mm7, ActiveMask
2613 movq mm2, mm3 // load b=Prior(x) step 1
2614 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2615 punpcklbw mm3, mm0 // Unpack High bytes of c
2616 movq [edi + ebx], mm7 // write back updated value
2617 movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
2618 // Now do Paeth for 2nd set of bytes (3-5)
2619 psrlq mm2, ShiftBpp // load b=Prior(x) step 2
2620 punpcklbw mm1, mm0 // Unpack High bytes of a
2621 pxor mm7, mm7
2622 punpcklbw mm2, mm0 // Unpack High bytes of b
2623 // pbv = p - b = (a + b - c) - b = a - c
2624 movq mm5, mm1
2625 // pav = p - a = (a + b - c) - a = b - c
2626 movq mm4, mm2
2627 psubw mm5, mm3
2628 psubw mm4, mm3
2629 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
2630 // pav + pbv = pbv + pav
2631 movq mm6, mm5
2632 paddw mm6, mm4
2633
2634 // pa = abs(p-a) = abs(pav)
2635 // pb = abs(p-b) = abs(pbv)
2636 // pc = abs(p-c) = abs(pcv)
2637 pcmpgtw mm0, mm5 // Create mask pbv bytes < 0
2638 pcmpgtw mm7, mm4 // Create mask pav bytes < 0
2639 pand mm0, mm5 // Only pbv bytes < 0 in mm0
2640 pand mm7, mm4 // Only pav bytes < 0 in mm7
2641 psubw mm5, mm0
2642 psubw mm4, mm7
2643 psubw mm5, mm0
2644 psubw mm4, mm7
2645 pxor mm0, mm0
2646 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2647 pand mm0, mm6 // Only pav bytes < 0 in mm7
2648 psubw mm6, mm0
2649 // test pa <= pb
2650 movq mm7, mm4
2651 psubw mm6, mm0
2652 pcmpgtw mm7, mm5 // pa > pb?
2653 movq mm0, mm7
2654 // use mm7 mask to merge pa & pb
2655 pand mm5, mm7
2656 // use mm0 mask copy to merge a & b
2657 pand mm2, mm0
2658 pandn mm7, mm4
2659 pandn mm0, mm1
2660 paddw mm7, mm5
2661 paddw mm0, mm2
2662 // test ((pa <= pb)? pa:pb) <= pc
2663 pcmpgtw mm7, mm6 // pab > pc?
2664 movq mm2, [esi + ebx] // load b=Prior(x)
2665 pand mm3, mm7
2666 pandn mm7, mm0
2667 pxor mm1, mm1
2668 paddw mm7, mm3
2669 pxor mm0, mm0
2670 packuswb mm7, mm1
2671 movq mm3, mm2 // load c=Prior(x-bpp) step 1
2672 pand mm7, ActiveMask
2673 punpckhbw mm2, mm0 // Unpack High bytes of b
2674 psllq mm7, ShiftBpp // Shift bytes to 2nd group of 3 bytes
2675 // pav = p - a = (a + b - c) - a = b - c
2676 movq mm4, mm2
2677 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2678 psllq mm3, ShiftBpp // load c=Prior(x-bpp) step 2
2679 movq [edi + ebx], mm7 // write back updated value
2680 movq mm1, mm7
2681 punpckhbw mm3, mm0 // Unpack High bytes of c
2682 psllq mm1, ShiftBpp // Shift bytes
2683 // Now mm1 will be used as Raw(x-bpp)
2684 // Now do Paeth for 3rd, and final, set of bytes (6-7)
2685 pxor mm7, mm7
2686 punpckhbw mm1, mm0 // Unpack High bytes of a
2687 psubw mm4, mm3
2688 // pbv = p - b = (a + b - c) - b = a - c
2689 movq mm5, mm1
2690 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2691 movq mm6, mm4
2692 psubw mm5, mm3
2693 pxor mm0, mm0
2694 paddw mm6, mm5
2695
2696 // pa = abs(p-a) = abs(pav)
2697 // pb = abs(p-b) = abs(pbv)
2698 // pc = abs(p-c) = abs(pcv)
2699 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2700 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2701 pand mm0, mm4 // Only pav bytes < 0 in mm7
2702 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2703 psubw mm4, mm0
2704 psubw mm5, mm7
2705 psubw mm4, mm0
2706 psubw mm5, mm7
2707 pxor mm0, mm0
2708 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2709 pand mm0, mm6 // Only pav bytes < 0 in mm7
2710 psubw mm6, mm0
2711 // test pa <= pb
2712 movq mm7, mm4
2713 psubw mm6, mm0
2714 pcmpgtw mm7, mm5 // pa > pb?
2715 movq mm0, mm7
2716 // use mm0 mask copy to merge a & b
2717 pand mm2, mm0
2718 // use mm7 mask to merge pa & pb
2719 pand mm5, mm7
2720 pandn mm0, mm1
2721 pandn mm7, mm4
2722 paddw mm0, mm2
2723 paddw mm7, mm5
2724 // test ((pa <= pb)? pa:pb) <= pc
2725 pcmpgtw mm7, mm6 // pab > pc?
2726 pand mm3, mm7
2727 pandn mm7, mm0
2728 paddw mm7, mm3
2729 pxor mm1, mm1
2730 packuswb mm1, mm7
2731 // Step ebx to next set of 8 bytes and repeat loop til done
2732 add ebx, 8
2733 pand mm1, ActiveMaskEnd
2734 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2735
2736 cmp ebx, MMXLength
2737 pxor mm0, mm0 // pxor does not affect flags
2738 movq [edi + ebx - 8], mm1 // write back updated value
2739 // mm1 will be used as Raw(x-bpp) next loop
2740 // mm3 ready to be used as Prior(x-bpp) next loop
2741 jb dpth3lp
2742 } // end _asm block
2743 }
2744 break;
2745 case 6:
2746 case 7:
2747 case 5:
2748 {
2749 ActiveMask.use = 0x00000000ffffffff;
2750 ActiveMask2.use = 0xffffffff00000000;
2751 ShiftBpp.use = bpp << 3; // == bpp * 8
2752 ShiftRem.use = 64 - ShiftBpp.use;
2753 _asm {
2754 mov ebx, diff
2755 mov edi, row //
2756 mov esi, prev_row
2757 // PRIME the pump (load the first Raw(x-bpp) data set
2758 movq mm1, [edi+ebx-8]
2759 pxor mm0, mm0
2760dpth6lp:
2761 // Must shift to position Raw(x-bpp) data
2762 psrlq mm1, ShiftRem
2763 // Do first set of 4 bytes
2764 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2765 punpcklbw mm1, mm0 // Unpack Low bytes of a
2766 movq mm2, [esi + ebx] // load b=Prior(x)
2767 punpcklbw mm2, mm0 // Unpack Low bytes of b
2768 // Must shift to position Prior(x-bpp) data
2769 psrlq mm3, ShiftRem
2770 // pav = p - a = (a + b - c) - a = b - c
2771 movq mm4, mm2
2772 punpcklbw mm3, mm0 // Unpack Low bytes of c
2773 // pbv = p - b = (a + b - c) - b = a - c
2774 movq mm5, mm1
2775 psubw mm4, mm3
2776 pxor mm7, mm7
2777 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2778 movq mm6, mm4
2779 psubw mm5, mm3
2780 // pa = abs(p-a) = abs(pav)
2781 // pb = abs(p-b) = abs(pbv)
2782 // pc = abs(p-c) = abs(pcv)
2783 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2784 paddw mm6, mm5
2785 pand mm0, mm4 // Only pav bytes < 0 in mm7
2786 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2787 psubw mm4, mm0
2788 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2789 psubw mm4, mm0
2790 psubw mm5, mm7
2791 pxor mm0, mm0
2792 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2793 pand mm0, mm6 // Only pav bytes < 0 in mm7
2794 psubw mm5, mm7
2795 psubw mm6, mm0
2796 // test pa <= pb
2797 movq mm7, mm4
2798 psubw mm6, mm0
2799 pcmpgtw mm7, mm5 // pa > pb?
2800 movq mm0, mm7
2801 // use mm7 mask to merge pa & pb
2802 pand mm5, mm7
2803 // use mm0 mask copy to merge a & b
2804 pand mm2, mm0
2805 pandn mm7, mm4
2806 pandn mm0, mm1
2807 paddw mm7, mm5
2808 paddw mm0, mm2
2809 // test ((pa <= pb)? pa:pb) <= pc
2810 pcmpgtw mm7, mm6 // pab > pc?
2811 pxor mm1, mm1
2812 pand mm3, mm7
2813 pandn mm7, mm0
2814 paddw mm7, mm3
2815 pxor mm0, mm0
2816 packuswb mm7, mm1
2817 movq mm3, [esi + ebx - 8] // load c=Prior(x-bpp)
2818 pand mm7, ActiveMask
2819 psrlq mm3, ShiftRem
2820 movq mm2, [esi + ebx] // load b=Prior(x) step 1
2821 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2822 movq mm6, mm2
2823 movq [edi + ebx], mm7 // write back updated value
2824 movq mm1, [edi+ebx-8]
2825 psllq mm6, ShiftBpp
2826 movq mm5, mm7
2827 psrlq mm1, ShiftRem
2828 por mm3, mm6
2829 psllq mm5, ShiftBpp
2830 punpckhbw mm3, mm0 // Unpack High bytes of c
2831 por mm1, mm5
2832 // Do second set of 4 bytes
2833 punpckhbw mm2, mm0 // Unpack High bytes of b
2834 punpckhbw mm1, mm0 // Unpack High bytes of a
2835 // pav = p - a = (a + b - c) - a = b - c
2836 movq mm4, mm2
2837 // pbv = p - b = (a + b - c) - b = a - c
2838 movq mm5, mm1
2839 psubw mm4, mm3
2840 pxor mm7, mm7
2841 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2842 movq mm6, mm4
2843 psubw mm5, mm3
2844 // pa = abs(p-a) = abs(pav)
2845 // pb = abs(p-b) = abs(pbv)
2846 // pc = abs(p-c) = abs(pcv)
2847 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2848 paddw mm6, mm5
2849 pand mm0, mm4 // Only pav bytes < 0 in mm7
2850 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2851 psubw mm4, mm0
2852 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2853 psubw mm4, mm0
2854 psubw mm5, mm7
2855 pxor mm0, mm0
2856 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2857 pand mm0, mm6 // Only pav bytes < 0 in mm7
2858 psubw mm5, mm7
2859 psubw mm6, mm0
2860 // test pa <= pb
2861 movq mm7, mm4
2862 psubw mm6, mm0
2863 pcmpgtw mm7, mm5 // pa > pb?
2864 movq mm0, mm7
2865 // use mm7 mask to merge pa & pb
2866 pand mm5, mm7
2867 // use mm0 mask copy to merge a & b
2868 pand mm2, mm0
2869 pandn mm7, mm4
2870 pandn mm0, mm1
2871 paddw mm7, mm5
2872 paddw mm0, mm2
2873 // test ((pa <= pb)? pa:pb) <= pc
2874 pcmpgtw mm7, mm6 // pab > pc?
2875 pxor mm1, mm1
2876 pand mm3, mm7
2877 pandn mm7, mm0
2878 pxor mm1, mm1
2879 paddw mm7, mm3
2880 pxor mm0, mm0
2881 // Step ex to next set of 8 bytes and repeat loop til done
2882 add ebx, 8
2883 packuswb mm1, mm7
2884 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2885 cmp ebx, MMXLength
2886 movq [edi + ebx - 8], mm1 // write back updated value
2887 // mm1 will be used as Raw(x-bpp) next loop
2888 jb dpth6lp
2889 } // end _asm block
2890 }
2891 break;
2892 case 4:
2893 {
2894 ActiveMask.use = 0x00000000ffffffff;
2895 _asm {
2896 mov ebx, diff
2897 mov edi, row //
2898 mov esi, prev_row
2899 pxor mm0, mm0
2900 // PRIME the pump (load the first Raw(x-bpp) data set
2901 movq mm1, [edi+ebx-8] // Only time should need to read a=Raw(x-bpp) bytes
2902dpth4lp:
2903 // Do first set of 4 bytes
2904 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2905 punpckhbw mm1, mm0 // Unpack Low bytes of a
2906 movq mm2, [esi + ebx] // load b=Prior(x)
2907 punpcklbw mm2, mm0 // Unpack High bytes of b
2908 // pav = p - a = (a + b - c) - a = b - c
2909 movq mm4, mm2
2910 punpckhbw mm3, mm0 // Unpack High bytes of c
2911 // pbv = p - b = (a + b - c) - b = a - c
2912 movq mm5, mm1
2913 psubw mm4, mm3
2914 pxor mm7, mm7
2915 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2916 movq mm6, mm4
2917 psubw mm5, mm3
2918 // pa = abs(p-a) = abs(pav)
2919 // pb = abs(p-b) = abs(pbv)
2920 // pc = abs(p-c) = abs(pcv)
2921 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2922 paddw mm6, mm5
2923 pand mm0, mm4 // Only pav bytes < 0 in mm7
2924 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2925 psubw mm4, mm0
2926 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2927 psubw mm4, mm0
2928 psubw mm5, mm7
2929 pxor mm0, mm0
2930 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2931 pand mm0, mm6 // Only pav bytes < 0 in mm7
2932 psubw mm5, mm7
2933 psubw mm6, mm0
2934 // test pa <= pb
2935 movq mm7, mm4
2936 psubw mm6, mm0
2937 pcmpgtw mm7, mm5 // pa > pb?
2938 movq mm0, mm7
2939 // use mm7 mask to merge pa & pb
2940 pand mm5, mm7
2941 // use mm0 mask copy to merge a & b
2942 pand mm2, mm0
2943 pandn mm7, mm4
2944 pandn mm0, mm1
2945 paddw mm7, mm5
2946 paddw mm0, mm2
2947 // test ((pa <= pb)? pa:pb) <= pc
2948 pcmpgtw mm7, mm6 // pab > pc?
2949 pxor mm1, mm1
2950 pand mm3, mm7
2951 pandn mm7, mm0
2952 paddw mm7, mm3
2953 pxor mm0, mm0
2954 packuswb mm7, mm1
2955 movq mm3, [esi + ebx] // load c=Prior(x-bpp)
2956 pand mm7, ActiveMask
2957 movq mm2, mm3 // load b=Prior(x) step 1
2958 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2959 punpcklbw mm3, mm0 // Unpack High bytes of c
2960 movq [edi + ebx], mm7 // write back updated value
2961 movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
2962 // Do second set of 4 bytes
2963 punpckhbw mm2, mm0 // Unpack Low bytes of b
2964 punpcklbw mm1, mm0 // Unpack Low bytes of a
2965 // pav = p - a = (a + b - c) - a = b - c
2966 movq mm4, mm2
2967 // pbv = p - b = (a + b - c) - b = a - c
2968 movq mm5, mm1
2969 psubw mm4, mm3
2970 pxor mm7, mm7
2971 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2972 movq mm6, mm4
2973 psubw mm5, mm3
2974 // pa = abs(p-a) = abs(pav)
2975 // pb = abs(p-b) = abs(pbv)
2976 // pc = abs(p-c) = abs(pcv)
2977 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2978 paddw mm6, mm5
2979 pand mm0, mm4 // Only pav bytes < 0 in mm7
2980 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2981 psubw mm4, mm0
2982 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2983 psubw mm4, mm0
2984 psubw mm5, mm7
2985 pxor mm0, mm0
2986 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2987 pand mm0, mm6 // Only pav bytes < 0 in mm7
2988 psubw mm5, mm7
2989 psubw mm6, mm0
2990 // test pa <= pb
2991 movq mm7, mm4
2992 psubw mm6, mm0
2993 pcmpgtw mm7, mm5 // pa > pb?
2994 movq mm0, mm7
2995 // use mm7 mask to merge pa & pb
2996 pand mm5, mm7
2997 // use mm0 mask copy to merge a & b
2998 pand mm2, mm0
2999 pandn mm7, mm4
3000 pandn mm0, mm1
3001 paddw mm7, mm5
3002 paddw mm0, mm2
3003 // test ((pa <= pb)? pa:pb) <= pc
3004 pcmpgtw mm7, mm6 // pab > pc?
3005 pxor mm1, mm1
3006 pand mm3, mm7
3007 pandn mm7, mm0
3008 pxor mm1, mm1
3009 paddw mm7, mm3
3010 pxor mm0, mm0
3011 // Step ex to next set of 8 bytes and repeat loop til done
3012 add ebx, 8
3013 packuswb mm1, mm7
3014 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
3015 cmp ebx, MMXLength
3016 movq [edi + ebx - 8], mm1 // write back updated value
3017 // mm1 will be used as Raw(x-bpp) next loop
3018 jb dpth4lp
3019 } // end _asm block
3020 }
3021 break;
3022 case 8: // bpp == 8
3023 {
3024 ActiveMask.use = 0x00000000ffffffff;
3025 _asm {
3026 mov ebx, diff
3027 mov edi, row //
3028 mov esi, prev_row
3029 pxor mm0, mm0
3030 // PRIME the pump (load the first Raw(x-bpp) data set
3031 movq mm1, [edi+ebx-8] // Only time should need to read a=Raw(x-bpp) bytes
3032dpth8lp:
3033 // Do first set of 4 bytes
3034 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
3035 punpcklbw mm1, mm0 // Unpack Low bytes of a
3036 movq mm2, [esi + ebx] // load b=Prior(x)
3037 punpcklbw mm2, mm0 // Unpack Low bytes of b
3038 // pav = p - a = (a + b - c) - a = b - c
3039 movq mm4, mm2
3040 punpcklbw mm3, mm0 // Unpack Low bytes of c
3041 // pbv = p - b = (a + b - c) - b = a - c
3042 movq mm5, mm1
3043 psubw mm4, mm3
3044 pxor mm7, mm7
3045 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3046 movq mm6, mm4
3047 psubw mm5, mm3
3048 // pa = abs(p-a) = abs(pav)
3049 // pb = abs(p-b) = abs(pbv)
3050 // pc = abs(p-c) = abs(pcv)
3051 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
3052 paddw mm6, mm5
3053 pand mm0, mm4 // Only pav bytes < 0 in mm7
3054 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
3055 psubw mm4, mm0
3056 pand mm7, mm5 // Only pbv bytes < 0 in mm0
3057 psubw mm4, mm0
3058 psubw mm5, mm7
3059 pxor mm0, mm0
3060 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
3061 pand mm0, mm6 // Only pav bytes < 0 in mm7
3062 psubw mm5, mm7
3063 psubw mm6, mm0
3064 // test pa <= pb
3065 movq mm7, mm4
3066 psubw mm6, mm0
3067 pcmpgtw mm7, mm5 // pa > pb?
3068 movq mm0, mm7
3069 // use mm7 mask to merge pa & pb
3070 pand mm5, mm7
3071 // use mm0 mask copy to merge a & b
3072 pand mm2, mm0
3073 pandn mm7, mm4
3074 pandn mm0, mm1
3075 paddw mm7, mm5
3076 paddw mm0, mm2
3077 // test ((pa <= pb)? pa:pb) <= pc
3078 pcmpgtw mm7, mm6 // pab > pc?
3079 pxor mm1, mm1
3080 pand mm3, mm7
3081 pandn mm7, mm0
3082 paddw mm7, mm3
3083 pxor mm0, mm0
3084 packuswb mm7, mm1
3085 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
3086 pand mm7, ActiveMask
3087 movq mm2, [esi + ebx] // load b=Prior(x)
3088 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
3089 punpckhbw mm3, mm0 // Unpack High bytes of c
3090 movq [edi + ebx], mm7 // write back updated value
3091 movq mm1, [edi+ebx-8] // read a=Raw(x-bpp) bytes
3092
3093 // Do second set of 4 bytes
3094 punpckhbw mm2, mm0 // Unpack High bytes of b
3095 punpckhbw mm1, mm0 // Unpack High bytes of a
3096 // pav = p - a = (a + b - c) - a = b - c
3097 movq mm4, mm2
3098 // pbv = p - b = (a + b - c) - b = a - c
3099 movq mm5, mm1
3100 psubw mm4, mm3
3101 pxor mm7, mm7
3102 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3103 movq mm6, mm4
3104 psubw mm5, mm3
3105 // pa = abs(p-a) = abs(pav)
3106 // pb = abs(p-b) = abs(pbv)
3107 // pc = abs(p-c) = abs(pcv)
3108 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
3109 paddw mm6, mm5
3110 pand mm0, mm4 // Only pav bytes < 0 in mm7
3111 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
3112 psubw mm4, mm0
3113 pand mm7, mm5 // Only pbv bytes < 0 in mm0
3114 psubw mm4, mm0
3115 psubw mm5, mm7
3116 pxor mm0, mm0
3117 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
3118 pand mm0, mm6 // Only pav bytes < 0 in mm7
3119 psubw mm5, mm7
3120 psubw mm6, mm0
3121 // test pa <= pb
3122 movq mm7, mm4
3123 psubw mm6, mm0
3124 pcmpgtw mm7, mm5 // pa > pb?
3125 movq mm0, mm7
3126 // use mm7 mask to merge pa & pb
3127 pand mm5, mm7
3128 // use mm0 mask copy to merge a & b
3129 pand mm2, mm0
3130 pandn mm7, mm4
3131 pandn mm0, mm1
3132 paddw mm7, mm5
3133 paddw mm0, mm2
3134 // test ((pa <= pb)? pa:pb) <= pc
3135 pcmpgtw mm7, mm6 // pab > pc?
3136 pxor mm1, mm1
3137 pand mm3, mm7
3138 pandn mm7, mm0
3139 pxor mm1, mm1
3140 paddw mm7, mm3
3141 pxor mm0, mm0
3142 // Step ex to next set of 8 bytes and repeat loop til done
3143 add ebx, 8
3144 packuswb mm1, mm7
3145 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
3146 cmp ebx, MMXLength
3147 movq [edi + ebx - 8], mm1 // write back updated value
3148 // mm1 will be used as Raw(x-bpp) next loop
3149 jb dpth8lp
3150 } // end _asm block
3151 }
3152 break;
3153 case 1: // bpp = 1
3154 case 2: // bpp = 2
3155 default: // bpp > 8
3156 {
3157 _asm {
3158 mov ebx, diff
3159 cmp ebx, FullLength
3160 jnb dpthdend
3161 mov edi, row //
3162 mov esi, prev_row
3163 // Do Paeth decode for remaining bytes
3164 mov edx, ebx
3165 xor ecx, ecx // zero ecx before using cl & cx in loop below
3166 sub edx, bpp // Set edx = ebx - bpp
3167dpthdlp:
3168 xor eax, eax
3169 // pav = p - a = (a + b - c) - a = b - c
3170 mov al, [esi + ebx] // load Prior(x) into al
3171 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3172 sub eax, ecx // subtract Prior(x-bpp)
3173 mov patemp, eax // Save pav for later use
3174 xor eax, eax
3175 // pbv = p - b = (a + b - c) - b = a - c
3176 mov al, [edi + edx] // load Raw(x-bpp) into al
3177 sub eax, ecx // subtract Prior(x-bpp)
3178 mov ecx, eax
3179 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3180 add eax, patemp // pcv = pav + pbv
3181 // pc = abs(pcv)
3182 test eax, 0x80000000
3183 jz dpthdpca
3184 neg eax // reverse sign of neg values
3185dpthdpca:
3186 mov pctemp, eax // save pc for later use
3187 // pb = abs(pbv)
3188 test ecx, 0x80000000
3189 jz dpthdpba
3190 neg ecx // reverse sign of neg values
3191dpthdpba:
3192 mov pbtemp, ecx // save pb for later use
3193 // pa = abs(pav)
3194 mov eax, patemp
3195 test eax, 0x80000000
3196 jz dpthdpaa
3197 neg eax // reverse sign of neg values
3198dpthdpaa:
3199 mov patemp, eax // save pa for later use
3200 // test if pa <= pb
3201 cmp eax, ecx
3202 jna dpthdabb
3203 // pa > pb; now test if pb <= pc
3204 cmp ecx, pctemp
3205 jna dpthdbbc
3206 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3207 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3208 jmp dpthdpaeth
3209dpthdbbc:
3210 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3211 mov cl, [esi + ebx] // load Prior(x) into cl
3212 jmp dpthdpaeth
3213dpthdabb:
3214 // pa <= pb; now test if pa <= pc
3215 cmp eax, pctemp
3216 jna dpthdabc
3217 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3218 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3219 jmp dpthdpaeth
3220dpthdabc:
3221 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3222 mov cl, [edi + edx] // load Raw(x-bpp) into cl
3223dpthdpaeth:
3224 inc ebx
3225 inc edx
3226 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3227 add [edi + ebx - 1], cl
3228 cmp ebx, FullLength
3229 jb dpthdlp
3230dpthdend:
3231 } // end _asm block
3232 }
3233 return; // No need to go further with this one
3234 } // end switch ( bpp )
3235 _asm {
3236 // MMX acceleration complete now do clean-up
3237 // Check if any remaining bytes left to decode
3238 mov ebx, MMXLength
3239 cmp ebx, FullLength
3240 jnb dpthend
3241 mov edi, row
3242 mov esi, prev_row
3243 // Do Paeth decode for remaining bytes
3244 mov edx, ebx
3245 xor ecx, ecx // zero ecx before using cl & cx in loop below
3246 sub edx, bpp // Set edx = ebx - bpp
3247dpthlp2:
3248 xor eax, eax
3249 // pav = p - a = (a + b - c) - a = b - c
3250 mov al, [esi + ebx] // load Prior(x) into al
3251 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3252 sub eax, ecx // subtract Prior(x-bpp)
3253 mov patemp, eax // Save pav for later use
3254 xor eax, eax
3255 // pbv = p - b = (a + b - c) - b = a - c
3256 mov al, [edi + edx] // load Raw(x-bpp) into al
3257 sub eax, ecx // subtract Prior(x-bpp)
3258 mov ecx, eax
3259 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3260 add eax, patemp // pcv = pav + pbv
3261 // pc = abs(pcv)
3262 test eax, 0x80000000
3263 jz dpthpca2
3264 neg eax // reverse sign of neg values
3265dpthpca2:
3266 mov pctemp, eax // save pc for later use
3267 // pb = abs(pbv)
3268 test ecx, 0x80000000
3269 jz dpthpba2
3270 neg ecx // reverse sign of neg values
3271dpthpba2:
3272 mov pbtemp, ecx // save pb for later use
3273 // pa = abs(pav)
3274 mov eax, patemp
3275 test eax, 0x80000000
3276 jz dpthpaa2
3277 neg eax // reverse sign of neg values
3278dpthpaa2:
3279 mov patemp, eax // save pa for later use
3280 // test if pa <= pb
3281 cmp eax, ecx
3282 jna dpthabb2
3283 // pa > pb; now test if pb <= pc
3284 cmp ecx, pctemp
3285 jna dpthbbc2
3286 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3287 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3288 jmp dpthpaeth2
3289dpthbbc2:
3290 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3291 mov cl, [esi + ebx] // load Prior(x) into cl
3292 jmp dpthpaeth2
3293dpthabb2:
3294 // pa <= pb; now test if pa <= pc
3295 cmp eax, pctemp
3296 jna dpthabc2
3297 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3298 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3299 jmp dpthpaeth2
3300dpthabc2:
3301 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3302 mov cl, [edi + edx] // load Raw(x-bpp) into cl
3303dpthpaeth2:
3304 inc ebx
3305 inc edx
3306 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3307 add [edi + ebx - 1], cl
3308 cmp ebx, FullLength
3309 jb dpthlp2
3310dpthend:
3311 emms // End MMX instructions; prep for possible FP instrs.
3312 } // end _asm block
3313}
3314
3315// Optimized code for PNG Sub filter decoder
3316void
3317png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
3318{
3319 //int test;
3320 int bpp;
3321 png_uint_32 FullLength;
3322 png_uint_32 MMXLength;
3323 int diff;
3324 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3325 FullLength = row_info->rowbytes - bpp; // # of bytes to filter
3326 _asm {
3327 mov edi, row
3328 mov esi, edi // lp = row
3329 add edi, bpp // rp = row + bpp
3330 xor eax, eax
3331 // get # of bytes to alignment
3332 mov diff, edi // take start of row
3333 add diff, 0xf // add 7 + 8 to incr past
3334 // alignment boundary
3335 xor ebx, ebx
3336 and diff, 0xfffffff8 // mask to alignment boundary
3337 sub diff, edi // subtract from start ==> value
3338 // ebx at alignment
3339 jz dsubgo
3340 // fix alignment
3341dsublp1:
3342 mov al, [esi+ebx]
3343 add [edi+ebx], al
3344 inc ebx
3345 cmp ebx, diff
3346 jb dsublp1
3347dsubgo:
3348 mov ecx, FullLength
3349 mov edx, ecx
3350 sub edx, ebx // subtract alignment fix
3351 and edx, 0x00000007 // calc bytes over mult of 8
3352 sub ecx, edx // drop over bytes from length
3353 mov MMXLength, ecx
3354 } // end _asm block
3355 // Now do the math for the rest of the row
3356 switch ( bpp )
3357 {
3358 case 3:
3359 {
3360 ActiveMask.use = 0x0000ffffff000000;
3361 ShiftBpp.use = 24; // == 3 * 8
3362 ShiftRem.use = 40; // == 64 - 24
3363 _asm {
3364 mov edi, row
3365 movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
3366 mov esi, edi // lp = row
3367 add edi, bpp // rp = row + bpp
3368 movq mm6, mm7
3369 mov ebx, diff
3370 psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
3371 // byte group
3372 // PRIME the pump (load the first Raw(x-bpp) data set
3373 movq mm1, [edi+ebx-8]
3374dsub3lp:
3375 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3376 // no need for mask; shift clears inactive bytes
3377 // Add 1st active group
3378 movq mm0, [edi+ebx]
3379 paddb mm0, mm1
3380 // Add 2nd active group
3381 movq mm1, mm0 // mov updated Raws to mm1
3382 psllq mm1, ShiftBpp // shift data to position correctly
3383 pand mm1, mm7 // mask to use only 2nd active group
3384 paddb mm0, mm1
3385 // Add 3rd active group
3386 movq mm1, mm0 // mov updated Raws to mm1
3387 psllq mm1, ShiftBpp // shift data to position correctly
3388 pand mm1, mm6 // mask to use only 3rd active group
3389 add ebx, 8
3390 paddb mm0, mm1
3391 cmp ebx, MMXLength
3392 movq [edi+ebx-8], mm0 // Write updated Raws back to array
3393 // Prep for doing 1st add at top of loop
3394 movq mm1, mm0
3395 jb dsub3lp
3396 } // end _asm block
3397 }
3398 break;
3399 case 1:
3400 {
3401 /* Placed here just in case this is a duplicate of the
3402 non-MMX code for the SUB filter in png_read_filter_row
3403 above
3404 */
3405// png_bytep rp;
3406// png_bytep lp;
3407// png_uint_32 i;
3408// bpp = (row_info->pixel_depth + 7) >> 3;
3409// for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
3410// i < row_info->rowbytes; i++, rp++, lp++)
3411// {
3412// *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
3413// }
3414 _asm {
3415 mov ebx, diff
3416 mov edi, row
3417 cmp ebx, FullLength
3418 jnb dsub1end
3419 mov esi, edi // lp = row
3420 xor eax, eax
3421 add edi, bpp // rp = row + bpp
3422dsub1lp:
3423 mov al, [esi+ebx]
3424 add [edi+ebx], al
3425 inc ebx
3426 cmp ebx, FullLength
3427 jb dsub1lp
3428dsub1end:
3429 } // end _asm block
3430 }
3431 return;
3432 case 6:
3433 case 7:
3434 case 4:
3435 case 5:
3436 {
3437 ShiftBpp.use = bpp << 3;
3438 ShiftRem.use = 64 - ShiftBpp.use;
3439 _asm {
3440 mov edi, row
3441 mov ebx, diff
3442 mov esi, edi // lp = row
3443 add edi, bpp // rp = row + bpp
3444 // PRIME the pump (load the first Raw(x-bpp) data set
3445 movq mm1, [edi+ebx-8]
3446dsub4lp:
3447 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3448 // no need for mask; shift clears inactive bytes
3449 movq mm0, [edi+ebx]
3450 paddb mm0, mm1
3451 // Add 2nd active group
3452 movq mm1, mm0 // mov updated Raws to mm1
3453 psllq mm1, ShiftBpp // shift data to position correctly
3454 // there is no need for any mask
3455 // since shift clears inactive bits/bytes
3456 add ebx, 8
3457 paddb mm0, mm1
3458 cmp ebx, MMXLength
3459 movq [edi+ebx-8], mm0
3460 movq mm1, mm0 // Prep for doing 1st add at top of loop
3461 jb dsub4lp
3462 } // end _asm block
3463 }
3464 break;
3465 case 2:
3466 {
3467 ActiveMask.use = 0x00000000ffff0000;
3468 ShiftBpp.use = 16; // == 2 * 8
3469 ShiftRem.use = 48; // == 64 - 16
3470 _asm {
3471 movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
3472 mov ebx, diff
3473 movq mm6, mm7
3474 mov edi, row
3475 psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active byte group
3476 mov esi, edi // lp = row
3477 movq mm5, mm6
3478 add edi, bpp // rp = row + bpp
3479 psllq mm5, ShiftBpp // Move mask in mm5 to cover 4th active byte group
3480 // PRIME the pump (load the first Raw(x-bpp) data set
3481 movq mm1, [edi+ebx-8]
3482dsub2lp:
3483 // Add 1st active group
3484 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3485 // no need for mask; shift clears inactive bytes
3486 movq mm0, [edi+ebx]
3487 paddb mm0, mm1
3488 // Add 2nd active group
3489 movq mm1, mm0 // mov updated Raws to mm1
3490 psllq mm1, ShiftBpp // shift data to position correctly
3491 pand mm1, mm7 // mask to use only 2nd active group
3492 paddb mm0, mm1
3493 // Add 3rd active group
3494 movq mm1, mm0 // mov updated Raws to mm1
3495 psllq mm1, ShiftBpp // shift data to position correctly
3496 pand mm1, mm6 // mask to use only 3rd active group
3497 paddb mm0, mm1
3498 // Add 4th active group
3499 movq mm1, mm0 // mov updated Raws to mm1
3500 psllq mm1, ShiftBpp // shift data to position correctly
3501 pand mm1, mm5 // mask to use only 4th active group
3502 add ebx, 8
3503 paddb mm0, mm1
3504 cmp ebx, MMXLength
3505 movq [edi+ebx-8], mm0 // Write updated Raws back to array
3506 movq mm1, mm0 // Prep for doing 1st add at top of loop
3507 jb dsub2lp
3508 } // end _asm block
3509 }
3510 break;
3511 case 8:
3512 {
3513 _asm {
3514 mov edi, row
3515 mov ebx, diff
3516 mov esi, edi // lp = row
3517 add edi, bpp // rp = row + bpp
3518 mov ecx, MMXLength
3519 movq mm7, [edi+ebx-8] // PRIME the pump (load the first
3520 // Raw(x-bpp) data set
3521 and ecx, 0x0000003f // calc bytes over mult of 64
3522dsub8lp:
3523 movq mm0, [edi+ebx] // Load Sub(x) for 1st 8 bytes
3524 paddb mm0, mm7
3525 movq mm1, [edi+ebx+8] // Load Sub(x) for 2nd 8 bytes
3526 movq [edi+ebx], mm0 // Write Raw(x) for 1st 8 bytes
3527 // Now mm0 will be used as Raw(x-bpp) for
3528 // the 2nd group of 8 bytes. This will be
3529 // repeated for each group of 8 bytes with
3530 // the 8th group being used as the Raw(x-bpp)
3531 // for the 1st group of the next loop.
3532 paddb mm1, mm0
3533 movq mm2, [edi+ebx+16] // Load Sub(x) for 3rd 8 bytes
3534 movq [edi+ebx+8], mm1 // Write Raw(x) for 2nd 8 bytes
3535 paddb mm2, mm1
3536 movq mm3, [edi+ebx+24] // Load Sub(x) for 4th 8 bytes
3537 movq [edi+ebx+16], mm2 // Write Raw(x) for 3rd 8 bytes
3538 paddb mm3, mm2
3539 movq mm4, [edi+ebx+32] // Load Sub(x) for 5th 8 bytes
3540 movq [edi+ebx+24], mm3 // Write Raw(x) for 4th 8 bytes
3541 paddb mm4, mm3
3542 movq mm5, [edi+ebx+40] // Load Sub(x) for 6th 8 bytes
3543 movq [edi+ebx+32], mm4 // Write Raw(x) for 5th 8 bytes
3544 paddb mm5, mm4
3545 movq mm6, [edi+ebx+48] // Load Sub(x) for 7th 8 bytes
3546 movq [edi+ebx+40], mm5 // Write Raw(x) for 6th 8 bytes
3547 paddb mm6, mm5
3548 movq mm7, [edi+ebx+56] // Load Sub(x) for 8th 8 bytes
3549 movq [edi+ebx+48], mm6 // Write Raw(x) for 7th 8 bytes
3550 add ebx, 64
3551 paddb mm7, mm6
3552 cmp ebx, ecx
3553 movq [edi+ebx-8], mm7 // Write Raw(x) for 8th 8 bytes
3554 jb dsub8lp
3555 cmp ebx, MMXLength
3556 jnb dsub8lt8
3557dsub8lpA:
3558 movq mm0, [edi+ebx]
3559 add ebx, 8
3560 paddb mm0, mm7
3561 cmp ebx, MMXLength
3562 movq [edi+ebx-8], mm0 // use -8 to offset early add to ebx
3563 movq mm7, mm0 // Move calculated Raw(x) data to mm1 to
3564 // be the new Raw(x-bpp) for the next loop
3565 jb dsub8lpA
3566dsub8lt8:
3567 } // end _asm block
3568 }
3569 break;
3570 default: // bpp greater than 8 bytes
3571 {
3572 _asm {
3573 mov ebx, diff
3574 mov edi, row
3575 mov esi, edi // lp = row
3576 add edi, bpp // rp = row + bpp
3577dsubAlp:
3578 movq mm0, [edi+ebx]
3579 movq mm1, [esi+ebx]
3580 add ebx, 8
3581 paddb mm0, mm1
3582 cmp ebx, MMXLength
3583 movq [edi+ebx-8], mm0 // mov does not affect flags; -8 to offset add ebx
3584 jb dsubAlp
3585 } // end _asm block
3586 }
3587 break;
3588 } // end switch ( bpp )
3589
3590 _asm {
3591 mov ebx, MMXLength
3592 mov edi, row
3593 cmp ebx, FullLength
3594 jnb dsubend
3595 mov esi, edi // lp = row
3596 xor eax, eax
3597 add edi, bpp // rp = row + bpp
3598dsublp2:
3599 mov al, [esi+ebx]
3600 add [edi+ebx], al
3601 inc ebx
3602 cmp ebx, FullLength
3603 jb dsublp2
3604dsubend:
3605 emms // End MMX instructions; prep for possible FP instrs.
3606 } // end _asm block
3607}
3608
3609// Optimized code for PNG Up filter decoder
3610void
3611png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
3612 png_bytep prev_row)
3613{
3614 png_uint_32 len;
3615 len = row_info->rowbytes; // # of bytes to filter
3616 _asm {
3617 mov edi, row
3618 // get # of bytes to alignment
3619 mov ecx, edi
3620 xor ebx, ebx
3621 add ecx, 0x7
3622 xor eax, eax
3623 and ecx, 0xfffffff8
3624 mov esi, prev_row
3625 sub ecx, edi
3626 jz dupgo
3627 // fix alignment
3628duplp1:
3629 mov al, [edi+ebx]
3630 add al, [esi+ebx]
3631 inc ebx
3632 cmp ebx, ecx
3633 mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3634 jb duplp1
3635dupgo:
3636 mov ecx, len
3637 mov edx, ecx
3638 sub edx, ebx // subtract alignment fix
3639 and edx, 0x0000003f // calc bytes over mult of 64
3640 sub ecx, edx // drop over bytes from length
3641 // Unrolled loop - use all MMX registers and interleave to reduce
3642 // number of branch instructions (loops) and reduce partial stalls
3643duploop:
3644 movq mm1, [esi+ebx]
3645 movq mm0, [edi+ebx]
3646 movq mm3, [esi+ebx+8]
3647 paddb mm0, mm1
3648 movq mm2, [edi+ebx+8]
3649 movq [edi+ebx], mm0
3650 paddb mm2, mm3
3651 movq mm5, [esi+ebx+16]
3652 movq [edi+ebx+8], mm2
3653 movq mm4, [edi+ebx+16]
3654 movq mm7, [esi+ebx+24]
3655 paddb mm4, mm5
3656 movq mm6, [edi+ebx+24]
3657 movq [edi+ebx+16], mm4
3658 paddb mm6, mm7
3659 movq mm1, [esi+ebx+32]
3660 movq [edi+ebx+24], mm6
3661 movq mm0, [edi+ebx+32]
3662 movq mm3, [esi+ebx+40]
3663 paddb mm0, mm1
3664 movq mm2, [edi+ebx+40]
3665 movq [edi+ebx+32], mm0
3666 paddb mm2, mm3
3667 movq mm5, [esi+ebx+48]
3668 movq [edi+ebx+40], mm2
3669 movq mm4, [edi+ebx+48]
3670 movq mm7, [esi+ebx+56]
3671 paddb mm4, mm5
3672 movq mm6, [edi+ebx+56]
3673 movq [edi+ebx+48], mm4
3674 add ebx, 64
3675 paddb mm6, mm7
3676 cmp ebx, ecx
3677 movq [edi+ebx-8], mm6 // (+56)movq does not affect flags;
3678 // -8 to offset add ebx
3679 jb duploop
3680
3681 cmp edx, 0 // Test for bytes over mult of 64
3682 jz dupend
3683
3684
3685 // 2 lines added by lcreeve@netins.net
3686 // (mail 11 Jul 98 in png-implement list)
3687 cmp edx, 8 //test for less than 8 bytes
3688 jb duplt8
3689
3690
3691 add ecx, edx
3692 and edx, 0x00000007 // calc bytes over mult of 8
3693 sub ecx, edx // drop over bytes from length
3694 jz duplt8
3695 // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
3696duplpA:
3697 movq mm1, [esi+ebx]
3698 movq mm0, [edi+ebx]
3699 add ebx, 8
3700 paddb mm0, mm1
3701 cmp ebx, ecx
3702 movq [edi+ebx-8], mm0 // movq does not affect flags; -8 to offset add ebx
3703 jb duplpA
3704 cmp edx, 0 // Test for bytes over mult of 8
3705 jz dupend
3706duplt8:
3707 xor eax, eax
3708 add ecx, edx // move over byte count into counter
3709 // Loop using x86 registers to update remaining bytes
3710duplp2:
3711 mov al, [edi + ebx]
3712 add al, [esi + ebx]
3713 inc ebx
3714 cmp ebx, ecx
3715 mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3716 jb duplp2
3717dupend:
3718 // Conversion of filtered row completed
3719 emms // End MMX instructions; prep for possible FP instrs.
3720 } // end _asm block
3721}
3722
3723
3724
3725// Optimized png_read_filter_row routines
3726void
3727png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
3728 row, png_bytep prev_row, int filter)
3729{
3730 char filnm[6];
3731 #define UseMMX (1)
3732
3733
3734 if (mmx_supported==2)
3735 mmx_supported=mmxsupport();
3736 //if (!mmx_supported)
3737 {
3738 png_read_filter_row_c(png_ptr, row_info, row, prev_row, filter);
3739 return ;
3740 }
3741
3742
3743 png_debug(1, "in png_read_filter_row\n");
3744 png_debug1(0,"%s, ", (UseMMX?"MMX":"x86"));
3745 switch (filter)
3746 {
3747 case 0: sprintf(filnm, "None ");
3748 break;
3749 case 1: sprintf(filnm, "Sub ");
3750 break;
3751 case 2: sprintf(filnm, "Up ");
3752 break;
3753 case 3: sprintf(filnm, "Avg ");
3754 break;
3755 case 4: sprintf(filnm, "Paeth");
3756 break;
3757 default: sprintf(filnm, "Unknw");
3758 break;
3759 }
3760 png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
3761 png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
3762 (int)((row_info->pixel_depth + 7) >> 3));
3763 png_debug1(0,"len=%8d, ", row_info->rowbytes);
3764
3765 switch (filter)
3766 {
3767 case PNG_FILTER_VALUE_NONE:
3768 break;
3769 case PNG_FILTER_VALUE_SUB:
3770 {
3771 if ( UseMMX && (row_info->pixel_depth > 8) &&
3772 (row_info->rowbytes >= 128) )
3773 {
3774 png_read_filter_row_mmx_sub(row_info, row);
3775 } //end if UseMMX
3776 else
3777 {
3778 int bpp;
3779 png_bytep rp;
3780 png_bytep lp;
3781 png_uint_32 i;
3782 bpp = (row_info->pixel_depth + 7) >> 3;
3783 for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
3784 i < row_info->rowbytes; i++, rp++, lp++)
3785 {
3786 *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
3787 }
3788 } //end !UseMMX
3789 break;
3790 }
3791 case PNG_FILTER_VALUE_UP:
3792 {
3793 if ( UseMMX && (row_info->pixel_depth > 8) &&
3794 (row_info->rowbytes >= 128) )
3795 {
3796 png_read_filter_row_mmx_up(row_info, row, prev_row);
3797 } //end if UseMMX
3798 else
3799 {
3800 png_bytep rp;
3801 png_bytep pp;
3802 png_uint_32 i;
3803 for (i = 0, rp = row, pp = prev_row;
3804 i < row_info->rowbytes; i++, rp++, pp++)
3805 {
3806 *rp = (png_byte)(((int)(*rp) + (int)(*pp)) & 0xff);
3807 }
3808 } //end !UseMMX
3809 break;
3810 }
3811 case PNG_FILTER_VALUE_AVG:
3812 {
3813 if ( UseMMX && (row_info->pixel_depth > 8) &&
3814 (row_info->rowbytes >= 128) )
3815 {
3816 png_read_filter_row_mmx_avg(row_info, row, prev_row);
3817 } //end if UseMMX
3818 else
3819 {
3820 png_uint_32 i;
3821 int bpp;
3822 png_bytep rp;
3823 png_bytep pp;
3824 png_bytep lp;
3825 bpp = (row_info->pixel_depth + 7) >> 3;
3826 for (i = 0, rp = row, pp = prev_row;
3827 i < (png_uint_32)bpp; i++, rp++, pp++)
3828 {
3829 *rp = (png_byte)(((int)(*rp) +
3830 ((int)(*pp) / 2)) & 0xff);
3831 }
3832 for (lp = row; i < row_info->rowbytes; i++, rp++, lp++, pp++)
3833 {
3834 *rp = (png_byte)(((int)(*rp) +
3835 (int)(*pp + *lp) / 2) & 0xff);
3836 }
3837 } //end !UseMMX
3838 break;
3839 }
3840 case PNG_FILTER_VALUE_PAETH:
3841 {
3842 if ( UseMMX && (row_info->pixel_depth > 8) &&
3843 (row_info->rowbytes >= 128) )
3844 {
3845 png_read_filter_row_mmx_paeth(row_info, row, prev_row);
3846 } //end if UseMMX
3847 else
3848 {
3849 int bpp;
3850 png_uint_32 i;
3851 png_bytep rp;
3852 png_bytep pp;
3853 png_bytep lp;
3854 png_bytep cp;
3855 bpp = (row_info->pixel_depth + 7) >> 3;
3856 for (i = 0, rp = row, pp = prev_row;
3857 i < (png_uint_32)bpp; i++, rp++, pp++)
3858 {
3859 *rp = (png_byte)(((int)(*rp) + (int)(*pp)) & 0xff);
3860 }
3861 for (lp = rp - bpp, cp = pp - bpp;
3862 i < row_info->rowbytes; i++, rp++, pp++, lp++, cp++)
3863 {
3864 int a, b, c, pa, pb, pc, p;
3865 b = *pp;
3866 c = *cp;
3867 a = *lp;
3868 p = a + b - c;
3869 pa = abs(p - a);
3870 pb = abs(p - b);
3871 pc = abs(p - c);
3872 if (pa <= pb && pa <= pc)
3873 p = a;
3874 else if (pb <= pc)
3875 p = b;
3876 else
3877 p = c;
3878 *rp = (png_byte)(((int)(*rp) + p) & 0xff);
3879 }
3880 } //end !UseMMX
3881 break;
3882 }
3883 default:
3884 png_error(png_ptr, "Bad adaptive filter type");
3885 break;
3886 }
3887}
3888#endif