blob: a20a78382062b1815d4cc3cf2e90d42a47aa8be6 [file] [log] [blame]
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001/* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
2 *
3 * For Intel x86 CPU and Microsoft Visual C++ compiler
4 *
Glenn Randers-Pehrson074af5e1999-11-28 23:32:18 -06005 * libpng 1.0.5d - November 29, 1999
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05006 * For conditions of distribution and use, see copyright notice in png.h
7 * Copyright (c) 1998, Intel Corporation
8 * Copyright (c) 1998, 1999 Glenn Randers-Pehrson
9 *
10 * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
11 * Interface to libpng contributed by Gilles Vollant, 1999
12 *
13 */
14
15#define PNG_INTERNAL
16#include "png.h"
17
18#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
19
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -050020/*
21 One of these might need to be defined.
22#define DISABLE_PNGVCRD_COMBINE
23#define DISABLE_PNGVCRD_INTERLACE
24*/
25
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -050026static int mmx_supported=2;
27
28void
29png_read_filter_row_c(png_structp png_ptr, png_row_infop row_info,
30 png_bytep row, png_bytep prev_row, int filter);
31
32static int mmxsupport()
33{
34 int mmx_supported_local = 0;
35 _asm {
36 pushfd //Save Eflag to stack
37 pop eax //Get Eflag from stack into eax
38 mov ecx, eax //Make another copy of Eflag in ecx
39 xor eax, 0x200000 //Toggle ID bit in Eflag [i.e. bit(21)]
40 push eax //Save modified Eflag back to stack
41
42 popfd //Restored modified value back to Eflag reg
43 pushfd //Save Eflag to stack
44 pop eax //Get Eflag from stack
45 xor eax, ecx //Compare the new Eflag with the original Eflag
46 jz NOT_SUPPORTED //If the same, CPUID instruction is not supported,
47 //skip following instructions and jump to
48 //NOT_SUPPORTED label
49
50 xor eax, eax //Set eax to zero
51
52 _asm _emit 0x0f //CPUID instruction (two bytes opcode)
53 _asm _emit 0xa2
54
55 cmp eax, 1 //make sure eax return non-zero value
56 jl NOT_SUPPORTED //If eax is zero, mmx not supported
57
58 xor eax, eax //set eax to zero
59 inc eax //Now increment eax to 1. This instruction is
60 //faster than the instruction "mov eax, 1"
61
62 _asm _emit 0x0f //CPUID instruction
63 _asm _emit 0xa2
64
65 and edx, 0x00800000 //mask out all bits but mmx bit(24)
66 cmp edx, 0 // 0 = mmx not supported
67 jz NOT_SUPPORTED // non-zero = Yes, mmx IS supported
68
69 mov mmx_supported_local, 1 //set return value to 1
70
71NOT_SUPPORTED:
72 mov eax, mmx_supported_local //move return value to eax
73
74 }
75
76 //mmx_supported_local=0; // test code for force don't support MMX
77 //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
78
79 return mmx_supported_local;
80}
81
82/* Combines the row recently read in with the previous row.
83 This routine takes care of alpha and transparency if requested.
84 This routine also handles the two methods of progressive display
85 of interlaced images, depending on the mask value.
86 The mask value describes which pixels are to be combined with
87 the row. The pattern always repeats every 8 pixels, so just 8
88 bits are needed. A one indicates the pixel is to be combined; a
89 zero indicates the pixel is to be skipped. This is in addition
90 to any alpha or transparency value associated with the pixel. If
91 you want all pixels to be combined, pass 0xff (255) in mask. */
92
93/* Use this routine for x86 platform - uses faster MMX routine if machine
94 supports MMX */
95
96void
97png_combine_row(png_structp png_ptr, png_bytep row, int mask)
98{
Glenn Randers-Pehrson074af5e1999-11-28 23:32:18 -060099#ifdef PNG_USE_LOCAL_ARRAYS
Glenn Randers-Pehrson5379b241999-11-27 10:22:33 -0600100 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
Glenn Randers-Pehrson074af5e1999-11-28 23:32:18 -0600101#endif
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -0500102#ifdef DISABLE_PNGVCRD_COMBINE
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500103 int save_mmx_supported = mmx_supported;
104#endif
105
106 png_debug(1,"in png_combine_row_asm\n");
107
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -0500108#ifdef DISABLE_PNGVCRD_COMBINE
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500109 if ((png_ptr->transformations & PNG_INTERLACE) && png_ptr->pass != 6)
110 mmx_supported = 0;
111 else
112#endif
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -0500113 if (mmx_supported == 2)
114 mmx_supported = mmxsupport();
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500115
116 if (mask == 0xff)
117 {
118 png_memcpy(row, png_ptr->row_buf + 1,
119 (png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
120 }
121 /* GRR: add "else if (mask == 0)" case?
122 * or does png_combine_row() not even get called in that case? */
123 else
124 {
125 switch (png_ptr->row_info.pixel_depth)
126 {
127 case 1:
128 {
129 png_bytep sp;
130 png_bytep dp;
131 int s_inc, s_start, s_end;
132 int m;
133 int shift;
134 png_uint_32 i;
135
136 sp = png_ptr->row_buf + 1;
137 dp = row;
138 m = 0x80;
139#if defined(PNG_READ_PACKSWAP_SUPPORTED)
140 if (png_ptr->transformations & PNG_PACKSWAP)
141 {
142 s_start = 0;
143 s_end = 7;
144 s_inc = 1;
145 }
146 else
147#endif
148 {
149 s_start = 7;
150 s_end = 0;
151 s_inc = -1;
152 }
153
154 shift = s_start;
155
156 for (i = 0; i < png_ptr->width; i++)
157 {
158 if (m & mask)
159 {
160 int value;
161
162 value = (*sp >> shift) & 0x1;
163 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
164 *dp |= (png_byte)(value << shift);
165 }
166
167 if (shift == s_end)
168 {
169 shift = s_start;
170 sp++;
171 dp++;
172 }
173 else
174 shift += s_inc;
175
176 if (m == 1)
177 m = 0x80;
178 else
179 m >>= 1;
180 }
181 break;
182 }
183
184 case 2:
185 {
186 png_bytep sp;
187 png_bytep dp;
188 int s_start, s_end, s_inc;
189 int m;
190 int shift;
191 png_uint_32 i;
192 int value;
193
194 sp = png_ptr->row_buf + 1;
195 dp = row;
196 m = 0x80;
197#if defined(PNG_READ_PACKSWAP_SUPPORTED)
198 if (png_ptr->transformations & PNG_PACKSWAP)
199 {
200 s_start = 0;
201 s_end = 6;
202 s_inc = 2;
203 }
204 else
205#endif
206 {
207 s_start = 6;
208 s_end = 0;
209 s_inc = -2;
210 }
211
212 shift = s_start;
213
214 for (i = 0; i < png_ptr->width; i++)
215 {
216 if (m & mask)
217 {
218 value = (*sp >> shift) & 0x3;
219 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
220 *dp |= (png_byte)(value << shift);
221 }
222
223 if (shift == s_end)
224 {
225 shift = s_start;
226 sp++;
227 dp++;
228 }
229 else
230 shift += s_inc;
231 if (m == 1)
232 m = 0x80;
233 else
234 m >>= 1;
235 }
236 break;
237 }
238
239 case 4:
240 {
241 png_bytep sp;
242 png_bytep dp;
243 int s_start, s_end, s_inc;
244 int m;
245 int shift;
246 png_uint_32 i;
247 int value;
248
249 sp = png_ptr->row_buf + 1;
250 dp = row;
251 m = 0x80;
252#if defined(PNG_READ_PACKSWAP_SUPPORTED)
253 if (png_ptr->transformations & PNG_PACKSWAP)
254 {
255 s_start = 0;
256 s_end = 4;
257 s_inc = 4;
258 }
259 else
260#endif
261 {
262 s_start = 4;
263 s_end = 0;
264 s_inc = -4;
265 }
266 shift = s_start;
267
268 for (i = 0; i < png_ptr->width; i++)
269 {
270 if (m & mask)
271 {
272 value = (*sp >> shift) & 0xf;
273 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
274 *dp |= (png_byte)(value << shift);
275 }
276
277 if (shift == s_end)
278 {
279 shift = s_start;
280 sp++;
281 dp++;
282 }
283 else
284 shift += s_inc;
285 if (m == 1)
286 m = 0x80;
287 else
288 m >>= 1;
289 }
290 break;
291 }
292
293 case 8:
294 {
295 png_bytep srcptr;
296 png_bytep dstptr;
297 png_uint_32 len;
298 int m;
299 int diff, unmask;
300
301 __int64 mask0=0x0102040810204080;
302
303 if (mmx_supported)
304 {
305 srcptr = png_ptr->row_buf + 1;
306 dstptr = row;
307 m = 0x80;
308 unmask = ~mask;
309 len = png_ptr->width &~7; //reduce to multiple of 8
310 diff = png_ptr->width & 7; //amount lost
311
312 _asm
313 {
314 movd mm7, unmask //load bit pattern
315 psubb mm6,mm6 //zero mm6
316 punpcklbw mm7,mm7
317 punpcklwd mm7,mm7
318 punpckldq mm7,mm7 //fill register with 8 masks
319
320 movq mm0,mask0
321
322 pand mm0,mm7 //nonzero if keep byte
323 pcmpeqb mm0,mm6 //zeros->1s, v versa
324
325 mov ecx,len //load length of line (pixels)
326 mov esi,srcptr //load source
327 mov ebx,dstptr //load dest
328 cmp ecx,0 //lcr
329 je mainloop8end
330
331mainloop8:
332 movq mm4,[esi]
333 pand mm4,mm0
334 movq mm6,mm0
335 pandn mm6,[ebx]
336 por mm4,mm6
337 movq [ebx],mm4
338
339 add esi,8 //inc by 8 bytes processed
340 add ebx,8
341 sub ecx,8 //dec by 8 pixels processed
342
343 ja mainloop8
344mainloop8end:
345
346 mov ecx,diff
347 cmp ecx,0
348 jz end8
349
350 mov edx,mask
351 sal edx,24 //make low byte the high byte
352
353secondloop8:
354 sal edx,1 //move high bit to CF
355 jnc skip8 //if CF = 0
356 mov al,[esi]
357 mov [ebx],al
358skip8:
359 inc esi
360 inc ebx
361
362 dec ecx
363 jnz secondloop8
364end8:
365 emms
366 }
367 }
368 else /* mmx not supported - use modified C routine */
369 {
370 register unsigned int incr1, initial_val, final_val;
371 png_size_t pixel_bytes;
372 png_uint_32 i;
373 register int disp = png_pass_inc[png_ptr->pass];
374 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
375
376 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
377 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
378 pixel_bytes;
379 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
380 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
381 final_val = png_ptr->width*pixel_bytes;
382 incr1 = (disp)*pixel_bytes;
383 for (i = initial_val; i < final_val; i += incr1)
384 {
385 png_memcpy(dstptr, srcptr, pixel_bytes);
386 srcptr += incr1;
387 dstptr += incr1;
388 }
389 } /* end of else */
390
391 break;
392 } // end 8 bpp
393
394 case 16:
395 {
396 png_bytep srcptr;
397 png_bytep dstptr;
398 png_uint_32 len;
399 int unmask, diff;
400 __int64 mask1=0x0101020204040808,
401 mask0=0x1010202040408080;
402
403 if (mmx_supported)
404 {
405 srcptr = png_ptr->row_buf + 1;
406 dstptr = row;
407
408 unmask = ~mask;
409 len = (png_ptr->width)&~7;
410 diff = (png_ptr->width)&7;
411 _asm
412 {
413 movd mm7, unmask //load bit pattern
414 psubb mm6,mm6 //zero mm6
415 punpcklbw mm7,mm7
416 punpcklwd mm7,mm7
417 punpckldq mm7,mm7 //fill register with 8 masks
418
419 movq mm0,mask0
420 movq mm1,mask1
421
422 pand mm0,mm7
423 pand mm1,mm7
424
425 pcmpeqb mm0,mm6
426 pcmpeqb mm1,mm6
427
428 mov ecx,len //load length of line
429 mov esi,srcptr //load source
430 mov ebx,dstptr //load dest
431 cmp ecx,0 //lcr
432 jz mainloop16end
433
434mainloop16:
435 movq mm4,[esi]
436 pand mm4,mm0
437 movq mm6,mm0
438 movq mm7,[ebx]
439 pandn mm6,mm7
440 por mm4,mm6
441 movq [ebx],mm4
442
443 movq mm5,[esi+8]
444 pand mm5,mm1
445 movq mm7,mm1
446 movq mm6,[ebx+8]
447 pandn mm7,mm6
448 por mm5,mm7
449 movq [ebx+8],mm5
450
451 add esi,16 //inc by 16 bytes processed
452 add ebx,16
453 sub ecx,8 //dec by 8 pixels processed
454
455 ja mainloop16
456
457mainloop16end:
458 mov ecx,diff
459 cmp ecx,0
460 jz end16
461
462 mov edx,mask
463 sal edx,24 //make low byte the high byte
464secondloop16:
465 sal edx,1 //move high bit to CF
466 jnc skip16 //if CF = 0
467 mov ax,[esi]
468 mov [ebx],ax
469skip16:
470 add esi,2
471 add ebx,2
472
473 dec ecx
474 jnz secondloop16
475end16:
476 emms
477 }
478 }
479 else /* mmx not supported - use modified C routine */
480 {
481 register unsigned int incr1, initial_val, final_val;
482 png_size_t pixel_bytes;
483 png_uint_32 i;
484 register int disp = png_pass_inc[png_ptr->pass];
485 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
486
487 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
488 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
489 pixel_bytes;
490 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
491 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
492 final_val = png_ptr->width*pixel_bytes;
493 incr1 = (disp)*pixel_bytes;
494 for (i = initial_val; i < final_val; i += incr1)
495 {
496 png_memcpy(dstptr, srcptr, pixel_bytes);
497 srcptr += incr1;
498 dstptr += incr1;
499 }
500 } /* end of else */
501
502 break;
503 } // end 16 bpp
504
505 case 24:
506 {
507 png_bytep srcptr;
508 png_bytep dstptr;
509 png_uint_32 len;
510 int unmask, diff;
511
512 __int64 mask2=0x0101010202020404, //24bpp
513 mask1=0x0408080810101020,
514 mask0=0x2020404040808080;
515
516 srcptr = png_ptr->row_buf + 1;
517 dstptr = row;
518
519 unmask = ~mask;
520 len = (png_ptr->width)&~7;
521 diff = (png_ptr->width)&7;
522
523 if (mmx_supported)
524 {
525 _asm
526 {
527 movd mm7, unmask //load bit pattern
528 psubb mm6,mm6 //zero mm6
529 punpcklbw mm7,mm7
530 punpcklwd mm7,mm7
531 punpckldq mm7,mm7 //fill register with 8 masks
532
533 movq mm0,mask0
534 movq mm1,mask1
535 movq mm2,mask2
536
537 pand mm0,mm7
538 pand mm1,mm7
539 pand mm2,mm7
540
541 pcmpeqb mm0,mm6
542 pcmpeqb mm1,mm6
543 pcmpeqb mm2,mm6
544
545 mov ecx,len //load length of line
546 mov esi,srcptr //load source
547 mov ebx,dstptr //load dest
548 cmp ecx,0
549 jz mainloop24end
550
551mainloop24:
552 movq mm4,[esi]
553 pand mm4,mm0
554 movq mm6,mm0
555 movq mm7,[ebx]
556 pandn mm6,mm7
557 por mm4,mm6
558 movq [ebx],mm4
559
560
561 movq mm5,[esi+8]
562 pand mm5,mm1
563 movq mm7,mm1
564 movq mm6,[ebx+8]
565 pandn mm7,mm6
566 por mm5,mm7
567 movq [ebx+8],mm5
568
569 movq mm6,[esi+16]
570 pand mm6,mm2
571 movq mm4,mm2
572 movq mm7,[ebx+16]
573 pandn mm4,mm7
574 por mm6,mm4
575 movq [ebx+16],mm6
576
577 add esi,24 //inc by 24 bytes processed
578 add ebx,24
579 sub ecx,8 //dec by 8 pixels processed
580
581 ja mainloop24
582
583mainloop24end:
584 mov ecx,diff
585 cmp ecx,0
586 jz end24
587
588 mov edx,mask
589 sal edx,24 //make low byte the high byte
590secondloop24:
591 sal edx,1 //move high bit to CF
592 jnc skip24 //if CF = 0
593 mov ax,[esi]
594 mov [ebx],ax
595 xor eax,eax
596 mov al,[esi+2]
597 mov [ebx+2],al
598skip24:
599 add esi,3
600 add ebx,3
601
602 dec ecx
603 jnz secondloop24
604
605end24:
606 emms
607 }
608 }
609 else /* mmx not supported - use modified C routine */
610 {
611 register unsigned int incr1, initial_val, final_val;
612 png_size_t pixel_bytes;
613 png_uint_32 i;
614 register int disp = png_pass_inc[png_ptr->pass];
615 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
616
617 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
618 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
619 pixel_bytes;
620 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
621 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
622 final_val = png_ptr->width*pixel_bytes;
623 incr1 = (disp)*pixel_bytes;
624 for (i = initial_val; i < final_val; i += incr1)
625 {
626 png_memcpy(dstptr, srcptr, pixel_bytes);
627 srcptr += incr1;
628 dstptr += incr1;
629 }
630 } /* end of else */
631
632 break;
633 } // end 24 bpp
634
635 case 32:
636 {
637 png_bytep srcptr;
638 png_bytep dstptr;
639 png_uint_32 len;
640 int unmask, diff;
641
642 __int64 mask3=0x0101010102020202, //32bpp
643 mask2=0x0404040408080808,
644 mask1=0x1010101020202020,
645 mask0=0x4040404080808080;
646
647 srcptr = png_ptr->row_buf + 1;
648 dstptr = row;
649
650 unmask = ~mask;
651 len = (png_ptr->width)&~7;
652 diff = (png_ptr->width)&7;
653
654 if (mmx_supported)
655 {
656 _asm
657 {
658 movd mm7, unmask //load bit pattern
659 psubb mm6,mm6 //zero mm6
660 punpcklbw mm7,mm7
661 punpcklwd mm7,mm7
662 punpckldq mm7,mm7 //fill register with 8 masks
663
664 movq mm0,mask0
665 movq mm1,mask1
666 movq mm2,mask2
667 movq mm3,mask3
668
669 pand mm0,mm7
670 pand mm1,mm7
671 pand mm2,mm7
672 pand mm3,mm7
673
674 pcmpeqb mm0,mm6
675 pcmpeqb mm1,mm6
676 pcmpeqb mm2,mm6
677 pcmpeqb mm3,mm6
678
679 mov ecx,len //load length of line
680 mov esi,srcptr //load source
681 mov ebx,dstptr //load dest
682
683 cmp ecx,0 //lcr
684 jz mainloop32end
685
686mainloop32:
687 movq mm4,[esi]
688 pand mm4,mm0
689 movq mm6,mm0
690 movq mm7,[ebx]
691 pandn mm6,mm7
692 por mm4,mm6
693 movq [ebx],mm4
694
695 movq mm5,[esi+8]
696 pand mm5,mm1
697 movq mm7,mm1
698 movq mm6,[ebx+8]
699 pandn mm7,mm6
700 por mm5,mm7
701 movq [ebx+8],mm5
702
703 movq mm6,[esi+16]
704 pand mm6,mm2
705 movq mm4,mm2
706 movq mm7,[ebx+16]
707 pandn mm4,mm7
708 por mm6,mm4
709 movq [ebx+16],mm6
710
711 movq mm7,[esi+24]
712 pand mm7,mm3
713 movq mm5,mm3
714 movq mm4,[ebx+24]
715 pandn mm5,mm4
716 por mm7,mm5
717 movq [ebx+24],mm7
718
719 add esi,32 //inc by 32 bytes processed
720 add ebx,32
721 sub ecx,8 //dec by 8 pixels processed
722
723 ja mainloop32
724
725mainloop32end:
726 mov ecx,diff
727 cmp ecx,0
728 jz end32
729
730 mov edx,mask
731 sal edx,24 //make low byte the high byte
732secondloop32:
733 sal edx,1 //move high bit to CF
734 jnc skip32 //if CF = 0
735 mov eax,[esi]
736 mov [ebx],eax
737skip32:
738 add esi,4
739 add ebx,4
740
741 dec ecx
742 jnz secondloop32
743
744end32:
745 emms
746 }
747 }
748 else /* mmx _not supported - Use modified C routine */
749 {
750 register unsigned int incr1, initial_val, final_val;
751 png_size_t pixel_bytes;
752 png_uint_32 i;
753 register int disp = png_pass_inc[png_ptr->pass];
754 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
755
756 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
757 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
758 pixel_bytes;
759 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
760 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
761 final_val = png_ptr->width*pixel_bytes;
762 incr1 = (disp)*pixel_bytes;
763 for (i = initial_val; i < final_val; i += incr1)
764 {
765 png_memcpy(dstptr, srcptr, pixel_bytes);
766 srcptr += incr1;
767 dstptr += incr1;
768 }
769 } /* end of else */
770
771 break;
772 } // end 32 bpp
773
774 case 48:
775 {
776 png_bytep srcptr;
777 png_bytep dstptr;
778 png_uint_32 len;
779 int unmask, diff;
780
781 __int64 mask5=0x0101010101010202,
782 mask4=0x0202020204040404,
783 mask3=0x0404080808080808,
784 mask2=0x1010101010102020,
785 mask1=0x2020202040404040,
786 mask0=0x4040808080808080;
787
788 if (mmx_supported)
789 {
790 srcptr = png_ptr->row_buf + 1;
791 dstptr = row;
792
793 unmask = ~mask;
794 len = (png_ptr->width)&~7;
795 diff = (png_ptr->width)&7;
796 _asm
797 {
798 movd mm7, unmask //load bit pattern
799 psubb mm6,mm6 //zero mm6
800 punpcklbw mm7,mm7
801 punpcklwd mm7,mm7
802 punpckldq mm7,mm7 //fill register with 8 masks
803
804 movq mm0,mask0
805 movq mm1,mask1
806 movq mm2,mask2
807 movq mm3,mask3
808 movq mm4,mask4
809 movq mm5,mask5
810
811 pand mm0,mm7
812 pand mm1,mm7
813 pand mm2,mm7
814 pand mm3,mm7
815 pand mm4,mm7
816 pand mm5,mm7
817
818 pcmpeqb mm0,mm6
819 pcmpeqb mm1,mm6
820 pcmpeqb mm2,mm6
821 pcmpeqb mm3,mm6
822 pcmpeqb mm4,mm6
823 pcmpeqb mm5,mm6
824
825 mov ecx,len //load length of line
826 mov esi,srcptr //load source
827 mov ebx,dstptr //load dest
828
829 cmp ecx,0
830 jz mainloop48end
831
832mainloop48:
833 movq mm7,[esi]
834 pand mm7,mm0
835 movq mm6,mm0
836 pandn mm6,[ebx]
837 por mm7,mm6
838 movq [ebx],mm7
839
840 movq mm6,[esi+8]
841 pand mm6,mm1
842 movq mm7,mm1
843 pandn mm7,[ebx+8]
844 por mm6,mm7
845 movq [ebx+8],mm6
846
847 movq mm6,[esi+16]
848 pand mm6,mm2
849 movq mm7,mm2
850 pandn mm7,[ebx+16]
851 por mm6,mm7
852 movq [ebx+16],mm6
853
854 movq mm7,[esi+24]
855 pand mm7,mm3
856 movq mm6,mm3
857 pandn mm6,[ebx+24]
858 por mm7,mm6
859 movq [ebx+24],mm7
860
861 movq mm6,[esi+32]
862 pand mm6,mm4
863 movq mm7,mm4
864 pandn mm7,[ebx+32]
865 por mm6,mm7
866 movq [ebx+32],mm6
867
868 movq mm7,[esi+40]
869 pand mm7,mm5
870 movq mm6,mm5
871 pandn mm6,[ebx+40]
872 por mm7,mm6
873 movq [ebx+40],mm7
874
875 add esi,48 //inc by 32 bytes processed
876 add ebx,48
877 sub ecx,8 //dec by 8 pixels processed
878
879 ja mainloop48
880mainloop48end:
881
882 mov ecx,diff
883 cmp ecx,0
884 jz end48
885
886 mov edx,mask
887 sal edx,24 //make low byte the high byte
888
889secondloop48:
890 sal edx,1 //move high bit to CF
891 jnc skip48 //if CF = 0
892 mov eax,[esi]
893 mov [ebx],eax
894skip48:
895 add esi,4
896 add ebx,4
897
898 dec ecx
899 jnz secondloop48
900
901end48:
902 emms
903 }
904 }
905 else /* mmx _not supported - Use modified C routine */
906 {
907 register unsigned int incr1, initial_val, final_val;
908 png_size_t pixel_bytes;
909 png_uint_32 i;
910 register int disp = png_pass_inc[png_ptr->pass];
911 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
912
913 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
914 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
915 pixel_bytes;
916 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
917 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
918 final_val = png_ptr->width*pixel_bytes;
919 incr1 = (disp)*pixel_bytes;
920 for (i = initial_val; i < final_val; i += incr1)
921 {
922 png_memcpy(dstptr, srcptr, pixel_bytes);
923 srcptr += incr1;
924 dstptr += incr1;
925 }
926 } /* end of else */
927
928 break;
929 } // end 48 bpp
930
931 default:
932 {
933 png_bytep sptr;
934 png_bytep dp;
935 png_size_t pixel_bytes;
936 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
937 unsigned int i;
938 register int disp = png_pass_inc[png_ptr->pass]; // get the offset
939 register unsigned int incr1, initial_val, final_val;
940
941 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
942 sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
943 pixel_bytes;
944 dp = row + offset_table[png_ptr->pass]*pixel_bytes;
945 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
946 final_val = png_ptr->width*pixel_bytes;
947 incr1 = (disp)*pixel_bytes;
948 for (i = initial_val; i < final_val; i += incr1)
949 {
950 png_memcpy(dp, sptr, pixel_bytes);
951 sptr += incr1;
952 dp += incr1;
953 }
954 break;
955 }
956 } /* end switch (png_ptr->row_info.pixel_depth) */
957 } /* end if (non-trivial mask) */
958
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -0500959#ifdef DISABLE_PNGVCRD_COMBINE
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500960 mmx_supported = save_mmx_supported;
961#endif
962
963} /* end png_combine_row() */
964
965
966#if defined(PNG_READ_INTERLACING_SUPPORTED)
967
968void
969png_do_read_interlace(png_row_infop row_info, png_bytep row, int pass,
970 png_uint_32 transformations)
971{
Glenn Randers-Pehrson074af5e1999-11-28 23:32:18 -0600972#ifdef PNG_USE_LOCAL_ARRAYS
Glenn Randers-Pehrson5379b241999-11-27 10:22:33 -0600973 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
Glenn Randers-Pehrson074af5e1999-11-28 23:32:18 -0600974#endif
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -0500975#ifdef DISABLE_PNGVCRD_INTERLACE
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500976 int save_mmx_supported = mmx_supported;
977#endif
978
979 png_debug(1,"in png_do_read_interlace\n");
980
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -0500981#ifdef DISABLE_PNGVCRD_INTERLACE
982 /* In libpng versions 1.0.3a through 1.0.4d,
983 * a sign error in the post-MMX cleanup code for each pixel_depth resulted
984 * in bad pixels at the beginning of some rows of some images, and also
985 * (due to out-of-range memory reads and writes) caused heap corruption
986 * when compiled with MSVC 6.0. The error was fixed in version 1.0.4e,
987 * and the code appears to work completely correctly, so it is enabled
988 * by default.
989 */
990 if (1) /* all passes caused a heap problem in the old code */
991 mmx_supported = 0;
992 else
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500993#endif
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -0500994 if (mmx_supported == 2)
995 mmx_supported = mmxsupport();
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500996
997 if (row != NULL && row_info != NULL)
998 {
999 png_uint_32 final_width;
1000
1001 final_width = row_info->width * png_pass_inc[pass];
1002
1003 switch (row_info->pixel_depth)
1004 {
1005 case 1:
1006 {
1007 png_bytep sp, dp;
1008 int sshift, dshift;
1009 int s_start, s_end, s_inc;
1010 png_byte v;
1011 png_uint_32 i;
1012 int j;
1013
1014 sp = row + (png_size_t)((row_info->width - 1) >> 3);
1015 dp = row + (png_size_t)((final_width - 1) >> 3);
1016#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1017 if (transformations & PNG_PACKSWAP)
1018 {
1019 sshift = (int)((row_info->width + 7) & 7);
1020 dshift = (int)((final_width + 7) & 7);
1021 s_start = 7;
1022 s_end = 0;
1023 s_inc = -1;
1024 }
1025 else
1026#endif
1027 {
1028 sshift = 7 - (int)((row_info->width + 7) & 7);
1029 dshift = 7 - (int)((final_width + 7) & 7);
1030 s_start = 0;
1031 s_end = 7;
1032 s_inc = 1;
1033 }
1034
1035 for (i = row_info->width; i; i--)
1036 {
1037 v = (png_byte)((*sp >> sshift) & 0x1);
1038 for (j = 0; j < png_pass_inc[pass]; j++)
1039 {
1040 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1041 *dp |= (png_byte)(v << dshift);
1042 if (dshift == s_end)
1043 {
1044 dshift = s_start;
1045 dp--;
1046 }
1047 else
1048 dshift += s_inc;
1049 }
1050 if (sshift == s_end)
1051 {
1052 sshift = s_start;
1053 sp--;
1054 }
1055 else
1056 sshift += s_inc;
1057 }
1058 break;
1059 }
1060
1061 case 2:
1062 {
1063 png_bytep sp, dp;
1064 int sshift, dshift;
1065 int s_start, s_end, s_inc;
1066 png_uint_32 i;
1067
1068 sp = row + (png_size_t)((row_info->width - 1) >> 2);
1069 dp = row + (png_size_t)((final_width - 1) >> 2);
1070#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1071 if (transformations & PNG_PACKSWAP)
1072 {
1073 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1074 dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1075 s_start = 6;
1076 s_end = 0;
1077 s_inc = -2;
1078 }
1079 else
1080#endif
1081 {
1082 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1083 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1084 s_start = 0;
1085 s_end = 6;
1086 s_inc = 2;
1087 }
1088
1089 for (i = row_info->width; i; i--)
1090 {
1091 png_byte v;
1092 int j;
1093
1094 v = (png_byte)((*sp >> sshift) & 0x3);
1095 for (j = 0; j < png_pass_inc[pass]; j++)
1096 {
1097 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1098 *dp |= (png_byte)(v << dshift);
1099 if (dshift == s_end)
1100 {
1101 dshift = s_start;
1102 dp--;
1103 }
1104 else
1105 dshift += s_inc;
1106 }
1107 if (sshift == s_end)
1108 {
1109 sshift = s_start;
1110 sp--;
1111 }
1112 else
1113 sshift += s_inc;
1114 }
1115 break;
1116 }
1117
1118 case 4:
1119 {
1120 png_bytep sp, dp;
1121 int sshift, dshift;
1122 int s_start, s_end, s_inc;
1123 png_uint_32 i;
1124
1125 sp = row + (png_size_t)((row_info->width - 1) >> 1);
1126 dp = row + (png_size_t)((final_width - 1) >> 1);
1127#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1128 if (transformations & PNG_PACKSWAP)
1129 {
1130 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1131 dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1132 s_start = 4;
1133 s_end = 0;
1134 s_inc = -4;
1135 }
1136 else
1137#endif
1138 {
1139 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1140 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1141 s_start = 0;
1142 s_end = 4;
1143 s_inc = 4;
1144 }
1145
1146 for (i = row_info->width; i; i--)
1147 {
1148 png_byte v;
1149 int j;
1150
1151 v = (png_byte)((*sp >> sshift) & 0xf);
1152 for (j = 0; j < png_pass_inc[pass]; j++)
1153 {
1154 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1155 *dp |= (png_byte)(v << dshift);
1156 if (dshift == s_end)
1157 {
1158 dshift = s_start;
1159 dp--;
1160 }
1161 else
1162 dshift += s_inc;
1163 }
1164 if (sshift == s_end)
1165 {
1166 sshift = s_start;
1167 sp--;
1168 }
1169 else
1170 sshift += s_inc;
1171 }
1172 break;
1173 }
1174
1175 default: // This is the place where the routine is modified
1176 {
1177 __int64 const4 = 0x0000000000FFFFFF;
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001178 // __int64 const5 = 0x000000FFFFFF0000; // unused...
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001179 __int64 const6 = 0x00000000000000FF;
1180 png_bytep sptr, dp;
1181 png_uint_32 i;
1182 png_size_t pixel_bytes;
1183 int width = row_info->width;
1184
1185 pixel_bytes = (row_info->pixel_depth >> 3);
1186
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001187 sptr = row + (width - 1) * pixel_bytes;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001188 dp = row + (final_width - 1) * pixel_bytes;
1189 // New code by Nirav Chhatrapati - Intel Corporation
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001190 // sign fix by GRR
1191 // NOTE: there is NO MMX code for 48-bit and 64-bit images
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001192
1193 if (mmx_supported) // use MMX routine if machine supports it
1194 {
1195 if (pixel_bytes == 3)
1196 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001197 if (((pass == 0) || (pass == 1)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001198 {
1199 _asm
1200 {
1201 mov esi, sptr
1202 mov edi, dp
1203 mov ecx, width
1204 sub edi, 21 // (png_pass_inc[pass] - 1)*pixel_bytes
1205loop_pass0:
1206 movd mm0, [esi] ; X X X X X v2 v1 v0
1207 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1208 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1209 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1210 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1211 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1212 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1213 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1214 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1215 movq mm3, mm0 ; v2 v1 v0 v2 v1 v0 v2 v1
1216 psllq mm0, 16 ; v0 v2 v1 v0 v2 v1 0 0
1217 movq mm4, mm3 ; v2 v1 v0 v2 v1 v0 v2 v1
1218 punpckhdq mm3, mm0 ; v0 v2 v1 v0 v2 v1 v0 v2
1219 movq [edi+16] , mm4
1220 psrlq mm0, 32 ; 0 0 0 0 v0 v2 v1 v0
1221 movq [edi+8] , mm3
1222 punpckldq mm0, mm4 ; v1 v0 v2 v1 v0 v2 v1 v0
1223 sub esi, 3
1224 movq [edi], mm0
1225 sub edi, 24
1226 //sub esi, 3
1227 dec ecx
1228 jnz loop_pass0
1229 EMMS
1230 }
1231 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001232 else if (((pass == 2) || (pass == 3)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001233 {
1234 _asm
1235 {
1236 mov esi, sptr
1237 mov edi, dp
1238 mov ecx, width
1239 sub edi, 9 // (png_pass_inc[pass] - 1)*pixel_bytes
1240loop_pass2:
1241 movd mm0, [esi] ; X X X X X v2 v1 v0
1242 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1243 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1244 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1245 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1246 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1247 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1248 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1249 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1250 movq [edi+4], mm0 ; move to memory
1251 psrlq mm0, 16 ; 0 0 v2 v1 v0 v2 v1 v0
1252 movd [edi], mm0 ; move to memory
1253 sub esi, 3
1254 sub edi, 12
1255 dec ecx
1256 jnz loop_pass2
1257 EMMS
1258 }
1259 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001260 else if (width) /* && ((pass == 4) || (pass == 5)) */
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001261 {
1262 int width_mmx = ((width >> 1) << 1) - 8;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001263 if (width_mmx < 0)
1264 width_mmx = 0;
1265 width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001266 if (width_mmx)
1267 {
1268 _asm
1269 {
1270 mov esi, sptr
1271 mov edi, dp
1272 mov ecx, width_mmx
1273 sub esi, 3
1274 sub edi, 9
1275loop_pass4:
1276 movq mm0, [esi] ; X X v2 v1 v0 v5 v4 v3
1277 movq mm7, mm0 ; X X v2 v1 v0 v5 v4 v3
1278 movq mm6, mm0 ; X X v2 v1 v0 v5 v4 v3
1279 psllq mm0, 24 ; v1 v0 v5 v4 v3 0 0 0
1280 pand mm7, const4 ; 0 0 0 0 0 v5 v4 v3
1281 psrlq mm6, 24 ; 0 0 0 X X v2 v1 v0
1282 por mm0, mm7 ; v1 v0 v5 v4 v3 v5 v4 v3
1283 movq mm5, mm6 ; 0 0 0 X X v2 v1 v0
1284 psllq mm6, 8 ; 0 0 X X v2 v1 v0 0
1285 movq [edi], mm0 ; move quad to memory
1286 psrlq mm5, 16 ; 0 0 0 0 0 X X v2
1287 pand mm5, const6 ; 0 0 0 0 0 0 0 v2
1288 por mm6, mm5 ; 0 0 X X v2 v1 v0 v2
1289 movd [edi+8], mm6 ; move double to memory
1290 sub esi, 6
1291 sub edi, 12
1292 sub ecx, 2
1293 jnz loop_pass4
1294 EMMS
1295 }
1296 }
1297
1298 sptr -= width_mmx*3;
1299 dp -= width_mmx*6;
1300 for (i = width; i; i--)
1301 {
1302 png_byte v[8];
1303 int j;
1304
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001305 png_memcpy(v, sptr, 3);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001306 for (j = 0; j < png_pass_inc[pass]; j++)
1307 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001308 png_memcpy(dp, v, 3);
1309 dp -= 3;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001310 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001311 sptr -= 3;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001312 }
1313 }
1314 } /* end of pixel_bytes == 3 */
1315
1316 else if (pixel_bytes == 1)
1317 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001318 if (((pass == 0) || (pass == 1)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001319 {
1320 int width_mmx = ((width >> 2) << 2);
1321 width -= width_mmx;
1322 if (width_mmx)
1323 {
1324 _asm
1325 {
1326 mov esi, sptr
1327 mov edi, dp
1328 mov ecx, width_mmx
1329 sub edi, 31
1330 sub esi, 3
1331loop1_pass0:
1332 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1333 movq mm1, mm0 ; X X X X v0 v1 v2 v3
1334 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1335 movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1336 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1337 movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1338 punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3
1339 punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2
1340 movq [edi], mm0 ; move to memory v3
1341 punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1342 movq [edi+8], mm3 ; move to memory v2
1343 movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1344 punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1
1345 punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0
1346 movq [edi+16], mm2 ; move to memory v1
1347 movq [edi+24], mm4 ; move to memory v0
1348 sub esi, 4
1349 sub edi, 32
1350 sub ecx, 4
1351 jnz loop1_pass0
1352 EMMS
1353 }
1354 }
1355
1356 sptr -= width_mmx;
1357 dp -= width_mmx*8;
1358 for (i = width; i; i--)
1359 {
1360 int j;
1361
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001362 /* I simplified this part in version 1.0.4e
1363 * here and in several other instances where
1364 * pixel_bytes == 1 -- GR-P
1365 *
1366 * Original code:
1367 *
1368 * png_byte v[8];
1369 * png_memcpy(v, sptr, pixel_bytes);
1370 * for (j = 0; j < png_pass_inc[pass]; j++)
1371 * {
1372 * png_memcpy(dp, v, pixel_bytes);
1373 * dp -= pixel_bytes;
1374 * }
1375 * sptr -= pixel_bytes;
1376 *
1377 * Replacement code is in the next three lines:
1378 */
1379
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001380 for (j = 0; j < png_pass_inc[pass]; j++)
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001381 *dp-- = *sptr;
1382 sptr--;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001383 }
1384 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001385 else if (((pass == 2) || (pass == 3)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001386 {
1387 int width_mmx = ((width >> 2) << 2);
1388 width -= width_mmx;
1389 if (width_mmx)
1390 {
1391 _asm
1392 {
1393 mov esi, sptr
1394 mov edi, dp
1395 mov ecx, width_mmx
1396 sub edi, 15
1397 sub esi, 3
1398loop1_pass2:
1399 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1400 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1401 movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1402 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1403 punpckhwd mm1, mm1 ; v0 v0 v0 v0 v1 v1 v1 v1
1404 movq [edi], mm0 ; move to memory v2 and v3
1405 sub esi, 4
1406 movq [edi+8], mm1 ; move to memory v1 and v0
1407 sub edi, 16
1408 sub ecx, 4
1409 jnz loop1_pass2
1410 EMMS
1411 }
1412 }
1413
1414 sptr -= width_mmx;
1415 dp -= width_mmx*4;
1416 for (i = width; i; i--)
1417 {
1418 int j;
1419
1420 for (j = 0; j < png_pass_inc[pass]; j++)
1421 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001422 *dp-- = *sptr;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001423 }
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001424 sptr --;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001425 }
1426 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001427 else if (width) /* && ((pass == 4) || (pass == 5))) */
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001428 {
1429 int width_mmx = ((width >> 3) << 3);
1430 width -= width_mmx;
1431 if (width_mmx)
1432 {
1433 _asm
1434 {
1435 mov esi, sptr
1436 mov edi, dp
1437 mov ecx, width_mmx
1438 sub edi, 15
1439 sub esi, 7
1440loop1_pass4:
1441 movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7
1442 movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7
1443 punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7
1444 //movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1445 punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3
1446 movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3
1447 sub esi, 8
1448 movq [edi], mm0 ; move to memory v4 v5 v6 and v7
1449 //sub esi, 4
1450 sub edi, 16
1451 sub ecx, 8
1452 jnz loop1_pass4
1453 EMMS
1454 }
1455 }
1456
1457 sptr -= width_mmx;
1458 dp -= width_mmx*2;
1459 for (i = width; i; i--)
1460 {
1461 int j;
1462
1463 for (j = 0; j < png_pass_inc[pass]; j++)
1464 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001465 *dp-- = *sptr;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001466 }
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001467 sptr --;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001468 }
1469 }
1470 } /* end of pixel_bytes == 1 */
1471
1472 else if (pixel_bytes == 2)
1473 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001474 if (((pass == 0) || (pass == 1)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001475 {
1476 int width_mmx = ((width >> 1) << 1);
1477 width -= width_mmx;
1478 if (width_mmx)
1479 {
1480 _asm
1481 {
1482 mov esi, sptr
1483 mov edi, dp
1484 mov ecx, width_mmx
1485 sub esi, 2
1486 sub edi, 30
1487loop2_pass0:
1488 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1489 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1490 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1491 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1492 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1493 movq [edi], mm0
1494 movq [edi + 8], mm0
1495 movq [edi + 16], mm1
1496 movq [edi + 24], mm1
1497 sub esi, 4
1498 sub edi, 32
1499 sub ecx, 2
1500 jnz loop2_pass0
1501 EMMS
1502 }
1503 }
1504
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001505 sptr -= (width_mmx*2 - 2); // sign fixed
1506 dp -= (width_mmx*16 - 2); // sign fixed
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001507 for (i = width; i; i--)
1508 {
1509 png_byte v[8];
1510 int j;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001511 sptr -= 2;
1512 png_memcpy(v, sptr, 2);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001513 for (j = 0; j < png_pass_inc[pass]; j++)
1514 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001515 dp -= 2;
1516 png_memcpy(dp, v, 2);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001517 }
1518 }
1519 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001520 else if (((pass == 2) || (pass == 3)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001521 {
1522 int width_mmx = ((width >> 1) << 1) ;
1523 width -= width_mmx;
1524 if (width_mmx)
1525 {
1526 _asm
1527 {
1528 mov esi, sptr
1529 mov edi, dp
1530 mov ecx, width_mmx
1531 sub esi, 2
1532 sub edi, 14
1533loop2_pass2:
1534 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1535 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1536 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1537 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1538 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1539 movq [edi], mm0
1540 sub esi, 4
1541 movq [edi + 8], mm1
1542 //sub esi, 4
1543 sub edi, 16
1544 sub ecx, 2
1545 jnz loop2_pass2
1546 EMMS
1547 }
1548 }
1549
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001550 sptr -= (width_mmx*2 - 2); // sign fixed
1551 dp -= (width_mmx*8 - 2); // sign fixed
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001552 for (i = width; i; i--)
1553 {
1554 png_byte v[8];
1555 int j;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001556 sptr -= 2;
1557 png_memcpy(v, sptr, 2);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001558 for (j = 0; j < png_pass_inc[pass]; j++)
1559 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001560 dp -= 2;
1561 png_memcpy(dp, v, 2);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001562 }
1563 }
1564 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001565 else if (width) // pass == 4 or 5
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001566 {
1567 int width_mmx = ((width >> 1) << 1) ;
1568 width -= width_mmx;
1569 if (width_mmx)
1570 {
1571 _asm
1572 {
1573 mov esi, sptr
1574 mov edi, dp
1575 mov ecx, width_mmx
1576 sub esi, 2
1577 sub edi, 6
1578loop2_pass4:
1579 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1580 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1581 sub esi, 4
1582 movq [edi], mm0
1583 sub edi, 8
1584 sub ecx, 2
1585 jnz loop2_pass4
1586 EMMS
1587 }
1588 }
1589
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001590 sptr -= (width_mmx*2 - 2); // sign fixed
1591 dp -= (width_mmx*4 - 2); // sign fixed
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001592 for (i = width; i; i--)
1593 {
1594 png_byte v[8];
1595 int j;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001596 sptr -= 2;
1597 png_memcpy(v, sptr, 2);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001598 for (j = 0; j < png_pass_inc[pass]; j++)
1599 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001600 dp -= 2;
1601 png_memcpy(dp, v, 2);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001602 }
1603 }
1604 }
1605 } /* end of pixel_bytes == 2 */
1606
1607 else if (pixel_bytes == 4)
1608 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001609 if (((pass == 0) || (pass == 1)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001610 {
1611 int width_mmx = ((width >> 1) << 1) ;
1612 width -= width_mmx;
1613 if (width_mmx)
1614 {
1615 _asm
1616 {
1617 mov esi, sptr
1618 mov edi, dp
1619 mov ecx, width_mmx
1620 sub esi, 4
1621 sub edi, 60
1622loop4_pass0:
1623 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1624 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1625 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1626 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1627 movq [edi], mm0
1628 movq [edi + 8], mm0
1629 movq [edi + 16], mm0
1630 movq [edi + 24], mm0
1631 movq [edi+32], mm1
1632 movq [edi + 40], mm1
1633 movq [edi+ 48], mm1
1634 sub esi, 8
1635 movq [edi + 56], mm1
1636 sub edi, 64
1637 sub ecx, 2
1638 jnz loop4_pass0
1639 EMMS
1640 }
1641 }
1642
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001643 sptr -= (width_mmx*4 - 4); // sign fixed
1644 dp -= (width_mmx*32 - 4); // sign fixed
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001645 for (i = width; i; i--)
1646 {
1647 png_byte v[8];
1648 int j;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001649 sptr -= 4;
1650 png_memcpy(v, sptr, 4);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001651 for (j = 0; j < png_pass_inc[pass]; j++)
1652 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001653 dp -= 4;
1654 png_memcpy(dp, v, 4);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001655 }
1656 }
1657 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001658 else if (((pass == 2) || (pass == 3)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001659 {
1660 int width_mmx = ((width >> 1) << 1) ;
1661 width -= width_mmx;
1662 if (width_mmx)
1663 {
1664 _asm
1665 {
1666 mov esi, sptr
1667 mov edi, dp
1668 mov ecx, width_mmx
1669 sub esi, 4
1670 sub edi, 28
1671loop4_pass2:
1672 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1673 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1674 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1675 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1676 movq [edi], mm0
1677 movq [edi + 8], mm0
1678 movq [edi+16], mm1
1679 movq [edi + 24], mm1
1680 sub esi, 8
1681 sub edi, 32
1682 sub ecx, 2
1683 jnz loop4_pass2
1684 EMMS
1685 }
1686 }
1687
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001688 sptr -= (width_mmx*4 - 4); // sign fixed
1689 dp -= (width_mmx*16 - 4); // sign fixed
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001690 for (i = width; i; i--)
1691 {
1692 png_byte v[8];
1693 int j;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001694 sptr -= 4;
1695 png_memcpy(v, sptr, 4);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001696 for (j = 0; j < png_pass_inc[pass]; j++)
1697 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001698 dp -= 4;
1699 png_memcpy(dp, v, 4);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001700 }
1701 }
1702 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001703 else if (width) // pass == 4 or 5
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001704 {
1705 int width_mmx = ((width >> 1) << 1) ;
1706 width -= width_mmx;
1707 if (width_mmx)
1708 {
1709 _asm
1710 {
1711 mov esi, sptr
1712 mov edi, dp
1713 mov ecx, width_mmx
1714 sub esi, 4
1715 sub edi, 12
1716loop4_pass4:
1717 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1718 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1719 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1720 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1721 movq [edi], mm0
1722 sub esi, 8
1723 movq [edi + 8], mm1
1724 sub edi, 16
1725 sub ecx, 2
1726 jnz loop4_pass4
1727 EMMS
1728 }
1729 }
1730
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001731 sptr -= (width_mmx*4 - 4); // sign fixed
1732 dp -= (width_mmx*8 - 4); // sign fixed
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001733 for (i = width; i; i--)
1734 {
1735 png_byte v[8];
1736 int j;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001737 sptr -= 4;
1738 png_memcpy(v, sptr, 4);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001739 for (j = 0; j < png_pass_inc[pass]; j++)
1740 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001741 dp -= 4;
1742 png_memcpy(dp, v, 4);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001743 }
1744 }
1745 }
1746
1747 } /* end of pixel_bytes == 4 */
1748
1749 else if (pixel_bytes == 6)
1750 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001751 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001752 {
1753 png_byte v[8];
1754 int j;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001755 png_memcpy(v, sptr, 6);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001756 for (j = 0; j < png_pass_inc[pass]; j++)
1757 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001758 png_memcpy(dp, v, 6);
1759 dp -= 6;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001760 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001761 sptr -= 6;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001762 }
1763 } /* end of pixel_bytes == 6 */
1764
1765 else
1766 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001767 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001768 {
1769 png_byte v[8];
1770 int j;
1771 png_memcpy(v, sptr, pixel_bytes);
1772 for (j = 0; j < png_pass_inc[pass]; j++)
1773 {
1774 png_memcpy(dp, v, pixel_bytes);
1775 dp -= pixel_bytes;
1776 }
1777 sptr-= pixel_bytes;
1778 }
1779 }
1780 } /* end of mmx_supported */
1781
1782 else /* MMX not supported: use modified C code - takes advantage
1783 * of inlining of memcpy for a constant */
1784 {
1785 if (pixel_bytes == 1)
1786 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001787 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001788 {
1789 int j;
1790 for (j = 0; j < png_pass_inc[pass]; j++)
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001791 *dp-- = *sptr;
1792 sptr--;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001793 }
1794 }
1795 else if (pixel_bytes == 3)
1796 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001797 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001798 {
1799 png_byte v[8];
1800 int j;
1801 png_memcpy(v, sptr, pixel_bytes);
1802 for (j = 0; j < png_pass_inc[pass]; j++)
1803 {
1804 png_memcpy(dp, v, pixel_bytes);
1805 dp -= pixel_bytes;
1806 }
1807 sptr -= pixel_bytes;
1808 }
1809 }
1810 else if (pixel_bytes == 2)
1811 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001812 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001813 {
1814 png_byte v[8];
1815 int j;
1816 png_memcpy(v, sptr, pixel_bytes);
1817 for (j = 0; j < png_pass_inc[pass]; j++)
1818 {
1819 png_memcpy(dp, v, pixel_bytes);
1820 dp -= pixel_bytes;
1821 }
1822 sptr -= pixel_bytes;
1823 }
1824 }
1825 else if (pixel_bytes == 4)
1826 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001827 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001828 {
1829 png_byte v[8];
1830 int j;
1831 png_memcpy(v, sptr, pixel_bytes);
1832 for (j = 0; j < png_pass_inc[pass]; j++)
1833 {
1834 png_memcpy(dp, v, pixel_bytes);
1835 dp -= pixel_bytes;
1836 }
1837 sptr -= pixel_bytes;
1838 }
1839 }
1840 else if (pixel_bytes == 6)
1841 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001842 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001843 {
1844 png_byte v[8];
1845 int j;
1846 png_memcpy(v, sptr, pixel_bytes);
1847 for (j = 0; j < png_pass_inc[pass]; j++)
1848 {
1849 png_memcpy(dp, v, pixel_bytes);
1850 dp -= pixel_bytes;
1851 }
1852 sptr -= pixel_bytes;
1853 }
1854 }
1855 else
1856 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001857 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001858 {
1859 png_byte v[8];
1860 int j;
1861 png_memcpy(v, sptr, pixel_bytes);
1862 for (j = 0; j < png_pass_inc[pass]; j++)
1863 {
1864 png_memcpy(dp, v, pixel_bytes);
1865 dp -= pixel_bytes;
1866 }
1867 sptr -= pixel_bytes;
1868 }
1869 }
1870
1871 } /* end of MMX not supported */
1872 break;
1873 }
1874 } /* end switch (row_info->pixel_depth) */
1875
1876 row_info->width = final_width;
1877 row_info->rowbytes = ((final_width *
1878 (png_uint_32)row_info->pixel_depth + 7) >> 3);
1879 }
1880
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001881#ifdef DISABLE_PNGVCRD_INTERLACE
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001882 mmx_supported = save_mmx_supported;
1883#endif
1884}
1885
1886#endif /* PNG_READ_INTERLACING_SUPPORTED */
1887
1888
1889// These variables are utilized in the functions below. They are declared
1890// globally here to ensure alignment on 8-byte boundaries.
1891
1892union uAll {
1893 __int64 use;
1894 double align;
1895} LBCarryMask = {0x0101010101010101},
1896 HBClearMask = {0x7f7f7f7f7f7f7f7f},
1897 ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
1898
1899
1900// Optimized code for PNG Average filter decoder
1901void
1902png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
1903 , png_bytep prev_row)
1904{
1905 int bpp;
1906 png_uint_32 FullLength;
1907 png_uint_32 MMXLength;
1908 //png_uint_32 len;
1909 int diff;
1910
1911 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
1912 FullLength = row_info->rowbytes; // # of bytes to filter
1913 _asm {
1914 // Init address pointers and offset
1915 mov edi, row // edi ==> Avg(x)
1916 xor ebx, ebx // ebx ==> x
1917 mov edx, edi
1918 mov esi, prev_row // esi ==> Prior(x)
1919 sub edx, bpp // edx ==> Raw(x-bpp)
1920
1921 xor eax, eax
1922 // Compute the Raw value for the first bpp bytes
1923 // Raw(x) = Avg(x) + (Prior(x)/2)
1924davgrlp:
1925 mov al, [esi + ebx] // Load al with Prior(x)
1926 inc ebx
1927 shr al, 1 // divide by 2
1928 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
1929 cmp ebx, bpp
1930 mov [edi+ebx-1], al // Write back Raw(x);
1931 // mov does not affect flags; -1 to offset inc ebx
1932 jb davgrlp
1933 // get # of bytes to alignment
1934 mov diff, edi // take start of row
1935 add diff, ebx // add bpp
1936 add diff, 0xf // add 7 + 8 to incr past alignment boundary
1937 and diff, 0xfffffff8 // mask to alignment boundary
1938 sub diff, edi // subtract from start ==> value ebx at alignment
1939 jz davggo
1940 // fix alignment
1941 // Compute the Raw value for the bytes upto the alignment boundary
1942 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
1943 xor ecx, ecx
1944davglp1:
1945 xor eax, eax
1946 mov cl, [esi + ebx] // load cl with Prior(x)
1947 mov al, [edx + ebx] // load al with Raw(x-bpp)
1948 add ax, cx
1949 inc ebx
1950 shr ax, 1 // divide by 2
1951 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
1952 cmp ebx, diff // Check if at alignment boundary
1953 mov [edi+ebx-1], al // Write back Raw(x);
1954 // mov does not affect flags; -1 to offset inc ebx
1955 jb davglp1 // Repeat until at alignment boundary
1956davggo:
1957 mov eax, FullLength
1958 mov ecx, eax
1959 sub eax, ebx // subtract alignment fix
1960 and eax, 0x00000007 // calc bytes over mult of 8
1961 sub ecx, eax // drop over bytes from original length
1962 mov MMXLength, ecx
1963 } // end _asm block
1964 // Now do the math for the rest of the row
1965 switch ( bpp )
1966 {
1967 case 3:
1968 {
1969 ActiveMask.use = 0x0000000000ffffff;
1970 ShiftBpp.use = 24; // == 3 * 8
1971 ShiftRem.use = 40; // == 64 - 24
1972 _asm {
1973 // Re-init address pointers and offset
1974 movq mm7, ActiveMask
1975 mov ebx, diff // ebx ==> x = offset to alignment boundary
1976 movq mm5, LBCarryMask
1977 mov edi, row // edi ==> Avg(x)
1978 movq mm4, HBClearMask
1979 mov esi, prev_row // esi ==> Prior(x)
1980 // PRIME the pump (load the first Raw(x-bpp) data set
1981 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
1982 // (we correct position in loop below)
1983davg3lp:
1984 movq mm0, [edi + ebx] // Load mm0 with Avg(x)
1985 // Add (Prev_row/2) to Average
1986 movq mm3, mm5
1987 psrlq mm2, ShiftRem // Correct position Raw(x-bpp) data
1988 movq mm1, [esi + ebx] // Load mm1 with Prior(x)
1989 movq mm6, mm7
1990 pand mm3, mm1 // get lsb for each prev_row byte
1991 psrlq mm1, 1 // divide prev_row bytes by 2
1992 pand mm1, mm4 // clear invalid bit 7 of each byte
1993 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
1994 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
1995 movq mm1, mm3 // now use mm1 for getting LBCarrys
1996 pand mm1, mm2 // get LBCarrys for each byte where both
1997 // lsb's were == 1 (Only valid for active group)
1998 psrlq mm2, 1 // divide raw bytes by 2
1999 pand mm2, mm4 // clear invalid bit 7 of each byte
2000 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2001 pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
2002 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2003 // byte
2004 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2005 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 3-5
2006 movq mm2, mm0 // mov updated Raws to mm2
2007 psllq mm2, ShiftBpp // shift data to position correctly
2008 movq mm1, mm3 // now use mm1 for getting LBCarrys
2009 pand mm1, mm2 // get LBCarrys for each byte where both
2010 // lsb's were == 1 (Only valid for active group)
2011 psrlq mm2, 1 // divide raw bytes by 2
2012 pand mm2, mm4 // clear invalid bit 7 of each byte
2013 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2014 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2015 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2016 // byte
2017
2018 // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
2019 psllq mm6, ShiftBpp // shift the mm6 mask to cover the last two
2020 // bytes
2021 movq mm2, mm0 // mov updated Raws to mm2
2022 psllq mm2, ShiftBpp // shift data to position correctly
2023 // Data only needs to be shifted once here to
2024 // get the correct x-bpp offset.
2025 movq mm1, mm3 // now use mm1 for getting LBCarrys
2026 pand mm1, mm2 // get LBCarrys for each byte where both
2027 // lsb's were == 1 (Only valid for active group)
2028 psrlq mm2, 1 // divide raw bytes by 2
2029 pand mm2, mm4 // clear invalid bit 7 of each byte
2030 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2031 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2032 add ebx, 8
2033 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2034 // byte
2035
2036 // Now ready to write back to memory
2037 movq [edi + ebx - 8], mm0
2038 // Move updated Raw(x) to use as Raw(x-bpp) for next loop
2039 cmp ebx, MMXLength
2040 movq mm2, mm0 // mov updated Raw(x) to mm2
2041 jb davg3lp
2042 } // end _asm block
2043 }
2044 break;
2045
2046 case 6:
2047 case 4:
2048 case 7:
2049 case 5:
2050 {
2051 ActiveMask.use = 0xffffffffffffffff; // use shift below to clear
2052 // appropriate inactive bytes
2053 ShiftBpp.use = bpp << 3;
2054 ShiftRem.use = 64 - ShiftBpp.use;
2055 _asm {
2056 movq mm4, HBClearMask
2057 // Re-init address pointers and offset
2058 mov ebx, diff // ebx ==> x = offset to alignment boundary
2059 // Load ActiveMask and clear all bytes except for 1st active group
2060 movq mm7, ActiveMask
2061 mov edi, row // edi ==> Avg(x)
2062 psrlq mm7, ShiftRem
2063 mov esi, prev_row // esi ==> Prior(x)
2064 movq mm6, mm7
2065 movq mm5, LBCarryMask
2066 psllq mm6, ShiftBpp // Create mask for 2nd active group
2067 // PRIME the pump (load the first Raw(x-bpp) data set
2068 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2069 // (we correct position in loop below)
2070davg4lp:
2071 movq mm0, [edi + ebx]
2072 psrlq mm2, ShiftRem // shift data to position correctly
2073 movq mm1, [esi + ebx]
2074 // Add (Prev_row/2) to Average
2075 movq mm3, mm5
2076 pand mm3, mm1 // get lsb for each prev_row byte
2077 psrlq mm1, 1 // divide prev_row bytes by 2
2078 pand mm1, mm4 // clear invalid bit 7 of each byte
2079 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2080 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2081 movq mm1, mm3 // now use mm1 for getting LBCarrys
2082 pand mm1, mm2 // get LBCarrys for each byte where both
2083 // lsb's were == 1 (Only valid for active group)
2084 psrlq mm2, 1 // divide raw bytes by 2
2085 pand mm2, mm4 // clear invalid bit 7 of each byte
2086 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2087 pand mm2, mm7 // Leave only Active Group 1 bytes to add to Avg
2088 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2089 // byte
2090 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2091 movq mm2, mm0 // mov updated Raws to mm2
2092 psllq mm2, ShiftBpp // shift data to position correctly
2093 add ebx, 8
2094 movq mm1, mm3 // now use mm1 for getting LBCarrys
2095 pand mm1, mm2 // get LBCarrys for each byte where both
2096 // lsb's were == 1 (Only valid for active group)
2097 psrlq mm2, 1 // divide raw bytes by 2
2098 pand mm2, mm4 // clear invalid bit 7 of each byte
2099 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2100 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2101 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2102 // byte
2103 cmp ebx, MMXLength
2104 // Now ready to write back to memory
2105 movq [edi + ebx - 8], mm0
2106 // Prep Raw(x-bpp) for next loop
2107 movq mm2, mm0 // mov updated Raws to mm2
2108 jb davg4lp
2109 } // end _asm block
2110 }
2111 break;
2112 case 2:
2113 {
2114 ActiveMask.use = 0x000000000000ffff;
2115 ShiftBpp.use = 24; // == 3 * 8
2116 ShiftRem.use = 40; // == 64 - 24
2117 _asm {
2118 // Load ActiveMask
2119 movq mm7, ActiveMask
2120 // Re-init address pointers and offset
2121 mov ebx, diff // ebx ==> x = offset to alignment boundary
2122 movq mm5, LBCarryMask
2123 mov edi, row // edi ==> Avg(x)
2124 movq mm4, HBClearMask
2125 mov esi, prev_row // esi ==> Prior(x)
2126 // PRIME the pump (load the first Raw(x-bpp) data set
2127 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2128 // (we correct position in loop below)
2129davg2lp:
2130 movq mm0, [edi + ebx]
2131 psllq mm2, ShiftRem // shift data to position correctly
2132 movq mm1, [esi + ebx]
2133 // Add (Prev_row/2) to Average
2134 movq mm3, mm5
2135 pand mm3, mm1 // get lsb for each prev_row byte
2136 psrlq mm1, 1 // divide prev_row bytes by 2
2137 pand mm1, mm4 // clear invalid bit 7 of each byte
2138 movq mm6, mm7
2139 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2140 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2141 movq mm1, mm3 // now use mm1 for getting LBCarrys
2142 pand mm1, mm2 // get LBCarrys for each byte where both
2143 // lsb's were == 1 (Only valid for active group)
2144 psrlq mm2, 1 // divide raw bytes by 2
2145 pand mm2, mm4 // clear invalid bit 7 of each byte
2146 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2147 pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
2148 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2149 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2150 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3
2151 movq mm2, mm0 // mov updated Raws to mm2
2152 psllq mm2, ShiftBpp // shift data to position correctly
2153 movq mm1, mm3 // now use mm1 for getting LBCarrys
2154 pand mm1, mm2 // get LBCarrys for each byte where both
2155 // lsb's were == 1 (Only valid for active group)
2156 psrlq mm2, 1 // divide raw bytes by 2
2157 pand mm2, mm4 // clear invalid bit 7 of each byte
2158 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2159 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2160 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2161
2162 // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
2163 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5
2164 movq mm2, mm0 // mov updated Raws to mm2
2165 psllq mm2, ShiftBpp // shift data to position correctly
2166 // Data only needs to be shifted once here to
2167 // get the correct x-bpp offset.
2168 movq mm1, mm3 // now use mm1 for getting LBCarrys
2169 pand mm1, mm2 // get LBCarrys for each byte where both
2170 // lsb's were == 1 (Only valid for active group)
2171 psrlq mm2, 1 // divide raw bytes by 2
2172 pand mm2, mm4 // clear invalid bit 7 of each byte
2173 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2174 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2175 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2176
2177 // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
2178 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 6 & 7
2179 movq mm2, mm0 // mov updated Raws to mm2
2180 psllq mm2, ShiftBpp // shift data to position correctly
2181 // Data only needs to be shifted once here to
2182 // get the correct x-bpp offset.
2183 add ebx, 8
2184 movq mm1, mm3 // now use mm1 for getting LBCarrys
2185 pand mm1, mm2 // get LBCarrys for each byte where both
2186 // lsb's were == 1 (Only valid for active group)
2187 psrlq mm2, 1 // divide raw bytes by 2
2188 pand mm2, mm4 // clear invalid bit 7 of each byte
2189 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2190 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2191 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2192
2193 cmp ebx, MMXLength
2194 // Now ready to write back to memory
2195 movq [edi + ebx - 8], mm0
2196 // Prep Raw(x-bpp) for next loop
2197 movq mm2, mm0 // mov updated Raws to mm2
2198 jb davg2lp
2199 } // end _asm block
2200 }
2201 break;
2202
2203 case 1: // bpp == 1
2204 {
2205 _asm {
2206 // Re-init address pointers and offset
2207 mov ebx, diff // ebx ==> x = offset to alignment boundary
2208 mov edi, row // edi ==> Avg(x)
2209 cmp ebx, FullLength // Test if offset at end of array
2210 jnb davg1end
2211 // Do Paeth decode for remaining bytes
2212 mov esi, prev_row // esi ==> Prior(x)
2213 mov edx, edi
2214 xor ecx, ecx // zero ecx before using cl & cx in loop below
2215 sub edx, bpp // edx ==> Raw(x-bpp)
2216davg1lp:
2217 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2218 xor eax, eax
2219 mov cl, [esi + ebx] // load cl with Prior(x)
2220 mov al, [edx + ebx] // load al with Raw(x-bpp)
2221 add ax, cx
2222 inc ebx
2223 shr ax, 1 // divide by 2
2224 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2225 cmp ebx, FullLength // Check if at end of array
2226 mov [edi+ebx-1], al // Write back Raw(x);
2227 // mov does not affect flags; -1 to offset inc ebx
2228 jb davg1lp
2229davg1end:
2230 } // end _asm block
2231 }
2232 return;
2233
2234 case 8: // bpp == 8
2235 {
2236 _asm {
2237 // Re-init address pointers and offset
2238 mov ebx, diff // ebx ==> x = offset to alignment boundary
2239 movq mm5, LBCarryMask
2240 mov edi, row // edi ==> Avg(x)
2241 movq mm4, HBClearMask
2242 mov esi, prev_row // esi ==> Prior(x)
2243 // PRIME the pump (load the first Raw(x-bpp) data set
2244 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2245 // (NO NEED to correct position in loop below)
2246davg8lp:
2247 movq mm0, [edi + ebx]
2248 movq mm3, mm5
2249 movq mm1, [esi + ebx]
2250 add ebx, 8
2251 pand mm3, mm1 // get lsb for each prev_row byte
2252 psrlq mm1, 1 // divide prev_row bytes by 2
2253 pand mm3, mm2 // get LBCarrys for each byte where both
2254 // lsb's were == 1
2255 psrlq mm2, 1 // divide raw bytes by 2
2256 pand mm1, mm4 // clear invalid bit 7 of each byte
2257 paddb mm0, mm3 // add LBCarrys to Avg for each byte
2258 pand mm2, mm4 // clear invalid bit 7 of each byte
2259 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2260 paddb mm0, mm2 // add (Raw/2) to Avg for each byte
2261 cmp ebx, MMXLength
2262 movq [edi + ebx - 8], mm0
2263 movq mm2, mm0 // reuse as Raw(x-bpp)
2264 jb davg8lp
2265 } // end _asm block
2266 }
2267 break;
2268 default: // bpp greater than 8
2269 {
2270 _asm {
2271 movq mm5, LBCarryMask
2272 // Re-init address pointers and offset
2273 mov ebx, diff // ebx ==> x = offset to alignment boundary
2274 mov edi, row // edi ==> Avg(x)
2275 movq mm4, HBClearMask
2276 mov edx, edi
2277 mov esi, prev_row // esi ==> Prior(x)
2278 sub edx, bpp // edx ==> Raw(x-bpp)
2279davgAlp:
2280 movq mm0, [edi + ebx]
2281 movq mm3, mm5
2282 movq mm1, [esi + ebx]
2283 pand mm3, mm1 // get lsb for each prev_row byte
2284 movq mm2, [edx + ebx]
2285 psrlq mm1, 1 // divide prev_row bytes by 2
2286 pand mm3, mm2 // get LBCarrys for each byte where both
2287 // lsb's were == 1
2288 psrlq mm2, 1 // divide raw bytes by 2
2289 pand mm1, mm4 // clear invalid bit 7 of each byte
2290 paddb mm0, mm3 // add LBCarrys to Avg for each byte
2291 pand mm2, mm4 // clear invalid bit 7 of each byte
2292 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2293 add ebx, 8
2294 paddb mm0, mm2 // add (Raw/2) to Avg for each byte
2295 cmp ebx, MMXLength
2296 movq [edi + ebx - 8], mm0
2297 jb davgAlp
2298 } // end _asm block
2299 }
2300 break;
2301 } // end switch ( bpp )
2302
2303 _asm {
2304 // MMX acceleration complete now do clean-up
2305 // Check if any remaining bytes left to decode
2306 mov ebx, MMXLength // ebx ==> x = offset bytes remaining after MMX
2307 mov edi, row // edi ==> Avg(x)
2308 cmp ebx, FullLength // Test if offset at end of array
2309 jnb davgend
2310 // Do Paeth decode for remaining bytes
2311 mov esi, prev_row // esi ==> Prior(x)
2312 mov edx, edi
2313 xor ecx, ecx // zero ecx before using cl & cx in loop below
2314 sub edx, bpp // edx ==> Raw(x-bpp)
2315davglp2:
2316 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2317 xor eax, eax
2318 mov cl, [esi + ebx] // load cl with Prior(x)
2319 mov al, [edx + ebx] // load al with Raw(x-bpp)
2320 add ax, cx
2321 inc ebx
2322 shr ax, 1 // divide by 2
2323 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2324 cmp ebx, FullLength // Check if at end of array
2325 mov [edi+ebx-1], al // Write back Raw(x);
2326 // mov does not affect flags; -1 to offset inc ebx
2327 jb davglp2
2328davgend:
2329 emms // End MMX instructions; prep for possible FP instrs.
2330 } // end _asm block
2331}
2332
2333// Optimized code for PNG Paeth filter decoder
2334void
2335png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
2336 png_bytep prev_row)
2337{
2338 png_uint_32 FullLength;
2339 png_uint_32 MMXLength;
2340 //png_uint_32 len;
2341 int bpp;
2342 int diff;
2343 //int ptemp;
2344 int patemp, pbtemp, pctemp;
2345
2346 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
2347 FullLength = row_info->rowbytes; // # of bytes to filter
2348 _asm
2349 {
2350 xor ebx, ebx // ebx ==> x offset
2351 mov edi, row
2352 xor edx, edx // edx ==> x-bpp offset
2353 mov esi, prev_row
2354 xor eax, eax
2355
2356 // Compute the Raw value for the first bpp bytes
2357 // Note: the formula works out to be always
2358 // Paeth(x) = Raw(x) + Prior(x) where x < bpp
2359dpthrlp:
2360 mov al, [edi + ebx]
2361 add al, [esi + ebx]
2362 inc ebx
2363 cmp ebx, bpp
2364 mov [edi + ebx - 1], al
2365 jb dpthrlp
2366 // get # of bytes to alignment
2367 mov diff, edi // take start of row
2368 add diff, ebx // add bpp
2369 xor ecx, ecx
2370 add diff, 0xf // add 7 + 8 to incr past alignment boundary
2371 and diff, 0xfffffff8 // mask to alignment boundary
2372 sub diff, edi // subtract from start ==> value ebx at alignment
2373 jz dpthgo
2374 // fix alignment
2375dpthlp1:
2376 xor eax, eax
2377 // pav = p - a = (a + b - c) - a = b - c
2378 mov al, [esi + ebx] // load Prior(x) into al
2379 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2380 sub eax, ecx // subtract Prior(x-bpp)
2381 mov patemp, eax // Save pav for later use
2382 xor eax, eax
2383 // pbv = p - b = (a + b - c) - b = a - c
2384 mov al, [edi + edx] // load Raw(x-bpp) into al
2385 sub eax, ecx // subtract Prior(x-bpp)
2386 mov ecx, eax
2387 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2388 add eax, patemp // pcv = pav + pbv
2389 // pc = abs(pcv)
2390 test eax, 0x80000000
2391 jz dpthpca
2392 neg eax // reverse sign of neg values
2393dpthpca:
2394 mov pctemp, eax // save pc for later use
2395 // pb = abs(pbv)
2396 test ecx, 0x80000000
2397 jz dpthpba
2398 neg ecx // reverse sign of neg values
2399dpthpba:
2400 mov pbtemp, ecx // save pb for later use
2401 // pa = abs(pav)
2402 mov eax, patemp
2403 test eax, 0x80000000
2404 jz dpthpaa
2405 neg eax // reverse sign of neg values
2406dpthpaa:
2407 mov patemp, eax // save pa for later use
2408 // test if pa <= pb
2409 cmp eax, ecx
2410 jna dpthabb
2411 // pa > pb; now test if pb <= pc
2412 cmp ecx, pctemp
2413 jna dpthbbc
2414 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2415 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2416 jmp dpthpaeth
2417dpthbbc:
2418 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
2419 mov cl, [esi + ebx] // load Prior(x) into cl
2420 jmp dpthpaeth
2421dpthabb:
2422 // pa <= pb; now test if pa <= pc
2423 cmp eax, pctemp
2424 jna dpthabc
2425 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2426 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2427 jmp dpthpaeth
2428dpthabc:
2429 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
2430 mov cl, [edi + edx] // load Raw(x-bpp) into cl
2431dpthpaeth:
2432 inc ebx
2433 inc edx
2434 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
2435 add [edi + ebx - 1], cl
2436 cmp ebx, diff
2437 jb dpthlp1
2438dpthgo:
2439 mov ecx, FullLength
2440 mov eax, ecx
2441 sub eax, ebx // subtract alignment fix
2442 and eax, 0x00000007 // calc bytes over mult of 8
2443 sub ecx, eax // drop over bytes from original length
2444 mov MMXLength, ecx
2445 } // end _asm block
2446 // Now do the math for the rest of the row
2447 switch ( bpp )
2448 {
2449 case 3:
2450 {
2451 ActiveMask.use = 0x0000000000ffffff;
2452 ActiveMaskEnd.use = 0xffff000000000000;
2453 ShiftBpp.use = 24; // == bpp(3) * 8
2454 ShiftRem.use = 40; // == 64 - 24
2455 _asm
2456 {
2457 mov ebx, diff
2458 mov edi, row
2459 mov esi, prev_row
2460 pxor mm0, mm0
2461 // PRIME the pump (load the first Raw(x-bpp) data set
2462 movq mm1, [edi+ebx-8]
2463dpth3lp:
2464 psrlq mm1, ShiftRem // shift last 3 bytes to 1st 3 bytes
2465 movq mm2, [esi + ebx] // load b=Prior(x)
2466 punpcklbw mm1, mm0 // Unpack High bytes of a
2467 movq mm3, [esi+ebx-8] // Prep c=Prior(x-bpp) bytes
2468 punpcklbw mm2, mm0 // Unpack High bytes of b
2469 psrlq mm3, ShiftRem // shift last 3 bytes to 1st 3 bytes
2470 // pav = p - a = (a + b - c) - a = b - c
2471 movq mm4, mm2
2472 punpcklbw mm3, mm0 // Unpack High bytes of c
2473 // pbv = p - b = (a + b - c) - b = a - c
2474 movq mm5, mm1
2475 psubw mm4, mm3
2476 pxor mm7, mm7
2477 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2478 movq mm6, mm4
2479 psubw mm5, mm3
2480
2481 // pa = abs(p-a) = abs(pav)
2482 // pb = abs(p-b) = abs(pbv)
2483 // pc = abs(p-c) = abs(pcv)
2484 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2485 paddw mm6, mm5
2486 pand mm0, mm4 // Only pav bytes < 0 in mm7
2487 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2488 psubw mm4, mm0
2489 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2490 psubw mm4, mm0
2491 psubw mm5, mm7
2492 pxor mm0, mm0
2493 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2494 pand mm0, mm6 // Only pav bytes < 0 in mm7
2495 psubw mm5, mm7
2496 psubw mm6, mm0
2497 // test pa <= pb
2498 movq mm7, mm4
2499 psubw mm6, mm0
2500 pcmpgtw mm7, mm5 // pa > pb?
2501 movq mm0, mm7
2502 // use mm7 mask to merge pa & pb
2503 pand mm5, mm7
2504 // use mm0 mask copy to merge a & b
2505 pand mm2, mm0
2506 pandn mm7, mm4
2507 pandn mm0, mm1
2508 paddw mm7, mm5
2509 paddw mm0, mm2
2510 // test ((pa <= pb)? pa:pb) <= pc
2511 pcmpgtw mm7, mm6 // pab > pc?
2512 pxor mm1, mm1
2513 pand mm3, mm7
2514 pandn mm7, mm0
2515 paddw mm7, mm3
2516 pxor mm0, mm0
2517 packuswb mm7, mm1
2518 movq mm3, [esi + ebx] // load c=Prior(x-bpp)
2519 pand mm7, ActiveMask
2520 movq mm2, mm3 // load b=Prior(x) step 1
2521 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2522 punpcklbw mm3, mm0 // Unpack High bytes of c
2523 movq [edi + ebx], mm7 // write back updated value
2524 movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
2525 // Now do Paeth for 2nd set of bytes (3-5)
2526 psrlq mm2, ShiftBpp // load b=Prior(x) step 2
2527 punpcklbw mm1, mm0 // Unpack High bytes of a
2528 pxor mm7, mm7
2529 punpcklbw mm2, mm0 // Unpack High bytes of b
2530 // pbv = p - b = (a + b - c) - b = a - c
2531 movq mm5, mm1
2532 // pav = p - a = (a + b - c) - a = b - c
2533 movq mm4, mm2
2534 psubw mm5, mm3
2535 psubw mm4, mm3
2536 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
2537 // pav + pbv = pbv + pav
2538 movq mm6, mm5
2539 paddw mm6, mm4
2540
2541 // pa = abs(p-a) = abs(pav)
2542 // pb = abs(p-b) = abs(pbv)
2543 // pc = abs(p-c) = abs(pcv)
2544 pcmpgtw mm0, mm5 // Create mask pbv bytes < 0
2545 pcmpgtw mm7, mm4 // Create mask pav bytes < 0
2546 pand mm0, mm5 // Only pbv bytes < 0 in mm0
2547 pand mm7, mm4 // Only pav bytes < 0 in mm7
2548 psubw mm5, mm0
2549 psubw mm4, mm7
2550 psubw mm5, mm0
2551 psubw mm4, mm7
2552 pxor mm0, mm0
2553 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2554 pand mm0, mm6 // Only pav bytes < 0 in mm7
2555 psubw mm6, mm0
2556 // test pa <= pb
2557 movq mm7, mm4
2558 psubw mm6, mm0
2559 pcmpgtw mm7, mm5 // pa > pb?
2560 movq mm0, mm7
2561 // use mm7 mask to merge pa & pb
2562 pand mm5, mm7
2563 // use mm0 mask copy to merge a & b
2564 pand mm2, mm0
2565 pandn mm7, mm4
2566 pandn mm0, mm1
2567 paddw mm7, mm5
2568 paddw mm0, mm2
2569 // test ((pa <= pb)? pa:pb) <= pc
2570 pcmpgtw mm7, mm6 // pab > pc?
2571 movq mm2, [esi + ebx] // load b=Prior(x)
2572 pand mm3, mm7
2573 pandn mm7, mm0
2574 pxor mm1, mm1
2575 paddw mm7, mm3
2576 pxor mm0, mm0
2577 packuswb mm7, mm1
2578 movq mm3, mm2 // load c=Prior(x-bpp) step 1
2579 pand mm7, ActiveMask
2580 punpckhbw mm2, mm0 // Unpack High bytes of b
2581 psllq mm7, ShiftBpp // Shift bytes to 2nd group of 3 bytes
2582 // pav = p - a = (a + b - c) - a = b - c
2583 movq mm4, mm2
2584 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2585 psllq mm3, ShiftBpp // load c=Prior(x-bpp) step 2
2586 movq [edi + ebx], mm7 // write back updated value
2587 movq mm1, mm7
2588 punpckhbw mm3, mm0 // Unpack High bytes of c
2589 psllq mm1, ShiftBpp // Shift bytes
2590 // Now mm1 will be used as Raw(x-bpp)
2591 // Now do Paeth for 3rd, and final, set of bytes (6-7)
2592 pxor mm7, mm7
2593 punpckhbw mm1, mm0 // Unpack High bytes of a
2594 psubw mm4, mm3
2595 // pbv = p - b = (a + b - c) - b = a - c
2596 movq mm5, mm1
2597 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2598 movq mm6, mm4
2599 psubw mm5, mm3
2600 pxor mm0, mm0
2601 paddw mm6, mm5
2602
2603 // pa = abs(p-a) = abs(pav)
2604 // pb = abs(p-b) = abs(pbv)
2605 // pc = abs(p-c) = abs(pcv)
2606 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2607 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2608 pand mm0, mm4 // Only pav bytes < 0 in mm7
2609 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2610 psubw mm4, mm0
2611 psubw mm5, mm7
2612 psubw mm4, mm0
2613 psubw mm5, mm7
2614 pxor mm0, mm0
2615 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2616 pand mm0, mm6 // Only pav bytes < 0 in mm7
2617 psubw mm6, mm0
2618 // test pa <= pb
2619 movq mm7, mm4
2620 psubw mm6, mm0
2621 pcmpgtw mm7, mm5 // pa > pb?
2622 movq mm0, mm7
2623 // use mm0 mask copy to merge a & b
2624 pand mm2, mm0
2625 // use mm7 mask to merge pa & pb
2626 pand mm5, mm7
2627 pandn mm0, mm1
2628 pandn mm7, mm4
2629 paddw mm0, mm2
2630 paddw mm7, mm5
2631 // test ((pa <= pb)? pa:pb) <= pc
2632 pcmpgtw mm7, mm6 // pab > pc?
2633 pand mm3, mm7
2634 pandn mm7, mm0
2635 paddw mm7, mm3
2636 pxor mm1, mm1
2637 packuswb mm1, mm7
2638 // Step ebx to next set of 8 bytes and repeat loop til done
2639 add ebx, 8
2640 pand mm1, ActiveMaskEnd
2641 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2642
2643 cmp ebx, MMXLength
2644 pxor mm0, mm0 // pxor does not affect flags
2645 movq [edi + ebx - 8], mm1 // write back updated value
2646 // mm1 will be used as Raw(x-bpp) next loop
2647 // mm3 ready to be used as Prior(x-bpp) next loop
2648 jb dpth3lp
2649 } // end _asm block
2650 }
2651 break;
2652
2653 case 6:
2654 case 7:
2655 case 5:
2656 {
2657 ActiveMask.use = 0x00000000ffffffff;
2658 ActiveMask2.use = 0xffffffff00000000;
2659 ShiftBpp.use = bpp << 3; // == bpp * 8
2660 ShiftRem.use = 64 - ShiftBpp.use;
2661 _asm
2662 {
2663 mov ebx, diff
2664 mov edi, row
2665 mov esi, prev_row
2666 // PRIME the pump (load the first Raw(x-bpp) data set
2667 movq mm1, [edi+ebx-8]
2668 pxor mm0, mm0
2669dpth6lp:
2670 // Must shift to position Raw(x-bpp) data
2671 psrlq mm1, ShiftRem
2672 // Do first set of 4 bytes
2673 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2674 punpcklbw mm1, mm0 // Unpack Low bytes of a
2675 movq mm2, [esi + ebx] // load b=Prior(x)
2676 punpcklbw mm2, mm0 // Unpack Low bytes of b
2677 // Must shift to position Prior(x-bpp) data
2678 psrlq mm3, ShiftRem
2679 // pav = p - a = (a + b - c) - a = b - c
2680 movq mm4, mm2
2681 punpcklbw mm3, mm0 // Unpack Low bytes of c
2682 // pbv = p - b = (a + b - c) - b = a - c
2683 movq mm5, mm1
2684 psubw mm4, mm3
2685 pxor mm7, mm7
2686 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2687 movq mm6, mm4
2688 psubw mm5, mm3
2689 // pa = abs(p-a) = abs(pav)
2690 // pb = abs(p-b) = abs(pbv)
2691 // pc = abs(p-c) = abs(pcv)
2692 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2693 paddw mm6, mm5
2694 pand mm0, mm4 // Only pav bytes < 0 in mm7
2695 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2696 psubw mm4, mm0
2697 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2698 psubw mm4, mm0
2699 psubw mm5, mm7
2700 pxor mm0, mm0
2701 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2702 pand mm0, mm6 // Only pav bytes < 0 in mm7
2703 psubw mm5, mm7
2704 psubw mm6, mm0
2705 // test pa <= pb
2706 movq mm7, mm4
2707 psubw mm6, mm0
2708 pcmpgtw mm7, mm5 // pa > pb?
2709 movq mm0, mm7
2710 // use mm7 mask to merge pa & pb
2711 pand mm5, mm7
2712 // use mm0 mask copy to merge a & b
2713 pand mm2, mm0
2714 pandn mm7, mm4
2715 pandn mm0, mm1
2716 paddw mm7, mm5
2717 paddw mm0, mm2
2718 // test ((pa <= pb)? pa:pb) <= pc
2719 pcmpgtw mm7, mm6 // pab > pc?
2720 pxor mm1, mm1
2721 pand mm3, mm7
2722 pandn mm7, mm0
2723 paddw mm7, mm3
2724 pxor mm0, mm0
2725 packuswb mm7, mm1
2726 movq mm3, [esi + ebx - 8] // load c=Prior(x-bpp)
2727 pand mm7, ActiveMask
2728 psrlq mm3, ShiftRem
2729 movq mm2, [esi + ebx] // load b=Prior(x) step 1
2730 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2731 movq mm6, mm2
2732 movq [edi + ebx], mm7 // write back updated value
2733 movq mm1, [edi+ebx-8]
2734 psllq mm6, ShiftBpp
2735 movq mm5, mm7
2736 psrlq mm1, ShiftRem
2737 por mm3, mm6
2738 psllq mm5, ShiftBpp
2739 punpckhbw mm3, mm0 // Unpack High bytes of c
2740 por mm1, mm5
2741 // Do second set of 4 bytes
2742 punpckhbw mm2, mm0 // Unpack High bytes of b
2743 punpckhbw mm1, mm0 // Unpack High bytes of a
2744 // pav = p - a = (a + b - c) - a = b - c
2745 movq mm4, mm2
2746 // pbv = p - b = (a + b - c) - b = a - c
2747 movq mm5, mm1
2748 psubw mm4, mm3
2749 pxor mm7, mm7
2750 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2751 movq mm6, mm4
2752 psubw mm5, mm3
2753 // pa = abs(p-a) = abs(pav)
2754 // pb = abs(p-b) = abs(pbv)
2755 // pc = abs(p-c) = abs(pcv)
2756 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2757 paddw mm6, mm5
2758 pand mm0, mm4 // Only pav bytes < 0 in mm7
2759 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2760 psubw mm4, mm0
2761 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2762 psubw mm4, mm0
2763 psubw mm5, mm7
2764 pxor mm0, mm0
2765 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2766 pand mm0, mm6 // Only pav bytes < 0 in mm7
2767 psubw mm5, mm7
2768 psubw mm6, mm0
2769 // test pa <= pb
2770 movq mm7, mm4
2771 psubw mm6, mm0
2772 pcmpgtw mm7, mm5 // pa > pb?
2773 movq mm0, mm7
2774 // use mm7 mask to merge pa & pb
2775 pand mm5, mm7
2776 // use mm0 mask copy to merge a & b
2777 pand mm2, mm0
2778 pandn mm7, mm4
2779 pandn mm0, mm1
2780 paddw mm7, mm5
2781 paddw mm0, mm2
2782 // test ((pa <= pb)? pa:pb) <= pc
2783 pcmpgtw mm7, mm6 // pab > pc?
2784 pxor mm1, mm1
2785 pand mm3, mm7
2786 pandn mm7, mm0
2787 pxor mm1, mm1
2788 paddw mm7, mm3
2789 pxor mm0, mm0
2790 // Step ex to next set of 8 bytes and repeat loop til done
2791 add ebx, 8
2792 packuswb mm1, mm7
2793 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2794 cmp ebx, MMXLength
2795 movq [edi + ebx - 8], mm1 // write back updated value
2796 // mm1 will be used as Raw(x-bpp) next loop
2797 jb dpth6lp
2798 } // end _asm block
2799 }
2800 break;
2801
2802 case 4:
2803 {
2804 ActiveMask.use = 0x00000000ffffffff;
2805 _asm {
2806 mov ebx, diff
2807 mov edi, row
2808 mov esi, prev_row
2809 pxor mm0, mm0
2810 // PRIME the pump (load the first Raw(x-bpp) data set
2811 movq mm1, [edi+ebx-8] // Only time should need to read
2812 // a=Raw(x-bpp) bytes
2813dpth4lp:
2814 // Do first set of 4 bytes
2815 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2816 punpckhbw mm1, mm0 // Unpack Low bytes of a
2817 movq mm2, [esi + ebx] // load b=Prior(x)
2818 punpcklbw mm2, mm0 // Unpack High bytes of b
2819 // pav = p - a = (a + b - c) - a = b - c
2820 movq mm4, mm2
2821 punpckhbw mm3, mm0 // Unpack High bytes of c
2822 // pbv = p - b = (a + b - c) - b = a - c
2823 movq mm5, mm1
2824 psubw mm4, mm3
2825 pxor mm7, mm7
2826 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2827 movq mm6, mm4
2828 psubw mm5, mm3
2829 // pa = abs(p-a) = abs(pav)
2830 // pb = abs(p-b) = abs(pbv)
2831 // pc = abs(p-c) = abs(pcv)
2832 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2833 paddw mm6, mm5
2834 pand mm0, mm4 // Only pav bytes < 0 in mm7
2835 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2836 psubw mm4, mm0
2837 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2838 psubw mm4, mm0
2839 psubw mm5, mm7
2840 pxor mm0, mm0
2841 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2842 pand mm0, mm6 // Only pav bytes < 0 in mm7
2843 psubw mm5, mm7
2844 psubw mm6, mm0
2845 // test pa <= pb
2846 movq mm7, mm4
2847 psubw mm6, mm0
2848 pcmpgtw mm7, mm5 // pa > pb?
2849 movq mm0, mm7
2850 // use mm7 mask to merge pa & pb
2851 pand mm5, mm7
2852 // use mm0 mask copy to merge a & b
2853 pand mm2, mm0
2854 pandn mm7, mm4
2855 pandn mm0, mm1
2856 paddw mm7, mm5
2857 paddw mm0, mm2
2858 // test ((pa <= pb)? pa:pb) <= pc
2859 pcmpgtw mm7, mm6 // pab > pc?
2860 pxor mm1, mm1
2861 pand mm3, mm7
2862 pandn mm7, mm0
2863 paddw mm7, mm3
2864 pxor mm0, mm0
2865 packuswb mm7, mm1
2866 movq mm3, [esi + ebx] // load c=Prior(x-bpp)
2867 pand mm7, ActiveMask
2868 movq mm2, mm3 // load b=Prior(x) step 1
2869 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2870 punpcklbw mm3, mm0 // Unpack High bytes of c
2871 movq [edi + ebx], mm7 // write back updated value
2872 movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
2873 // Do second set of 4 bytes
2874 punpckhbw mm2, mm0 // Unpack Low bytes of b
2875 punpcklbw mm1, mm0 // Unpack Low bytes of a
2876 // pav = p - a = (a + b - c) - a = b - c
2877 movq mm4, mm2
2878 // pbv = p - b = (a + b - c) - b = a - c
2879 movq mm5, mm1
2880 psubw mm4, mm3
2881 pxor mm7, mm7
2882 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2883 movq mm6, mm4
2884 psubw mm5, mm3
2885 // pa = abs(p-a) = abs(pav)
2886 // pb = abs(p-b) = abs(pbv)
2887 // pc = abs(p-c) = abs(pcv)
2888 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2889 paddw mm6, mm5
2890 pand mm0, mm4 // Only pav bytes < 0 in mm7
2891 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2892 psubw mm4, mm0
2893 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2894 psubw mm4, mm0
2895 psubw mm5, mm7
2896 pxor mm0, mm0
2897 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2898 pand mm0, mm6 // Only pav bytes < 0 in mm7
2899 psubw mm5, mm7
2900 psubw mm6, mm0
2901 // test pa <= pb
2902 movq mm7, mm4
2903 psubw mm6, mm0
2904 pcmpgtw mm7, mm5 // pa > pb?
2905 movq mm0, mm7
2906 // use mm7 mask to merge pa & pb
2907 pand mm5, mm7
2908 // use mm0 mask copy to merge a & b
2909 pand mm2, mm0
2910 pandn mm7, mm4
2911 pandn mm0, mm1
2912 paddw mm7, mm5
2913 paddw mm0, mm2
2914 // test ((pa <= pb)? pa:pb) <= pc
2915 pcmpgtw mm7, mm6 // pab > pc?
2916 pxor mm1, mm1
2917 pand mm3, mm7
2918 pandn mm7, mm0
2919 pxor mm1, mm1
2920 paddw mm7, mm3
2921 pxor mm0, mm0
2922 // Step ex to next set of 8 bytes and repeat loop til done
2923 add ebx, 8
2924 packuswb mm1, mm7
2925 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2926 cmp ebx, MMXLength
2927 movq [edi + ebx - 8], mm1 // write back updated value
2928 // mm1 will be used as Raw(x-bpp) next loop
2929 jb dpth4lp
2930 } // end _asm block
2931 }
2932 break;
2933 case 8: // bpp == 8
2934 {
2935 ActiveMask.use = 0x00000000ffffffff;
2936 _asm {
2937 mov ebx, diff
2938 mov edi, row
2939 mov esi, prev_row
2940 pxor mm0, mm0
2941 // PRIME the pump (load the first Raw(x-bpp) data set
2942 movq mm1, [edi+ebx-8] // Only time should need to read
2943 // a=Raw(x-bpp) bytes
2944dpth8lp:
2945 // Do first set of 4 bytes
2946 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2947 punpcklbw mm1, mm0 // Unpack Low bytes of a
2948 movq mm2, [esi + ebx] // load b=Prior(x)
2949 punpcklbw mm2, mm0 // Unpack Low bytes of b
2950 // pav = p - a = (a + b - c) - a = b - c
2951 movq mm4, mm2
2952 punpcklbw mm3, mm0 // Unpack Low bytes of c
2953 // pbv = p - b = (a + b - c) - b = a - c
2954 movq mm5, mm1
2955 psubw mm4, mm3
2956 pxor mm7, mm7
2957 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2958 movq mm6, mm4
2959 psubw mm5, mm3
2960 // pa = abs(p-a) = abs(pav)
2961 // pb = abs(p-b) = abs(pbv)
2962 // pc = abs(p-c) = abs(pcv)
2963 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2964 paddw mm6, mm5
2965 pand mm0, mm4 // Only pav bytes < 0 in mm7
2966 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2967 psubw mm4, mm0
2968 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2969 psubw mm4, mm0
2970 psubw mm5, mm7
2971 pxor mm0, mm0
2972 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2973 pand mm0, mm6 // Only pav bytes < 0 in mm7
2974 psubw mm5, mm7
2975 psubw mm6, mm0
2976 // test pa <= pb
2977 movq mm7, mm4
2978 psubw mm6, mm0
2979 pcmpgtw mm7, mm5 // pa > pb?
2980 movq mm0, mm7
2981 // use mm7 mask to merge pa & pb
2982 pand mm5, mm7
2983 // use mm0 mask copy to merge a & b
2984 pand mm2, mm0
2985 pandn mm7, mm4
2986 pandn mm0, mm1
2987 paddw mm7, mm5
2988 paddw mm0, mm2
2989 // test ((pa <= pb)? pa:pb) <= pc
2990 pcmpgtw mm7, mm6 // pab > pc?
2991 pxor mm1, mm1
2992 pand mm3, mm7
2993 pandn mm7, mm0
2994 paddw mm7, mm3
2995 pxor mm0, mm0
2996 packuswb mm7, mm1
2997 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2998 pand mm7, ActiveMask
2999 movq mm2, [esi + ebx] // load b=Prior(x)
3000 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
3001 punpckhbw mm3, mm0 // Unpack High bytes of c
3002 movq [edi + ebx], mm7 // write back updated value
3003 movq mm1, [edi+ebx-8] // read a=Raw(x-bpp) bytes
3004
3005 // Do second set of 4 bytes
3006 punpckhbw mm2, mm0 // Unpack High bytes of b
3007 punpckhbw mm1, mm0 // Unpack High bytes of a
3008 // pav = p - a = (a + b - c) - a = b - c
3009 movq mm4, mm2
3010 // pbv = p - b = (a + b - c) - b = a - c
3011 movq mm5, mm1
3012 psubw mm4, mm3
3013 pxor mm7, mm7
3014 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3015 movq mm6, mm4
3016 psubw mm5, mm3
3017 // pa = abs(p-a) = abs(pav)
3018 // pb = abs(p-b) = abs(pbv)
3019 // pc = abs(p-c) = abs(pcv)
3020 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
3021 paddw mm6, mm5
3022 pand mm0, mm4 // Only pav bytes < 0 in mm7
3023 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
3024 psubw mm4, mm0
3025 pand mm7, mm5 // Only pbv bytes < 0 in mm0
3026 psubw mm4, mm0
3027 psubw mm5, mm7
3028 pxor mm0, mm0
3029 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
3030 pand mm0, mm6 // Only pav bytes < 0 in mm7
3031 psubw mm5, mm7
3032 psubw mm6, mm0
3033 // test pa <= pb
3034 movq mm7, mm4
3035 psubw mm6, mm0
3036 pcmpgtw mm7, mm5 // pa > pb?
3037 movq mm0, mm7
3038 // use mm7 mask to merge pa & pb
3039 pand mm5, mm7
3040 // use mm0 mask copy to merge a & b
3041 pand mm2, mm0
3042 pandn mm7, mm4
3043 pandn mm0, mm1
3044 paddw mm7, mm5
3045 paddw mm0, mm2
3046 // test ((pa <= pb)? pa:pb) <= pc
3047 pcmpgtw mm7, mm6 // pab > pc?
3048 pxor mm1, mm1
3049 pand mm3, mm7
3050 pandn mm7, mm0
3051 pxor mm1, mm1
3052 paddw mm7, mm3
3053 pxor mm0, mm0
3054 // Step ex to next set of 8 bytes and repeat loop til done
3055 add ebx, 8
3056 packuswb mm1, mm7
3057 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
3058 cmp ebx, MMXLength
3059 movq [edi + ebx - 8], mm1 // write back updated value
3060 // mm1 will be used as Raw(x-bpp) next loop
3061 jb dpth8lp
3062 } // end _asm block
3063 }
3064 break;
3065
3066 case 1: // bpp = 1
3067 case 2: // bpp = 2
3068 default: // bpp > 8
3069 {
3070 _asm {
3071 mov ebx, diff
3072 cmp ebx, FullLength
3073 jnb dpthdend
3074 mov edi, row
3075 mov esi, prev_row
3076 // Do Paeth decode for remaining bytes
3077 mov edx, ebx
3078 xor ecx, ecx // zero ecx before using cl & cx in loop below
3079 sub edx, bpp // Set edx = ebx - bpp
3080dpthdlp:
3081 xor eax, eax
3082 // pav = p - a = (a + b - c) - a = b - c
3083 mov al, [esi + ebx] // load Prior(x) into al
3084 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3085 sub eax, ecx // subtract Prior(x-bpp)
3086 mov patemp, eax // Save pav for later use
3087 xor eax, eax
3088 // pbv = p - b = (a + b - c) - b = a - c
3089 mov al, [edi + edx] // load Raw(x-bpp) into al
3090 sub eax, ecx // subtract Prior(x-bpp)
3091 mov ecx, eax
3092 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3093 add eax, patemp // pcv = pav + pbv
3094 // pc = abs(pcv)
3095 test eax, 0x80000000
3096 jz dpthdpca
3097 neg eax // reverse sign of neg values
3098dpthdpca:
3099 mov pctemp, eax // save pc for later use
3100 // pb = abs(pbv)
3101 test ecx, 0x80000000
3102 jz dpthdpba
3103 neg ecx // reverse sign of neg values
3104dpthdpba:
3105 mov pbtemp, ecx // save pb for later use
3106 // pa = abs(pav)
3107 mov eax, patemp
3108 test eax, 0x80000000
3109 jz dpthdpaa
3110 neg eax // reverse sign of neg values
3111dpthdpaa:
3112 mov patemp, eax // save pa for later use
3113 // test if pa <= pb
3114 cmp eax, ecx
3115 jna dpthdabb
3116 // pa > pb; now test if pb <= pc
3117 cmp ecx, pctemp
3118 jna dpthdbbc
3119 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3120 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3121 jmp dpthdpaeth
3122dpthdbbc:
3123 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3124 mov cl, [esi + ebx] // load Prior(x) into cl
3125 jmp dpthdpaeth
3126dpthdabb:
3127 // pa <= pb; now test if pa <= pc
3128 cmp eax, pctemp
3129 jna dpthdabc
3130 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3131 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3132 jmp dpthdpaeth
3133dpthdabc:
3134 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3135 mov cl, [edi + edx] // load Raw(x-bpp) into cl
3136dpthdpaeth:
3137 inc ebx
3138 inc edx
3139 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3140 add [edi + ebx - 1], cl
3141 cmp ebx, FullLength
3142 jb dpthdlp
3143dpthdend:
3144 } // end _asm block
3145 }
3146 return; // No need to go further with this one
3147 } // end switch ( bpp )
3148 _asm
3149 {
3150 // MMX acceleration complete now do clean-up
3151 // Check if any remaining bytes left to decode
3152 mov ebx, MMXLength
3153 cmp ebx, FullLength
3154 jnb dpthend
3155 mov edi, row
3156 mov esi, prev_row
3157 // Do Paeth decode for remaining bytes
3158 mov edx, ebx
3159 xor ecx, ecx // zero ecx before using cl & cx in loop below
3160 sub edx, bpp // Set edx = ebx - bpp
3161dpthlp2:
3162 xor eax, eax
3163 // pav = p - a = (a + b - c) - a = b - c
3164 mov al, [esi + ebx] // load Prior(x) into al
3165 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3166 sub eax, ecx // subtract Prior(x-bpp)
3167 mov patemp, eax // Save pav for later use
3168 xor eax, eax
3169 // pbv = p - b = (a + b - c) - b = a - c
3170 mov al, [edi + edx] // load Raw(x-bpp) into al
3171 sub eax, ecx // subtract Prior(x-bpp)
3172 mov ecx, eax
3173 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3174 add eax, patemp // pcv = pav + pbv
3175 // pc = abs(pcv)
3176 test eax, 0x80000000
3177 jz dpthpca2
3178 neg eax // reverse sign of neg values
3179dpthpca2:
3180 mov pctemp, eax // save pc for later use
3181 // pb = abs(pbv)
3182 test ecx, 0x80000000
3183 jz dpthpba2
3184 neg ecx // reverse sign of neg values
3185dpthpba2:
3186 mov pbtemp, ecx // save pb for later use
3187 // pa = abs(pav)
3188 mov eax, patemp
3189 test eax, 0x80000000
3190 jz dpthpaa2
3191 neg eax // reverse sign of neg values
3192dpthpaa2:
3193 mov patemp, eax // save pa for later use
3194 // test if pa <= pb
3195 cmp eax, ecx
3196 jna dpthabb2
3197 // pa > pb; now test if pb <= pc
3198 cmp ecx, pctemp
3199 jna dpthbbc2
3200 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3201 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3202 jmp dpthpaeth2
3203dpthbbc2:
3204 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3205 mov cl, [esi + ebx] // load Prior(x) into cl
3206 jmp dpthpaeth2
3207dpthabb2:
3208 // pa <= pb; now test if pa <= pc
3209 cmp eax, pctemp
3210 jna dpthabc2
3211 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3212 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3213 jmp dpthpaeth2
3214dpthabc2:
3215 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3216 mov cl, [edi + edx] // load Raw(x-bpp) into cl
3217dpthpaeth2:
3218 inc ebx
3219 inc edx
3220 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3221 add [edi + ebx - 1], cl
3222 cmp ebx, FullLength
3223 jb dpthlp2
3224dpthend:
3225 emms // End MMX instructions; prep for possible FP instrs.
3226 } // end _asm block
3227}
3228
3229// Optimized code for PNG Sub filter decoder
3230void
3231png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
3232{
3233 //int test;
3234 int bpp;
3235 png_uint_32 FullLength;
3236 png_uint_32 MMXLength;
3237 int diff;
3238
3239 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3240 FullLength = row_info->rowbytes - bpp; // # of bytes to filter
3241 _asm {
3242 mov edi, row
3243 mov esi, edi // lp = row
3244 add edi, bpp // rp = row + bpp
3245 xor eax, eax
3246 // get # of bytes to alignment
3247 mov diff, edi // take start of row
3248 add diff, 0xf // add 7 + 8 to incr past
3249 // alignment boundary
3250 xor ebx, ebx
3251 and diff, 0xfffffff8 // mask to alignment boundary
3252 sub diff, edi // subtract from start ==> value
3253 // ebx at alignment
3254 jz dsubgo
3255 // fix alignment
3256dsublp1:
3257 mov al, [esi+ebx]
3258 add [edi+ebx], al
3259 inc ebx
3260 cmp ebx, diff
3261 jb dsublp1
3262dsubgo:
3263 mov ecx, FullLength
3264 mov edx, ecx
3265 sub edx, ebx // subtract alignment fix
3266 and edx, 0x00000007 // calc bytes over mult of 8
3267 sub ecx, edx // drop over bytes from length
3268 mov MMXLength, ecx
3269 } // end _asm block
3270
3271 // Now do the math for the rest of the row
3272 switch ( bpp )
3273 {
3274 case 3:
3275 {
3276 ActiveMask.use = 0x0000ffffff000000;
3277 ShiftBpp.use = 24; // == 3 * 8
3278 ShiftRem.use = 40; // == 64 - 24
3279 _asm {
3280 mov edi, row
3281 movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
3282 mov esi, edi // lp = row
3283 add edi, bpp // rp = row + bpp
3284 movq mm6, mm7
3285 mov ebx, diff
3286 psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
3287 // byte group
3288 // PRIME the pump (load the first Raw(x-bpp) data set
3289 movq mm1, [edi+ebx-8]
3290dsub3lp:
3291 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3292 // no need for mask; shift clears inactive bytes
3293 // Add 1st active group
3294 movq mm0, [edi+ebx]
3295 paddb mm0, mm1
3296 // Add 2nd active group
3297 movq mm1, mm0 // mov updated Raws to mm1
3298 psllq mm1, ShiftBpp // shift data to position correctly
3299 pand mm1, mm7 // mask to use only 2nd active group
3300 paddb mm0, mm1
3301 // Add 3rd active group
3302 movq mm1, mm0 // mov updated Raws to mm1
3303 psllq mm1, ShiftBpp // shift data to position correctly
3304 pand mm1, mm6 // mask to use only 3rd active group
3305 add ebx, 8
3306 paddb mm0, mm1
3307 cmp ebx, MMXLength
3308 movq [edi+ebx-8], mm0 // Write updated Raws back to array
3309 // Prep for doing 1st add at top of loop
3310 movq mm1, mm0
3311 jb dsub3lp
3312 } // end _asm block
3313 }
3314 break;
3315
3316 case 1:
3317 {
3318 // Placed here just in case this is a duplicate of the
3319 // non-MMX code for the SUB filter in png_read_filter_row above
3320 //
3321 // png_bytep rp;
3322 // png_bytep lp;
3323 // png_uint_32 i;
3324 // bpp = (row_info->pixel_depth + 7) >> 3;
3325 // for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
3326 // i < row_info->rowbytes; i++, rp++, lp++)
3327 // {
3328 // *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
3329 // }
3330 _asm {
3331 mov ebx, diff
3332 mov edi, row
3333 cmp ebx, FullLength
3334 jnb dsub1end
3335 mov esi, edi // lp = row
3336 xor eax, eax
3337 add edi, bpp // rp = row + bpp
3338dsub1lp:
3339 mov al, [esi+ebx]
3340 add [edi+ebx], al
3341 inc ebx
3342 cmp ebx, FullLength
3343 jb dsub1lp
3344dsub1end:
3345 } // end _asm block
3346 }
3347 return;
3348
3349 case 6:
3350 case 7:
3351 case 4:
3352 case 5:
3353 {
3354 ShiftBpp.use = bpp << 3;
3355 ShiftRem.use = 64 - ShiftBpp.use;
3356 _asm {
3357 mov edi, row
3358 mov ebx, diff
3359 mov esi, edi // lp = row
3360 add edi, bpp // rp = row + bpp
3361 // PRIME the pump (load the first Raw(x-bpp) data set
3362 movq mm1, [edi+ebx-8]
3363dsub4lp:
3364 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3365 // no need for mask; shift clears inactive bytes
3366 movq mm0, [edi+ebx]
3367 paddb mm0, mm1
3368 // Add 2nd active group
3369 movq mm1, mm0 // mov updated Raws to mm1
3370 psllq mm1, ShiftBpp // shift data to position correctly
3371 // there is no need for any mask
3372 // since shift clears inactive bits/bytes
3373 add ebx, 8
3374 paddb mm0, mm1
3375 cmp ebx, MMXLength
3376 movq [edi+ebx-8], mm0
3377 movq mm1, mm0 // Prep for doing 1st add at top of loop
3378 jb dsub4lp
3379 } // end _asm block
3380 }
3381 break;
3382
3383 case 2:
3384 {
3385 ActiveMask.use = 0x00000000ffff0000;
3386 ShiftBpp.use = 16; // == 2 * 8
3387 ShiftRem.use = 48; // == 64 - 16
3388 _asm {
3389 movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
3390 mov ebx, diff
3391 movq mm6, mm7
3392 mov edi, row
3393 psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
3394 // byte group
3395 mov esi, edi // lp = row
3396 movq mm5, mm6
3397 add edi, bpp // rp = row + bpp
3398 psllq mm5, ShiftBpp // Move mask in mm5 to cover 4th active
3399 // byte group
3400 // PRIME the pump (load the first Raw(x-bpp) data set
3401 movq mm1, [edi+ebx-8]
3402dsub2lp:
3403 // Add 1st active group
3404 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3405 // no need for mask; shift clears inactive
3406 // bytes
3407 movq mm0, [edi+ebx]
3408 paddb mm0, mm1
3409 // Add 2nd active group
3410 movq mm1, mm0 // mov updated Raws to mm1
3411 psllq mm1, ShiftBpp // shift data to position correctly
3412 pand mm1, mm7 // mask to use only 2nd active group
3413 paddb mm0, mm1
3414 // Add 3rd active group
3415 movq mm1, mm0 // mov updated Raws to mm1
3416 psllq mm1, ShiftBpp // shift data to position correctly
3417 pand mm1, mm6 // mask to use only 3rd active group
3418 paddb mm0, mm1
3419 // Add 4th active group
3420 movq mm1, mm0 // mov updated Raws to mm1
3421 psllq mm1, ShiftBpp // shift data to position correctly
3422 pand mm1, mm5 // mask to use only 4th active group
3423 add ebx, 8
3424 paddb mm0, mm1
3425 cmp ebx, MMXLength
3426 movq [edi+ebx-8], mm0 // Write updated Raws back to array
3427 movq mm1, mm0 // Prep for doing 1st add at top of loop
3428 jb dsub2lp
3429 } // end _asm block
3430 }
3431 break;
3432 case 8:
3433 {
3434 _asm {
3435 mov edi, row
3436 mov ebx, diff
3437 mov esi, edi // lp = row
3438 add edi, bpp // rp = row + bpp
3439 mov ecx, MMXLength
3440 movq mm7, [edi+ebx-8] // PRIME the pump (load the first
3441 // Raw(x-bpp) data set
3442 and ecx, 0x0000003f // calc bytes over mult of 64
3443dsub8lp:
3444 movq mm0, [edi+ebx] // Load Sub(x) for 1st 8 bytes
3445 paddb mm0, mm7
3446 movq mm1, [edi+ebx+8] // Load Sub(x) for 2nd 8 bytes
3447 movq [edi+ebx], mm0 // Write Raw(x) for 1st 8 bytes
3448 // Now mm0 will be used as Raw(x-bpp) for
3449 // the 2nd group of 8 bytes. This will be
3450 // repeated for each group of 8 bytes with
3451 // the 8th group being used as the Raw(x-bpp)
3452 // for the 1st group of the next loop.
3453 paddb mm1, mm0
3454 movq mm2, [edi+ebx+16] // Load Sub(x) for 3rd 8 bytes
3455 movq [edi+ebx+8], mm1 // Write Raw(x) for 2nd 8 bytes
3456 paddb mm2, mm1
3457 movq mm3, [edi+ebx+24] // Load Sub(x) for 4th 8 bytes
3458 movq [edi+ebx+16], mm2 // Write Raw(x) for 3rd 8 bytes
3459 paddb mm3, mm2
3460 movq mm4, [edi+ebx+32] // Load Sub(x) for 5th 8 bytes
3461 movq [edi+ebx+24], mm3 // Write Raw(x) for 4th 8 bytes
3462 paddb mm4, mm3
3463 movq mm5, [edi+ebx+40] // Load Sub(x) for 6th 8 bytes
3464 movq [edi+ebx+32], mm4 // Write Raw(x) for 5th 8 bytes
3465 paddb mm5, mm4
3466 movq mm6, [edi+ebx+48] // Load Sub(x) for 7th 8 bytes
3467 movq [edi+ebx+40], mm5 // Write Raw(x) for 6th 8 bytes
3468 paddb mm6, mm5
3469 movq mm7, [edi+ebx+56] // Load Sub(x) for 8th 8 bytes
3470 movq [edi+ebx+48], mm6 // Write Raw(x) for 7th 8 bytes
3471 add ebx, 64
3472 paddb mm7, mm6
3473 cmp ebx, ecx
3474 movq [edi+ebx-8], mm7 // Write Raw(x) for 8th 8 bytes
3475 jb dsub8lp
3476 cmp ebx, MMXLength
3477 jnb dsub8lt8
3478dsub8lpA:
3479 movq mm0, [edi+ebx]
3480 add ebx, 8
3481 paddb mm0, mm7
3482 cmp ebx, MMXLength
3483 movq [edi+ebx-8], mm0 // use -8 to offset early add to ebx
3484 movq mm7, mm0 // Move calculated Raw(x) data to mm1 to
3485 // be the new Raw(x-bpp) for the next loop
3486 jb dsub8lpA
3487dsub8lt8:
3488 } // end _asm block
3489 }
3490 break;
3491
3492 default: // bpp greater than 8 bytes
3493 {
3494 _asm {
3495 mov ebx, diff
3496 mov edi, row
3497 mov esi, edi // lp = row
3498 add edi, bpp // rp = row + bpp
3499dsubAlp:
3500 movq mm0, [edi+ebx]
3501 movq mm1, [esi+ebx]
3502 add ebx, 8
3503 paddb mm0, mm1
3504 cmp ebx, MMXLength
3505 movq [edi+ebx-8], mm0 // mov does not affect flags; -8 to offset
3506 // add ebx
3507 jb dsubAlp
3508 } // end _asm block
3509 }
3510 break;
3511
3512 } // end switch ( bpp )
3513
3514 _asm {
3515 mov ebx, MMXLength
3516 mov edi, row
3517 cmp ebx, FullLength
3518 jnb dsubend
3519 mov esi, edi // lp = row
3520 xor eax, eax
3521 add edi, bpp // rp = row + bpp
3522dsublp2:
3523 mov al, [esi+ebx]
3524 add [edi+ebx], al
3525 inc ebx
3526 cmp ebx, FullLength
3527 jb dsublp2
3528dsubend:
3529 emms // End MMX instructions; prep for possible FP instrs.
3530 } // end _asm block
3531}
3532
3533// Optimized code for PNG Up filter decoder
3534void
3535png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
3536 png_bytep prev_row)
3537{
3538 png_uint_32 len;
3539 len = row_info->rowbytes; // # of bytes to filter
3540 _asm {
3541 mov edi, row
3542 // get # of bytes to alignment
3543 mov ecx, edi
3544 xor ebx, ebx
3545 add ecx, 0x7
3546 xor eax, eax
3547 and ecx, 0xfffffff8
3548 mov esi, prev_row
3549 sub ecx, edi
3550 jz dupgo
3551 // fix alignment
3552duplp1:
3553 mov al, [edi+ebx]
3554 add al, [esi+ebx]
3555 inc ebx
3556 cmp ebx, ecx
3557 mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3558 jb duplp1
3559dupgo:
3560 mov ecx, len
3561 mov edx, ecx
3562 sub edx, ebx // subtract alignment fix
3563 and edx, 0x0000003f // calc bytes over mult of 64
3564 sub ecx, edx // drop over bytes from length
3565 // Unrolled loop - use all MMX registers and interleave to reduce
3566 // number of branch instructions (loops) and reduce partial stalls
3567duploop:
3568 movq mm1, [esi+ebx]
3569 movq mm0, [edi+ebx]
3570 movq mm3, [esi+ebx+8]
3571 paddb mm0, mm1
3572 movq mm2, [edi+ebx+8]
3573 movq [edi+ebx], mm0
3574 paddb mm2, mm3
3575 movq mm5, [esi+ebx+16]
3576 movq [edi+ebx+8], mm2
3577 movq mm4, [edi+ebx+16]
3578 movq mm7, [esi+ebx+24]
3579 paddb mm4, mm5
3580 movq mm6, [edi+ebx+24]
3581 movq [edi+ebx+16], mm4
3582 paddb mm6, mm7
3583 movq mm1, [esi+ebx+32]
3584 movq [edi+ebx+24], mm6
3585 movq mm0, [edi+ebx+32]
3586 movq mm3, [esi+ebx+40]
3587 paddb mm0, mm1
3588 movq mm2, [edi+ebx+40]
3589 movq [edi+ebx+32], mm0
3590 paddb mm2, mm3
3591 movq mm5, [esi+ebx+48]
3592 movq [edi+ebx+40], mm2
3593 movq mm4, [edi+ebx+48]
3594 movq mm7, [esi+ebx+56]
3595 paddb mm4, mm5
3596 movq mm6, [edi+ebx+56]
3597 movq [edi+ebx+48], mm4
3598 add ebx, 64
3599 paddb mm6, mm7
3600 cmp ebx, ecx
3601 movq [edi+ebx-8], mm6 // (+56)movq does not affect flags;
3602 // -8 to offset add ebx
3603 jb duploop
3604
3605 cmp edx, 0 // Test for bytes over mult of 64
3606 jz dupend
3607
3608
3609 // 2 lines added by lcreeve@netins.net
3610 // (mail 11 Jul 98 in png-implement list)
3611 cmp edx, 8 //test for less than 8 bytes
3612 jb duplt8
3613
3614
3615 add ecx, edx
3616 and edx, 0x00000007 // calc bytes over mult of 8
3617 sub ecx, edx // drop over bytes from length
3618 jz duplt8
3619 // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
3620duplpA:
3621 movq mm1, [esi+ebx]
3622 movq mm0, [edi+ebx]
3623 add ebx, 8
3624 paddb mm0, mm1
3625 cmp ebx, ecx
3626 movq [edi+ebx-8], mm0 // movq does not affect flags; -8 to offset add ebx
3627 jb duplpA
3628 cmp edx, 0 // Test for bytes over mult of 8
3629 jz dupend
3630duplt8:
3631 xor eax, eax
3632 add ecx, edx // move over byte count into counter
3633 // Loop using x86 registers to update remaining bytes
3634duplp2:
3635 mov al, [edi + ebx]
3636 add al, [esi + ebx]
3637 inc ebx
3638 cmp ebx, ecx
3639 mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3640 jb duplp2
3641dupend:
3642 // Conversion of filtered row completed
3643 emms // End MMX instructions; prep for possible FP instrs.
3644 } // end _asm block
3645}
3646
3647
3648// Optimized png_read_filter_row routines
3649void
3650png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
3651 row, png_bytep prev_row, int filter)
3652{
3653#ifdef PNG_DEBUG
3654 char filnm[6];
3655#endif
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003656 #define UseMMX 1
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003657
3658 if (mmx_supported == 2)
3659 mmx_supported = mmxsupport();
3660
3661 if (!mmx_supported)
3662 {
3663 png_read_filter_row_c(png_ptr, row_info, row, prev_row, filter);
3664 return ;
3665 }
3666
3667#ifdef PNG_DEBUG
3668 png_debug(1, "in png_read_filter_row\n");
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003669#if (UseMMX == 1)
3670 png_debug1(0,"%s, ", "MMX");
3671#else
3672 png_debug1(0,"%s, ", "x86");
3673#endif
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003674 switch (filter)
3675 {
3676 case 0: sprintf(filnm, "None ");
3677 break;
3678 case 1: sprintf(filnm, "Sub ");
3679 break;
3680 case 2: sprintf(filnm, "Up ");
3681 break;
3682 case 3: sprintf(filnm, "Avg ");
3683 break;
3684 case 4: sprintf(filnm, "Paeth");
3685 break;
3686 default: sprintf(filnm, "Unknw");
3687 break;
3688 }
3689 png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
3690 png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
3691 (int)((row_info->pixel_depth + 7) >> 3));
3692 png_debug1(0,"len=%8d, ", row_info->rowbytes);
3693#endif
3694
3695 switch (filter)
3696 {
3697 case PNG_FILTER_VALUE_NONE:
3698 break;
3699 case PNG_FILTER_VALUE_SUB:
3700 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003701#if (UseMMX == 1)
3702 if ((row_info->pixel_depth > 8) &&
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003703 (row_info->rowbytes >= 128) )
3704 {
3705 png_read_filter_row_mmx_sub(row_info, row);
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003706 }
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003707 else
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003708#endif
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003709 {
3710 png_uint_32 i;
3711 png_uint_32 istop = row_info->rowbytes;
3712 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3713 png_bytep rp = row + bpp;
3714 png_bytep lp = row;
3715
3716 for (i = bpp; i < istop; i++)
3717 {
3718 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
3719 rp++;
3720 }
3721 } //end !UseMMX
3722 break;
3723 }
3724 case PNG_FILTER_VALUE_UP:
3725 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003726#if (UseMMX == 1)
3727 if ((row_info->pixel_depth > 8) &&
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003728 (row_info->rowbytes >= 128) )
3729 {
3730 png_read_filter_row_mmx_up(row_info, row, prev_row);
3731 } //end if UseMMX
3732 else
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003733#endif
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003734 {
3735 png_bytep rp;
3736 png_bytep pp;
3737 png_uint_32 i;
3738 for (i = 0, rp = row, pp = prev_row;
3739 i < row_info->rowbytes; i++, rp++, pp++)
3740 {
3741 *rp = (png_byte)(((int)(*rp) + (int)(*pp)) & 0xff);
3742 }
3743 } //end !UseMMX
3744 break;
3745 }
3746 case PNG_FILTER_VALUE_AVG:
3747 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003748#if (UseMMX == 1)
3749 if ((row_info->pixel_depth > 8) &&
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003750 (row_info->rowbytes >= 128) )
3751 {
3752 png_read_filter_row_mmx_avg(row_info, row, prev_row);
3753 } //end if UseMMX
3754 else
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003755#endif
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003756 {
3757 png_uint_32 i;
3758 png_bytep rp = row;
3759 png_bytep pp = prev_row;
3760 png_bytep lp = row;
3761 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3762 png_uint_32 istop = row_info->rowbytes - bpp;
3763
3764 for (i = 0; i < bpp; i++)
3765 {
3766 *rp = (png_byte)(((int)(*rp) +
3767 ((int)(*pp++) >> 1)) & 0xff);
3768 rp++;
3769 }
3770
3771 for (i = 0; i < istop; i++)
3772 {
3773 *rp = (png_byte)(((int)(*rp) +
3774 ((int)(*pp++ + *lp++) >> 1)) & 0xff);
3775 rp++;
3776 }
3777 } //end !UseMMX
3778 break;
3779 }
3780 case PNG_FILTER_VALUE_PAETH:
3781 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003782#if (UseMMX == 1)
3783 if ((row_info->pixel_depth > 8) &&
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003784 (row_info->rowbytes >= 128) )
3785 {
3786 png_read_filter_row_mmx_paeth(row_info, row, prev_row);
3787 } //end if UseMMX
3788 else
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003789#endif
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003790 {
3791 png_uint_32 i;
3792 png_bytep rp = row;
3793 png_bytep pp = prev_row;
3794 png_bytep lp = row;
3795 png_bytep cp = prev_row;
3796 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3797 png_uint_32 istop=row_info->rowbytes - bpp;
3798
3799 for (i = 0; i < bpp; i++)
3800 {
3801 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3802 rp++;
3803 }
3804
3805 for (i = 0; i < istop; i++) // use leftover rp,pp
3806 {
3807 int a, b, c, pa, pb, pc, p;
3808
3809 a = *lp++;
3810 b = *pp++;
3811 c = *cp++;
3812
3813 p = b - c;
3814 pc = a - c;
3815
3816#ifdef PNG_USE_ABS
3817 pa = abs(p);
3818 pb = abs(pc);
3819 pc = abs(p + pc);
3820#else
3821 pa = p < 0 ? -p : p;
3822 pb = pc < 0 ? -pc : pc;
3823 pc = (p + pc) < 0 ? -(p + pc) : p + pc;
3824#endif
3825
3826 /*
3827 if (pa <= pb && pa <= pc)
3828 p = a;
3829 else if (pb <= pc)
3830 p = b;
3831 else
3832 p = c;
3833 */
3834
3835 p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
3836
3837 *rp = (png_byte)(((int)(*rp) + p) & 0xff);
3838 rp++;
3839 }
3840 } //end !UseMMX
3841 break;
3842 }
3843 default:
3844 png_error(png_ptr, "Bad adaptive filter type");
3845 break;
3846 }
3847}
3848#endif