blob: 639c37841402786e728d8295fb56b50ec15009c1 [file] [log] [blame]
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001/* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
2 *
3 * For Intel x86 CPU and Microsoft Visual C++ compiler
4 *
Glenn Randers-Pehrsonff9c9472000-07-11 07:12:36 -05005 * libpng 1.0.8beta3 - July 11, 2000
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05006 * For conditions of distribution and use, see copyright notice in png.h
Glenn Randers-Pehrson61c32d92000-02-04 23:40:16 -06007 * Copyright (c) 1998, 1999, 2000 Glenn Randers-Pehrson
Glenn Randers-Pehrsond4366722000-06-04 14:29:29 -05008 * Copyright (c) 1998, Intel Corporation
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05009 *
10 * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
11 * Interface to libpng contributed by Gilles Vollant, 1999
12 *
13 */
14
15#define PNG_INTERNAL
16#include "png.h"
17
18#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
19
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -050020/*
21 One of these might need to be defined.
22#define DISABLE_PNGVCRD_COMBINE
23#define DISABLE_PNGVCRD_INTERLACE
24*/
25
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -050026static int mmx_supported=2;
27
Glenn Randers-Pehrson75294572000-05-06 14:09:57 -050028void /* PRIVATE */
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -050029png_read_filter_row_c(png_structp png_ptr, png_row_infop row_info,
30 png_bytep row, png_bytep prev_row, int filter);
31
32static int mmxsupport()
33{
34 int mmx_supported_local = 0;
35 _asm {
Glenn Randers-Pehrson61c32d92000-02-04 23:40:16 -060036 push ebx //CPUID will trash these
37 push ecx
38 push edx
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -050039 pushfd //Save Eflag to stack
40 pop eax //Get Eflag from stack into eax
41 mov ecx, eax //Make another copy of Eflag in ecx
42 xor eax, 0x200000 //Toggle ID bit in Eflag [i.e. bit(21)]
43 push eax //Save modified Eflag back to stack
44
45 popfd //Restored modified value back to Eflag reg
46 pushfd //Save Eflag to stack
47 pop eax //Get Eflag from stack
48 xor eax, ecx //Compare the new Eflag with the original Eflag
49 jz NOT_SUPPORTED //If the same, CPUID instruction is not supported,
50 //skip following instructions and jump to
51 //NOT_SUPPORTED label
52
53 xor eax, eax //Set eax to zero
54
55 _asm _emit 0x0f //CPUID instruction (two bytes opcode)
56 _asm _emit 0xa2
57
58 cmp eax, 1 //make sure eax return non-zero value
59 jl NOT_SUPPORTED //If eax is zero, mmx not supported
60
61 xor eax, eax //set eax to zero
62 inc eax //Now increment eax to 1. This instruction is
63 //faster than the instruction "mov eax, 1"
64
65 _asm _emit 0x0f //CPUID instruction
66 _asm _emit 0xa2
67
68 and edx, 0x00800000 //mask out all bits but mmx bit(24)
69 cmp edx, 0 // 0 = mmx not supported
70 jz NOT_SUPPORTED // non-zero = Yes, mmx IS supported
71
72 mov mmx_supported_local, 1 //set return value to 1
73
74NOT_SUPPORTED:
75 mov eax, mmx_supported_local //move return value to eax
Glenn Randers-Pehrson61c32d92000-02-04 23:40:16 -060076 pop edx //CPUID trashed these
77 pop ecx
78 pop ebx
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -050079 }
80
81 //mmx_supported_local=0; // test code for force don't support MMX
82 //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
83
84 return mmx_supported_local;
85}
86
87/* Combines the row recently read in with the previous row.
88 This routine takes care of alpha and transparency if requested.
89 This routine also handles the two methods of progressive display
90 of interlaced images, depending on the mask value.
91 The mask value describes which pixels are to be combined with
92 the row. The pattern always repeats every 8 pixels, so just 8
93 bits are needed. A one indicates the pixel is to be combined; a
94 zero indicates the pixel is to be skipped. This is in addition
95 to any alpha or transparency value associated with the pixel. If
96 you want all pixels to be combined, pass 0xff (255) in mask. */
97
98/* Use this routine for x86 platform - uses faster MMX routine if machine
99 supports MMX */
100
Glenn Randers-Pehrson75294572000-05-06 14:09:57 -0500101void /* PRIVATE */
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500102png_combine_row(png_structp png_ptr, png_bytep row, int mask)
103{
Glenn Randers-Pehrson074af5e1999-11-28 23:32:18 -0600104#ifdef PNG_USE_LOCAL_ARRAYS
Glenn Randers-Pehrson5379b241999-11-27 10:22:33 -0600105 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
Glenn Randers-Pehrson074af5e1999-11-28 23:32:18 -0600106#endif
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -0500107#ifdef DISABLE_PNGVCRD_COMBINE
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500108 int save_mmx_supported = mmx_supported;
109#endif
110
111 png_debug(1,"in png_combine_row_asm\n");
112
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -0500113#ifdef DISABLE_PNGVCRD_COMBINE
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500114 if ((png_ptr->transformations & PNG_INTERLACE) && png_ptr->pass != 6)
115 mmx_supported = 0;
116 else
117#endif
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -0500118 if (mmx_supported == 2)
119 mmx_supported = mmxsupport();
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500120
121 if (mask == 0xff)
122 {
123 png_memcpy(row, png_ptr->row_buf + 1,
124 (png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
125 }
126 /* GRR: add "else if (mask == 0)" case?
127 * or does png_combine_row() not even get called in that case? */
128 else
129 {
130 switch (png_ptr->row_info.pixel_depth)
131 {
132 case 1:
133 {
134 png_bytep sp;
135 png_bytep dp;
136 int s_inc, s_start, s_end;
137 int m;
138 int shift;
139 png_uint_32 i;
140
141 sp = png_ptr->row_buf + 1;
142 dp = row;
143 m = 0x80;
144#if defined(PNG_READ_PACKSWAP_SUPPORTED)
145 if (png_ptr->transformations & PNG_PACKSWAP)
146 {
147 s_start = 0;
148 s_end = 7;
149 s_inc = 1;
150 }
151 else
152#endif
153 {
154 s_start = 7;
155 s_end = 0;
156 s_inc = -1;
157 }
158
159 shift = s_start;
160
161 for (i = 0; i < png_ptr->width; i++)
162 {
163 if (m & mask)
164 {
165 int value;
166
167 value = (*sp >> shift) & 0x1;
168 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
169 *dp |= (png_byte)(value << shift);
170 }
171
172 if (shift == s_end)
173 {
174 shift = s_start;
175 sp++;
176 dp++;
177 }
178 else
179 shift += s_inc;
180
181 if (m == 1)
182 m = 0x80;
183 else
184 m >>= 1;
185 }
186 break;
187 }
188
189 case 2:
190 {
191 png_bytep sp;
192 png_bytep dp;
193 int s_start, s_end, s_inc;
194 int m;
195 int shift;
196 png_uint_32 i;
197 int value;
198
199 sp = png_ptr->row_buf + 1;
200 dp = row;
201 m = 0x80;
202#if defined(PNG_READ_PACKSWAP_SUPPORTED)
203 if (png_ptr->transformations & PNG_PACKSWAP)
204 {
205 s_start = 0;
206 s_end = 6;
207 s_inc = 2;
208 }
209 else
210#endif
211 {
212 s_start = 6;
213 s_end = 0;
214 s_inc = -2;
215 }
216
217 shift = s_start;
218
219 for (i = 0; i < png_ptr->width; i++)
220 {
221 if (m & mask)
222 {
223 value = (*sp >> shift) & 0x3;
224 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
225 *dp |= (png_byte)(value << shift);
226 }
227
228 if (shift == s_end)
229 {
230 shift = s_start;
231 sp++;
232 dp++;
233 }
234 else
235 shift += s_inc;
236 if (m == 1)
237 m = 0x80;
238 else
239 m >>= 1;
240 }
241 break;
242 }
243
244 case 4:
245 {
246 png_bytep sp;
247 png_bytep dp;
248 int s_start, s_end, s_inc;
249 int m;
250 int shift;
251 png_uint_32 i;
252 int value;
253
254 sp = png_ptr->row_buf + 1;
255 dp = row;
256 m = 0x80;
257#if defined(PNG_READ_PACKSWAP_SUPPORTED)
258 if (png_ptr->transformations & PNG_PACKSWAP)
259 {
260 s_start = 0;
261 s_end = 4;
262 s_inc = 4;
263 }
264 else
265#endif
266 {
267 s_start = 4;
268 s_end = 0;
269 s_inc = -4;
270 }
271 shift = s_start;
272
273 for (i = 0; i < png_ptr->width; i++)
274 {
275 if (m & mask)
276 {
277 value = (*sp >> shift) & 0xf;
278 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
279 *dp |= (png_byte)(value << shift);
280 }
281
282 if (shift == s_end)
283 {
284 shift = s_start;
285 sp++;
286 dp++;
287 }
288 else
289 shift += s_inc;
290 if (m == 1)
291 m = 0x80;
292 else
293 m >>= 1;
294 }
295 break;
296 }
297
298 case 8:
299 {
300 png_bytep srcptr;
301 png_bytep dstptr;
302 png_uint_32 len;
303 int m;
304 int diff, unmask;
305
306 __int64 mask0=0x0102040810204080;
307
308 if (mmx_supported)
309 {
310 srcptr = png_ptr->row_buf + 1;
311 dstptr = row;
312 m = 0x80;
313 unmask = ~mask;
314 len = png_ptr->width &~7; //reduce to multiple of 8
315 diff = png_ptr->width & 7; //amount lost
316
317 _asm
318 {
319 movd mm7, unmask //load bit pattern
320 psubb mm6,mm6 //zero mm6
321 punpcklbw mm7,mm7
322 punpcklwd mm7,mm7
323 punpckldq mm7,mm7 //fill register with 8 masks
324
325 movq mm0,mask0
326
327 pand mm0,mm7 //nonzero if keep byte
328 pcmpeqb mm0,mm6 //zeros->1s, v versa
329
330 mov ecx,len //load length of line (pixels)
331 mov esi,srcptr //load source
332 mov ebx,dstptr //load dest
333 cmp ecx,0 //lcr
334 je mainloop8end
335
336mainloop8:
337 movq mm4,[esi]
338 pand mm4,mm0
339 movq mm6,mm0
340 pandn mm6,[ebx]
341 por mm4,mm6
342 movq [ebx],mm4
343
344 add esi,8 //inc by 8 bytes processed
345 add ebx,8
346 sub ecx,8 //dec by 8 pixels processed
347
348 ja mainloop8
349mainloop8end:
350
351 mov ecx,diff
352 cmp ecx,0
353 jz end8
354
355 mov edx,mask
356 sal edx,24 //make low byte the high byte
357
358secondloop8:
359 sal edx,1 //move high bit to CF
360 jnc skip8 //if CF = 0
361 mov al,[esi]
362 mov [ebx],al
363skip8:
364 inc esi
365 inc ebx
366
367 dec ecx
368 jnz secondloop8
369end8:
370 emms
371 }
372 }
373 else /* mmx not supported - use modified C routine */
374 {
375 register unsigned int incr1, initial_val, final_val;
376 png_size_t pixel_bytes;
377 png_uint_32 i;
378 register int disp = png_pass_inc[png_ptr->pass];
379 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
380
381 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
382 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
383 pixel_bytes;
384 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
385 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
386 final_val = png_ptr->width*pixel_bytes;
387 incr1 = (disp)*pixel_bytes;
388 for (i = initial_val; i < final_val; i += incr1)
389 {
390 png_memcpy(dstptr, srcptr, pixel_bytes);
391 srcptr += incr1;
392 dstptr += incr1;
393 }
394 } /* end of else */
395
396 break;
397 } // end 8 bpp
398
399 case 16:
400 {
401 png_bytep srcptr;
402 png_bytep dstptr;
403 png_uint_32 len;
404 int unmask, diff;
405 __int64 mask1=0x0101020204040808,
406 mask0=0x1010202040408080;
407
408 if (mmx_supported)
409 {
410 srcptr = png_ptr->row_buf + 1;
411 dstptr = row;
412
413 unmask = ~mask;
414 len = (png_ptr->width)&~7;
415 diff = (png_ptr->width)&7;
416 _asm
417 {
418 movd mm7, unmask //load bit pattern
419 psubb mm6,mm6 //zero mm6
420 punpcklbw mm7,mm7
421 punpcklwd mm7,mm7
422 punpckldq mm7,mm7 //fill register with 8 masks
423
424 movq mm0,mask0
425 movq mm1,mask1
426
427 pand mm0,mm7
428 pand mm1,mm7
429
430 pcmpeqb mm0,mm6
431 pcmpeqb mm1,mm6
432
433 mov ecx,len //load length of line
434 mov esi,srcptr //load source
435 mov ebx,dstptr //load dest
436 cmp ecx,0 //lcr
437 jz mainloop16end
438
439mainloop16:
440 movq mm4,[esi]
441 pand mm4,mm0
442 movq mm6,mm0
443 movq mm7,[ebx]
444 pandn mm6,mm7
445 por mm4,mm6
446 movq [ebx],mm4
447
448 movq mm5,[esi+8]
449 pand mm5,mm1
450 movq mm7,mm1
451 movq mm6,[ebx+8]
452 pandn mm7,mm6
453 por mm5,mm7
454 movq [ebx+8],mm5
455
456 add esi,16 //inc by 16 bytes processed
457 add ebx,16
458 sub ecx,8 //dec by 8 pixels processed
459
460 ja mainloop16
461
462mainloop16end:
463 mov ecx,diff
464 cmp ecx,0
465 jz end16
466
467 mov edx,mask
468 sal edx,24 //make low byte the high byte
469secondloop16:
470 sal edx,1 //move high bit to CF
471 jnc skip16 //if CF = 0
472 mov ax,[esi]
473 mov [ebx],ax
474skip16:
475 add esi,2
476 add ebx,2
477
478 dec ecx
479 jnz secondloop16
480end16:
481 emms
482 }
483 }
484 else /* mmx not supported - use modified C routine */
485 {
486 register unsigned int incr1, initial_val, final_val;
487 png_size_t pixel_bytes;
488 png_uint_32 i;
489 register int disp = png_pass_inc[png_ptr->pass];
490 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
491
492 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
493 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
494 pixel_bytes;
495 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
496 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
497 final_val = png_ptr->width*pixel_bytes;
498 incr1 = (disp)*pixel_bytes;
499 for (i = initial_val; i < final_val; i += incr1)
500 {
501 png_memcpy(dstptr, srcptr, pixel_bytes);
502 srcptr += incr1;
503 dstptr += incr1;
504 }
505 } /* end of else */
506
507 break;
508 } // end 16 bpp
509
510 case 24:
511 {
512 png_bytep srcptr;
513 png_bytep dstptr;
514 png_uint_32 len;
515 int unmask, diff;
516
517 __int64 mask2=0x0101010202020404, //24bpp
518 mask1=0x0408080810101020,
519 mask0=0x2020404040808080;
520
521 srcptr = png_ptr->row_buf + 1;
522 dstptr = row;
523
524 unmask = ~mask;
525 len = (png_ptr->width)&~7;
526 diff = (png_ptr->width)&7;
527
528 if (mmx_supported)
529 {
530 _asm
531 {
532 movd mm7, unmask //load bit pattern
533 psubb mm6,mm6 //zero mm6
534 punpcklbw mm7,mm7
535 punpcklwd mm7,mm7
536 punpckldq mm7,mm7 //fill register with 8 masks
537
538 movq mm0,mask0
539 movq mm1,mask1
540 movq mm2,mask2
541
542 pand mm0,mm7
543 pand mm1,mm7
544 pand mm2,mm7
545
546 pcmpeqb mm0,mm6
547 pcmpeqb mm1,mm6
548 pcmpeqb mm2,mm6
549
550 mov ecx,len //load length of line
551 mov esi,srcptr //load source
552 mov ebx,dstptr //load dest
553 cmp ecx,0
554 jz mainloop24end
555
556mainloop24:
557 movq mm4,[esi]
558 pand mm4,mm0
559 movq mm6,mm0
560 movq mm7,[ebx]
561 pandn mm6,mm7
562 por mm4,mm6
563 movq [ebx],mm4
564
565
566 movq mm5,[esi+8]
567 pand mm5,mm1
568 movq mm7,mm1
569 movq mm6,[ebx+8]
570 pandn mm7,mm6
571 por mm5,mm7
572 movq [ebx+8],mm5
573
574 movq mm6,[esi+16]
575 pand mm6,mm2
576 movq mm4,mm2
577 movq mm7,[ebx+16]
578 pandn mm4,mm7
579 por mm6,mm4
580 movq [ebx+16],mm6
581
582 add esi,24 //inc by 24 bytes processed
583 add ebx,24
584 sub ecx,8 //dec by 8 pixels processed
585
586 ja mainloop24
587
588mainloop24end:
589 mov ecx,diff
590 cmp ecx,0
591 jz end24
592
593 mov edx,mask
594 sal edx,24 //make low byte the high byte
595secondloop24:
596 sal edx,1 //move high bit to CF
597 jnc skip24 //if CF = 0
598 mov ax,[esi]
599 mov [ebx],ax
600 xor eax,eax
601 mov al,[esi+2]
602 mov [ebx+2],al
603skip24:
604 add esi,3
605 add ebx,3
606
607 dec ecx
608 jnz secondloop24
609
610end24:
611 emms
612 }
613 }
614 else /* mmx not supported - use modified C routine */
615 {
616 register unsigned int incr1, initial_val, final_val;
617 png_size_t pixel_bytes;
618 png_uint_32 i;
619 register int disp = png_pass_inc[png_ptr->pass];
620 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
621
622 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
623 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
624 pixel_bytes;
625 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
626 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
627 final_val = png_ptr->width*pixel_bytes;
628 incr1 = (disp)*pixel_bytes;
629 for (i = initial_val; i < final_val; i += incr1)
630 {
631 png_memcpy(dstptr, srcptr, pixel_bytes);
632 srcptr += incr1;
633 dstptr += incr1;
634 }
635 } /* end of else */
636
637 break;
638 } // end 24 bpp
639
640 case 32:
641 {
642 png_bytep srcptr;
643 png_bytep dstptr;
644 png_uint_32 len;
645 int unmask, diff;
646
647 __int64 mask3=0x0101010102020202, //32bpp
648 mask2=0x0404040408080808,
649 mask1=0x1010101020202020,
650 mask0=0x4040404080808080;
651
652 srcptr = png_ptr->row_buf + 1;
653 dstptr = row;
654
655 unmask = ~mask;
656 len = (png_ptr->width)&~7;
657 diff = (png_ptr->width)&7;
658
659 if (mmx_supported)
660 {
661 _asm
662 {
663 movd mm7, unmask //load bit pattern
664 psubb mm6,mm6 //zero mm6
665 punpcklbw mm7,mm7
666 punpcklwd mm7,mm7
667 punpckldq mm7,mm7 //fill register with 8 masks
668
669 movq mm0,mask0
670 movq mm1,mask1
671 movq mm2,mask2
672 movq mm3,mask3
673
674 pand mm0,mm7
675 pand mm1,mm7
676 pand mm2,mm7
677 pand mm3,mm7
678
679 pcmpeqb mm0,mm6
680 pcmpeqb mm1,mm6
681 pcmpeqb mm2,mm6
682 pcmpeqb mm3,mm6
683
684 mov ecx,len //load length of line
685 mov esi,srcptr //load source
686 mov ebx,dstptr //load dest
687
688 cmp ecx,0 //lcr
689 jz mainloop32end
690
691mainloop32:
692 movq mm4,[esi]
693 pand mm4,mm0
694 movq mm6,mm0
695 movq mm7,[ebx]
696 pandn mm6,mm7
697 por mm4,mm6
698 movq [ebx],mm4
699
700 movq mm5,[esi+8]
701 pand mm5,mm1
702 movq mm7,mm1
703 movq mm6,[ebx+8]
704 pandn mm7,mm6
705 por mm5,mm7
706 movq [ebx+8],mm5
707
708 movq mm6,[esi+16]
709 pand mm6,mm2
710 movq mm4,mm2
711 movq mm7,[ebx+16]
712 pandn mm4,mm7
713 por mm6,mm4
714 movq [ebx+16],mm6
715
716 movq mm7,[esi+24]
717 pand mm7,mm3
718 movq mm5,mm3
719 movq mm4,[ebx+24]
720 pandn mm5,mm4
721 por mm7,mm5
722 movq [ebx+24],mm7
723
724 add esi,32 //inc by 32 bytes processed
725 add ebx,32
726 sub ecx,8 //dec by 8 pixels processed
727
728 ja mainloop32
729
730mainloop32end:
731 mov ecx,diff
732 cmp ecx,0
733 jz end32
734
735 mov edx,mask
736 sal edx,24 //make low byte the high byte
737secondloop32:
738 sal edx,1 //move high bit to CF
739 jnc skip32 //if CF = 0
740 mov eax,[esi]
741 mov [ebx],eax
742skip32:
743 add esi,4
744 add ebx,4
745
746 dec ecx
747 jnz secondloop32
748
749end32:
750 emms
751 }
752 }
753 else /* mmx _not supported - Use modified C routine */
754 {
755 register unsigned int incr1, initial_val, final_val;
756 png_size_t pixel_bytes;
757 png_uint_32 i;
758 register int disp = png_pass_inc[png_ptr->pass];
759 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
760
761 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
762 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
763 pixel_bytes;
764 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
765 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
766 final_val = png_ptr->width*pixel_bytes;
767 incr1 = (disp)*pixel_bytes;
768 for (i = initial_val; i < final_val; i += incr1)
769 {
770 png_memcpy(dstptr, srcptr, pixel_bytes);
771 srcptr += incr1;
772 dstptr += incr1;
773 }
774 } /* end of else */
775
776 break;
777 } // end 32 bpp
778
779 case 48:
780 {
781 png_bytep srcptr;
782 png_bytep dstptr;
783 png_uint_32 len;
784 int unmask, diff;
785
786 __int64 mask5=0x0101010101010202,
787 mask4=0x0202020204040404,
788 mask3=0x0404080808080808,
789 mask2=0x1010101010102020,
790 mask1=0x2020202040404040,
791 mask0=0x4040808080808080;
792
793 if (mmx_supported)
794 {
795 srcptr = png_ptr->row_buf + 1;
796 dstptr = row;
797
798 unmask = ~mask;
799 len = (png_ptr->width)&~7;
800 diff = (png_ptr->width)&7;
801 _asm
802 {
803 movd mm7, unmask //load bit pattern
804 psubb mm6,mm6 //zero mm6
805 punpcklbw mm7,mm7
806 punpcklwd mm7,mm7
807 punpckldq mm7,mm7 //fill register with 8 masks
808
809 movq mm0,mask0
810 movq mm1,mask1
811 movq mm2,mask2
812 movq mm3,mask3
813 movq mm4,mask4
814 movq mm5,mask5
815
816 pand mm0,mm7
817 pand mm1,mm7
818 pand mm2,mm7
819 pand mm3,mm7
820 pand mm4,mm7
821 pand mm5,mm7
822
823 pcmpeqb mm0,mm6
824 pcmpeqb mm1,mm6
825 pcmpeqb mm2,mm6
826 pcmpeqb mm3,mm6
827 pcmpeqb mm4,mm6
828 pcmpeqb mm5,mm6
829
830 mov ecx,len //load length of line
831 mov esi,srcptr //load source
832 mov ebx,dstptr //load dest
833
834 cmp ecx,0
835 jz mainloop48end
836
837mainloop48:
838 movq mm7,[esi]
839 pand mm7,mm0
840 movq mm6,mm0
841 pandn mm6,[ebx]
842 por mm7,mm6
843 movq [ebx],mm7
844
845 movq mm6,[esi+8]
846 pand mm6,mm1
847 movq mm7,mm1
848 pandn mm7,[ebx+8]
849 por mm6,mm7
850 movq [ebx+8],mm6
851
852 movq mm6,[esi+16]
853 pand mm6,mm2
854 movq mm7,mm2
855 pandn mm7,[ebx+16]
856 por mm6,mm7
857 movq [ebx+16],mm6
858
859 movq mm7,[esi+24]
860 pand mm7,mm3
861 movq mm6,mm3
862 pandn mm6,[ebx+24]
863 por mm7,mm6
864 movq [ebx+24],mm7
865
866 movq mm6,[esi+32]
867 pand mm6,mm4
868 movq mm7,mm4
869 pandn mm7,[ebx+32]
870 por mm6,mm7
871 movq [ebx+32],mm6
872
873 movq mm7,[esi+40]
874 pand mm7,mm5
875 movq mm6,mm5
876 pandn mm6,[ebx+40]
877 por mm7,mm6
878 movq [ebx+40],mm7
879
880 add esi,48 //inc by 32 bytes processed
881 add ebx,48
882 sub ecx,8 //dec by 8 pixels processed
883
884 ja mainloop48
885mainloop48end:
886
887 mov ecx,diff
888 cmp ecx,0
889 jz end48
890
891 mov edx,mask
892 sal edx,24 //make low byte the high byte
893
894secondloop48:
895 sal edx,1 //move high bit to CF
896 jnc skip48 //if CF = 0
897 mov eax,[esi]
898 mov [ebx],eax
899skip48:
900 add esi,4
901 add ebx,4
902
903 dec ecx
904 jnz secondloop48
905
906end48:
907 emms
908 }
909 }
910 else /* mmx _not supported - Use modified C routine */
911 {
912 register unsigned int incr1, initial_val, final_val;
913 png_size_t pixel_bytes;
914 png_uint_32 i;
915 register int disp = png_pass_inc[png_ptr->pass];
916 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
917
918 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
919 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
920 pixel_bytes;
921 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
922 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
923 final_val = png_ptr->width*pixel_bytes;
924 incr1 = (disp)*pixel_bytes;
925 for (i = initial_val; i < final_val; i += incr1)
926 {
927 png_memcpy(dstptr, srcptr, pixel_bytes);
928 srcptr += incr1;
929 dstptr += incr1;
930 }
931 } /* end of else */
932
933 break;
934 } // end 48 bpp
935
936 default:
937 {
938 png_bytep sptr;
939 png_bytep dp;
940 png_size_t pixel_bytes;
941 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
942 unsigned int i;
943 register int disp = png_pass_inc[png_ptr->pass]; // get the offset
944 register unsigned int incr1, initial_val, final_val;
945
946 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
947 sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
948 pixel_bytes;
949 dp = row + offset_table[png_ptr->pass]*pixel_bytes;
950 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
951 final_val = png_ptr->width*pixel_bytes;
952 incr1 = (disp)*pixel_bytes;
953 for (i = initial_val; i < final_val; i += incr1)
954 {
955 png_memcpy(dp, sptr, pixel_bytes);
956 sptr += incr1;
957 dp += incr1;
958 }
959 break;
960 }
961 } /* end switch (png_ptr->row_info.pixel_depth) */
962 } /* end if (non-trivial mask) */
963
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -0500964#ifdef DISABLE_PNGVCRD_COMBINE
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500965 mmx_supported = save_mmx_supported;
966#endif
967
968} /* end png_combine_row() */
969
970
971#if defined(PNG_READ_INTERLACING_SUPPORTED)
972
Glenn Randers-Pehrson75294572000-05-06 14:09:57 -0500973void /* PRIVATE */
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500974png_do_read_interlace(png_row_infop row_info, png_bytep row, int pass,
975 png_uint_32 transformations)
976{
Glenn Randers-Pehrson074af5e1999-11-28 23:32:18 -0600977#ifdef PNG_USE_LOCAL_ARRAYS
Glenn Randers-Pehrson5379b241999-11-27 10:22:33 -0600978 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
Glenn Randers-Pehrson074af5e1999-11-28 23:32:18 -0600979#endif
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -0500980#ifdef DISABLE_PNGVCRD_INTERLACE
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500981 int save_mmx_supported = mmx_supported;
982#endif
983
984 png_debug(1,"in png_do_read_interlace\n");
985
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -0500986#ifdef DISABLE_PNGVCRD_INTERLACE
987 /* In libpng versions 1.0.3a through 1.0.4d,
988 * a sign error in the post-MMX cleanup code for each pixel_depth resulted
989 * in bad pixels at the beginning of some rows of some images, and also
990 * (due to out-of-range memory reads and writes) caused heap corruption
991 * when compiled with MSVC 6.0. The error was fixed in version 1.0.4e,
992 * and the code appears to work completely correctly, so it is enabled
993 * by default.
994 */
995 if (1) /* all passes caused a heap problem in the old code */
996 mmx_supported = 0;
997 else
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -0500998#endif
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -0500999 if (mmx_supported == 2)
1000 mmx_supported = mmxsupport();
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001001
1002 if (row != NULL && row_info != NULL)
1003 {
1004 png_uint_32 final_width;
1005
1006 final_width = row_info->width * png_pass_inc[pass];
1007
1008 switch (row_info->pixel_depth)
1009 {
1010 case 1:
1011 {
1012 png_bytep sp, dp;
1013 int sshift, dshift;
1014 int s_start, s_end, s_inc;
1015 png_byte v;
1016 png_uint_32 i;
1017 int j;
1018
1019 sp = row + (png_size_t)((row_info->width - 1) >> 3);
1020 dp = row + (png_size_t)((final_width - 1) >> 3);
1021#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1022 if (transformations & PNG_PACKSWAP)
1023 {
1024 sshift = (int)((row_info->width + 7) & 7);
1025 dshift = (int)((final_width + 7) & 7);
1026 s_start = 7;
1027 s_end = 0;
1028 s_inc = -1;
1029 }
1030 else
1031#endif
1032 {
1033 sshift = 7 - (int)((row_info->width + 7) & 7);
1034 dshift = 7 - (int)((final_width + 7) & 7);
1035 s_start = 0;
1036 s_end = 7;
1037 s_inc = 1;
1038 }
1039
1040 for (i = row_info->width; i; i--)
1041 {
1042 v = (png_byte)((*sp >> sshift) & 0x1);
1043 for (j = 0; j < png_pass_inc[pass]; j++)
1044 {
1045 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1046 *dp |= (png_byte)(v << dshift);
1047 if (dshift == s_end)
1048 {
1049 dshift = s_start;
1050 dp--;
1051 }
1052 else
1053 dshift += s_inc;
1054 }
1055 if (sshift == s_end)
1056 {
1057 sshift = s_start;
1058 sp--;
1059 }
1060 else
1061 sshift += s_inc;
1062 }
1063 break;
1064 }
1065
1066 case 2:
1067 {
1068 png_bytep sp, dp;
1069 int sshift, dshift;
1070 int s_start, s_end, s_inc;
1071 png_uint_32 i;
1072
1073 sp = row + (png_size_t)((row_info->width - 1) >> 2);
1074 dp = row + (png_size_t)((final_width - 1) >> 2);
1075#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1076 if (transformations & PNG_PACKSWAP)
1077 {
1078 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1079 dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1080 s_start = 6;
1081 s_end = 0;
1082 s_inc = -2;
1083 }
1084 else
1085#endif
1086 {
1087 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1088 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1089 s_start = 0;
1090 s_end = 6;
1091 s_inc = 2;
1092 }
1093
1094 for (i = row_info->width; i; i--)
1095 {
1096 png_byte v;
1097 int j;
1098
1099 v = (png_byte)((*sp >> sshift) & 0x3);
1100 for (j = 0; j < png_pass_inc[pass]; j++)
1101 {
1102 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1103 *dp |= (png_byte)(v << dshift);
1104 if (dshift == s_end)
1105 {
1106 dshift = s_start;
1107 dp--;
1108 }
1109 else
1110 dshift += s_inc;
1111 }
1112 if (sshift == s_end)
1113 {
1114 sshift = s_start;
1115 sp--;
1116 }
1117 else
1118 sshift += s_inc;
1119 }
1120 break;
1121 }
1122
1123 case 4:
1124 {
1125 png_bytep sp, dp;
1126 int sshift, dshift;
1127 int s_start, s_end, s_inc;
1128 png_uint_32 i;
1129
1130 sp = row + (png_size_t)((row_info->width - 1) >> 1);
1131 dp = row + (png_size_t)((final_width - 1) >> 1);
1132#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1133 if (transformations & PNG_PACKSWAP)
1134 {
1135 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1136 dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1137 s_start = 4;
1138 s_end = 0;
1139 s_inc = -4;
1140 }
1141 else
1142#endif
1143 {
1144 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1145 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1146 s_start = 0;
1147 s_end = 4;
1148 s_inc = 4;
1149 }
1150
1151 for (i = row_info->width; i; i--)
1152 {
1153 png_byte v;
1154 int j;
1155
1156 v = (png_byte)((*sp >> sshift) & 0xf);
1157 for (j = 0; j < png_pass_inc[pass]; j++)
1158 {
1159 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1160 *dp |= (png_byte)(v << dshift);
1161 if (dshift == s_end)
1162 {
1163 dshift = s_start;
1164 dp--;
1165 }
1166 else
1167 dshift += s_inc;
1168 }
1169 if (sshift == s_end)
1170 {
1171 sshift = s_start;
1172 sp--;
1173 }
1174 else
1175 sshift += s_inc;
1176 }
1177 break;
1178 }
1179
1180 default: // This is the place where the routine is modified
1181 {
1182 __int64 const4 = 0x0000000000FFFFFF;
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001183 // __int64 const5 = 0x000000FFFFFF0000; // unused...
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001184 __int64 const6 = 0x00000000000000FF;
1185 png_bytep sptr, dp;
1186 png_uint_32 i;
1187 png_size_t pixel_bytes;
1188 int width = row_info->width;
1189
1190 pixel_bytes = (row_info->pixel_depth >> 3);
1191
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001192 sptr = row + (width - 1) * pixel_bytes;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001193 dp = row + (final_width - 1) * pixel_bytes;
1194 // New code by Nirav Chhatrapati - Intel Corporation
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001195 // sign fix by GRR
1196 // NOTE: there is NO MMX code for 48-bit and 64-bit images
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001197
1198 if (mmx_supported) // use MMX routine if machine supports it
1199 {
1200 if (pixel_bytes == 3)
1201 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001202 if (((pass == 0) || (pass == 1)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001203 {
1204 _asm
1205 {
1206 mov esi, sptr
1207 mov edi, dp
1208 mov ecx, width
1209 sub edi, 21 // (png_pass_inc[pass] - 1)*pixel_bytes
1210loop_pass0:
1211 movd mm0, [esi] ; X X X X X v2 v1 v0
1212 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1213 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1214 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1215 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1216 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1217 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1218 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1219 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1220 movq mm3, mm0 ; v2 v1 v0 v2 v1 v0 v2 v1
1221 psllq mm0, 16 ; v0 v2 v1 v0 v2 v1 0 0
1222 movq mm4, mm3 ; v2 v1 v0 v2 v1 v0 v2 v1
1223 punpckhdq mm3, mm0 ; v0 v2 v1 v0 v2 v1 v0 v2
1224 movq [edi+16] , mm4
1225 psrlq mm0, 32 ; 0 0 0 0 v0 v2 v1 v0
1226 movq [edi+8] , mm3
1227 punpckldq mm0, mm4 ; v1 v0 v2 v1 v0 v2 v1 v0
1228 sub esi, 3
1229 movq [edi], mm0
1230 sub edi, 24
1231 //sub esi, 3
1232 dec ecx
1233 jnz loop_pass0
1234 EMMS
1235 }
1236 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001237 else if (((pass == 2) || (pass == 3)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001238 {
1239 _asm
1240 {
1241 mov esi, sptr
1242 mov edi, dp
1243 mov ecx, width
1244 sub edi, 9 // (png_pass_inc[pass] - 1)*pixel_bytes
1245loop_pass2:
1246 movd mm0, [esi] ; X X X X X v2 v1 v0
1247 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1248 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1249 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1250 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1251 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1252 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1253 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1254 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1255 movq [edi+4], mm0 ; move to memory
1256 psrlq mm0, 16 ; 0 0 v2 v1 v0 v2 v1 v0
1257 movd [edi], mm0 ; move to memory
1258 sub esi, 3
1259 sub edi, 12
1260 dec ecx
1261 jnz loop_pass2
1262 EMMS
1263 }
1264 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001265 else if (width) /* && ((pass == 4) || (pass == 5)) */
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001266 {
1267 int width_mmx = ((width >> 1) << 1) - 8;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001268 if (width_mmx < 0)
1269 width_mmx = 0;
1270 width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001271 if (width_mmx)
1272 {
1273 _asm
1274 {
1275 mov esi, sptr
1276 mov edi, dp
1277 mov ecx, width_mmx
1278 sub esi, 3
1279 sub edi, 9
1280loop_pass4:
1281 movq mm0, [esi] ; X X v2 v1 v0 v5 v4 v3
1282 movq mm7, mm0 ; X X v2 v1 v0 v5 v4 v3
1283 movq mm6, mm0 ; X X v2 v1 v0 v5 v4 v3
1284 psllq mm0, 24 ; v1 v0 v5 v4 v3 0 0 0
1285 pand mm7, const4 ; 0 0 0 0 0 v5 v4 v3
1286 psrlq mm6, 24 ; 0 0 0 X X v2 v1 v0
1287 por mm0, mm7 ; v1 v0 v5 v4 v3 v5 v4 v3
1288 movq mm5, mm6 ; 0 0 0 X X v2 v1 v0
1289 psllq mm6, 8 ; 0 0 X X v2 v1 v0 0
1290 movq [edi], mm0 ; move quad to memory
1291 psrlq mm5, 16 ; 0 0 0 0 0 X X v2
1292 pand mm5, const6 ; 0 0 0 0 0 0 0 v2
1293 por mm6, mm5 ; 0 0 X X v2 v1 v0 v2
1294 movd [edi+8], mm6 ; move double to memory
1295 sub esi, 6
1296 sub edi, 12
1297 sub ecx, 2
1298 jnz loop_pass4
1299 EMMS
1300 }
1301 }
1302
1303 sptr -= width_mmx*3;
1304 dp -= width_mmx*6;
1305 for (i = width; i; i--)
1306 {
1307 png_byte v[8];
1308 int j;
1309
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001310 png_memcpy(v, sptr, 3);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001311 for (j = 0; j < png_pass_inc[pass]; j++)
1312 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001313 png_memcpy(dp, v, 3);
1314 dp -= 3;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001315 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001316 sptr -= 3;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001317 }
1318 }
1319 } /* end of pixel_bytes == 3 */
1320
1321 else if (pixel_bytes == 1)
1322 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001323 if (((pass == 0) || (pass == 1)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001324 {
1325 int width_mmx = ((width >> 2) << 2);
1326 width -= width_mmx;
1327 if (width_mmx)
1328 {
1329 _asm
1330 {
1331 mov esi, sptr
1332 mov edi, dp
1333 mov ecx, width_mmx
1334 sub edi, 31
1335 sub esi, 3
1336loop1_pass0:
1337 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1338 movq mm1, mm0 ; X X X X v0 v1 v2 v3
1339 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1340 movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1341 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1342 movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1343 punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3
1344 punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2
1345 movq [edi], mm0 ; move to memory v3
1346 punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1347 movq [edi+8], mm3 ; move to memory v2
1348 movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1349 punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1
1350 punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0
1351 movq [edi+16], mm2 ; move to memory v1
1352 movq [edi+24], mm4 ; move to memory v0
1353 sub esi, 4
1354 sub edi, 32
1355 sub ecx, 4
1356 jnz loop1_pass0
1357 EMMS
1358 }
1359 }
1360
1361 sptr -= width_mmx;
1362 dp -= width_mmx*8;
1363 for (i = width; i; i--)
1364 {
1365 int j;
1366
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001367 /* I simplified this part in version 1.0.4e
1368 * here and in several other instances where
1369 * pixel_bytes == 1 -- GR-P
1370 *
1371 * Original code:
1372 *
1373 * png_byte v[8];
1374 * png_memcpy(v, sptr, pixel_bytes);
1375 * for (j = 0; j < png_pass_inc[pass]; j++)
1376 * {
1377 * png_memcpy(dp, v, pixel_bytes);
1378 * dp -= pixel_bytes;
1379 * }
1380 * sptr -= pixel_bytes;
1381 *
1382 * Replacement code is in the next three lines:
1383 */
1384
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001385 for (j = 0; j < png_pass_inc[pass]; j++)
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001386 *dp-- = *sptr;
1387 sptr--;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001388 }
1389 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001390 else if (((pass == 2) || (pass == 3)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001391 {
1392 int width_mmx = ((width >> 2) << 2);
1393 width -= width_mmx;
1394 if (width_mmx)
1395 {
1396 _asm
1397 {
1398 mov esi, sptr
1399 mov edi, dp
1400 mov ecx, width_mmx
1401 sub edi, 15
1402 sub esi, 3
1403loop1_pass2:
1404 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1405 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1406 movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1407 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1408 punpckhwd mm1, mm1 ; v0 v0 v0 v0 v1 v1 v1 v1
1409 movq [edi], mm0 ; move to memory v2 and v3
1410 sub esi, 4
1411 movq [edi+8], mm1 ; move to memory v1 and v0
1412 sub edi, 16
1413 sub ecx, 4
1414 jnz loop1_pass2
1415 EMMS
1416 }
1417 }
1418
1419 sptr -= width_mmx;
1420 dp -= width_mmx*4;
1421 for (i = width; i; i--)
1422 {
1423 int j;
1424
1425 for (j = 0; j < png_pass_inc[pass]; j++)
1426 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001427 *dp-- = *sptr;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001428 }
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001429 sptr --;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001430 }
1431 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001432 else if (width) /* && ((pass == 4) || (pass == 5))) */
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001433 {
1434 int width_mmx = ((width >> 3) << 3);
1435 width -= width_mmx;
1436 if (width_mmx)
1437 {
1438 _asm
1439 {
1440 mov esi, sptr
1441 mov edi, dp
1442 mov ecx, width_mmx
1443 sub edi, 15
1444 sub esi, 7
1445loop1_pass4:
1446 movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7
1447 movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7
1448 punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7
1449 //movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1450 punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3
1451 movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3
1452 sub esi, 8
1453 movq [edi], mm0 ; move to memory v4 v5 v6 and v7
1454 //sub esi, 4
1455 sub edi, 16
1456 sub ecx, 8
1457 jnz loop1_pass4
1458 EMMS
1459 }
1460 }
1461
1462 sptr -= width_mmx;
1463 dp -= width_mmx*2;
1464 for (i = width; i; i--)
1465 {
1466 int j;
1467
1468 for (j = 0; j < png_pass_inc[pass]; j++)
1469 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001470 *dp-- = *sptr;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001471 }
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001472 sptr --;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001473 }
1474 }
1475 } /* end of pixel_bytes == 1 */
1476
1477 else if (pixel_bytes == 2)
1478 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001479 if (((pass == 0) || (pass == 1)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001480 {
1481 int width_mmx = ((width >> 1) << 1);
1482 width -= width_mmx;
1483 if (width_mmx)
1484 {
1485 _asm
1486 {
1487 mov esi, sptr
1488 mov edi, dp
1489 mov ecx, width_mmx
1490 sub esi, 2
1491 sub edi, 30
1492loop2_pass0:
1493 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1494 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1495 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1496 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1497 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1498 movq [edi], mm0
1499 movq [edi + 8], mm0
1500 movq [edi + 16], mm1
1501 movq [edi + 24], mm1
1502 sub esi, 4
1503 sub edi, 32
1504 sub ecx, 2
1505 jnz loop2_pass0
1506 EMMS
1507 }
1508 }
1509
Glenn Randers-Pehrson166c5a31999-12-10 09:43:02 -06001510 sptr -= (width_mmx*2 - 2); // sign fixed
1511 dp -= (width_mmx*16 - 2); // sign fixed
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001512 for (i = width; i; i--)
1513 {
1514 png_byte v[8];
1515 int j;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001516 sptr -= 2;
1517 png_memcpy(v, sptr, 2);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001518 for (j = 0; j < png_pass_inc[pass]; j++)
1519 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001520 dp -= 2;
1521 png_memcpy(dp, v, 2);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001522 }
1523 }
1524 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001525 else if (((pass == 2) || (pass == 3)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001526 {
1527 int width_mmx = ((width >> 1) << 1) ;
1528 width -= width_mmx;
1529 if (width_mmx)
1530 {
1531 _asm
1532 {
1533 mov esi, sptr
1534 mov edi, dp
1535 mov ecx, width_mmx
1536 sub esi, 2
1537 sub edi, 14
1538loop2_pass2:
1539 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1540 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1541 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1542 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1543 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1544 movq [edi], mm0
1545 sub esi, 4
1546 movq [edi + 8], mm1
1547 //sub esi, 4
1548 sub edi, 16
1549 sub ecx, 2
1550 jnz loop2_pass2
1551 EMMS
1552 }
1553 }
1554
Glenn Randers-Pehrson166c5a31999-12-10 09:43:02 -06001555 sptr -= (width_mmx*2 - 2); // sign fixed
1556 dp -= (width_mmx*8 - 2); // sign fixed
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001557 for (i = width; i; i--)
1558 {
1559 png_byte v[8];
1560 int j;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001561 sptr -= 2;
1562 png_memcpy(v, sptr, 2);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001563 for (j = 0; j < png_pass_inc[pass]; j++)
1564 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001565 dp -= 2;
1566 png_memcpy(dp, v, 2);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001567 }
1568 }
1569 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001570 else if (width) // pass == 4 or 5
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001571 {
1572 int width_mmx = ((width >> 1) << 1) ;
1573 width -= width_mmx;
1574 if (width_mmx)
1575 {
1576 _asm
1577 {
1578 mov esi, sptr
1579 mov edi, dp
1580 mov ecx, width_mmx
1581 sub esi, 2
1582 sub edi, 6
1583loop2_pass4:
1584 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1585 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1586 sub esi, 4
1587 movq [edi], mm0
1588 sub edi, 8
1589 sub ecx, 2
1590 jnz loop2_pass4
1591 EMMS
1592 }
1593 }
1594
Glenn Randers-Pehrson166c5a31999-12-10 09:43:02 -06001595 sptr -= (width_mmx*2 - 2); // sign fixed
1596 dp -= (width_mmx*4 - 2); // sign fixed
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001597 for (i = width; i; i--)
1598 {
1599 png_byte v[8];
1600 int j;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001601 sptr -= 2;
1602 png_memcpy(v, sptr, 2);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001603 for (j = 0; j < png_pass_inc[pass]; j++)
1604 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001605 dp -= 2;
1606 png_memcpy(dp, v, 2);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001607 }
1608 }
1609 }
1610 } /* end of pixel_bytes == 2 */
1611
1612 else if (pixel_bytes == 4)
1613 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001614 if (((pass == 0) || (pass == 1)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001615 {
1616 int width_mmx = ((width >> 1) << 1) ;
1617 width -= width_mmx;
1618 if (width_mmx)
1619 {
1620 _asm
1621 {
1622 mov esi, sptr
1623 mov edi, dp
1624 mov ecx, width_mmx
1625 sub esi, 4
1626 sub edi, 60
1627loop4_pass0:
1628 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1629 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1630 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1631 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1632 movq [edi], mm0
1633 movq [edi + 8], mm0
1634 movq [edi + 16], mm0
1635 movq [edi + 24], mm0
1636 movq [edi+32], mm1
1637 movq [edi + 40], mm1
1638 movq [edi+ 48], mm1
1639 sub esi, 8
1640 movq [edi + 56], mm1
1641 sub edi, 64
1642 sub ecx, 2
1643 jnz loop4_pass0
1644 EMMS
1645 }
1646 }
1647
Glenn Randers-Pehrson166c5a31999-12-10 09:43:02 -06001648 sptr -= (width_mmx*4 - 4); // sign fixed
1649 dp -= (width_mmx*32 - 4); // sign fixed
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001650 for (i = width; i; i--)
1651 {
1652 png_byte v[8];
1653 int j;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001654 sptr -= 4;
1655 png_memcpy(v, sptr, 4);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001656 for (j = 0; j < png_pass_inc[pass]; j++)
1657 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001658 dp -= 4;
1659 png_memcpy(dp, v, 4);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001660 }
1661 }
1662 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001663 else if (((pass == 2) || (pass == 3)) && width)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001664 {
1665 int width_mmx = ((width >> 1) << 1) ;
1666 width -= width_mmx;
1667 if (width_mmx)
1668 {
1669 _asm
1670 {
1671 mov esi, sptr
1672 mov edi, dp
1673 mov ecx, width_mmx
1674 sub esi, 4
1675 sub edi, 28
1676loop4_pass2:
1677 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1678 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1679 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1680 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1681 movq [edi], mm0
1682 movq [edi + 8], mm0
1683 movq [edi+16], mm1
1684 movq [edi + 24], mm1
1685 sub esi, 8
1686 sub edi, 32
1687 sub ecx, 2
1688 jnz loop4_pass2
1689 EMMS
1690 }
1691 }
1692
Glenn Randers-Pehrson166c5a31999-12-10 09:43:02 -06001693 sptr -= (width_mmx*4 - 4); // sign fixed
1694 dp -= (width_mmx*16 - 4); // sign fixed
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001695 for (i = width; i; i--)
1696 {
1697 png_byte v[8];
1698 int j;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001699 sptr -= 4;
1700 png_memcpy(v, sptr, 4);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001701 for (j = 0; j < png_pass_inc[pass]; j++)
1702 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001703 dp -= 4;
1704 png_memcpy(dp, v, 4);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001705 }
1706 }
1707 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001708 else if (width) // pass == 4 or 5
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001709 {
1710 int width_mmx = ((width >> 1) << 1) ;
1711 width -= width_mmx;
1712 if (width_mmx)
1713 {
1714 _asm
1715 {
1716 mov esi, sptr
1717 mov edi, dp
1718 mov ecx, width_mmx
1719 sub esi, 4
1720 sub edi, 12
1721loop4_pass4:
1722 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1723 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1724 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1725 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1726 movq [edi], mm0
1727 sub esi, 8
1728 movq [edi + 8], mm1
1729 sub edi, 16
1730 sub ecx, 2
1731 jnz loop4_pass4
1732 EMMS
1733 }
1734 }
1735
Glenn Randers-Pehrson166c5a31999-12-10 09:43:02 -06001736 sptr -= (width_mmx*4 - 4); // sign fixed
1737 dp -= (width_mmx*8 - 4); // sign fixed
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001738 for (i = width; i; i--)
1739 {
1740 png_byte v[8];
1741 int j;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001742 sptr -= 4;
1743 png_memcpy(v, sptr, 4);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001744 for (j = 0; j < png_pass_inc[pass]; j++)
1745 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001746 dp -= 4;
1747 png_memcpy(dp, v, 4);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001748 }
1749 }
1750 }
1751
1752 } /* end of pixel_bytes == 4 */
1753
1754 else if (pixel_bytes == 6)
1755 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001756 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001757 {
1758 png_byte v[8];
1759 int j;
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001760 png_memcpy(v, sptr, 6);
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001761 for (j = 0; j < png_pass_inc[pass]; j++)
1762 {
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001763 png_memcpy(dp, v, 6);
1764 dp -= 6;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001765 }
Glenn Randers-Pehrson6d8f3b01999-10-23 08:39:18 -05001766 sptr -= 6;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001767 }
1768 } /* end of pixel_bytes == 6 */
1769
1770 else
1771 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001772 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001773 {
1774 png_byte v[8];
1775 int j;
1776 png_memcpy(v, sptr, pixel_bytes);
1777 for (j = 0; j < png_pass_inc[pass]; j++)
1778 {
1779 png_memcpy(dp, v, pixel_bytes);
1780 dp -= pixel_bytes;
1781 }
1782 sptr-= pixel_bytes;
1783 }
1784 }
1785 } /* end of mmx_supported */
1786
1787 else /* MMX not supported: use modified C code - takes advantage
1788 * of inlining of memcpy for a constant */
1789 {
1790 if (pixel_bytes == 1)
1791 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001792 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001793 {
1794 int j;
1795 for (j = 0; j < png_pass_inc[pass]; j++)
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001796 *dp-- = *sptr;
1797 sptr--;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001798 }
1799 }
1800 else if (pixel_bytes == 3)
1801 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001802 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001803 {
1804 png_byte v[8];
1805 int j;
1806 png_memcpy(v, sptr, pixel_bytes);
1807 for (j = 0; j < png_pass_inc[pass]; j++)
1808 {
1809 png_memcpy(dp, v, pixel_bytes);
1810 dp -= pixel_bytes;
1811 }
1812 sptr -= pixel_bytes;
1813 }
1814 }
1815 else if (pixel_bytes == 2)
1816 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001817 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001818 {
1819 png_byte v[8];
1820 int j;
1821 png_memcpy(v, sptr, pixel_bytes);
1822 for (j = 0; j < png_pass_inc[pass]; j++)
1823 {
1824 png_memcpy(dp, v, pixel_bytes);
1825 dp -= pixel_bytes;
1826 }
1827 sptr -= pixel_bytes;
1828 }
1829 }
1830 else if (pixel_bytes == 4)
1831 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001832 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001833 {
1834 png_byte v[8];
1835 int j;
1836 png_memcpy(v, sptr, pixel_bytes);
1837 for (j = 0; j < png_pass_inc[pass]; j++)
1838 {
1839 png_memcpy(dp, v, pixel_bytes);
1840 dp -= pixel_bytes;
1841 }
1842 sptr -= pixel_bytes;
1843 }
1844 }
1845 else if (pixel_bytes == 6)
1846 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001847 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001848 {
1849 png_byte v[8];
1850 int j;
1851 png_memcpy(v, sptr, pixel_bytes);
1852 for (j = 0; j < png_pass_inc[pass]; j++)
1853 {
1854 png_memcpy(dp, v, pixel_bytes);
1855 dp -= pixel_bytes;
1856 }
1857 sptr -= pixel_bytes;
1858 }
1859 }
1860 else
1861 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001862 for (i = width; i; i--)
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001863 {
1864 png_byte v[8];
1865 int j;
1866 png_memcpy(v, sptr, pixel_bytes);
1867 for (j = 0; j < png_pass_inc[pass]; j++)
1868 {
1869 png_memcpy(dp, v, pixel_bytes);
1870 dp -= pixel_bytes;
1871 }
1872 sptr -= pixel_bytes;
1873 }
1874 }
1875
1876 } /* end of MMX not supported */
1877 break;
1878 }
1879 } /* end switch (row_info->pixel_depth) */
1880
1881 row_info->width = final_width;
1882 row_info->rowbytes = ((final_width *
1883 (png_uint_32)row_info->pixel_depth + 7) >> 3);
1884 }
1885
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05001886#ifdef DISABLE_PNGVCRD_INTERLACE
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001887 mmx_supported = save_mmx_supported;
1888#endif
1889}
1890
1891#endif /* PNG_READ_INTERLACING_SUPPORTED */
1892
1893
1894// These variables are utilized in the functions below. They are declared
1895// globally here to ensure alignment on 8-byte boundaries.
1896
1897union uAll {
1898 __int64 use;
1899 double align;
1900} LBCarryMask = {0x0101010101010101},
1901 HBClearMask = {0x7f7f7f7f7f7f7f7f},
1902 ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
1903
1904
1905// Optimized code for PNG Average filter decoder
Glenn Randers-Pehrson75294572000-05-06 14:09:57 -05001906void /* PRIVATE */
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05001907png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
1908 , png_bytep prev_row)
1909{
1910 int bpp;
1911 png_uint_32 FullLength;
1912 png_uint_32 MMXLength;
1913 //png_uint_32 len;
1914 int diff;
1915
1916 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
1917 FullLength = row_info->rowbytes; // # of bytes to filter
1918 _asm {
1919 // Init address pointers and offset
1920 mov edi, row // edi ==> Avg(x)
1921 xor ebx, ebx // ebx ==> x
1922 mov edx, edi
1923 mov esi, prev_row // esi ==> Prior(x)
1924 sub edx, bpp // edx ==> Raw(x-bpp)
1925
1926 xor eax, eax
1927 // Compute the Raw value for the first bpp bytes
1928 // Raw(x) = Avg(x) + (Prior(x)/2)
1929davgrlp:
1930 mov al, [esi + ebx] // Load al with Prior(x)
1931 inc ebx
1932 shr al, 1 // divide by 2
1933 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
1934 cmp ebx, bpp
1935 mov [edi+ebx-1], al // Write back Raw(x);
1936 // mov does not affect flags; -1 to offset inc ebx
1937 jb davgrlp
1938 // get # of bytes to alignment
1939 mov diff, edi // take start of row
1940 add diff, ebx // add bpp
1941 add diff, 0xf // add 7 + 8 to incr past alignment boundary
1942 and diff, 0xfffffff8 // mask to alignment boundary
1943 sub diff, edi // subtract from start ==> value ebx at alignment
1944 jz davggo
1945 // fix alignment
1946 // Compute the Raw value for the bytes upto the alignment boundary
1947 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
1948 xor ecx, ecx
1949davglp1:
1950 xor eax, eax
1951 mov cl, [esi + ebx] // load cl with Prior(x)
1952 mov al, [edx + ebx] // load al with Raw(x-bpp)
1953 add ax, cx
1954 inc ebx
1955 shr ax, 1 // divide by 2
1956 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
1957 cmp ebx, diff // Check if at alignment boundary
1958 mov [edi+ebx-1], al // Write back Raw(x);
1959 // mov does not affect flags; -1 to offset inc ebx
1960 jb davglp1 // Repeat until at alignment boundary
1961davggo:
1962 mov eax, FullLength
1963 mov ecx, eax
1964 sub eax, ebx // subtract alignment fix
1965 and eax, 0x00000007 // calc bytes over mult of 8
1966 sub ecx, eax // drop over bytes from original length
1967 mov MMXLength, ecx
1968 } // end _asm block
1969 // Now do the math for the rest of the row
1970 switch ( bpp )
1971 {
1972 case 3:
1973 {
1974 ActiveMask.use = 0x0000000000ffffff;
1975 ShiftBpp.use = 24; // == 3 * 8
1976 ShiftRem.use = 40; // == 64 - 24
1977 _asm {
1978 // Re-init address pointers and offset
1979 movq mm7, ActiveMask
1980 mov ebx, diff // ebx ==> x = offset to alignment boundary
1981 movq mm5, LBCarryMask
1982 mov edi, row // edi ==> Avg(x)
1983 movq mm4, HBClearMask
1984 mov esi, prev_row // esi ==> Prior(x)
1985 // PRIME the pump (load the first Raw(x-bpp) data set
1986 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
1987 // (we correct position in loop below)
1988davg3lp:
1989 movq mm0, [edi + ebx] // Load mm0 with Avg(x)
1990 // Add (Prev_row/2) to Average
1991 movq mm3, mm5
1992 psrlq mm2, ShiftRem // Correct position Raw(x-bpp) data
1993 movq mm1, [esi + ebx] // Load mm1 with Prior(x)
1994 movq mm6, mm7
1995 pand mm3, mm1 // get lsb for each prev_row byte
1996 psrlq mm1, 1 // divide prev_row bytes by 2
1997 pand mm1, mm4 // clear invalid bit 7 of each byte
1998 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
1999 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2000 movq mm1, mm3 // now use mm1 for getting LBCarrys
2001 pand mm1, mm2 // get LBCarrys for each byte where both
2002 // lsb's were == 1 (Only valid for active group)
2003 psrlq mm2, 1 // divide raw bytes by 2
2004 pand mm2, mm4 // clear invalid bit 7 of each byte
2005 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2006 pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
2007 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2008 // byte
2009 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2010 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 3-5
2011 movq mm2, mm0 // mov updated Raws to mm2
2012 psllq mm2, ShiftBpp // shift data to position correctly
2013 movq mm1, mm3 // now use mm1 for getting LBCarrys
2014 pand mm1, mm2 // get LBCarrys for each byte where both
2015 // lsb's were == 1 (Only valid for active group)
2016 psrlq mm2, 1 // divide raw bytes by 2
2017 pand mm2, mm4 // clear invalid bit 7 of each byte
2018 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2019 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2020 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2021 // byte
2022
2023 // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
2024 psllq mm6, ShiftBpp // shift the mm6 mask to cover the last two
2025 // bytes
2026 movq mm2, mm0 // mov updated Raws to mm2
2027 psllq mm2, ShiftBpp // shift data to position correctly
2028 // Data only needs to be shifted once here to
2029 // get the correct x-bpp offset.
2030 movq mm1, mm3 // now use mm1 for getting LBCarrys
2031 pand mm1, mm2 // get LBCarrys for each byte where both
2032 // lsb's were == 1 (Only valid for active group)
2033 psrlq mm2, 1 // divide raw bytes by 2
2034 pand mm2, mm4 // clear invalid bit 7 of each byte
2035 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2036 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2037 add ebx, 8
2038 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2039 // byte
2040
2041 // Now ready to write back to memory
2042 movq [edi + ebx - 8], mm0
2043 // Move updated Raw(x) to use as Raw(x-bpp) for next loop
2044 cmp ebx, MMXLength
2045 movq mm2, mm0 // mov updated Raw(x) to mm2
2046 jb davg3lp
2047 } // end _asm block
2048 }
2049 break;
2050
2051 case 6:
2052 case 4:
2053 case 7:
2054 case 5:
2055 {
2056 ActiveMask.use = 0xffffffffffffffff; // use shift below to clear
2057 // appropriate inactive bytes
2058 ShiftBpp.use = bpp << 3;
2059 ShiftRem.use = 64 - ShiftBpp.use;
2060 _asm {
2061 movq mm4, HBClearMask
2062 // Re-init address pointers and offset
2063 mov ebx, diff // ebx ==> x = offset to alignment boundary
2064 // Load ActiveMask and clear all bytes except for 1st active group
2065 movq mm7, ActiveMask
2066 mov edi, row // edi ==> Avg(x)
2067 psrlq mm7, ShiftRem
2068 mov esi, prev_row // esi ==> Prior(x)
2069 movq mm6, mm7
2070 movq mm5, LBCarryMask
2071 psllq mm6, ShiftBpp // Create mask for 2nd active group
2072 // PRIME the pump (load the first Raw(x-bpp) data set
2073 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2074 // (we correct position in loop below)
2075davg4lp:
2076 movq mm0, [edi + ebx]
2077 psrlq mm2, ShiftRem // shift data to position correctly
2078 movq mm1, [esi + ebx]
2079 // Add (Prev_row/2) to Average
2080 movq mm3, mm5
2081 pand mm3, mm1 // get lsb for each prev_row byte
2082 psrlq mm1, 1 // divide prev_row bytes by 2
2083 pand mm1, mm4 // clear invalid bit 7 of each byte
2084 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2085 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2086 movq mm1, mm3 // now use mm1 for getting LBCarrys
2087 pand mm1, mm2 // get LBCarrys for each byte where both
2088 // lsb's were == 1 (Only valid for active group)
2089 psrlq mm2, 1 // divide raw bytes by 2
2090 pand mm2, mm4 // clear invalid bit 7 of each byte
2091 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2092 pand mm2, mm7 // Leave only Active Group 1 bytes to add to Avg
2093 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2094 // byte
2095 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2096 movq mm2, mm0 // mov updated Raws to mm2
2097 psllq mm2, ShiftBpp // shift data to position correctly
2098 add ebx, 8
2099 movq mm1, mm3 // now use mm1 for getting LBCarrys
2100 pand mm1, mm2 // get LBCarrys for each byte where both
2101 // lsb's were == 1 (Only valid for active group)
2102 psrlq mm2, 1 // divide raw bytes by 2
2103 pand mm2, mm4 // clear invalid bit 7 of each byte
2104 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2105 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2106 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2107 // byte
2108 cmp ebx, MMXLength
2109 // Now ready to write back to memory
2110 movq [edi + ebx - 8], mm0
2111 // Prep Raw(x-bpp) for next loop
2112 movq mm2, mm0 // mov updated Raws to mm2
2113 jb davg4lp
2114 } // end _asm block
2115 }
2116 break;
2117 case 2:
2118 {
2119 ActiveMask.use = 0x000000000000ffff;
2120 ShiftBpp.use = 24; // == 3 * 8
2121 ShiftRem.use = 40; // == 64 - 24
2122 _asm {
2123 // Load ActiveMask
2124 movq mm7, ActiveMask
2125 // Re-init address pointers and offset
2126 mov ebx, diff // ebx ==> x = offset to alignment boundary
2127 movq mm5, LBCarryMask
2128 mov edi, row // edi ==> Avg(x)
2129 movq mm4, HBClearMask
2130 mov esi, prev_row // esi ==> Prior(x)
2131 // PRIME the pump (load the first Raw(x-bpp) data set
2132 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2133 // (we correct position in loop below)
2134davg2lp:
2135 movq mm0, [edi + ebx]
2136 psllq mm2, ShiftRem // shift data to position correctly
2137 movq mm1, [esi + ebx]
2138 // Add (Prev_row/2) to Average
2139 movq mm3, mm5
2140 pand mm3, mm1 // get lsb for each prev_row byte
2141 psrlq mm1, 1 // divide prev_row bytes by 2
2142 pand mm1, mm4 // clear invalid bit 7 of each byte
2143 movq mm6, mm7
2144 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2145 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2146 movq mm1, mm3 // now use mm1 for getting LBCarrys
2147 pand mm1, mm2 // get LBCarrys for each byte where both
2148 // lsb's were == 1 (Only valid for active group)
2149 psrlq mm2, 1 // divide raw bytes by 2
2150 pand mm2, mm4 // clear invalid bit 7 of each byte
2151 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2152 pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
2153 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2154 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2155 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3
2156 movq mm2, mm0 // mov updated Raws to mm2
2157 psllq mm2, ShiftBpp // shift data to position correctly
2158 movq mm1, mm3 // now use mm1 for getting LBCarrys
2159 pand mm1, mm2 // get LBCarrys for each byte where both
2160 // lsb's were == 1 (Only valid for active group)
2161 psrlq mm2, 1 // divide raw bytes by 2
2162 pand mm2, mm4 // clear invalid bit 7 of each byte
2163 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2164 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2165 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2166
2167 // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
2168 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5
2169 movq mm2, mm0 // mov updated Raws to mm2
2170 psllq mm2, ShiftBpp // shift data to position correctly
2171 // Data only needs to be shifted once here to
2172 // get the correct x-bpp offset.
2173 movq mm1, mm3 // now use mm1 for getting LBCarrys
2174 pand mm1, mm2 // get LBCarrys for each byte where both
2175 // lsb's were == 1 (Only valid for active group)
2176 psrlq mm2, 1 // divide raw bytes by 2
2177 pand mm2, mm4 // clear invalid bit 7 of each byte
2178 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2179 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2180 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2181
2182 // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
2183 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 6 & 7
2184 movq mm2, mm0 // mov updated Raws to mm2
2185 psllq mm2, ShiftBpp // shift data to position correctly
2186 // Data only needs to be shifted once here to
2187 // get the correct x-bpp offset.
2188 add ebx, 8
2189 movq mm1, mm3 // now use mm1 for getting LBCarrys
2190 pand mm1, mm2 // get LBCarrys for each byte where both
2191 // lsb's were == 1 (Only valid for active group)
2192 psrlq mm2, 1 // divide raw bytes by 2
2193 pand mm2, mm4 // clear invalid bit 7 of each byte
2194 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2195 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2196 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2197
2198 cmp ebx, MMXLength
2199 // Now ready to write back to memory
2200 movq [edi + ebx - 8], mm0
2201 // Prep Raw(x-bpp) for next loop
2202 movq mm2, mm0 // mov updated Raws to mm2
2203 jb davg2lp
2204 } // end _asm block
2205 }
2206 break;
2207
2208 case 1: // bpp == 1
2209 {
2210 _asm {
2211 // Re-init address pointers and offset
2212 mov ebx, diff // ebx ==> x = offset to alignment boundary
2213 mov edi, row // edi ==> Avg(x)
2214 cmp ebx, FullLength // Test if offset at end of array
2215 jnb davg1end
2216 // Do Paeth decode for remaining bytes
2217 mov esi, prev_row // esi ==> Prior(x)
2218 mov edx, edi
2219 xor ecx, ecx // zero ecx before using cl & cx in loop below
2220 sub edx, bpp // edx ==> Raw(x-bpp)
2221davg1lp:
2222 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2223 xor eax, eax
2224 mov cl, [esi + ebx] // load cl with Prior(x)
2225 mov al, [edx + ebx] // load al with Raw(x-bpp)
2226 add ax, cx
2227 inc ebx
2228 shr ax, 1 // divide by 2
2229 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2230 cmp ebx, FullLength // Check if at end of array
2231 mov [edi+ebx-1], al // Write back Raw(x);
2232 // mov does not affect flags; -1 to offset inc ebx
2233 jb davg1lp
2234davg1end:
2235 } // end _asm block
2236 }
2237 return;
2238
2239 case 8: // bpp == 8
2240 {
2241 _asm {
2242 // Re-init address pointers and offset
2243 mov ebx, diff // ebx ==> x = offset to alignment boundary
2244 movq mm5, LBCarryMask
2245 mov edi, row // edi ==> Avg(x)
2246 movq mm4, HBClearMask
2247 mov esi, prev_row // esi ==> Prior(x)
2248 // PRIME the pump (load the first Raw(x-bpp) data set
2249 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2250 // (NO NEED to correct position in loop below)
2251davg8lp:
2252 movq mm0, [edi + ebx]
2253 movq mm3, mm5
2254 movq mm1, [esi + ebx]
2255 add ebx, 8
2256 pand mm3, mm1 // get lsb for each prev_row byte
2257 psrlq mm1, 1 // divide prev_row bytes by 2
2258 pand mm3, mm2 // get LBCarrys for each byte where both
2259 // lsb's were == 1
2260 psrlq mm2, 1 // divide raw bytes by 2
2261 pand mm1, mm4 // clear invalid bit 7 of each byte
2262 paddb mm0, mm3 // add LBCarrys to Avg for each byte
2263 pand mm2, mm4 // clear invalid bit 7 of each byte
2264 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2265 paddb mm0, mm2 // add (Raw/2) to Avg for each byte
2266 cmp ebx, MMXLength
2267 movq [edi + ebx - 8], mm0
2268 movq mm2, mm0 // reuse as Raw(x-bpp)
2269 jb davg8lp
2270 } // end _asm block
2271 }
2272 break;
2273 default: // bpp greater than 8
2274 {
2275 _asm {
2276 movq mm5, LBCarryMask
2277 // Re-init address pointers and offset
2278 mov ebx, diff // ebx ==> x = offset to alignment boundary
2279 mov edi, row // edi ==> Avg(x)
2280 movq mm4, HBClearMask
2281 mov edx, edi
2282 mov esi, prev_row // esi ==> Prior(x)
2283 sub edx, bpp // edx ==> Raw(x-bpp)
2284davgAlp:
2285 movq mm0, [edi + ebx]
2286 movq mm3, mm5
2287 movq mm1, [esi + ebx]
2288 pand mm3, mm1 // get lsb for each prev_row byte
2289 movq mm2, [edx + ebx]
2290 psrlq mm1, 1 // divide prev_row bytes by 2
2291 pand mm3, mm2 // get LBCarrys for each byte where both
2292 // lsb's were == 1
2293 psrlq mm2, 1 // divide raw bytes by 2
2294 pand mm1, mm4 // clear invalid bit 7 of each byte
2295 paddb mm0, mm3 // add LBCarrys to Avg for each byte
2296 pand mm2, mm4 // clear invalid bit 7 of each byte
2297 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2298 add ebx, 8
2299 paddb mm0, mm2 // add (Raw/2) to Avg for each byte
2300 cmp ebx, MMXLength
2301 movq [edi + ebx - 8], mm0
2302 jb davgAlp
2303 } // end _asm block
2304 }
2305 break;
2306 } // end switch ( bpp )
2307
2308 _asm {
2309 // MMX acceleration complete now do clean-up
2310 // Check if any remaining bytes left to decode
2311 mov ebx, MMXLength // ebx ==> x = offset bytes remaining after MMX
2312 mov edi, row // edi ==> Avg(x)
2313 cmp ebx, FullLength // Test if offset at end of array
2314 jnb davgend
2315 // Do Paeth decode for remaining bytes
2316 mov esi, prev_row // esi ==> Prior(x)
2317 mov edx, edi
2318 xor ecx, ecx // zero ecx before using cl & cx in loop below
2319 sub edx, bpp // edx ==> Raw(x-bpp)
2320davglp2:
2321 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2322 xor eax, eax
2323 mov cl, [esi + ebx] // load cl with Prior(x)
2324 mov al, [edx + ebx] // load al with Raw(x-bpp)
2325 add ax, cx
2326 inc ebx
2327 shr ax, 1 // divide by 2
2328 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2329 cmp ebx, FullLength // Check if at end of array
2330 mov [edi+ebx-1], al // Write back Raw(x);
2331 // mov does not affect flags; -1 to offset inc ebx
2332 jb davglp2
2333davgend:
2334 emms // End MMX instructions; prep for possible FP instrs.
2335 } // end _asm block
2336}
2337
2338// Optimized code for PNG Paeth filter decoder
Glenn Randers-Pehrson75294572000-05-06 14:09:57 -05002339void /* PRIVATE */
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05002340png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
2341 png_bytep prev_row)
2342{
2343 png_uint_32 FullLength;
2344 png_uint_32 MMXLength;
2345 //png_uint_32 len;
2346 int bpp;
2347 int diff;
2348 //int ptemp;
2349 int patemp, pbtemp, pctemp;
2350
2351 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
2352 FullLength = row_info->rowbytes; // # of bytes to filter
2353 _asm
2354 {
2355 xor ebx, ebx // ebx ==> x offset
2356 mov edi, row
2357 xor edx, edx // edx ==> x-bpp offset
2358 mov esi, prev_row
2359 xor eax, eax
2360
2361 // Compute the Raw value for the first bpp bytes
2362 // Note: the formula works out to be always
2363 // Paeth(x) = Raw(x) + Prior(x) where x < bpp
2364dpthrlp:
2365 mov al, [edi + ebx]
2366 add al, [esi + ebx]
2367 inc ebx
2368 cmp ebx, bpp
2369 mov [edi + ebx - 1], al
2370 jb dpthrlp
2371 // get # of bytes to alignment
2372 mov diff, edi // take start of row
2373 add diff, ebx // add bpp
2374 xor ecx, ecx
2375 add diff, 0xf // add 7 + 8 to incr past alignment boundary
2376 and diff, 0xfffffff8 // mask to alignment boundary
2377 sub diff, edi // subtract from start ==> value ebx at alignment
2378 jz dpthgo
2379 // fix alignment
2380dpthlp1:
2381 xor eax, eax
2382 // pav = p - a = (a + b - c) - a = b - c
2383 mov al, [esi + ebx] // load Prior(x) into al
2384 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2385 sub eax, ecx // subtract Prior(x-bpp)
2386 mov patemp, eax // Save pav for later use
2387 xor eax, eax
2388 // pbv = p - b = (a + b - c) - b = a - c
2389 mov al, [edi + edx] // load Raw(x-bpp) into al
2390 sub eax, ecx // subtract Prior(x-bpp)
2391 mov ecx, eax
2392 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2393 add eax, patemp // pcv = pav + pbv
2394 // pc = abs(pcv)
2395 test eax, 0x80000000
2396 jz dpthpca
2397 neg eax // reverse sign of neg values
2398dpthpca:
2399 mov pctemp, eax // save pc for later use
2400 // pb = abs(pbv)
2401 test ecx, 0x80000000
2402 jz dpthpba
2403 neg ecx // reverse sign of neg values
2404dpthpba:
2405 mov pbtemp, ecx // save pb for later use
2406 // pa = abs(pav)
2407 mov eax, patemp
2408 test eax, 0x80000000
2409 jz dpthpaa
2410 neg eax // reverse sign of neg values
2411dpthpaa:
2412 mov patemp, eax // save pa for later use
2413 // test if pa <= pb
2414 cmp eax, ecx
2415 jna dpthabb
2416 // pa > pb; now test if pb <= pc
2417 cmp ecx, pctemp
2418 jna dpthbbc
2419 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2420 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2421 jmp dpthpaeth
2422dpthbbc:
2423 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
2424 mov cl, [esi + ebx] // load Prior(x) into cl
2425 jmp dpthpaeth
2426dpthabb:
2427 // pa <= pb; now test if pa <= pc
2428 cmp eax, pctemp
2429 jna dpthabc
2430 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2431 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2432 jmp dpthpaeth
2433dpthabc:
2434 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
2435 mov cl, [edi + edx] // load Raw(x-bpp) into cl
2436dpthpaeth:
2437 inc ebx
2438 inc edx
2439 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
2440 add [edi + ebx - 1], cl
2441 cmp ebx, diff
2442 jb dpthlp1
2443dpthgo:
2444 mov ecx, FullLength
2445 mov eax, ecx
2446 sub eax, ebx // subtract alignment fix
2447 and eax, 0x00000007 // calc bytes over mult of 8
2448 sub ecx, eax // drop over bytes from original length
2449 mov MMXLength, ecx
2450 } // end _asm block
2451 // Now do the math for the rest of the row
2452 switch ( bpp )
2453 {
2454 case 3:
2455 {
2456 ActiveMask.use = 0x0000000000ffffff;
2457 ActiveMaskEnd.use = 0xffff000000000000;
2458 ShiftBpp.use = 24; // == bpp(3) * 8
2459 ShiftRem.use = 40; // == 64 - 24
2460 _asm
2461 {
2462 mov ebx, diff
2463 mov edi, row
2464 mov esi, prev_row
2465 pxor mm0, mm0
2466 // PRIME the pump (load the first Raw(x-bpp) data set
2467 movq mm1, [edi+ebx-8]
2468dpth3lp:
2469 psrlq mm1, ShiftRem // shift last 3 bytes to 1st 3 bytes
2470 movq mm2, [esi + ebx] // load b=Prior(x)
2471 punpcklbw mm1, mm0 // Unpack High bytes of a
2472 movq mm3, [esi+ebx-8] // Prep c=Prior(x-bpp) bytes
2473 punpcklbw mm2, mm0 // Unpack High bytes of b
2474 psrlq mm3, ShiftRem // shift last 3 bytes to 1st 3 bytes
2475 // pav = p - a = (a + b - c) - a = b - c
2476 movq mm4, mm2
2477 punpcklbw mm3, mm0 // Unpack High bytes of c
2478 // pbv = p - b = (a + b - c) - b = a - c
2479 movq mm5, mm1
2480 psubw mm4, mm3
2481 pxor mm7, mm7
2482 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2483 movq mm6, mm4
2484 psubw mm5, mm3
2485
2486 // pa = abs(p-a) = abs(pav)
2487 // pb = abs(p-b) = abs(pbv)
2488 // pc = abs(p-c) = abs(pcv)
2489 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2490 paddw mm6, mm5
2491 pand mm0, mm4 // Only pav bytes < 0 in mm7
2492 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2493 psubw mm4, mm0
2494 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2495 psubw mm4, mm0
2496 psubw mm5, mm7
2497 pxor mm0, mm0
2498 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2499 pand mm0, mm6 // Only pav bytes < 0 in mm7
2500 psubw mm5, mm7
2501 psubw mm6, mm0
2502 // test pa <= pb
2503 movq mm7, mm4
2504 psubw mm6, mm0
2505 pcmpgtw mm7, mm5 // pa > pb?
2506 movq mm0, mm7
2507 // use mm7 mask to merge pa & pb
2508 pand mm5, mm7
2509 // use mm0 mask copy to merge a & b
2510 pand mm2, mm0
2511 pandn mm7, mm4
2512 pandn mm0, mm1
2513 paddw mm7, mm5
2514 paddw mm0, mm2
2515 // test ((pa <= pb)? pa:pb) <= pc
2516 pcmpgtw mm7, mm6 // pab > pc?
2517 pxor mm1, mm1
2518 pand mm3, mm7
2519 pandn mm7, mm0
2520 paddw mm7, mm3
2521 pxor mm0, mm0
2522 packuswb mm7, mm1
2523 movq mm3, [esi + ebx] // load c=Prior(x-bpp)
2524 pand mm7, ActiveMask
2525 movq mm2, mm3 // load b=Prior(x) step 1
2526 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2527 punpcklbw mm3, mm0 // Unpack High bytes of c
2528 movq [edi + ebx], mm7 // write back updated value
2529 movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
2530 // Now do Paeth for 2nd set of bytes (3-5)
2531 psrlq mm2, ShiftBpp // load b=Prior(x) step 2
2532 punpcklbw mm1, mm0 // Unpack High bytes of a
2533 pxor mm7, mm7
2534 punpcklbw mm2, mm0 // Unpack High bytes of b
2535 // pbv = p - b = (a + b - c) - b = a - c
2536 movq mm5, mm1
2537 // pav = p - a = (a + b - c) - a = b - c
2538 movq mm4, mm2
2539 psubw mm5, mm3
2540 psubw mm4, mm3
2541 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
2542 // pav + pbv = pbv + pav
2543 movq mm6, mm5
2544 paddw mm6, mm4
2545
2546 // pa = abs(p-a) = abs(pav)
2547 // pb = abs(p-b) = abs(pbv)
2548 // pc = abs(p-c) = abs(pcv)
2549 pcmpgtw mm0, mm5 // Create mask pbv bytes < 0
2550 pcmpgtw mm7, mm4 // Create mask pav bytes < 0
2551 pand mm0, mm5 // Only pbv bytes < 0 in mm0
2552 pand mm7, mm4 // Only pav bytes < 0 in mm7
2553 psubw mm5, mm0
2554 psubw mm4, mm7
2555 psubw mm5, mm0
2556 psubw mm4, mm7
2557 pxor mm0, mm0
2558 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2559 pand mm0, mm6 // Only pav bytes < 0 in mm7
2560 psubw mm6, mm0
2561 // test pa <= pb
2562 movq mm7, mm4
2563 psubw mm6, mm0
2564 pcmpgtw mm7, mm5 // pa > pb?
2565 movq mm0, mm7
2566 // use mm7 mask to merge pa & pb
2567 pand mm5, mm7
2568 // use mm0 mask copy to merge a & b
2569 pand mm2, mm0
2570 pandn mm7, mm4
2571 pandn mm0, mm1
2572 paddw mm7, mm5
2573 paddw mm0, mm2
2574 // test ((pa <= pb)? pa:pb) <= pc
2575 pcmpgtw mm7, mm6 // pab > pc?
2576 movq mm2, [esi + ebx] // load b=Prior(x)
2577 pand mm3, mm7
2578 pandn mm7, mm0
2579 pxor mm1, mm1
2580 paddw mm7, mm3
2581 pxor mm0, mm0
2582 packuswb mm7, mm1
2583 movq mm3, mm2 // load c=Prior(x-bpp) step 1
2584 pand mm7, ActiveMask
2585 punpckhbw mm2, mm0 // Unpack High bytes of b
2586 psllq mm7, ShiftBpp // Shift bytes to 2nd group of 3 bytes
2587 // pav = p - a = (a + b - c) - a = b - c
2588 movq mm4, mm2
2589 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2590 psllq mm3, ShiftBpp // load c=Prior(x-bpp) step 2
2591 movq [edi + ebx], mm7 // write back updated value
2592 movq mm1, mm7
2593 punpckhbw mm3, mm0 // Unpack High bytes of c
2594 psllq mm1, ShiftBpp // Shift bytes
2595 // Now mm1 will be used as Raw(x-bpp)
2596 // Now do Paeth for 3rd, and final, set of bytes (6-7)
2597 pxor mm7, mm7
2598 punpckhbw mm1, mm0 // Unpack High bytes of a
2599 psubw mm4, mm3
2600 // pbv = p - b = (a + b - c) - b = a - c
2601 movq mm5, mm1
2602 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2603 movq mm6, mm4
2604 psubw mm5, mm3
2605 pxor mm0, mm0
2606 paddw mm6, mm5
2607
2608 // pa = abs(p-a) = abs(pav)
2609 // pb = abs(p-b) = abs(pbv)
2610 // pc = abs(p-c) = abs(pcv)
2611 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2612 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2613 pand mm0, mm4 // Only pav bytes < 0 in mm7
2614 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2615 psubw mm4, mm0
2616 psubw mm5, mm7
2617 psubw mm4, mm0
2618 psubw mm5, mm7
2619 pxor mm0, mm0
2620 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2621 pand mm0, mm6 // Only pav bytes < 0 in mm7
2622 psubw mm6, mm0
2623 // test pa <= pb
2624 movq mm7, mm4
2625 psubw mm6, mm0
2626 pcmpgtw mm7, mm5 // pa > pb?
2627 movq mm0, mm7
2628 // use mm0 mask copy to merge a & b
2629 pand mm2, mm0
2630 // use mm7 mask to merge pa & pb
2631 pand mm5, mm7
2632 pandn mm0, mm1
2633 pandn mm7, mm4
2634 paddw mm0, mm2
2635 paddw mm7, mm5
2636 // test ((pa <= pb)? pa:pb) <= pc
2637 pcmpgtw mm7, mm6 // pab > pc?
2638 pand mm3, mm7
2639 pandn mm7, mm0
2640 paddw mm7, mm3
2641 pxor mm1, mm1
2642 packuswb mm1, mm7
2643 // Step ebx to next set of 8 bytes and repeat loop til done
2644 add ebx, 8
2645 pand mm1, ActiveMaskEnd
2646 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2647
2648 cmp ebx, MMXLength
2649 pxor mm0, mm0 // pxor does not affect flags
2650 movq [edi + ebx - 8], mm1 // write back updated value
2651 // mm1 will be used as Raw(x-bpp) next loop
2652 // mm3 ready to be used as Prior(x-bpp) next loop
2653 jb dpth3lp
2654 } // end _asm block
2655 }
2656 break;
2657
2658 case 6:
2659 case 7:
2660 case 5:
2661 {
2662 ActiveMask.use = 0x00000000ffffffff;
2663 ActiveMask2.use = 0xffffffff00000000;
2664 ShiftBpp.use = bpp << 3; // == bpp * 8
2665 ShiftRem.use = 64 - ShiftBpp.use;
2666 _asm
2667 {
2668 mov ebx, diff
2669 mov edi, row
2670 mov esi, prev_row
2671 // PRIME the pump (load the first Raw(x-bpp) data set
2672 movq mm1, [edi+ebx-8]
2673 pxor mm0, mm0
2674dpth6lp:
2675 // Must shift to position Raw(x-bpp) data
2676 psrlq mm1, ShiftRem
2677 // Do first set of 4 bytes
2678 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2679 punpcklbw mm1, mm0 // Unpack Low bytes of a
2680 movq mm2, [esi + ebx] // load b=Prior(x)
2681 punpcklbw mm2, mm0 // Unpack Low bytes of b
2682 // Must shift to position Prior(x-bpp) data
2683 psrlq mm3, ShiftRem
2684 // pav = p - a = (a + b - c) - a = b - c
2685 movq mm4, mm2
2686 punpcklbw mm3, mm0 // Unpack Low bytes of c
2687 // pbv = p - b = (a + b - c) - b = a - c
2688 movq mm5, mm1
2689 psubw mm4, mm3
2690 pxor mm7, mm7
2691 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2692 movq mm6, mm4
2693 psubw mm5, mm3
2694 // pa = abs(p-a) = abs(pav)
2695 // pb = abs(p-b) = abs(pbv)
2696 // pc = abs(p-c) = abs(pcv)
2697 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2698 paddw mm6, mm5
2699 pand mm0, mm4 // Only pav bytes < 0 in mm7
2700 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2701 psubw mm4, mm0
2702 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2703 psubw mm4, mm0
2704 psubw mm5, mm7
2705 pxor mm0, mm0
2706 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2707 pand mm0, mm6 // Only pav bytes < 0 in mm7
2708 psubw mm5, mm7
2709 psubw mm6, mm0
2710 // test pa <= pb
2711 movq mm7, mm4
2712 psubw mm6, mm0
2713 pcmpgtw mm7, mm5 // pa > pb?
2714 movq mm0, mm7
2715 // use mm7 mask to merge pa & pb
2716 pand mm5, mm7
2717 // use mm0 mask copy to merge a & b
2718 pand mm2, mm0
2719 pandn mm7, mm4
2720 pandn mm0, mm1
2721 paddw mm7, mm5
2722 paddw mm0, mm2
2723 // test ((pa <= pb)? pa:pb) <= pc
2724 pcmpgtw mm7, mm6 // pab > pc?
2725 pxor mm1, mm1
2726 pand mm3, mm7
2727 pandn mm7, mm0
2728 paddw mm7, mm3
2729 pxor mm0, mm0
2730 packuswb mm7, mm1
2731 movq mm3, [esi + ebx - 8] // load c=Prior(x-bpp)
2732 pand mm7, ActiveMask
2733 psrlq mm3, ShiftRem
2734 movq mm2, [esi + ebx] // load b=Prior(x) step 1
2735 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2736 movq mm6, mm2
2737 movq [edi + ebx], mm7 // write back updated value
2738 movq mm1, [edi+ebx-8]
2739 psllq mm6, ShiftBpp
2740 movq mm5, mm7
2741 psrlq mm1, ShiftRem
2742 por mm3, mm6
2743 psllq mm5, ShiftBpp
2744 punpckhbw mm3, mm0 // Unpack High bytes of c
2745 por mm1, mm5
2746 // Do second set of 4 bytes
2747 punpckhbw mm2, mm0 // Unpack High bytes of b
2748 punpckhbw mm1, mm0 // Unpack High bytes of a
2749 // pav = p - a = (a + b - c) - a = b - c
2750 movq mm4, mm2
2751 // pbv = p - b = (a + b - c) - b = a - c
2752 movq mm5, mm1
2753 psubw mm4, mm3
2754 pxor mm7, mm7
2755 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2756 movq mm6, mm4
2757 psubw mm5, mm3
2758 // pa = abs(p-a) = abs(pav)
2759 // pb = abs(p-b) = abs(pbv)
2760 // pc = abs(p-c) = abs(pcv)
2761 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2762 paddw mm6, mm5
2763 pand mm0, mm4 // Only pav bytes < 0 in mm7
2764 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2765 psubw mm4, mm0
2766 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2767 psubw mm4, mm0
2768 psubw mm5, mm7
2769 pxor mm0, mm0
2770 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2771 pand mm0, mm6 // Only pav bytes < 0 in mm7
2772 psubw mm5, mm7
2773 psubw mm6, mm0
2774 // test pa <= pb
2775 movq mm7, mm4
2776 psubw mm6, mm0
2777 pcmpgtw mm7, mm5 // pa > pb?
2778 movq mm0, mm7
2779 // use mm7 mask to merge pa & pb
2780 pand mm5, mm7
2781 // use mm0 mask copy to merge a & b
2782 pand mm2, mm0
2783 pandn mm7, mm4
2784 pandn mm0, mm1
2785 paddw mm7, mm5
2786 paddw mm0, mm2
2787 // test ((pa <= pb)? pa:pb) <= pc
2788 pcmpgtw mm7, mm6 // pab > pc?
2789 pxor mm1, mm1
2790 pand mm3, mm7
2791 pandn mm7, mm0
2792 pxor mm1, mm1
2793 paddw mm7, mm3
2794 pxor mm0, mm0
2795 // Step ex to next set of 8 bytes and repeat loop til done
2796 add ebx, 8
2797 packuswb mm1, mm7
2798 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2799 cmp ebx, MMXLength
2800 movq [edi + ebx - 8], mm1 // write back updated value
2801 // mm1 will be used as Raw(x-bpp) next loop
2802 jb dpth6lp
2803 } // end _asm block
2804 }
2805 break;
2806
2807 case 4:
2808 {
2809 ActiveMask.use = 0x00000000ffffffff;
2810 _asm {
2811 mov ebx, diff
2812 mov edi, row
2813 mov esi, prev_row
2814 pxor mm0, mm0
2815 // PRIME the pump (load the first Raw(x-bpp) data set
2816 movq mm1, [edi+ebx-8] // Only time should need to read
2817 // a=Raw(x-bpp) bytes
2818dpth4lp:
2819 // Do first set of 4 bytes
2820 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2821 punpckhbw mm1, mm0 // Unpack Low bytes of a
2822 movq mm2, [esi + ebx] // load b=Prior(x)
2823 punpcklbw mm2, mm0 // Unpack High bytes of b
2824 // pav = p - a = (a + b - c) - a = b - c
2825 movq mm4, mm2
2826 punpckhbw mm3, mm0 // Unpack High bytes of c
2827 // pbv = p - b = (a + b - c) - b = a - c
2828 movq mm5, mm1
2829 psubw mm4, mm3
2830 pxor mm7, mm7
2831 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2832 movq mm6, mm4
2833 psubw mm5, mm3
2834 // pa = abs(p-a) = abs(pav)
2835 // pb = abs(p-b) = abs(pbv)
2836 // pc = abs(p-c) = abs(pcv)
2837 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2838 paddw mm6, mm5
2839 pand mm0, mm4 // Only pav bytes < 0 in mm7
2840 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2841 psubw mm4, mm0
2842 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2843 psubw mm4, mm0
2844 psubw mm5, mm7
2845 pxor mm0, mm0
2846 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2847 pand mm0, mm6 // Only pav bytes < 0 in mm7
2848 psubw mm5, mm7
2849 psubw mm6, mm0
2850 // test pa <= pb
2851 movq mm7, mm4
2852 psubw mm6, mm0
2853 pcmpgtw mm7, mm5 // pa > pb?
2854 movq mm0, mm7
2855 // use mm7 mask to merge pa & pb
2856 pand mm5, mm7
2857 // use mm0 mask copy to merge a & b
2858 pand mm2, mm0
2859 pandn mm7, mm4
2860 pandn mm0, mm1
2861 paddw mm7, mm5
2862 paddw mm0, mm2
2863 // test ((pa <= pb)? pa:pb) <= pc
2864 pcmpgtw mm7, mm6 // pab > pc?
2865 pxor mm1, mm1
2866 pand mm3, mm7
2867 pandn mm7, mm0
2868 paddw mm7, mm3
2869 pxor mm0, mm0
2870 packuswb mm7, mm1
2871 movq mm3, [esi + ebx] // load c=Prior(x-bpp)
2872 pand mm7, ActiveMask
2873 movq mm2, mm3 // load b=Prior(x) step 1
2874 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2875 punpcklbw mm3, mm0 // Unpack High bytes of c
2876 movq [edi + ebx], mm7 // write back updated value
2877 movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
2878 // Do second set of 4 bytes
2879 punpckhbw mm2, mm0 // Unpack Low bytes of b
2880 punpcklbw mm1, mm0 // Unpack Low bytes of a
2881 // pav = p - a = (a + b - c) - a = b - c
2882 movq mm4, mm2
2883 // pbv = p - b = (a + b - c) - b = a - c
2884 movq mm5, mm1
2885 psubw mm4, mm3
2886 pxor mm7, mm7
2887 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2888 movq mm6, mm4
2889 psubw mm5, mm3
2890 // pa = abs(p-a) = abs(pav)
2891 // pb = abs(p-b) = abs(pbv)
2892 // pc = abs(p-c) = abs(pcv)
2893 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2894 paddw mm6, mm5
2895 pand mm0, mm4 // Only pav bytes < 0 in mm7
2896 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2897 psubw mm4, mm0
2898 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2899 psubw mm4, mm0
2900 psubw mm5, mm7
2901 pxor mm0, mm0
2902 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2903 pand mm0, mm6 // Only pav bytes < 0 in mm7
2904 psubw mm5, mm7
2905 psubw mm6, mm0
2906 // test pa <= pb
2907 movq mm7, mm4
2908 psubw mm6, mm0
2909 pcmpgtw mm7, mm5 // pa > pb?
2910 movq mm0, mm7
2911 // use mm7 mask to merge pa & pb
2912 pand mm5, mm7
2913 // use mm0 mask copy to merge a & b
2914 pand mm2, mm0
2915 pandn mm7, mm4
2916 pandn mm0, mm1
2917 paddw mm7, mm5
2918 paddw mm0, mm2
2919 // test ((pa <= pb)? pa:pb) <= pc
2920 pcmpgtw mm7, mm6 // pab > pc?
2921 pxor mm1, mm1
2922 pand mm3, mm7
2923 pandn mm7, mm0
2924 pxor mm1, mm1
2925 paddw mm7, mm3
2926 pxor mm0, mm0
2927 // Step ex to next set of 8 bytes and repeat loop til done
2928 add ebx, 8
2929 packuswb mm1, mm7
2930 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2931 cmp ebx, MMXLength
2932 movq [edi + ebx - 8], mm1 // write back updated value
2933 // mm1 will be used as Raw(x-bpp) next loop
2934 jb dpth4lp
2935 } // end _asm block
2936 }
2937 break;
2938 case 8: // bpp == 8
2939 {
2940 ActiveMask.use = 0x00000000ffffffff;
2941 _asm {
2942 mov ebx, diff
2943 mov edi, row
2944 mov esi, prev_row
2945 pxor mm0, mm0
2946 // PRIME the pump (load the first Raw(x-bpp) data set
2947 movq mm1, [edi+ebx-8] // Only time should need to read
2948 // a=Raw(x-bpp) bytes
2949dpth8lp:
2950 // Do first set of 4 bytes
2951 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2952 punpcklbw mm1, mm0 // Unpack Low bytes of a
2953 movq mm2, [esi + ebx] // load b=Prior(x)
2954 punpcklbw mm2, mm0 // Unpack Low bytes of b
2955 // pav = p - a = (a + b - c) - a = b - c
2956 movq mm4, mm2
2957 punpcklbw mm3, mm0 // Unpack Low bytes of c
2958 // pbv = p - b = (a + b - c) - b = a - c
2959 movq mm5, mm1
2960 psubw mm4, mm3
2961 pxor mm7, mm7
2962 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2963 movq mm6, mm4
2964 psubw mm5, mm3
2965 // pa = abs(p-a) = abs(pav)
2966 // pb = abs(p-b) = abs(pbv)
2967 // pc = abs(p-c) = abs(pcv)
2968 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2969 paddw mm6, mm5
2970 pand mm0, mm4 // Only pav bytes < 0 in mm7
2971 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2972 psubw mm4, mm0
2973 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2974 psubw mm4, mm0
2975 psubw mm5, mm7
2976 pxor mm0, mm0
2977 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2978 pand mm0, mm6 // Only pav bytes < 0 in mm7
2979 psubw mm5, mm7
2980 psubw mm6, mm0
2981 // test pa <= pb
2982 movq mm7, mm4
2983 psubw mm6, mm0
2984 pcmpgtw mm7, mm5 // pa > pb?
2985 movq mm0, mm7
2986 // use mm7 mask to merge pa & pb
2987 pand mm5, mm7
2988 // use mm0 mask copy to merge a & b
2989 pand mm2, mm0
2990 pandn mm7, mm4
2991 pandn mm0, mm1
2992 paddw mm7, mm5
2993 paddw mm0, mm2
2994 // test ((pa <= pb)? pa:pb) <= pc
2995 pcmpgtw mm7, mm6 // pab > pc?
2996 pxor mm1, mm1
2997 pand mm3, mm7
2998 pandn mm7, mm0
2999 paddw mm7, mm3
3000 pxor mm0, mm0
3001 packuswb mm7, mm1
3002 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
3003 pand mm7, ActiveMask
3004 movq mm2, [esi + ebx] // load b=Prior(x)
3005 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
3006 punpckhbw mm3, mm0 // Unpack High bytes of c
3007 movq [edi + ebx], mm7 // write back updated value
3008 movq mm1, [edi+ebx-8] // read a=Raw(x-bpp) bytes
3009
3010 // Do second set of 4 bytes
3011 punpckhbw mm2, mm0 // Unpack High bytes of b
3012 punpckhbw mm1, mm0 // Unpack High bytes of a
3013 // pav = p - a = (a + b - c) - a = b - c
3014 movq mm4, mm2
3015 // pbv = p - b = (a + b - c) - b = a - c
3016 movq mm5, mm1
3017 psubw mm4, mm3
3018 pxor mm7, mm7
3019 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3020 movq mm6, mm4
3021 psubw mm5, mm3
3022 // pa = abs(p-a) = abs(pav)
3023 // pb = abs(p-b) = abs(pbv)
3024 // pc = abs(p-c) = abs(pcv)
3025 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
3026 paddw mm6, mm5
3027 pand mm0, mm4 // Only pav bytes < 0 in mm7
3028 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
3029 psubw mm4, mm0
3030 pand mm7, mm5 // Only pbv bytes < 0 in mm0
3031 psubw mm4, mm0
3032 psubw mm5, mm7
3033 pxor mm0, mm0
3034 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
3035 pand mm0, mm6 // Only pav bytes < 0 in mm7
3036 psubw mm5, mm7
3037 psubw mm6, mm0
3038 // test pa <= pb
3039 movq mm7, mm4
3040 psubw mm6, mm0
3041 pcmpgtw mm7, mm5 // pa > pb?
3042 movq mm0, mm7
3043 // use mm7 mask to merge pa & pb
3044 pand mm5, mm7
3045 // use mm0 mask copy to merge a & b
3046 pand mm2, mm0
3047 pandn mm7, mm4
3048 pandn mm0, mm1
3049 paddw mm7, mm5
3050 paddw mm0, mm2
3051 // test ((pa <= pb)? pa:pb) <= pc
3052 pcmpgtw mm7, mm6 // pab > pc?
3053 pxor mm1, mm1
3054 pand mm3, mm7
3055 pandn mm7, mm0
3056 pxor mm1, mm1
3057 paddw mm7, mm3
3058 pxor mm0, mm0
3059 // Step ex to next set of 8 bytes and repeat loop til done
3060 add ebx, 8
3061 packuswb mm1, mm7
3062 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
3063 cmp ebx, MMXLength
3064 movq [edi + ebx - 8], mm1 // write back updated value
3065 // mm1 will be used as Raw(x-bpp) next loop
3066 jb dpth8lp
3067 } // end _asm block
3068 }
3069 break;
3070
3071 case 1: // bpp = 1
3072 case 2: // bpp = 2
3073 default: // bpp > 8
3074 {
3075 _asm {
3076 mov ebx, diff
3077 cmp ebx, FullLength
3078 jnb dpthdend
3079 mov edi, row
3080 mov esi, prev_row
3081 // Do Paeth decode for remaining bytes
3082 mov edx, ebx
3083 xor ecx, ecx // zero ecx before using cl & cx in loop below
3084 sub edx, bpp // Set edx = ebx - bpp
3085dpthdlp:
3086 xor eax, eax
3087 // pav = p - a = (a + b - c) - a = b - c
3088 mov al, [esi + ebx] // load Prior(x) into al
3089 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3090 sub eax, ecx // subtract Prior(x-bpp)
3091 mov patemp, eax // Save pav for later use
3092 xor eax, eax
3093 // pbv = p - b = (a + b - c) - b = a - c
3094 mov al, [edi + edx] // load Raw(x-bpp) into al
3095 sub eax, ecx // subtract Prior(x-bpp)
3096 mov ecx, eax
3097 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3098 add eax, patemp // pcv = pav + pbv
3099 // pc = abs(pcv)
3100 test eax, 0x80000000
3101 jz dpthdpca
3102 neg eax // reverse sign of neg values
3103dpthdpca:
3104 mov pctemp, eax // save pc for later use
3105 // pb = abs(pbv)
3106 test ecx, 0x80000000
3107 jz dpthdpba
3108 neg ecx // reverse sign of neg values
3109dpthdpba:
3110 mov pbtemp, ecx // save pb for later use
3111 // pa = abs(pav)
3112 mov eax, patemp
3113 test eax, 0x80000000
3114 jz dpthdpaa
3115 neg eax // reverse sign of neg values
3116dpthdpaa:
3117 mov patemp, eax // save pa for later use
3118 // test if pa <= pb
3119 cmp eax, ecx
3120 jna dpthdabb
3121 // pa > pb; now test if pb <= pc
3122 cmp ecx, pctemp
3123 jna dpthdbbc
3124 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3125 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3126 jmp dpthdpaeth
3127dpthdbbc:
3128 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3129 mov cl, [esi + ebx] // load Prior(x) into cl
3130 jmp dpthdpaeth
3131dpthdabb:
3132 // pa <= pb; now test if pa <= pc
3133 cmp eax, pctemp
3134 jna dpthdabc
3135 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3136 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3137 jmp dpthdpaeth
3138dpthdabc:
3139 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3140 mov cl, [edi + edx] // load Raw(x-bpp) into cl
3141dpthdpaeth:
3142 inc ebx
3143 inc edx
3144 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3145 add [edi + ebx - 1], cl
3146 cmp ebx, FullLength
3147 jb dpthdlp
3148dpthdend:
3149 } // end _asm block
3150 }
3151 return; // No need to go further with this one
3152 } // end switch ( bpp )
3153 _asm
3154 {
3155 // MMX acceleration complete now do clean-up
3156 // Check if any remaining bytes left to decode
3157 mov ebx, MMXLength
3158 cmp ebx, FullLength
3159 jnb dpthend
3160 mov edi, row
3161 mov esi, prev_row
3162 // Do Paeth decode for remaining bytes
3163 mov edx, ebx
3164 xor ecx, ecx // zero ecx before using cl & cx in loop below
3165 sub edx, bpp // Set edx = ebx - bpp
3166dpthlp2:
3167 xor eax, eax
3168 // pav = p - a = (a + b - c) - a = b - c
3169 mov al, [esi + ebx] // load Prior(x) into al
3170 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3171 sub eax, ecx // subtract Prior(x-bpp)
3172 mov patemp, eax // Save pav for later use
3173 xor eax, eax
3174 // pbv = p - b = (a + b - c) - b = a - c
3175 mov al, [edi + edx] // load Raw(x-bpp) into al
3176 sub eax, ecx // subtract Prior(x-bpp)
3177 mov ecx, eax
3178 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3179 add eax, patemp // pcv = pav + pbv
3180 // pc = abs(pcv)
3181 test eax, 0x80000000
3182 jz dpthpca2
3183 neg eax // reverse sign of neg values
3184dpthpca2:
3185 mov pctemp, eax // save pc for later use
3186 // pb = abs(pbv)
3187 test ecx, 0x80000000
3188 jz dpthpba2
3189 neg ecx // reverse sign of neg values
3190dpthpba2:
3191 mov pbtemp, ecx // save pb for later use
3192 // pa = abs(pav)
3193 mov eax, patemp
3194 test eax, 0x80000000
3195 jz dpthpaa2
3196 neg eax // reverse sign of neg values
3197dpthpaa2:
3198 mov patemp, eax // save pa for later use
3199 // test if pa <= pb
3200 cmp eax, ecx
3201 jna dpthabb2
3202 // pa > pb; now test if pb <= pc
3203 cmp ecx, pctemp
3204 jna dpthbbc2
3205 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3206 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3207 jmp dpthpaeth2
3208dpthbbc2:
3209 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3210 mov cl, [esi + ebx] // load Prior(x) into cl
3211 jmp dpthpaeth2
3212dpthabb2:
3213 // pa <= pb; now test if pa <= pc
3214 cmp eax, pctemp
3215 jna dpthabc2
3216 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3217 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3218 jmp dpthpaeth2
3219dpthabc2:
3220 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3221 mov cl, [edi + edx] // load Raw(x-bpp) into cl
3222dpthpaeth2:
3223 inc ebx
3224 inc edx
3225 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3226 add [edi + ebx - 1], cl
3227 cmp ebx, FullLength
3228 jb dpthlp2
3229dpthend:
3230 emms // End MMX instructions; prep for possible FP instrs.
3231 } // end _asm block
3232}
3233
3234// Optimized code for PNG Sub filter decoder
Glenn Randers-Pehrson75294572000-05-06 14:09:57 -05003235void /* PRIVATE */
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003236png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
3237{
3238 //int test;
3239 int bpp;
3240 png_uint_32 FullLength;
3241 png_uint_32 MMXLength;
3242 int diff;
3243
3244 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3245 FullLength = row_info->rowbytes - bpp; // # of bytes to filter
3246 _asm {
3247 mov edi, row
3248 mov esi, edi // lp = row
3249 add edi, bpp // rp = row + bpp
3250 xor eax, eax
3251 // get # of bytes to alignment
3252 mov diff, edi // take start of row
3253 add diff, 0xf // add 7 + 8 to incr past
3254 // alignment boundary
3255 xor ebx, ebx
3256 and diff, 0xfffffff8 // mask to alignment boundary
3257 sub diff, edi // subtract from start ==> value
3258 // ebx at alignment
3259 jz dsubgo
3260 // fix alignment
3261dsublp1:
3262 mov al, [esi+ebx]
3263 add [edi+ebx], al
3264 inc ebx
3265 cmp ebx, diff
3266 jb dsublp1
3267dsubgo:
3268 mov ecx, FullLength
3269 mov edx, ecx
3270 sub edx, ebx // subtract alignment fix
3271 and edx, 0x00000007 // calc bytes over mult of 8
3272 sub ecx, edx // drop over bytes from length
3273 mov MMXLength, ecx
3274 } // end _asm block
3275
3276 // Now do the math for the rest of the row
3277 switch ( bpp )
3278 {
3279 case 3:
3280 {
3281 ActiveMask.use = 0x0000ffffff000000;
3282 ShiftBpp.use = 24; // == 3 * 8
3283 ShiftRem.use = 40; // == 64 - 24
3284 _asm {
3285 mov edi, row
3286 movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
3287 mov esi, edi // lp = row
3288 add edi, bpp // rp = row + bpp
3289 movq mm6, mm7
3290 mov ebx, diff
3291 psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
3292 // byte group
3293 // PRIME the pump (load the first Raw(x-bpp) data set
3294 movq mm1, [edi+ebx-8]
3295dsub3lp:
3296 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3297 // no need for mask; shift clears inactive bytes
3298 // Add 1st active group
3299 movq mm0, [edi+ebx]
3300 paddb mm0, mm1
3301 // Add 2nd active group
3302 movq mm1, mm0 // mov updated Raws to mm1
3303 psllq mm1, ShiftBpp // shift data to position correctly
3304 pand mm1, mm7 // mask to use only 2nd active group
3305 paddb mm0, mm1
3306 // Add 3rd active group
3307 movq mm1, mm0 // mov updated Raws to mm1
3308 psllq mm1, ShiftBpp // shift data to position correctly
3309 pand mm1, mm6 // mask to use only 3rd active group
3310 add ebx, 8
3311 paddb mm0, mm1
3312 cmp ebx, MMXLength
3313 movq [edi+ebx-8], mm0 // Write updated Raws back to array
3314 // Prep for doing 1st add at top of loop
3315 movq mm1, mm0
3316 jb dsub3lp
3317 } // end _asm block
3318 }
3319 break;
3320
3321 case 1:
3322 {
3323 // Placed here just in case this is a duplicate of the
3324 // non-MMX code for the SUB filter in png_read_filter_row above
3325 //
3326 // png_bytep rp;
3327 // png_bytep lp;
3328 // png_uint_32 i;
3329 // bpp = (row_info->pixel_depth + 7) >> 3;
3330 // for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
3331 // i < row_info->rowbytes; i++, rp++, lp++)
3332 // {
3333 // *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
3334 // }
3335 _asm {
3336 mov ebx, diff
3337 mov edi, row
3338 cmp ebx, FullLength
3339 jnb dsub1end
3340 mov esi, edi // lp = row
3341 xor eax, eax
3342 add edi, bpp // rp = row + bpp
3343dsub1lp:
3344 mov al, [esi+ebx]
3345 add [edi+ebx], al
3346 inc ebx
3347 cmp ebx, FullLength
3348 jb dsub1lp
3349dsub1end:
3350 } // end _asm block
3351 }
3352 return;
3353
3354 case 6:
3355 case 7:
3356 case 4:
3357 case 5:
3358 {
3359 ShiftBpp.use = bpp << 3;
3360 ShiftRem.use = 64 - ShiftBpp.use;
3361 _asm {
3362 mov edi, row
3363 mov ebx, diff
3364 mov esi, edi // lp = row
3365 add edi, bpp // rp = row + bpp
3366 // PRIME the pump (load the first Raw(x-bpp) data set
3367 movq mm1, [edi+ebx-8]
3368dsub4lp:
3369 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3370 // no need for mask; shift clears inactive bytes
3371 movq mm0, [edi+ebx]
3372 paddb mm0, mm1
3373 // Add 2nd active group
3374 movq mm1, mm0 // mov updated Raws to mm1
3375 psllq mm1, ShiftBpp // shift data to position correctly
3376 // there is no need for any mask
3377 // since shift clears inactive bits/bytes
3378 add ebx, 8
3379 paddb mm0, mm1
3380 cmp ebx, MMXLength
3381 movq [edi+ebx-8], mm0
3382 movq mm1, mm0 // Prep for doing 1st add at top of loop
3383 jb dsub4lp
3384 } // end _asm block
3385 }
3386 break;
3387
3388 case 2:
3389 {
3390 ActiveMask.use = 0x00000000ffff0000;
3391 ShiftBpp.use = 16; // == 2 * 8
3392 ShiftRem.use = 48; // == 64 - 16
3393 _asm {
3394 movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
3395 mov ebx, diff
3396 movq mm6, mm7
3397 mov edi, row
3398 psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
3399 // byte group
3400 mov esi, edi // lp = row
3401 movq mm5, mm6
3402 add edi, bpp // rp = row + bpp
3403 psllq mm5, ShiftBpp // Move mask in mm5 to cover 4th active
3404 // byte group
3405 // PRIME the pump (load the first Raw(x-bpp) data set
3406 movq mm1, [edi+ebx-8]
3407dsub2lp:
3408 // Add 1st active group
3409 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3410 // no need for mask; shift clears inactive
3411 // bytes
3412 movq mm0, [edi+ebx]
3413 paddb mm0, mm1
3414 // Add 2nd active group
3415 movq mm1, mm0 // mov updated Raws to mm1
3416 psllq mm1, ShiftBpp // shift data to position correctly
3417 pand mm1, mm7 // mask to use only 2nd active group
3418 paddb mm0, mm1
3419 // Add 3rd active group
3420 movq mm1, mm0 // mov updated Raws to mm1
3421 psllq mm1, ShiftBpp // shift data to position correctly
3422 pand mm1, mm6 // mask to use only 3rd active group
3423 paddb mm0, mm1
3424 // Add 4th active group
3425 movq mm1, mm0 // mov updated Raws to mm1
3426 psllq mm1, ShiftBpp // shift data to position correctly
3427 pand mm1, mm5 // mask to use only 4th active group
3428 add ebx, 8
3429 paddb mm0, mm1
3430 cmp ebx, MMXLength
3431 movq [edi+ebx-8], mm0 // Write updated Raws back to array
3432 movq mm1, mm0 // Prep for doing 1st add at top of loop
3433 jb dsub2lp
3434 } // end _asm block
3435 }
3436 break;
3437 case 8:
3438 {
3439 _asm {
3440 mov edi, row
3441 mov ebx, diff
3442 mov esi, edi // lp = row
3443 add edi, bpp // rp = row + bpp
3444 mov ecx, MMXLength
3445 movq mm7, [edi+ebx-8] // PRIME the pump (load the first
3446 // Raw(x-bpp) data set
3447 and ecx, 0x0000003f // calc bytes over mult of 64
3448dsub8lp:
3449 movq mm0, [edi+ebx] // Load Sub(x) for 1st 8 bytes
3450 paddb mm0, mm7
3451 movq mm1, [edi+ebx+8] // Load Sub(x) for 2nd 8 bytes
3452 movq [edi+ebx], mm0 // Write Raw(x) for 1st 8 bytes
3453 // Now mm0 will be used as Raw(x-bpp) for
3454 // the 2nd group of 8 bytes. This will be
3455 // repeated for each group of 8 bytes with
3456 // the 8th group being used as the Raw(x-bpp)
3457 // for the 1st group of the next loop.
3458 paddb mm1, mm0
3459 movq mm2, [edi+ebx+16] // Load Sub(x) for 3rd 8 bytes
3460 movq [edi+ebx+8], mm1 // Write Raw(x) for 2nd 8 bytes
3461 paddb mm2, mm1
3462 movq mm3, [edi+ebx+24] // Load Sub(x) for 4th 8 bytes
3463 movq [edi+ebx+16], mm2 // Write Raw(x) for 3rd 8 bytes
3464 paddb mm3, mm2
3465 movq mm4, [edi+ebx+32] // Load Sub(x) for 5th 8 bytes
3466 movq [edi+ebx+24], mm3 // Write Raw(x) for 4th 8 bytes
3467 paddb mm4, mm3
3468 movq mm5, [edi+ebx+40] // Load Sub(x) for 6th 8 bytes
3469 movq [edi+ebx+32], mm4 // Write Raw(x) for 5th 8 bytes
3470 paddb mm5, mm4
3471 movq mm6, [edi+ebx+48] // Load Sub(x) for 7th 8 bytes
3472 movq [edi+ebx+40], mm5 // Write Raw(x) for 6th 8 bytes
3473 paddb mm6, mm5
3474 movq mm7, [edi+ebx+56] // Load Sub(x) for 8th 8 bytes
3475 movq [edi+ebx+48], mm6 // Write Raw(x) for 7th 8 bytes
3476 add ebx, 64
3477 paddb mm7, mm6
3478 cmp ebx, ecx
3479 movq [edi+ebx-8], mm7 // Write Raw(x) for 8th 8 bytes
3480 jb dsub8lp
3481 cmp ebx, MMXLength
3482 jnb dsub8lt8
3483dsub8lpA:
3484 movq mm0, [edi+ebx]
3485 add ebx, 8
3486 paddb mm0, mm7
3487 cmp ebx, MMXLength
3488 movq [edi+ebx-8], mm0 // use -8 to offset early add to ebx
3489 movq mm7, mm0 // Move calculated Raw(x) data to mm1 to
3490 // be the new Raw(x-bpp) for the next loop
3491 jb dsub8lpA
3492dsub8lt8:
3493 } // end _asm block
3494 }
3495 break;
3496
3497 default: // bpp greater than 8 bytes
3498 {
3499 _asm {
3500 mov ebx, diff
3501 mov edi, row
3502 mov esi, edi // lp = row
3503 add edi, bpp // rp = row + bpp
3504dsubAlp:
3505 movq mm0, [edi+ebx]
3506 movq mm1, [esi+ebx]
3507 add ebx, 8
3508 paddb mm0, mm1
3509 cmp ebx, MMXLength
3510 movq [edi+ebx-8], mm0 // mov does not affect flags; -8 to offset
3511 // add ebx
3512 jb dsubAlp
3513 } // end _asm block
3514 }
3515 break;
3516
3517 } // end switch ( bpp )
3518
3519 _asm {
3520 mov ebx, MMXLength
3521 mov edi, row
3522 cmp ebx, FullLength
3523 jnb dsubend
3524 mov esi, edi // lp = row
3525 xor eax, eax
3526 add edi, bpp // rp = row + bpp
3527dsublp2:
3528 mov al, [esi+ebx]
3529 add [edi+ebx], al
3530 inc ebx
3531 cmp ebx, FullLength
3532 jb dsublp2
3533dsubend:
3534 emms // End MMX instructions; prep for possible FP instrs.
3535 } // end _asm block
3536}
3537
3538// Optimized code for PNG Up filter decoder
Glenn Randers-Pehrson75294572000-05-06 14:09:57 -05003539void /* PRIVATE */
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003540png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
3541 png_bytep prev_row)
3542{
3543 png_uint_32 len;
3544 len = row_info->rowbytes; // # of bytes to filter
3545 _asm {
3546 mov edi, row
3547 // get # of bytes to alignment
3548 mov ecx, edi
3549 xor ebx, ebx
3550 add ecx, 0x7
3551 xor eax, eax
3552 and ecx, 0xfffffff8
3553 mov esi, prev_row
3554 sub ecx, edi
3555 jz dupgo
3556 // fix alignment
3557duplp1:
3558 mov al, [edi+ebx]
3559 add al, [esi+ebx]
3560 inc ebx
3561 cmp ebx, ecx
3562 mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3563 jb duplp1
3564dupgo:
3565 mov ecx, len
3566 mov edx, ecx
3567 sub edx, ebx // subtract alignment fix
3568 and edx, 0x0000003f // calc bytes over mult of 64
3569 sub ecx, edx // drop over bytes from length
3570 // Unrolled loop - use all MMX registers and interleave to reduce
3571 // number of branch instructions (loops) and reduce partial stalls
3572duploop:
3573 movq mm1, [esi+ebx]
3574 movq mm0, [edi+ebx]
3575 movq mm3, [esi+ebx+8]
3576 paddb mm0, mm1
3577 movq mm2, [edi+ebx+8]
3578 movq [edi+ebx], mm0
3579 paddb mm2, mm3
3580 movq mm5, [esi+ebx+16]
3581 movq [edi+ebx+8], mm2
3582 movq mm4, [edi+ebx+16]
3583 movq mm7, [esi+ebx+24]
3584 paddb mm4, mm5
3585 movq mm6, [edi+ebx+24]
3586 movq [edi+ebx+16], mm4
3587 paddb mm6, mm7
3588 movq mm1, [esi+ebx+32]
3589 movq [edi+ebx+24], mm6
3590 movq mm0, [edi+ebx+32]
3591 movq mm3, [esi+ebx+40]
3592 paddb mm0, mm1
3593 movq mm2, [edi+ebx+40]
3594 movq [edi+ebx+32], mm0
3595 paddb mm2, mm3
3596 movq mm5, [esi+ebx+48]
3597 movq [edi+ebx+40], mm2
3598 movq mm4, [edi+ebx+48]
3599 movq mm7, [esi+ebx+56]
3600 paddb mm4, mm5
3601 movq mm6, [edi+ebx+56]
3602 movq [edi+ebx+48], mm4
3603 add ebx, 64
3604 paddb mm6, mm7
3605 cmp ebx, ecx
3606 movq [edi+ebx-8], mm6 // (+56)movq does not affect flags;
3607 // -8 to offset add ebx
3608 jb duploop
3609
3610 cmp edx, 0 // Test for bytes over mult of 64
3611 jz dupend
3612
3613
3614 // 2 lines added by lcreeve@netins.net
3615 // (mail 11 Jul 98 in png-implement list)
3616 cmp edx, 8 //test for less than 8 bytes
3617 jb duplt8
3618
3619
3620 add ecx, edx
3621 and edx, 0x00000007 // calc bytes over mult of 8
3622 sub ecx, edx // drop over bytes from length
3623 jz duplt8
3624 // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
3625duplpA:
3626 movq mm1, [esi+ebx]
3627 movq mm0, [edi+ebx]
3628 add ebx, 8
3629 paddb mm0, mm1
3630 cmp ebx, ecx
3631 movq [edi+ebx-8], mm0 // movq does not affect flags; -8 to offset add ebx
3632 jb duplpA
3633 cmp edx, 0 // Test for bytes over mult of 8
3634 jz dupend
3635duplt8:
3636 xor eax, eax
3637 add ecx, edx // move over byte count into counter
3638 // Loop using x86 registers to update remaining bytes
3639duplp2:
3640 mov al, [edi + ebx]
3641 add al, [esi + ebx]
3642 inc ebx
3643 cmp ebx, ecx
3644 mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3645 jb duplp2
3646dupend:
3647 // Conversion of filtered row completed
3648 emms // End MMX instructions; prep for possible FP instrs.
3649 } // end _asm block
3650}
3651
3652
3653// Optimized png_read_filter_row routines
Glenn Randers-Pehrson75294572000-05-06 14:09:57 -05003654void /* PRIVATE */
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003655png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
3656 row, png_bytep prev_row, int filter)
3657{
3658#ifdef PNG_DEBUG
3659 char filnm[6];
3660#endif
Glenn Randers-Pehrson61c32d92000-02-04 23:40:16 -06003661#define UseMMX 1
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003662
3663 if (mmx_supported == 2)
3664 mmx_supported = mmxsupport();
3665
3666 if (!mmx_supported)
3667 {
3668 png_read_filter_row_c(png_ptr, row_info, row, prev_row, filter);
3669 return ;
3670 }
3671
3672#ifdef PNG_DEBUG
3673 png_debug(1, "in png_read_filter_row\n");
Glenn Randers-Pehrson316f97a2000-07-08 13:19:41 -05003674# if (UseMMX == 1)
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003675 png_debug1(0,"%s, ", "MMX");
Glenn Randers-Pehrson316f97a2000-07-08 13:19:41 -05003676# else
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003677 png_debug1(0,"%s, ", "x86");
Glenn Randers-Pehrson316f97a2000-07-08 13:19:41 -05003678# endif
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003679 switch (filter)
3680 {
3681 case 0: sprintf(filnm, "None ");
3682 break;
3683 case 1: sprintf(filnm, "Sub ");
3684 break;
3685 case 2: sprintf(filnm, "Up ");
3686 break;
3687 case 3: sprintf(filnm, "Avg ");
3688 break;
3689 case 4: sprintf(filnm, "Paeth");
3690 break;
3691 default: sprintf(filnm, "Unknw");
3692 break;
3693 }
3694 png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
3695 png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
3696 (int)((row_info->pixel_depth + 7) >> 3));
3697 png_debug1(0,"len=%8d, ", row_info->rowbytes);
3698#endif
3699
3700 switch (filter)
3701 {
3702 case PNG_FILTER_VALUE_NONE:
3703 break;
3704 case PNG_FILTER_VALUE_SUB:
3705 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003706#if (UseMMX == 1)
3707 if ((row_info->pixel_depth > 8) &&
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003708 (row_info->rowbytes >= 128) )
3709 {
3710 png_read_filter_row_mmx_sub(row_info, row);
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003711 }
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003712 else
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003713#endif
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003714 {
3715 png_uint_32 i;
3716 png_uint_32 istop = row_info->rowbytes;
3717 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3718 png_bytep rp = row + bpp;
3719 png_bytep lp = row;
3720
3721 for (i = bpp; i < istop; i++)
3722 {
3723 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
3724 rp++;
3725 }
3726 } //end !UseMMX
3727 break;
3728 }
3729 case PNG_FILTER_VALUE_UP:
3730 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003731#if (UseMMX == 1)
3732 if ((row_info->pixel_depth > 8) &&
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003733 (row_info->rowbytes >= 128) )
3734 {
3735 png_read_filter_row_mmx_up(row_info, row, prev_row);
3736 } //end if UseMMX
3737 else
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003738#endif
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003739 {
3740 png_bytep rp;
3741 png_bytep pp;
3742 png_uint_32 i;
3743 for (i = 0, rp = row, pp = prev_row;
3744 i < row_info->rowbytes; i++, rp++, pp++)
3745 {
3746 *rp = (png_byte)(((int)(*rp) + (int)(*pp)) & 0xff);
3747 }
3748 } //end !UseMMX
3749 break;
3750 }
3751 case PNG_FILTER_VALUE_AVG:
3752 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003753#if (UseMMX == 1)
3754 if ((row_info->pixel_depth > 8) &&
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003755 (row_info->rowbytes >= 128) )
3756 {
3757 png_read_filter_row_mmx_avg(row_info, row, prev_row);
3758 } //end if UseMMX
3759 else
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003760#endif
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003761 {
3762 png_uint_32 i;
3763 png_bytep rp = row;
3764 png_bytep pp = prev_row;
3765 png_bytep lp = row;
3766 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3767 png_uint_32 istop = row_info->rowbytes - bpp;
3768
3769 for (i = 0; i < bpp; i++)
3770 {
3771 *rp = (png_byte)(((int)(*rp) +
3772 ((int)(*pp++) >> 1)) & 0xff);
3773 rp++;
3774 }
3775
3776 for (i = 0; i < istop; i++)
3777 {
3778 *rp = (png_byte)(((int)(*rp) +
3779 ((int)(*pp++ + *lp++) >> 1)) & 0xff);
3780 rp++;
3781 }
3782 } //end !UseMMX
3783 break;
3784 }
3785 case PNG_FILTER_VALUE_PAETH:
3786 {
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003787#if (UseMMX == 1)
3788 if ((row_info->pixel_depth > 8) &&
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003789 (row_info->rowbytes >= 128) )
3790 {
3791 png_read_filter_row_mmx_paeth(row_info, row, prev_row);
3792 } //end if UseMMX
3793 else
Glenn Randers-Pehrson860ab2b1999-10-14 07:43:10 -05003794#endif
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003795 {
3796 png_uint_32 i;
3797 png_bytep rp = row;
3798 png_bytep pp = prev_row;
3799 png_bytep lp = row;
3800 png_bytep cp = prev_row;
3801 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3802 png_uint_32 istop=row_info->rowbytes - bpp;
3803
3804 for (i = 0; i < bpp; i++)
3805 {
3806 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3807 rp++;
3808 }
3809
3810 for (i = 0; i < istop; i++) // use leftover rp,pp
3811 {
3812 int a, b, c, pa, pb, pc, p;
3813
3814 a = *lp++;
3815 b = *pp++;
3816 c = *cp++;
3817
3818 p = b - c;
3819 pc = a - c;
3820
3821#ifdef PNG_USE_ABS
3822 pa = abs(p);
3823 pb = abs(pc);
3824 pc = abs(p + pc);
3825#else
3826 pa = p < 0 ? -p : p;
3827 pb = pc < 0 ? -pc : pc;
3828 pc = (p + pc) < 0 ? -(p + pc) : p + pc;
3829#endif
3830
3831 /*
3832 if (pa <= pb && pa <= pc)
3833 p = a;
3834 else if (pb <= pc)
3835 p = b;
3836 else
3837 p = c;
3838 */
3839
3840 p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
3841
3842 *rp = (png_byte)(((int)(*rp) + p) & 0xff);
3843 rp++;
3844 }
3845 } //end !UseMMX
3846 break;
3847 }
3848 default:
Glenn Randers-Pehrsonec61c232000-05-16 06:17:36 -05003849 png_warning(png_ptr, "Ignoring bad adaptive filter type");
3850 *row=0;
Glenn Randers-Pehrson18c415f1999-10-10 17:50:00 -05003851 break;
3852 }
3853}
3854#endif