blob: fe99a20b343130cbe88ab30eebed2d6eb43f7b26 [file] [log] [blame]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +00001;
2; jcqntsse.asm - sample data conversion and quantization (SSE & MMX)
3;
4; x86 SIMD extension for IJG JPEG library
5; Copyright (C) 1999-2006, MIYASAKA Masaru.
6; For conditions of distribution and use, see copyright notice in jsimdext.inc
7;
8; This file should be assembled with NASM (Netwide Assembler),
9; can *not* be assembled with Microsoft's MASM or any compatible
10; assembler (including Borland's Turbo Assembler).
11; NASM is available from http://nasm.sourceforge.net/ or
12; http://sourceforge.net/project/showfiles.php?group_id=6208
13;
14; Last Modified : January 12, 2005
15;
16; [TAB8]
17
18%include "jsimdext.inc"
19%include "jdct.inc"
20
21%ifdef DCT_FLOAT_SUPPORTED
22%ifdef JFDCT_FLT_SSE_MMX_SUPPORTED
23
24; This module is specialized to the case DCTSIZE = 8.
25;
26%if DCTSIZE != 8
27%error "Sorry, this code only copes with 8x8 DCTs."
28%endif
29
30; --------------------------------------------------------------------------
31 SECTION SEG_TEXT
32 BITS 32
33;
34; Load data into workspace, applying unsigned->signed conversion
35;
36; GLOBAL(void)
37; jpeg_convsamp_flt_sse (JSAMPARRAY sample_data, JDIMENSION start_col,
38; FAST_FLOAT * workspace);
39;
40
41%define sample_data ebp+8 ; JSAMPARRAY sample_data
42%define start_col ebp+12 ; JDIMENSION start_col
43%define workspace ebp+16 ; FAST_FLOAT * workspace
44
45 align 16
46 global EXTN(jpeg_convsamp_flt_sse)
47
48EXTN(jpeg_convsamp_flt_sse):
49 push ebp
50 mov ebp,esp
51 push ebx
52; push ecx ; need not be preserved
53; push edx ; need not be preserved
54 push esi
55 push edi
56
57 pcmpeqw mm7,mm7
58 psllw mm7,7
59 packsswb mm7,mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..)
60
61 mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
62 mov eax, JDIMENSION [start_col]
63 mov edi, POINTER [workspace] ; (DCTELEM *)
64 mov ecx, DCTSIZE/2
65 alignx 16,7
66.convloop:
67 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
68 mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
69
70 movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
71 movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
72
73 psubb mm0,mm7 ; mm0=(01234567)
74 psubb mm1,mm7 ; mm1=(89ABCDEF)
75
76 punpcklbw mm2,mm0 ; mm2=(*0*1*2*3)
77 punpckhbw mm0,mm0 ; mm0=(*4*5*6*7)
78 punpcklbw mm3,mm1 ; mm3=(*8*9*A*B)
79 punpckhbw mm1,mm1 ; mm1=(*C*D*E*F)
80
81 punpcklwd mm4,mm2 ; mm4=(***0***1)
82 punpckhwd mm2,mm2 ; mm2=(***2***3)
83 punpcklwd mm5,mm0 ; mm5=(***4***5)
84 punpckhwd mm0,mm0 ; mm0=(***6***7)
85
86 psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(01)
87 psrad mm2,(DWORD_BIT-BYTE_BIT) ; mm2=(23)
88 cvtpi2ps xmm0,mm4 ; xmm0=(01**)
89 cvtpi2ps xmm1,mm2 ; xmm1=(23**)
90 psrad mm5,(DWORD_BIT-BYTE_BIT) ; mm5=(45)
91 psrad mm0,(DWORD_BIT-BYTE_BIT) ; mm0=(67)
92 cvtpi2ps xmm2,mm5 ; xmm2=(45**)
93 cvtpi2ps xmm3,mm0 ; xmm3=(67**)
94
95 punpcklwd mm6,mm3 ; mm6=(***8***9)
96 punpckhwd mm3,mm3 ; mm3=(***A***B)
97 punpcklwd mm4,mm1 ; mm4=(***C***D)
98 punpckhwd mm1,mm1 ; mm1=(***E***F)
99
100 psrad mm6,(DWORD_BIT-BYTE_BIT) ; mm6=(89)
101 psrad mm3,(DWORD_BIT-BYTE_BIT) ; mm3=(AB)
102 cvtpi2ps xmm4,mm6 ; xmm4=(89**)
103 cvtpi2ps xmm5,mm3 ; xmm5=(AB**)
104 psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(CD)
105 psrad mm1,(DWORD_BIT-BYTE_BIT) ; mm1=(EF)
106 cvtpi2ps xmm6,mm4 ; xmm6=(CD**)
107 cvtpi2ps xmm7,mm1 ; xmm7=(EF**)
108
109 movlhps xmm0,xmm1 ; xmm0=(0123)
110 movlhps xmm2,xmm3 ; xmm2=(4567)
111 movlhps xmm4,xmm5 ; xmm4=(89AB)
112 movlhps xmm6,xmm7 ; xmm6=(CDEF)
113
114 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
115 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
116 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
117 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
118
119 add esi, byte 2*SIZEOF_JSAMPROW
120 add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
121 dec ecx
122 jnz near .convloop
123
124 emms ; empty MMX state
125
126 pop edi
127 pop esi
128; pop edx ; need not be preserved
129; pop ecx ; need not be preserved
130 pop ebx
131 pop ebp
132 ret
133
134
135; --------------------------------------------------------------------------
136;
137; Quantize/descale the coefficients, and store into coef_block
138;
139; GLOBAL(void)
140; jpeg_quantize_flt_sse (JCOEFPTR coef_block, FAST_FLOAT * divisors,
141; FAST_FLOAT * workspace);
142;
143
144%define coef_block ebp+8 ; JCOEFPTR coef_block
145%define divisors ebp+12 ; FAST_FLOAT * divisors
146%define workspace ebp+16 ; FAST_FLOAT * workspace
147
148 align 16
149 global EXTN(jpeg_quantize_flt_sse)
150
151EXTN(jpeg_quantize_flt_sse):
152 push ebp
153 mov ebp,esp
154; push ebx ; unused
155; push ecx ; unused
156; push edx ; need not be preserved
157 push esi
158 push edi
159
160 mov esi, POINTER [workspace]
161 mov edx, POINTER [divisors]
162 mov edi, JCOEFPTR [coef_block]
163 mov eax, DCTSIZE2/16
164 alignx 16,7
165.quantloop:
166 movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
167 movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
168 mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
169 mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
170 movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
171 movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
172 mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
173 mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
174
175 movhlps xmm4,xmm0
176 movhlps xmm5,xmm1
177
178 cvtps2pi mm0,xmm0
179 cvtps2pi mm1,xmm1
180 cvtps2pi mm4,xmm4
181 cvtps2pi mm5,xmm5
182
183 movhlps xmm6,xmm2
184 movhlps xmm7,xmm3
185
186 cvtps2pi mm2,xmm2
187 cvtps2pi mm3,xmm3
188 cvtps2pi mm6,xmm6
189 cvtps2pi mm7,xmm7
190
191 packssdw mm0,mm4
192 packssdw mm1,mm5
193 packssdw mm2,mm6
194 packssdw mm3,mm7
195
196 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
197 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
198 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
199 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
200
201 add esi, byte 16*SIZEOF_FAST_FLOAT
202 add edx, byte 16*SIZEOF_FAST_FLOAT
203 add edi, byte 16*SIZEOF_JCOEF
204 dec eax
205 jnz short .quantloop
206
207 emms ; empty MMX state
208
209 pop edi
210 pop esi
211; pop edx ; need not be preserved
212; pop ecx ; unused
213; pop ebx ; unused
214 pop ebp
215 ret
216
217%endif ; JFDCT_FLT_SSE_MMX_SUPPORTED
218%endif ; DCT_FLOAT_SUPPORTED