blob: 796723ac3092c299323a9de1b7f4a40d2ebb93fb [file] [log] [blame]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +00001;
DRC72130be2014-05-09 20:14:26 +00002; jquant.asm - sample data conversion and quantization (SSE & MMX)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +00003;
Pierre Ossman018fc422009-03-09 13:31:56 +00004; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +00007; x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000017; [TAB8]
18
Pierre Ossman3a65ef42009-03-16 13:34:18 +000019%include "jsimdext.inc"
20%include "jdct.inc"
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000021
22; --------------------------------------------------------------------------
DRCe5eaf372014-05-09 18:00:32 +000023 SECTION SEG_TEXT
24 BITS 32
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000025;
26; Load data into workspace, applying unsigned->signed conversion
27;
28; GLOBAL(void)
Pierre Ossman018fc422009-03-09 13:31:56 +000029; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col,
DRCbd498032016-02-19 08:53:33 -060030; FAST_FLOAT *workspace);
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000031;
32
DRCe5eaf372014-05-09 18:00:32 +000033%define sample_data ebp+8 ; JSAMPARRAY sample_data
34%define start_col ebp+12 ; JDIMENSION start_col
DRCbd498032016-02-19 08:53:33 -060035%define workspace ebp+16 ; FAST_FLOAT *workspace
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000036
DRCe5eaf372014-05-09 18:00:32 +000037 align 16
38 global EXTN(jsimd_convsamp_float_sse)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000039
Pierre Ossman018fc422009-03-09 13:31:56 +000040EXTN(jsimd_convsamp_float_sse):
DRCe5eaf372014-05-09 18:00:32 +000041 push ebp
42 mov ebp,esp
43 push ebx
44; push ecx ; need not be preserved
45; push edx ; need not be preserved
46 push esi
47 push edi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000048
DRCe5eaf372014-05-09 18:00:32 +000049 pcmpeqw mm7,mm7
50 psllw mm7,7
51 packsswb mm7,mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000052
DRCe5eaf372014-05-09 18:00:32 +000053 mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
54 mov eax, JDIMENSION [start_col]
55 mov edi, POINTER [workspace] ; (DCTELEM *)
56 mov ecx, DCTSIZE/2
57 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000058.convloop:
DRCe5eaf372014-05-09 18:00:32 +000059 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
60 mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000061
DRCe5eaf372014-05-09 18:00:32 +000062 movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
63 movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000064
DRCe5eaf372014-05-09 18:00:32 +000065 psubb mm0,mm7 ; mm0=(01234567)
66 psubb mm1,mm7 ; mm1=(89ABCDEF)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000067
DRCe5eaf372014-05-09 18:00:32 +000068 punpcklbw mm2,mm0 ; mm2=(*0*1*2*3)
69 punpckhbw mm0,mm0 ; mm0=(*4*5*6*7)
70 punpcklbw mm3,mm1 ; mm3=(*8*9*A*B)
71 punpckhbw mm1,mm1 ; mm1=(*C*D*E*F)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000072
DRCe5eaf372014-05-09 18:00:32 +000073 punpcklwd mm4,mm2 ; mm4=(***0***1)
74 punpckhwd mm2,mm2 ; mm2=(***2***3)
75 punpcklwd mm5,mm0 ; mm5=(***4***5)
76 punpckhwd mm0,mm0 ; mm0=(***6***7)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000077
DRCe5eaf372014-05-09 18:00:32 +000078 psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(01)
79 psrad mm2,(DWORD_BIT-BYTE_BIT) ; mm2=(23)
80 cvtpi2ps xmm0,mm4 ; xmm0=(01**)
81 cvtpi2ps xmm1,mm2 ; xmm1=(23**)
82 psrad mm5,(DWORD_BIT-BYTE_BIT) ; mm5=(45)
83 psrad mm0,(DWORD_BIT-BYTE_BIT) ; mm0=(67)
84 cvtpi2ps xmm2,mm5 ; xmm2=(45**)
85 cvtpi2ps xmm3,mm0 ; xmm3=(67**)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000086
DRCe5eaf372014-05-09 18:00:32 +000087 punpcklwd mm6,mm3 ; mm6=(***8***9)
88 punpckhwd mm3,mm3 ; mm3=(***A***B)
89 punpcklwd mm4,mm1 ; mm4=(***C***D)
90 punpckhwd mm1,mm1 ; mm1=(***E***F)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000091
DRCe5eaf372014-05-09 18:00:32 +000092 psrad mm6,(DWORD_BIT-BYTE_BIT) ; mm6=(89)
93 psrad mm3,(DWORD_BIT-BYTE_BIT) ; mm3=(AB)
94 cvtpi2ps xmm4,mm6 ; xmm4=(89**)
95 cvtpi2ps xmm5,mm3 ; xmm5=(AB**)
96 psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(CD)
97 psrad mm1,(DWORD_BIT-BYTE_BIT) ; mm1=(EF)
98 cvtpi2ps xmm6,mm4 ; xmm6=(CD**)
99 cvtpi2ps xmm7,mm1 ; xmm7=(EF**)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000100
DRCe5eaf372014-05-09 18:00:32 +0000101 movlhps xmm0,xmm1 ; xmm0=(0123)
102 movlhps xmm2,xmm3 ; xmm2=(4567)
103 movlhps xmm4,xmm5 ; xmm4=(89AB)
104 movlhps xmm6,xmm7 ; xmm6=(CDEF)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000105
DRCe5eaf372014-05-09 18:00:32 +0000106 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
107 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
108 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
109 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000110
DRCe5eaf372014-05-09 18:00:32 +0000111 add esi, byte 2*SIZEOF_JSAMPROW
112 add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
113 dec ecx
114 jnz near .convloop
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000115
DRCe5eaf372014-05-09 18:00:32 +0000116 emms ; empty MMX state
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000117
DRCe5eaf372014-05-09 18:00:32 +0000118 pop edi
119 pop esi
120; pop edx ; need not be preserved
121; pop ecx ; need not be preserved
122 pop ebx
123 pop ebp
124 ret
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000125
126
127; --------------------------------------------------------------------------
128;
129; Quantize/descale the coefficients, and store into coef_block
130;
131; GLOBAL(void)
DRCbd498032016-02-19 08:53:33 -0600132; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT *divisors,
133; FAST_FLOAT *workspace);
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000134;
135
DRCe5eaf372014-05-09 18:00:32 +0000136%define coef_block ebp+8 ; JCOEFPTR coef_block
DRCbd498032016-02-19 08:53:33 -0600137%define divisors ebp+12 ; FAST_FLOAT *divisors
138%define workspace ebp+16 ; FAST_FLOAT *workspace
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000139
DRCe5eaf372014-05-09 18:00:32 +0000140 align 16
141 global EXTN(jsimd_quantize_float_sse)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000142
Pierre Ossman018fc422009-03-09 13:31:56 +0000143EXTN(jsimd_quantize_float_sse):
DRCe5eaf372014-05-09 18:00:32 +0000144 push ebp
145 mov ebp,esp
146; push ebx ; unused
147; push ecx ; unused
148; push edx ; need not be preserved
149 push esi
150 push edi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000151
DRCe5eaf372014-05-09 18:00:32 +0000152 mov esi, POINTER [workspace]
153 mov edx, POINTER [divisors]
154 mov edi, JCOEFPTR [coef_block]
155 mov eax, DCTSIZE2/16
156 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000157.quantloop:
DRCe5eaf372014-05-09 18:00:32 +0000158 movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
159 movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
160 mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
161 mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
162 movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
163 movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
164 mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
165 mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000166
DRCe5eaf372014-05-09 18:00:32 +0000167 movhlps xmm4,xmm0
168 movhlps xmm5,xmm1
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000169
DRCe5eaf372014-05-09 18:00:32 +0000170 cvtps2pi mm0,xmm0
171 cvtps2pi mm1,xmm1
172 cvtps2pi mm4,xmm4
173 cvtps2pi mm5,xmm5
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000174
DRCe5eaf372014-05-09 18:00:32 +0000175 movhlps xmm6,xmm2
176 movhlps xmm7,xmm3
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000177
DRCe5eaf372014-05-09 18:00:32 +0000178 cvtps2pi mm2,xmm2
179 cvtps2pi mm3,xmm3
180 cvtps2pi mm6,xmm6
181 cvtps2pi mm7,xmm7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000182
DRCe5eaf372014-05-09 18:00:32 +0000183 packssdw mm0,mm4
184 packssdw mm1,mm5
185 packssdw mm2,mm6
186 packssdw mm3,mm7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000187
DRCe5eaf372014-05-09 18:00:32 +0000188 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
189 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
190 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
191 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000192
DRCe5eaf372014-05-09 18:00:32 +0000193 add esi, byte 16*SIZEOF_FAST_FLOAT
194 add edx, byte 16*SIZEOF_FAST_FLOAT
195 add edi, byte 16*SIZEOF_JCOEF
196 dec eax
197 jnz short .quantloop
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000198
DRCe5eaf372014-05-09 18:00:32 +0000199 emms ; empty MMX state
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000200
DRCe5eaf372014-05-09 18:00:32 +0000201 pop edi
202 pop esi
203; pop edx ; need not be preserved
204; pop ecx ; unused
205; pop ebx ; unused
206 pop ebp
207 ret
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000208
DRC132b5fd2009-10-08 09:04:56 +0000209; For some reason, the OS X linker does not honor the request to align the
210; segment unless we do this.
DRCe5eaf372014-05-09 18:00:32 +0000211 align 16