blob: 11202dbf68d916ef01abb4d758788c2338e01d24 [file] [log] [blame]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +00001;
DRC72130be2014-05-09 20:14:26 +00002; jcsample.asm - downsampling (SSE2)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +00003;
Pierre Ossmaneea72152009-03-09 13:34:17 +00004; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +00007; x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000017; [TAB8]
18
Pierre Ossman3a65ef42009-03-16 13:34:18 +000019%include "jsimdext.inc"
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000020
21; --------------------------------------------------------------------------
DRCe5eaf372014-05-09 18:00:32 +000022 SECTION SEG_TEXT
23 BITS 32
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000024;
25; Downsample pixel values of a single component.
26; This version handles the common case of 2:1 horizontal and 1:1 vertical,
27; without smoothing.
28;
29; GLOBAL(void)
Pierre Ossmaneea72152009-03-09 13:34:17 +000030; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
31; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
32; JSAMPARRAY input_data, JSAMPARRAY output_data);
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000033;
34
DRCe5eaf372014-05-09 18:00:32 +000035%define img_width(b) (b)+8 ; JDIMENSION image_width
36%define max_v_samp(b) (b)+12 ; int max_v_samp_factor
37%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
38%define width_blks(b) (b)+20 ; JDIMENSION width_blocks
39%define input_data(b) (b)+24 ; JSAMPARRAY input_data
40%define output_data(b) (b)+28 ; JSAMPARRAY output_data
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000041
DRCe5eaf372014-05-09 18:00:32 +000042 align 16
43 global EXTN(jsimd_h2v1_downsample_sse2)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000044
Pierre Ossmaneea72152009-03-09 13:34:17 +000045EXTN(jsimd_h2v1_downsample_sse2):
DRCe5eaf372014-05-09 18:00:32 +000046 push ebp
47 mov ebp,esp
48; push ebx ; unused
49; push ecx ; need not be preserved
50; push edx ; need not be preserved
51 push esi
52 push edi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000053
DRCe5eaf372014-05-09 18:00:32 +000054 mov ecx, JDIMENSION [width_blks(ebp)]
55 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
56 jz near .return
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000057
DRCe5eaf372014-05-09 18:00:32 +000058 mov edx, JDIMENSION [img_width(ebp)]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000059
DRCe5eaf372014-05-09 18:00:32 +000060 ; -- expand_right_edge
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000061
DRCe5eaf372014-05-09 18:00:32 +000062 push ecx
63 shl ecx,1 ; output_cols * 2
64 sub ecx,edx
65 jle short .expand_end
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000066
DRCe5eaf372014-05-09 18:00:32 +000067 mov eax, INT [max_v_samp(ebp)]
68 test eax,eax
69 jle short .expand_end
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000070
DRCe5eaf372014-05-09 18:00:32 +000071 cld
72 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
73 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000074.expandloop:
DRCe5eaf372014-05-09 18:00:32 +000075 push eax
76 push ecx
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000077
DRCe5eaf372014-05-09 18:00:32 +000078 mov edi, JSAMPROW [esi]
79 add edi,edx
80 mov al, JSAMPLE [edi-1]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000081
DRCe5eaf372014-05-09 18:00:32 +000082 rep stosb
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000083
DRCe5eaf372014-05-09 18:00:32 +000084 pop ecx
85 pop eax
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000086
DRCe5eaf372014-05-09 18:00:32 +000087 add esi, byte SIZEOF_JSAMPROW
88 dec eax
89 jg short .expandloop
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000090
91.expand_end:
DRCe5eaf372014-05-09 18:00:32 +000092 pop ecx ; output_cols
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000093
DRCe5eaf372014-05-09 18:00:32 +000094 ; -- h2v1_downsample
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000095
DRCe5eaf372014-05-09 18:00:32 +000096 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
97 test eax,eax
98 jle near .return
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000099
DRCe5eaf372014-05-09 18:00:32 +0000100 mov edx, 0x00010000 ; bias pattern
101 movd xmm7,edx
102 pcmpeqw xmm6,xmm6
103 pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
104 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000105
DRCe5eaf372014-05-09 18:00:32 +0000106 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
107 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
108 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000109.rowloop:
DRCe5eaf372014-05-09 18:00:32 +0000110 push ecx
111 push edi
112 push esi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000113
DRCe5eaf372014-05-09 18:00:32 +0000114 mov esi, JSAMPROW [esi] ; inptr
115 mov edi, JSAMPROW [edi] ; outptr
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000116
DRCe5eaf372014-05-09 18:00:32 +0000117 cmp ecx, byte SIZEOF_XMMWORD
118 jae short .columnloop
119 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000120
121.columnloop_r8:
DRCe5eaf372014-05-09 18:00:32 +0000122 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
123 pxor xmm1,xmm1
124 mov ecx, SIZEOF_XMMWORD
125 jmp short .downsample
126 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000127
128.columnloop:
DRCe5eaf372014-05-09 18:00:32 +0000129 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
130 movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000131
132.downsample:
DRCe5eaf372014-05-09 18:00:32 +0000133 movdqa xmm2,xmm0
134 movdqa xmm3,xmm1
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000135
DRCe5eaf372014-05-09 18:00:32 +0000136 pand xmm0,xmm6
137 psrlw xmm2,BYTE_BIT
138 pand xmm1,xmm6
139 psrlw xmm3,BYTE_BIT
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000140
DRCe5eaf372014-05-09 18:00:32 +0000141 paddw xmm0,xmm2
142 paddw xmm1,xmm3
143 paddw xmm0,xmm7
144 paddw xmm1,xmm7
145 psrlw xmm0,1
146 psrlw xmm1,1
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000147
DRCe5eaf372014-05-09 18:00:32 +0000148 packuswb xmm0,xmm1
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000149
DRCe5eaf372014-05-09 18:00:32 +0000150 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000151
DRCe5eaf372014-05-09 18:00:32 +0000152 sub ecx, byte SIZEOF_XMMWORD ; outcol
153 add esi, byte 2*SIZEOF_XMMWORD ; inptr
154 add edi, byte 1*SIZEOF_XMMWORD ; outptr
155 cmp ecx, byte SIZEOF_XMMWORD
156 jae short .columnloop
157 test ecx,ecx
158 jnz short .columnloop_r8
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000159
DRCe5eaf372014-05-09 18:00:32 +0000160 pop esi
161 pop edi
162 pop ecx
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000163
DRCe5eaf372014-05-09 18:00:32 +0000164 add esi, byte SIZEOF_JSAMPROW ; input_data
165 add edi, byte SIZEOF_JSAMPROW ; output_data
166 dec eax ; rowctr
167 jg near .rowloop
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000168
169.return:
DRCe5eaf372014-05-09 18:00:32 +0000170 pop edi
171 pop esi
172; pop edx ; need not be preserved
173; pop ecx ; need not be preserved
174; pop ebx ; unused
175 pop ebp
176 ret
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000177
178; --------------------------------------------------------------------------
179;
180; Downsample pixel values of a single component.
181; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
182; without smoothing.
183;
184; GLOBAL(void)
Pierre Ossmaneea72152009-03-09 13:34:17 +0000185; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
186; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
187; JSAMPARRAY input_data, JSAMPARRAY output_data);
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000188;
189
DRCe5eaf372014-05-09 18:00:32 +0000190%define img_width(b) (b)+8 ; JDIMENSION image_width
191%define max_v_samp(b) (b)+12 ; int max_v_samp_factor
192%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
193%define width_blks(b) (b)+20 ; JDIMENSION width_blocks
194%define input_data(b) (b)+24 ; JSAMPARRAY input_data
195%define output_data(b) (b)+28 ; JSAMPARRAY output_data
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000196
DRCe5eaf372014-05-09 18:00:32 +0000197 align 16
198 global EXTN(jsimd_h2v2_downsample_sse2)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000199
Pierre Ossmaneea72152009-03-09 13:34:17 +0000200EXTN(jsimd_h2v2_downsample_sse2):
DRCe5eaf372014-05-09 18:00:32 +0000201 push ebp
202 mov ebp,esp
203; push ebx ; unused
204; push ecx ; need not be preserved
205; push edx ; need not be preserved
206 push esi
207 push edi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000208
DRCe5eaf372014-05-09 18:00:32 +0000209 mov ecx, JDIMENSION [width_blks(ebp)]
210 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
211 jz near .return
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000212
DRCe5eaf372014-05-09 18:00:32 +0000213 mov edx, JDIMENSION [img_width(ebp)]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000214
DRCe5eaf372014-05-09 18:00:32 +0000215 ; -- expand_right_edge
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000216
DRCe5eaf372014-05-09 18:00:32 +0000217 push ecx
218 shl ecx,1 ; output_cols * 2
219 sub ecx,edx
220 jle short .expand_end
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000221
DRCe5eaf372014-05-09 18:00:32 +0000222 mov eax, INT [max_v_samp(ebp)]
223 test eax,eax
224 jle short .expand_end
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000225
DRCe5eaf372014-05-09 18:00:32 +0000226 cld
227 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
228 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000229.expandloop:
DRCe5eaf372014-05-09 18:00:32 +0000230 push eax
231 push ecx
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000232
DRCe5eaf372014-05-09 18:00:32 +0000233 mov edi, JSAMPROW [esi]
234 add edi,edx
235 mov al, JSAMPLE [edi-1]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000236
DRCe5eaf372014-05-09 18:00:32 +0000237 rep stosb
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000238
DRCe5eaf372014-05-09 18:00:32 +0000239 pop ecx
240 pop eax
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000241
DRCe5eaf372014-05-09 18:00:32 +0000242 add esi, byte SIZEOF_JSAMPROW
243 dec eax
244 jg short .expandloop
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000245
246.expand_end:
DRCe5eaf372014-05-09 18:00:32 +0000247 pop ecx ; output_cols
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000248
DRCe5eaf372014-05-09 18:00:32 +0000249 ; -- h2v2_downsample
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000250
DRCe5eaf372014-05-09 18:00:32 +0000251 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
252 test eax,eax
253 jle near .return
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000254
DRCe5eaf372014-05-09 18:00:32 +0000255 mov edx, 0x00020001 ; bias pattern
256 movd xmm7,edx
257 pcmpeqw xmm6,xmm6
258 pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
259 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000260
DRCe5eaf372014-05-09 18:00:32 +0000261 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
262 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
263 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000264.rowloop:
DRCe5eaf372014-05-09 18:00:32 +0000265 push ecx
266 push edi
267 push esi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000268
DRCe5eaf372014-05-09 18:00:32 +0000269 mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
270 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
271 mov edi, JSAMPROW [edi] ; outptr
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000272
DRCe5eaf372014-05-09 18:00:32 +0000273 cmp ecx, byte SIZEOF_XMMWORD
274 jae short .columnloop
275 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000276
277.columnloop_r8:
DRCe5eaf372014-05-09 18:00:32 +0000278 movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
279 movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
280 pxor xmm2,xmm2
281 pxor xmm3,xmm3
282 mov ecx, SIZEOF_XMMWORD
283 jmp short .downsample
284 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000285
286.columnloop:
DRCe5eaf372014-05-09 18:00:32 +0000287 movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
288 movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
289 movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
290 movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000291
292.downsample:
DRCe5eaf372014-05-09 18:00:32 +0000293 movdqa xmm4,xmm0
294 movdqa xmm5,xmm1
295 pand xmm0,xmm6
296 psrlw xmm4,BYTE_BIT
297 pand xmm1,xmm6
298 psrlw xmm5,BYTE_BIT
299 paddw xmm0,xmm4
300 paddw xmm1,xmm5
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000301
DRCe5eaf372014-05-09 18:00:32 +0000302 movdqa xmm4,xmm2
303 movdqa xmm5,xmm3
304 pand xmm2,xmm6
305 psrlw xmm4,BYTE_BIT
306 pand xmm3,xmm6
307 psrlw xmm5,BYTE_BIT
308 paddw xmm2,xmm4
309 paddw xmm3,xmm5
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000310
DRCe5eaf372014-05-09 18:00:32 +0000311 paddw xmm0,xmm1
312 paddw xmm2,xmm3
313 paddw xmm0,xmm7
314 paddw xmm2,xmm7
315 psrlw xmm0,2
316 psrlw xmm2,2
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000317
DRCe5eaf372014-05-09 18:00:32 +0000318 packuswb xmm0,xmm2
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000319
DRCe5eaf372014-05-09 18:00:32 +0000320 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000321
DRCe5eaf372014-05-09 18:00:32 +0000322 sub ecx, byte SIZEOF_XMMWORD ; outcol
323 add edx, byte 2*SIZEOF_XMMWORD ; inptr0
324 add esi, byte 2*SIZEOF_XMMWORD ; inptr1
325 add edi, byte 1*SIZEOF_XMMWORD ; outptr
326 cmp ecx, byte SIZEOF_XMMWORD
327 jae near .columnloop
328 test ecx,ecx
329 jnz near .columnloop_r8
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000330
DRCe5eaf372014-05-09 18:00:32 +0000331 pop esi
332 pop edi
333 pop ecx
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000334
DRCe5eaf372014-05-09 18:00:32 +0000335 add esi, byte 2*SIZEOF_JSAMPROW ; input_data
336 add edi, byte 1*SIZEOF_JSAMPROW ; output_data
337 dec eax ; rowctr
338 jg near .rowloop
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000339
340.return:
DRCe5eaf372014-05-09 18:00:32 +0000341 pop edi
342 pop esi
343; pop edx ; need not be preserved
344; pop ecx ; need not be preserved
345; pop ebx ; unused
346 pop ebp
347 ret
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000348
DRC132b5fd2009-10-08 09:04:56 +0000349; For some reason, the OS X linker does not honor the request to align the
350; segment unless we do this.
DRCe5eaf372014-05-09 18:00:32 +0000351 align 16