blob: 7693285c90089a14226340d50445c84cb591f4a9 [file] [log] [blame]
DRCcdc8ac32009-06-25 20:38:31 +00001;
DRC72130be2014-05-09 20:14:26 +00002; jcsample.asm - downsampling (64-bit SSE2)
DRCcdc8ac32009-06-25 20:38:31 +00003;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright 2009 D. R. Commander
6;
7; Based on
8; x86 SIMD extension for IJG JPEG library
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10; For conditions of distribution and use, see copyright notice in jsimdext.inc
11;
12; This file should be assembled with NASM (Netwide Assembler),
13; can *not* be assembled with Microsoft's MASM or any compatible
14; assembler (including Borland's Turbo Assembler).
15; NASM is available from http://nasm.sourceforge.net/ or
16; http://sourceforge.net/project/showfiles.php?group_id=6208
17;
18; [TAB8]
19
20%include "jsimdext.inc"
21
22; --------------------------------------------------------------------------
DRCe5eaf372014-05-09 18:00:32 +000023 SECTION SEG_TEXT
24 BITS 64
DRCcdc8ac32009-06-25 20:38:31 +000025;
26; Downsample pixel values of a single component.
27; This version handles the common case of 2:1 horizontal and 1:1 vertical,
28; without smoothing.
29;
30; GLOBAL(void)
31; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
32; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
33; JSAMPARRAY input_data, JSAMPARRAY output_data);
34;
35
36; r10 = JDIMENSION image_width
37; r11 = int max_v_samp_factor
38; r12 = JDIMENSION v_samp_factor
39; r13 = JDIMENSION width_blocks
40; r14 = JSAMPARRAY input_data
41; r15 = JSAMPARRAY output_data
42
DRCe5eaf372014-05-09 18:00:32 +000043 align 16
44 global EXTN(jsimd_h2v1_downsample_sse2)
DRCcdc8ac32009-06-25 20:38:31 +000045
46EXTN(jsimd_h2v1_downsample_sse2):
DRCe5eaf372014-05-09 18:00:32 +000047 push rbp
48 mov rax,rsp
49 mov rbp,rsp
50 collect_args
DRCcdc8ac32009-06-25 20:38:31 +000051
Chandler Carruth498d9bc2015-09-15 11:57:03 -070052 mov ecx, r13d
DRCe5eaf372014-05-09 18:00:32 +000053 shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols)
54 jz near .return
DRCcdc8ac32009-06-25 20:38:31 +000055
Chandler Carruth498d9bc2015-09-15 11:57:03 -070056 mov edx, r10d
DRCcdc8ac32009-06-25 20:38:31 +000057
DRCe5eaf372014-05-09 18:00:32 +000058 ; -- expand_right_edge
DRCcdc8ac32009-06-25 20:38:31 +000059
DRCe5eaf372014-05-09 18:00:32 +000060 push rcx
61 shl rcx,1 ; output_cols * 2
62 sub rcx,rdx
63 jle short .expand_end
DRCcdc8ac32009-06-25 20:38:31 +000064
DRCe5eaf372014-05-09 18:00:32 +000065 mov rax, r11
66 test rax,rax
67 jle short .expand_end
DRCcdc8ac32009-06-25 20:38:31 +000068
DRCe5eaf372014-05-09 18:00:32 +000069 cld
70 mov rsi, r14 ; input_data
DRCcdc8ac32009-06-25 20:38:31 +000071.expandloop:
DRCe5eaf372014-05-09 18:00:32 +000072 push rax
73 push rcx
DRCcdc8ac32009-06-25 20:38:31 +000074
DRCe5eaf372014-05-09 18:00:32 +000075 mov rdi, JSAMPROW [rsi]
76 add rdi,rdx
77 mov al, JSAMPLE [rdi-1]
DRCcdc8ac32009-06-25 20:38:31 +000078
DRCe5eaf372014-05-09 18:00:32 +000079 rep stosb
DRCcdc8ac32009-06-25 20:38:31 +000080
DRCe5eaf372014-05-09 18:00:32 +000081 pop rcx
82 pop rax
DRCcdc8ac32009-06-25 20:38:31 +000083
DRCe5eaf372014-05-09 18:00:32 +000084 add rsi, byte SIZEOF_JSAMPROW
85 dec rax
86 jg short .expandloop
DRCcdc8ac32009-06-25 20:38:31 +000087
88.expand_end:
DRCe5eaf372014-05-09 18:00:32 +000089 pop rcx ; output_cols
DRCcdc8ac32009-06-25 20:38:31 +000090
DRCe5eaf372014-05-09 18:00:32 +000091 ; -- h2v1_downsample
DRCcdc8ac32009-06-25 20:38:31 +000092
Chandler Carruth498d9bc2015-09-15 11:57:03 -070093 mov eax, r12d ; rowctr
DRCe5eaf372014-05-09 18:00:32 +000094 test eax,eax
95 jle near .return
DRCcdc8ac32009-06-25 20:38:31 +000096
DRCe5eaf372014-05-09 18:00:32 +000097 mov rdx, 0x00010000 ; bias pattern
98 movd xmm7,edx
99 pcmpeqw xmm6,xmm6
100 pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
101 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
DRCcdc8ac32009-06-25 20:38:31 +0000102
DRCe5eaf372014-05-09 18:00:32 +0000103 mov rsi, r14 ; input_data
104 mov rdi, r15 ; output_data
DRCcdc8ac32009-06-25 20:38:31 +0000105.rowloop:
DRCe5eaf372014-05-09 18:00:32 +0000106 push rcx
107 push rdi
108 push rsi
DRCcdc8ac32009-06-25 20:38:31 +0000109
DRCe5eaf372014-05-09 18:00:32 +0000110 mov rsi, JSAMPROW [rsi] ; inptr
111 mov rdi, JSAMPROW [rdi] ; outptr
DRCcdc8ac32009-06-25 20:38:31 +0000112
DRCe5eaf372014-05-09 18:00:32 +0000113 cmp rcx, byte SIZEOF_XMMWORD
114 jae short .columnloop
DRCcdc8ac32009-06-25 20:38:31 +0000115
116.columnloop_r8:
DRCe5eaf372014-05-09 18:00:32 +0000117 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
118 pxor xmm1,xmm1
119 mov rcx, SIZEOF_XMMWORD
120 jmp short .downsample
DRCcdc8ac32009-06-25 20:38:31 +0000121
122.columnloop:
DRCe5eaf372014-05-09 18:00:32 +0000123 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
124 movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
DRCcdc8ac32009-06-25 20:38:31 +0000125
126.downsample:
DRCe5eaf372014-05-09 18:00:32 +0000127 movdqa xmm2,xmm0
128 movdqa xmm3,xmm1
DRCcdc8ac32009-06-25 20:38:31 +0000129
DRCe5eaf372014-05-09 18:00:32 +0000130 pand xmm0,xmm6
131 psrlw xmm2,BYTE_BIT
132 pand xmm1,xmm6
133 psrlw xmm3,BYTE_BIT
DRCcdc8ac32009-06-25 20:38:31 +0000134
DRCe5eaf372014-05-09 18:00:32 +0000135 paddw xmm0,xmm2
136 paddw xmm1,xmm3
137 paddw xmm0,xmm7
138 paddw xmm1,xmm7
139 psrlw xmm0,1
140 psrlw xmm1,1
DRCcdc8ac32009-06-25 20:38:31 +0000141
DRCe5eaf372014-05-09 18:00:32 +0000142 packuswb xmm0,xmm1
DRCcdc8ac32009-06-25 20:38:31 +0000143
DRCe5eaf372014-05-09 18:00:32 +0000144 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
DRCcdc8ac32009-06-25 20:38:31 +0000145
DRCe5eaf372014-05-09 18:00:32 +0000146 sub rcx, byte SIZEOF_XMMWORD ; outcol
147 add rsi, byte 2*SIZEOF_XMMWORD ; inptr
148 add rdi, byte 1*SIZEOF_XMMWORD ; outptr
149 cmp rcx, byte SIZEOF_XMMWORD
150 jae short .columnloop
151 test rcx,rcx
152 jnz short .columnloop_r8
DRCcdc8ac32009-06-25 20:38:31 +0000153
DRCe5eaf372014-05-09 18:00:32 +0000154 pop rsi
155 pop rdi
156 pop rcx
DRCcdc8ac32009-06-25 20:38:31 +0000157
DRCe5eaf372014-05-09 18:00:32 +0000158 add rsi, byte SIZEOF_JSAMPROW ; input_data
159 add rdi, byte SIZEOF_JSAMPROW ; output_data
160 dec rax ; rowctr
161 jg near .rowloop
DRCcdc8ac32009-06-25 20:38:31 +0000162
163.return:
DRCe5eaf372014-05-09 18:00:32 +0000164 uncollect_args
165 pop rbp
166 ret
DRCcdc8ac32009-06-25 20:38:31 +0000167
168; --------------------------------------------------------------------------
169;
170; Downsample pixel values of a single component.
171; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
172; without smoothing.
173;
174; GLOBAL(void)
175; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
176; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
177; JSAMPARRAY input_data, JSAMPARRAY output_data);
178;
179
180; r10 = JDIMENSION image_width
181; r11 = int max_v_samp_factor
182; r12 = JDIMENSION v_samp_factor
183; r13 = JDIMENSION width_blocks
184; r14 = JSAMPARRAY input_data
185; r15 = JSAMPARRAY output_data
186
DRCe5eaf372014-05-09 18:00:32 +0000187 align 16
188 global EXTN(jsimd_h2v2_downsample_sse2)
DRCcdc8ac32009-06-25 20:38:31 +0000189
190EXTN(jsimd_h2v2_downsample_sse2):
DRCe5eaf372014-05-09 18:00:32 +0000191 push rbp
192 mov rax,rsp
193 mov rbp,rsp
194 collect_args
DRCcdc8ac32009-06-25 20:38:31 +0000195
Chandler Carruth498d9bc2015-09-15 11:57:03 -0700196 mov ecx, r13d
DRCe5eaf372014-05-09 18:00:32 +0000197 shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols)
198 jz near .return
DRCcdc8ac32009-06-25 20:38:31 +0000199
Chandler Carruth498d9bc2015-09-15 11:57:03 -0700200 mov edx, r10d
DRCcdc8ac32009-06-25 20:38:31 +0000201
DRCe5eaf372014-05-09 18:00:32 +0000202 ; -- expand_right_edge
DRCcdc8ac32009-06-25 20:38:31 +0000203
DRCe5eaf372014-05-09 18:00:32 +0000204 push rcx
205 shl rcx,1 ; output_cols * 2
206 sub rcx,rdx
207 jle short .expand_end
DRCcdc8ac32009-06-25 20:38:31 +0000208
DRCe5eaf372014-05-09 18:00:32 +0000209 mov rax, r11
210 test rax,rax
211 jle short .expand_end
DRCcdc8ac32009-06-25 20:38:31 +0000212
DRCe5eaf372014-05-09 18:00:32 +0000213 cld
214 mov rsi, r14 ; input_data
DRCcdc8ac32009-06-25 20:38:31 +0000215.expandloop:
DRCe5eaf372014-05-09 18:00:32 +0000216 push rax
217 push rcx
DRCcdc8ac32009-06-25 20:38:31 +0000218
DRCe5eaf372014-05-09 18:00:32 +0000219 mov rdi, JSAMPROW [rsi]
220 add rdi,rdx
221 mov al, JSAMPLE [rdi-1]
DRCcdc8ac32009-06-25 20:38:31 +0000222
DRCe5eaf372014-05-09 18:00:32 +0000223 rep stosb
DRCcdc8ac32009-06-25 20:38:31 +0000224
DRCe5eaf372014-05-09 18:00:32 +0000225 pop rcx
226 pop rax
DRCcdc8ac32009-06-25 20:38:31 +0000227
DRCe5eaf372014-05-09 18:00:32 +0000228 add rsi, byte SIZEOF_JSAMPROW
229 dec rax
230 jg short .expandloop
DRCcdc8ac32009-06-25 20:38:31 +0000231
232.expand_end:
DRCe5eaf372014-05-09 18:00:32 +0000233 pop rcx ; output_cols
DRCcdc8ac32009-06-25 20:38:31 +0000234
DRCe5eaf372014-05-09 18:00:32 +0000235 ; -- h2v2_downsample
DRCcdc8ac32009-06-25 20:38:31 +0000236
Chandler Carruth498d9bc2015-09-15 11:57:03 -0700237 mov eax, r12d ; rowctr
DRCe5eaf372014-05-09 18:00:32 +0000238 test rax,rax
239 jle near .return
DRCcdc8ac32009-06-25 20:38:31 +0000240
DRCe5eaf372014-05-09 18:00:32 +0000241 mov rdx, 0x00020001 ; bias pattern
242 movd xmm7,edx
243 pcmpeqw xmm6,xmm6
244 pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
245 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
DRCcdc8ac32009-06-25 20:38:31 +0000246
DRCe5eaf372014-05-09 18:00:32 +0000247 mov rsi, r14 ; input_data
248 mov rdi, r15 ; output_data
DRCcdc8ac32009-06-25 20:38:31 +0000249.rowloop:
DRCe5eaf372014-05-09 18:00:32 +0000250 push rcx
251 push rdi
252 push rsi
DRCcdc8ac32009-06-25 20:38:31 +0000253
DRCe5eaf372014-05-09 18:00:32 +0000254 mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
255 mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
256 mov rdi, JSAMPROW [rdi] ; outptr
DRCcdc8ac32009-06-25 20:38:31 +0000257
DRCe5eaf372014-05-09 18:00:32 +0000258 cmp rcx, byte SIZEOF_XMMWORD
259 jae short .columnloop
DRCcdc8ac32009-06-25 20:38:31 +0000260
261.columnloop_r8:
DRCe5eaf372014-05-09 18:00:32 +0000262 movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
263 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
264 pxor xmm2,xmm2
265 pxor xmm3,xmm3
266 mov rcx, SIZEOF_XMMWORD
267 jmp short .downsample
DRCcdc8ac32009-06-25 20:38:31 +0000268
269.columnloop:
DRCe5eaf372014-05-09 18:00:32 +0000270 movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
271 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
272 movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
273 movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
DRCcdc8ac32009-06-25 20:38:31 +0000274
275.downsample:
DRCe5eaf372014-05-09 18:00:32 +0000276 movdqa xmm4,xmm0
277 movdqa xmm5,xmm1
278 pand xmm0,xmm6
279 psrlw xmm4,BYTE_BIT
280 pand xmm1,xmm6
281 psrlw xmm5,BYTE_BIT
282 paddw xmm0,xmm4
283 paddw xmm1,xmm5
DRCcdc8ac32009-06-25 20:38:31 +0000284
DRCe5eaf372014-05-09 18:00:32 +0000285 movdqa xmm4,xmm2
286 movdqa xmm5,xmm3
287 pand xmm2,xmm6
288 psrlw xmm4,BYTE_BIT
289 pand xmm3,xmm6
290 psrlw xmm5,BYTE_BIT
291 paddw xmm2,xmm4
292 paddw xmm3,xmm5
DRCcdc8ac32009-06-25 20:38:31 +0000293
DRCe5eaf372014-05-09 18:00:32 +0000294 paddw xmm0,xmm1
295 paddw xmm2,xmm3
296 paddw xmm0,xmm7
297 paddw xmm2,xmm7
298 psrlw xmm0,2
299 psrlw xmm2,2
DRCcdc8ac32009-06-25 20:38:31 +0000300
DRCe5eaf372014-05-09 18:00:32 +0000301 packuswb xmm0,xmm2
DRCcdc8ac32009-06-25 20:38:31 +0000302
DRCe5eaf372014-05-09 18:00:32 +0000303 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
DRCcdc8ac32009-06-25 20:38:31 +0000304
DRCe5eaf372014-05-09 18:00:32 +0000305 sub rcx, byte SIZEOF_XMMWORD ; outcol
306 add rdx, byte 2*SIZEOF_XMMWORD ; inptr0
307 add rsi, byte 2*SIZEOF_XMMWORD ; inptr1
308 add rdi, byte 1*SIZEOF_XMMWORD ; outptr
309 cmp rcx, byte SIZEOF_XMMWORD
310 jae near .columnloop
311 test rcx,rcx
312 jnz near .columnloop_r8
DRCcdc8ac32009-06-25 20:38:31 +0000313
DRCe5eaf372014-05-09 18:00:32 +0000314 pop rsi
315 pop rdi
316 pop rcx
DRCcdc8ac32009-06-25 20:38:31 +0000317
DRCe5eaf372014-05-09 18:00:32 +0000318 add rsi, byte 2*SIZEOF_JSAMPROW ; input_data
319 add rdi, byte 1*SIZEOF_JSAMPROW ; output_data
320 dec rax ; rowctr
321 jg near .rowloop
DRCcdc8ac32009-06-25 20:38:31 +0000322
323.return:
DRCe5eaf372014-05-09 18:00:32 +0000324 uncollect_args
325 pop rbp
326 ret
DRC132b5fd2009-10-08 09:04:56 +0000327
328; For some reason, the OS X linker does not honor the request to align the
329; segment unless we do this.
DRCe5eaf372014-05-09 18:00:32 +0000330 align 16