blob: cf7a776baf4c719dd2fabf5f3c52560000fa7e9a [file] [log] [blame]
DRCcdc8ac32009-06-25 20:38:31 +00001;
2; jcsamss2.asm - downsampling (64-bit SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright 2009 D. R. Commander
6;
7; Based on
8; x86 SIMD extension for IJG JPEG library
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10; For conditions of distribution and use, see copyright notice in jsimdext.inc
11;
12; This file should be assembled with NASM (Netwide Assembler),
13; can *not* be assembled with Microsoft's MASM or any compatible
14; assembler (including Borland's Turbo Assembler).
15; NASM is available from http://nasm.sourceforge.net/ or
16; http://sourceforge.net/project/showfiles.php?group_id=6208
17;
18; [TAB8]
19
20%include "jsimdext.inc"
21
22; --------------------------------------------------------------------------
23 SECTION SEG_TEXT
24 BITS 64
25;
26; Downsample pixel values of a single component.
27; This version handles the common case of 2:1 horizontal and 1:1 vertical,
28; without smoothing.
29;
30; GLOBAL(void)
31; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
32; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
33; JSAMPARRAY input_data, JSAMPARRAY output_data);
34;
35
36; r10 = JDIMENSION image_width
37; r11 = int max_v_samp_factor
38; r12 = JDIMENSION v_samp_factor
39; r13 = JDIMENSION width_blocks
40; r14 = JSAMPARRAY input_data
41; r15 = JSAMPARRAY output_data
42
43 align 16
44 global EXTN(jsimd_h2v1_downsample_sse2)
45
46EXTN(jsimd_h2v1_downsample_sse2):
47 push rbp
48 mov rbp,rsp
49 collect_args
50
51 mov rcx, r13
52 shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols)
53 jz near .return
54
55 mov rdx, r10
56
57 ; -- expand_right_edge
58
59 push rcx
60 shl rcx,1 ; output_cols * 2
61 sub rcx,rdx
62 jle short .expand_end
63
64 mov rax, r11
65 test rax,rax
66 jle short .expand_end
67
68 cld
69 mov rsi, r14 ; input_data
70.expandloop:
71 push rax
72 push rcx
73
74 mov rdi, JSAMPROW [rsi]
75 add rdi,rdx
76 mov al, JSAMPLE [rdi-1]
77
78 rep stosb
79
80 pop rcx
81 pop rax
82
83 add rsi, byte SIZEOF_JSAMPROW
84 dec rax
85 jg short .expandloop
86
87.expand_end:
88 pop rcx ; output_cols
89
90 ; -- h2v1_downsample
91
92 mov rax, r12 ; rowctr
93 test eax,eax
94 jle near .return
95
96 mov rdx, 0x00010000 ; bias pattern
97 movd xmm7,edx
98 pcmpeqw xmm6,xmm6
99 pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
100 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
101
102 mov rsi, r14 ; input_data
103 mov rdi, r15 ; output_data
104.rowloop:
105 push rcx
106 push rdi
107 push rsi
108
109 mov rsi, JSAMPROW [rsi] ; inptr
110 mov rdi, JSAMPROW [rdi] ; outptr
111
112 cmp rcx, byte SIZEOF_XMMWORD
113 jae short .columnloop
114
115.columnloop_r8:
116 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
117 pxor xmm1,xmm1
118 mov rcx, SIZEOF_XMMWORD
119 jmp short .downsample
120
121.columnloop:
122 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
123 movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
124
125.downsample:
126 movdqa xmm2,xmm0
127 movdqa xmm3,xmm1
128
129 pand xmm0,xmm6
130 psrlw xmm2,BYTE_BIT
131 pand xmm1,xmm6
132 psrlw xmm3,BYTE_BIT
133
134 paddw xmm0,xmm2
135 paddw xmm1,xmm3
136 paddw xmm0,xmm7
137 paddw xmm1,xmm7
138 psrlw xmm0,1
139 psrlw xmm1,1
140
141 packuswb xmm0,xmm1
142
143 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
144
145 sub rcx, byte SIZEOF_XMMWORD ; outcol
146 add rsi, byte 2*SIZEOF_XMMWORD ; inptr
147 add rdi, byte 1*SIZEOF_XMMWORD ; outptr
148 cmp rcx, byte SIZEOF_XMMWORD
149 jae short .columnloop
150 test rcx,rcx
151 jnz short .columnloop_r8
152
153 pop rsi
154 pop rdi
155 pop rcx
156
157 add rsi, byte SIZEOF_JSAMPROW ; input_data
158 add rdi, byte SIZEOF_JSAMPROW ; output_data
159 dec rax ; rowctr
160 jg near .rowloop
161
162.return:
163 uncollect_args
164 pop rbp
165 ret
166
167; --------------------------------------------------------------------------
168;
169; Downsample pixel values of a single component.
170; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
171; without smoothing.
172;
173; GLOBAL(void)
174; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
175; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
176; JSAMPARRAY input_data, JSAMPARRAY output_data);
177;
178
179; r10 = JDIMENSION image_width
180; r11 = int max_v_samp_factor
181; r12 = JDIMENSION v_samp_factor
182; r13 = JDIMENSION width_blocks
183; r14 = JSAMPARRAY input_data
184; r15 = JSAMPARRAY output_data
185
186 align 16
187 global EXTN(jsimd_h2v2_downsample_sse2)
188
189EXTN(jsimd_h2v2_downsample_sse2):
190 push rbp
191 mov rbp,rsp
192 collect_args
193
194 mov rcx, r13
195 shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols)
196 jz near .return
197
198 mov rdx, r10
199
200 ; -- expand_right_edge
201
202 push rcx
203 shl rcx,1 ; output_cols * 2
204 sub rcx,rdx
205 jle short .expand_end
206
207 mov rax, r11
208 test rax,rax
209 jle short .expand_end
210
211 cld
212 mov rsi, r14 ; input_data
213.expandloop:
214 push rax
215 push rcx
216
217 mov rdi, JSAMPROW [rsi]
218 add rdi,rdx
219 mov al, JSAMPLE [rdi-1]
220
221 rep stosb
222
223 pop rcx
224 pop rax
225
226 add rsi, byte SIZEOF_JSAMPROW
227 dec rax
228 jg short .expandloop
229
230.expand_end:
231 pop rcx ; output_cols
232
233 ; -- h2v2_downsample
234
235 mov rax, r12 ; rowctr
236 test rax,rax
237 jle near .return
238
239 mov rdx, 0x00020001 ; bias pattern
240 movd xmm7,edx
241 pcmpeqw xmm6,xmm6
242 pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
243 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
244
245 mov rsi, r14 ; input_data
246 mov rdi, r15 ; output_data
247.rowloop:
248 push rcx
249 push rdi
250 push rsi
251
252 mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
253 mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
254 mov rdi, JSAMPROW [rdi] ; outptr
255
256 cmp rcx, byte SIZEOF_XMMWORD
257 jae short .columnloop
258
259.columnloop_r8:
260 movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
261 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
262 pxor xmm2,xmm2
263 pxor xmm3,xmm3
264 mov rcx, SIZEOF_XMMWORD
265 jmp short .downsample
266
267.columnloop:
268 movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
269 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
270 movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
271 movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
272
273.downsample:
274 movdqa xmm4,xmm0
275 movdqa xmm5,xmm1
276 pand xmm0,xmm6
277 psrlw xmm4,BYTE_BIT
278 pand xmm1,xmm6
279 psrlw xmm5,BYTE_BIT
280 paddw xmm0,xmm4
281 paddw xmm1,xmm5
282
283 movdqa xmm4,xmm2
284 movdqa xmm5,xmm3
285 pand xmm2,xmm6
286 psrlw xmm4,BYTE_BIT
287 pand xmm3,xmm6
288 psrlw xmm5,BYTE_BIT
289 paddw xmm2,xmm4
290 paddw xmm3,xmm5
291
292 paddw xmm0,xmm1
293 paddw xmm2,xmm3
294 paddw xmm0,xmm7
295 paddw xmm2,xmm7
296 psrlw xmm0,2
297 psrlw xmm2,2
298
299 packuswb xmm0,xmm2
300
301 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
302
303 sub rcx, byte SIZEOF_XMMWORD ; outcol
304 add rdx, byte 2*SIZEOF_XMMWORD ; inptr0
305 add rsi, byte 2*SIZEOF_XMMWORD ; inptr1
306 add rdi, byte 1*SIZEOF_XMMWORD ; outptr
307 cmp rcx, byte SIZEOF_XMMWORD
308 jae near .columnloop
309 test rcx,rcx
310 jnz near .columnloop_r8
311
312 pop rsi
313 pop rdi
314 pop rcx
315
316 add rsi, byte 2*SIZEOF_JSAMPROW ; input_data
317 add rdi, byte 1*SIZEOF_JSAMPROW ; output_data
318 dec rax ; rowctr
319 jg near .rowloop
320
321.return:
322 uncollect_args
323 pop rbp
324 ret