blob: 1f7b1f5946741afe386270ffde3a952e582b5f45 [file] [log] [blame]
DRCcdc8ac32009-06-25 20:38:31 +00001;
2; jdsamss2.asm - upsampling (64-bit SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright 2009 D. R. Commander
6;
7; Based on
8; x86 SIMD extension for IJG JPEG library
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10; For conditions of distribution and use, see copyright notice in jsimdext.inc
11;
12; This file should be assembled with NASM (Netwide Assembler),
13; can *not* be assembled with Microsoft's MASM or any compatible
14; assembler (including Borland's Turbo Assembler).
15; NASM is available from http://nasm.sourceforge.net/ or
16; http://sourceforge.net/project/showfiles.php?group_id=6208
17;
18; [TAB8]
19
20%include "jsimdext.inc"
21
22; --------------------------------------------------------------------------
23 SECTION SEG_CONST
24
25 alignz 16
26 global EXTN(jconst_fancy_upsample_sse2)
27
28EXTN(jconst_fancy_upsample_sse2):
29
30PW_ONE times 8 dw 1
31PW_TWO times 8 dw 2
32PW_THREE times 8 dw 3
33PW_SEVEN times 8 dw 7
34PW_EIGHT times 8 dw 8
35
36 alignz 16
37
38; --------------------------------------------------------------------------
39 SECTION SEG_TEXT
40 BITS 64
41;
42; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
43;
44; The upsampling algorithm is linear interpolation between pixel centers,
45; also known as a "triangle filter". This is a good compromise between
46; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
47; of the way between input pixel centers.
48;
49; GLOBAL(void)
50; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
51; JDIMENSION downsampled_width,
52; JSAMPARRAY input_data,
53; JSAMPARRAY * output_data_ptr);
54;
55
56; r10 = int max_v_samp_factor
57; r11 = JDIMENSION downsampled_width
58; r12 = JSAMPARRAY input_data
59; r13 = JSAMPARRAY * output_data_ptr
60
61 align 16
62 global EXTN(jsimd_h2v1_fancy_upsample_sse2)
63
64EXTN(jsimd_h2v1_fancy_upsample_sse2):
65 push rbp
66 mov rbp,rsp
67 collect_args
68
69 mov rax, r11 ; colctr
70 test rax,rax
71 jz near .return
72
73 mov rcx, r10 ; rowctr
74 test rcx,rcx
75 jz near .return
76
77 mov rsi, r12 ; input_data
78 mov rdi, r13
79 mov rdi, JSAMPARRAY [rdi] ; output_data
80.rowloop:
81 push rax ; colctr
82 push rdi
83 push rsi
84
85 mov rsi, JSAMPROW [rsi] ; inptr
86 mov rdi, JSAMPROW [rdi] ; outptr
87
88 test rax, SIZEOF_XMMWORD-1
89 jz short .skip
90 mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
91 mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
92.skip:
93 pxor xmm0,xmm0 ; xmm0=(all 0's)
94 pcmpeqb xmm7,xmm7
95 psrldq xmm7,(SIZEOF_XMMWORD-1)
96 pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
97
98 add rax, byte SIZEOF_XMMWORD-1
99 and rax, byte -SIZEOF_XMMWORD
100 cmp rax, byte SIZEOF_XMMWORD
101 ja short .columnloop
102
103.columnloop_last:
104 pcmpeqb xmm6,xmm6
105 pslldq xmm6,(SIZEOF_XMMWORD-1)
106 pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
107 jmp short .upsample
108
109.columnloop:
110 movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
111 pslldq xmm6,(SIZEOF_XMMWORD-1)
112
113.upsample:
114 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
115 movdqa xmm2,xmm1
116 movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
117 pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14)
118 psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --)
119
120 por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
121 por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
122
123 movdqa xmm7,xmm1
124 psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
125
126 movdqa xmm4,xmm1
127 punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
128 punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
129 movdqa xmm5,xmm2
130 punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
131 punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
132 movdqa xmm6,xmm3
133 punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
134 punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
135
136 pmullw xmm1,[PW_THREE]
137 pmullw xmm4,[PW_THREE]
138 paddw xmm2,[PW_ONE]
139 paddw xmm5,[PW_ONE]
140 paddw xmm3,[PW_TWO]
141 paddw xmm6,[PW_TWO]
142
143 paddw xmm2,xmm1
144 paddw xmm5,xmm4
145 psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
146 psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
147 paddw xmm3,xmm1
148 paddw xmm6,xmm4
149 psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
150 psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
151
152 psllw xmm3,BYTE_BIT
153 psllw xmm6,BYTE_BIT
154 por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
155 por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
156
157 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
158 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
159
160 sub rax, byte SIZEOF_XMMWORD
161 add rsi, byte 1*SIZEOF_XMMWORD ; inptr
162 add rdi, byte 2*SIZEOF_XMMWORD ; outptr
163 cmp rax, byte SIZEOF_XMMWORD
164 ja near .columnloop
165 test eax,eax
166 jnz near .columnloop_last
167
168 pop rsi
169 pop rdi
170 pop rax
171
172 add rsi, byte SIZEOF_JSAMPROW ; input_data
173 add rdi, byte SIZEOF_JSAMPROW ; output_data
174 dec rcx ; rowctr
175 jg near .rowloop
176
177.return:
178 uncollect_args
179 pop rbp
180 ret
181
182; --------------------------------------------------------------------------
183;
184; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
185; Again a triangle filter; see comments for h2v1 case, above.
186;
187; GLOBAL(void)
188; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
189; JDIMENSION downsampled_width,
190; JSAMPARRAY input_data,
191; JSAMPARRAY * output_data_ptr);
192;
193
194; r10 = int max_v_samp_factor
195; r11 = JDIMENSION downsampled_width
196; r12 = JSAMPARRAY input_data
197; r13 = JSAMPARRAY * output_data_ptr
198
199%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
200%define WK_NUM 4
201
202 align 16
203 global EXTN(jsimd_h2v2_fancy_upsample_sse2)
204
205EXTN(jsimd_h2v2_fancy_upsample_sse2):
206 push rbp
207 mov rax,rsp ; rax = original rbp
208 sub rsp, byte 4
209 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
210 mov [rsp],rax
211 mov rbp,rsp ; rbp = aligned rbp
212 lea rsp, [wk(0)]
213 push rbx
214 collect_args
215
216 mov rax, r11 ; colctr
217 test rax,rax
218 jz near .return
219
220 mov rcx, r10 ; rowctr
221 test rcx,rcx
222 jz near .return
223
224 mov rsi, r12 ; input_data
225 mov rdi, r13
226 mov rdi, JSAMPARRAY [rdi] ; output_data
227.rowloop:
228 push rax ; colctr
229 push rcx
230 push rdi
231 push rsi
232
233 mov rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above)
234 mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
235 mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below)
236 mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
237 mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
238
239 test rax, SIZEOF_XMMWORD-1
240 jz short .skip
241 push rdx
242 mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
243 mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
244 mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
245 mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
246 mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
247 mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
248 pop rdx
249.skip:
250 ; -- process the first column block
251
252 movdqa xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0]
253 movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0]
254 movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0]
255
256 pxor xmm3,xmm3 ; xmm3=(all 0's)
257 movdqa xmm4,xmm0
258 punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
259 punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
260 movdqa xmm5,xmm1
261 punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
262 punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
263 movdqa xmm6,xmm2
264 punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
265 punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
266
267 pmullw xmm0,[PW_THREE]
268 pmullw xmm4,[PW_THREE]
269
270 pcmpeqb xmm7,xmm7
271 psrldq xmm7,(SIZEOF_XMMWORD-2)
272
273 paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
274 paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
275 paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
276 paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
277
278 movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
279 movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
280 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
281 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
282
283 pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
284 pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
285
286 movdqa XMMWORD [wk(0)], xmm1
287 movdqa XMMWORD [wk(1)], xmm2
288
289 add rax, byte SIZEOF_XMMWORD-1
290 and rax, byte -SIZEOF_XMMWORD
291 cmp rax, byte SIZEOF_XMMWORD
292 ja short .columnloop
293
294.columnloop_last:
295 ; -- process the last column block
296
297 pcmpeqb xmm1,xmm1
298 pslldq xmm1,(SIZEOF_XMMWORD-2)
299 movdqa xmm2,xmm1
300
301 pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
302 pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
303
304 movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15)
305 movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15)
306
307 jmp near .upsample
308
309.columnloop:
310 ; -- process the next column block
311
312 movdqa xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1]
313 movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1]
314 movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1]
315
316 pxor xmm3,xmm3 ; xmm3=(all 0's)
317 movdqa xmm4,xmm0
318 punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
319 punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
320 movdqa xmm5,xmm1
321 punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
322 punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
323 movdqa xmm6,xmm2
324 punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
325 punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
326
327 pmullw xmm0,[PW_THREE]
328 pmullw xmm4,[PW_THREE]
329
330 paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
331 paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
332 paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
333 paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
334
335 movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
336 movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
337 movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
338 movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
339
340 pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
341 pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
342
343 movdqa XMMWORD [wk(2)], xmm1
344 movdqa XMMWORD [wk(3)], xmm2
345
346.upsample:
347 ; -- process the upper row
348
349 movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
350 movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
351
352 movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
353 movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
354 psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --)
355 pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
356 movdqa xmm5,xmm7
357 movdqa xmm6,xmm3
358 psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
359 pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14)
360
361 por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
362 por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
363
364 movdqa xmm1,xmm7
365 movdqa xmm2,xmm3
366 pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)
367 psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --)
368 movdqa xmm4,xmm3
369 psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
370
371 por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
372 por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
373
374 movdqa XMMWORD [wk(0)], xmm4
375
376 pmullw xmm7,[PW_THREE]
377 pmullw xmm3,[PW_THREE]
378 paddw xmm1,[PW_EIGHT]
379 paddw xmm5,[PW_EIGHT]
380 paddw xmm0,[PW_SEVEN]
381 paddw xmm2,[PW_SEVEN]
382
383 paddw xmm1,xmm7
384 paddw xmm5,xmm3
385 psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
386 psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
387 paddw xmm0,xmm7
388 paddw xmm2,xmm3
389 psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
390 psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
391
392 psllw xmm0,BYTE_BIT
393 psllw xmm2,BYTE_BIT
394 por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
395 por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
396
397 movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
398 movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
399
400 ; -- process the lower row
401
402 movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
403 movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
404
405 movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
406 movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
407 psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --)
408 pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
409 movdqa xmm0,xmm6
410 movdqa xmm2,xmm4
411 psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
412 pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14)
413
414 por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
415 por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
416
417 movdqa xmm1,xmm6
418 movdqa xmm5,xmm4
419 pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)
420 psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --)
421 movdqa xmm3,xmm4
422 psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
423
424 por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
425 por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
426
427 movdqa XMMWORD [wk(1)], xmm3
428
429 pmullw xmm6,[PW_THREE]
430 pmullw xmm4,[PW_THREE]
431 paddw xmm1,[PW_EIGHT]
432 paddw xmm0,[PW_EIGHT]
433 paddw xmm7,[PW_SEVEN]
434 paddw xmm5,[PW_SEVEN]
435
436 paddw xmm1,xmm6
437 paddw xmm0,xmm4
438 psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
439 psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
440 paddw xmm7,xmm6
441 paddw xmm5,xmm4
442 psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
443 psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
444
445 psllw xmm7,BYTE_BIT
446 psllw xmm5,BYTE_BIT
447 por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
448 por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
449
450 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
451 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
452
453 sub rax, byte SIZEOF_XMMWORD
454 add rcx, byte 1*SIZEOF_XMMWORD ; inptr1(above)
455 add rbx, byte 1*SIZEOF_XMMWORD ; inptr0
456 add rsi, byte 1*SIZEOF_XMMWORD ; inptr1(below)
457 add rdx, byte 2*SIZEOF_XMMWORD ; outptr0
458 add rdi, byte 2*SIZEOF_XMMWORD ; outptr1
459 cmp rax, byte SIZEOF_XMMWORD
460 ja near .columnloop
461 test rax,rax
462 jnz near .columnloop_last
463
464 pop rsi
465 pop rdi
466 pop rcx
467 pop rax
468
469 add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
470 add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
471 sub rcx, byte 2 ; rowctr
472 jg near .rowloop
473
474.return:
475 uncollect_args
476 pop rbx
477 mov rsp,rbp ; rsp <- aligned rbp
478 pop rsp ; rsp <- original rbp
479 pop rbp
480 ret
481
482; --------------------------------------------------------------------------
483;
484; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
485; It's still a box filter.
486;
487; GLOBAL(void)
488; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
489; JDIMENSION output_width,
490; JSAMPARRAY input_data,
491; JSAMPARRAY * output_data_ptr);
492;
493
494; r10 = int max_v_samp_factor
495; r11 = JDIMENSION output_width
496; r12 = JSAMPARRAY input_data
497; r13 = JSAMPARRAY * output_data_ptr
498
499 align 16
500 global EXTN(jsimd_h2v1_upsample_sse2)
501
502EXTN(jsimd_h2v1_upsample_sse2):
503 push rbp
504 mov rbp,rsp
505 collect_args
506
507 mov rdx, r11
508 add rdx, byte (2*SIZEOF_XMMWORD)-1
509 and rdx, byte -(2*SIZEOF_XMMWORD)
510 jz near .return
511
512 mov rcx, r10 ; rowctr
513 test rcx,rcx
514 jz short .return
515
516 mov rsi, r12 ; input_data
517 mov rdi, r13
518 mov rdi, JSAMPARRAY [rdi] ; output_data
519.rowloop:
520 push rdi
521 push rsi
522
523 mov rsi, JSAMPROW [rsi] ; inptr
524 mov rdi, JSAMPROW [rdi] ; outptr
525 mov rax,rdx ; colctr
526.columnloop:
527
528 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
529
530 movdqa xmm1,xmm0
531 punpcklbw xmm0,xmm0
532 punpckhbw xmm1,xmm1
533
534 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
535 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
536
537 sub rax, byte 2*SIZEOF_XMMWORD
538 jz short .nextrow
539
540 movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
541
542 movdqa xmm3,xmm2
543 punpcklbw xmm2,xmm2
544 punpckhbw xmm3,xmm3
545
546 movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
547 movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
548
549 sub rax, byte 2*SIZEOF_XMMWORD
550 jz short .nextrow
551
552 add rsi, byte 2*SIZEOF_XMMWORD ; inptr
553 add rdi, byte 4*SIZEOF_XMMWORD ; outptr
554 jmp short .columnloop
555
556.nextrow:
557 pop rsi
558 pop rdi
559
560 add rsi, byte SIZEOF_JSAMPROW ; input_data
561 add rdi, byte SIZEOF_JSAMPROW ; output_data
562 dec rcx ; rowctr
563 jg short .rowloop
564
565.return:
566 uncollect_args
567 pop rbp
568 ret
569
570; --------------------------------------------------------------------------
571;
572; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
573; It's still a box filter.
574;
575; GLOBAL(void)
576; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
577; JDIMENSION output_width,
578; JSAMPARRAY input_data,
579; JSAMPARRAY * output_data_ptr);
580;
581
582; r10 = int max_v_samp_factor
583; r11 = JDIMENSION output_width
584; r12 = JSAMPARRAY input_data
585; r13 = JSAMPARRAY * output_data_ptr
586
587 align 16
588 global EXTN(jsimd_h2v2_upsample_sse2)
589
590EXTN(jsimd_h2v2_upsample_sse2):
591 push rbp
592 mov rbp,rsp
593 push rbx
594 collect_args
595
596 mov rdx, r11
597 add rdx, byte (2*SIZEOF_XMMWORD)-1
598 and rdx, byte -(2*SIZEOF_XMMWORD)
599 jz near .return
600
601 mov rcx, r10 ; rowctr
602 test rcx,rcx
603 jz near .return
604
605 mov rsi, r12 ; input_data
606 mov rdi, r13
607 mov rdi, JSAMPARRAY [rdi] ; output_data
608.rowloop:
609 push rdi
610 push rsi
611
612 mov rsi, JSAMPROW [rsi] ; inptr
613 mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
614 mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
615 mov rax,rdx ; colctr
616.columnloop:
617
618 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
619
620 movdqa xmm1,xmm0
621 punpcklbw xmm0,xmm0
622 punpckhbw xmm1,xmm1
623
624 movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
625 movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
626 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
627 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
628
629 sub rax, byte 2*SIZEOF_XMMWORD
630 jz short .nextrow
631
632 movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
633
634 movdqa xmm3,xmm2
635 punpcklbw xmm2,xmm2
636 punpckhbw xmm3,xmm3
637
638 movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
639 movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
640 movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
641 movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
642
643 sub rax, byte 2*SIZEOF_XMMWORD
644 jz short .nextrow
645
646 add rsi, byte 2*SIZEOF_XMMWORD ; inptr
647 add rbx, byte 4*SIZEOF_XMMWORD ; outptr0
648 add rdi, byte 4*SIZEOF_XMMWORD ; outptr1
649 jmp short .columnloop
650
651.nextrow:
652 pop rsi
653 pop rdi
654
655 add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
656 add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
657 sub rcx, byte 2 ; rowctr
658 jg near .rowloop
659
660.return:
661 uncollect_args
662 pop rbx
663 pop rbp
664 ret