blob: 88564e43935b4348f9e591bf24a5a290d074e567 [file] [log] [blame]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +00001;
DRC72130be2014-05-09 20:14:26 +00002; jdsample.asm - upsampling (MMX)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +00003;
Pierre Ossman5eb84ff2009-03-09 13:25:30 +00004; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +00007; x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000017; [TAB8]
18
Pierre Ossman3a65ef42009-03-16 13:34:18 +000019%include "jsimdext.inc"
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000020
21; --------------------------------------------------------------------------
DRCe5eaf372014-05-09 18:00:32 +000022 SECTION SEG_CONST
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000023
DRCe5eaf372014-05-09 18:00:32 +000024 alignz 16
25 global EXTN(jconst_fancy_upsample_mmx)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000026
27EXTN(jconst_fancy_upsample_mmx):
28
DRCe5eaf372014-05-09 18:00:32 +000029PW_ONE times 4 dw 1
30PW_TWO times 4 dw 2
31PW_THREE times 4 dw 3
32PW_SEVEN times 4 dw 7
33PW_EIGHT times 4 dw 8
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000034
DRCe5eaf372014-05-09 18:00:32 +000035 alignz 16
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000036
37; --------------------------------------------------------------------------
DRCe5eaf372014-05-09 18:00:32 +000038 SECTION SEG_TEXT
39 BITS 32
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000040;
41; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
42;
43; The upsampling algorithm is linear interpolation between pixel centers,
44; also known as a "triangle filter". This is a good compromise between
45; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
46; of the way between input pixel centers.
47;
48; GLOBAL(void)
Pierre Ossman5eb84ff2009-03-09 13:25:30 +000049; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor,
50; JDIMENSION downsampled_width,
51; JSAMPARRAY input_data,
52; JSAMPARRAY * output_data_ptr);
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000053;
54
DRCe5eaf372014-05-09 18:00:32 +000055%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
56%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width
57%define input_data(b) (b)+16 ; JSAMPARRAY input_data
58%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000059
DRCe5eaf372014-05-09 18:00:32 +000060 align 16
61 global EXTN(jsimd_h2v1_fancy_upsample_mmx)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000062
Pierre Ossman5eb84ff2009-03-09 13:25:30 +000063EXTN(jsimd_h2v1_fancy_upsample_mmx):
DRCe5eaf372014-05-09 18:00:32 +000064 push ebp
65 mov ebp,esp
66 pushpic ebx
67; push ecx ; need not be preserved
68; push edx ; need not be preserved
69 push esi
70 push edi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000071
DRCe5eaf372014-05-09 18:00:32 +000072 get_GOT ebx ; get GOT address
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000073
DRCe5eaf372014-05-09 18:00:32 +000074 mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
75 test eax,eax
76 jz near .return
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000077
DRCe5eaf372014-05-09 18:00:32 +000078 mov ecx, INT [max_v_samp(ebp)] ; rowctr
79 test ecx,ecx
80 jz near .return
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000081
DRCe5eaf372014-05-09 18:00:32 +000082 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
83 mov edi, POINTER [output_data_ptr(ebp)]
84 mov edi, JSAMPARRAY [edi] ; output_data
85 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000086.rowloop:
DRCe5eaf372014-05-09 18:00:32 +000087 push eax ; colctr
88 push edi
89 push esi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000090
DRCe5eaf372014-05-09 18:00:32 +000091 mov esi, JSAMPROW [esi] ; inptr
92 mov edi, JSAMPROW [edi] ; outptr
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000093
DRCe5eaf372014-05-09 18:00:32 +000094 test eax, SIZEOF_MMWORD-1
95 jz short .skip
96 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
97 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000098.skip:
DRCe5eaf372014-05-09 18:00:32 +000099 pxor mm0,mm0 ; mm0=(all 0's)
100 pcmpeqb mm7,mm7
101 psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT
102 pand mm7, MMWORD [esi+0*SIZEOF_MMWORD]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000103
DRCe5eaf372014-05-09 18:00:32 +0000104 add eax, byte SIZEOF_MMWORD-1
105 and eax, byte -SIZEOF_MMWORD
106 cmp eax, byte SIZEOF_MMWORD
107 ja short .columnloop
108 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000109
110.columnloop_last:
DRCe5eaf372014-05-09 18:00:32 +0000111 pcmpeqb mm6,mm6
112 psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
113 pand mm6, MMWORD [esi+0*SIZEOF_MMWORD]
114 jmp short .upsample
115 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000116
117.columnloop:
DRCe5eaf372014-05-09 18:00:32 +0000118 movq mm6, MMWORD [esi+1*SIZEOF_MMWORD]
119 psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000120
121.upsample:
DRCe5eaf372014-05-09 18:00:32 +0000122 movq mm1, MMWORD [esi+0*SIZEOF_MMWORD]
123 movq mm2,mm1
124 movq mm3,mm1 ; mm1=( 0 1 2 3 4 5 6 7)
125 psllq mm2,BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6)
126 psrlq mm3,BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000127
DRCe5eaf372014-05-09 18:00:32 +0000128 por mm2,mm7 ; mm2=(-1 0 1 2 3 4 5 6)
129 por mm3,mm6 ; mm3=( 1 2 3 4 5 6 7 8)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000130
DRCe5eaf372014-05-09 18:00:32 +0000131 movq mm7,mm1
132 psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000133
DRCe5eaf372014-05-09 18:00:32 +0000134 movq mm4,mm1
135 punpcklbw mm1,mm0 ; mm1=( 0 1 2 3)
136 punpckhbw mm4,mm0 ; mm4=( 4 5 6 7)
137 movq mm5,mm2
138 punpcklbw mm2,mm0 ; mm2=(-1 0 1 2)
139 punpckhbw mm5,mm0 ; mm5=( 3 4 5 6)
140 movq mm6,mm3
141 punpcklbw mm3,mm0 ; mm3=( 1 2 3 4)
142 punpckhbw mm6,mm0 ; mm6=( 5 6 7 8)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000143
DRCe5eaf372014-05-09 18:00:32 +0000144 pmullw mm1,[GOTOFF(ebx,PW_THREE)]
145 pmullw mm4,[GOTOFF(ebx,PW_THREE)]
146 paddw mm2,[GOTOFF(ebx,PW_ONE)]
147 paddw mm5,[GOTOFF(ebx,PW_ONE)]
148 paddw mm3,[GOTOFF(ebx,PW_TWO)]
149 paddw mm6,[GOTOFF(ebx,PW_TWO)]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000150
DRCe5eaf372014-05-09 18:00:32 +0000151 paddw mm2,mm1
152 paddw mm5,mm4
153 psrlw mm2,2 ; mm2=OutLE=( 0 2 4 6)
154 psrlw mm5,2 ; mm5=OutHE=( 8 10 12 14)
155 paddw mm3,mm1
156 paddw mm6,mm4
157 psrlw mm3,2 ; mm3=OutLO=( 1 3 5 7)
158 psrlw mm6,2 ; mm6=OutHO=( 9 11 13 15)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000159
DRCe5eaf372014-05-09 18:00:32 +0000160 psllw mm3,BYTE_BIT
161 psllw mm6,BYTE_BIT
162 por mm2,mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7)
163 por mm5,mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000164
DRCe5eaf372014-05-09 18:00:32 +0000165 movq MMWORD [edi+0*SIZEOF_MMWORD], mm2
166 movq MMWORD [edi+1*SIZEOF_MMWORD], mm5
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000167
DRCe5eaf372014-05-09 18:00:32 +0000168 sub eax, byte SIZEOF_MMWORD
169 add esi, byte 1*SIZEOF_MMWORD ; inptr
170 add edi, byte 2*SIZEOF_MMWORD ; outptr
171 cmp eax, byte SIZEOF_MMWORD
172 ja near .columnloop
173 test eax,eax
174 jnz near .columnloop_last
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000175
DRCe5eaf372014-05-09 18:00:32 +0000176 pop esi
177 pop edi
178 pop eax
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000179
DRCe5eaf372014-05-09 18:00:32 +0000180 add esi, byte SIZEOF_JSAMPROW ; input_data
181 add edi, byte SIZEOF_JSAMPROW ; output_data
182 dec ecx ; rowctr
183 jg near .rowloop
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000184
DRCe5eaf372014-05-09 18:00:32 +0000185 emms ; empty MMX state
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000186
187.return:
DRCe5eaf372014-05-09 18:00:32 +0000188 pop edi
189 pop esi
190; pop edx ; need not be preserved
191; pop ecx ; need not be preserved
192 poppic ebx
193 pop ebp
194 ret
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000195
196; --------------------------------------------------------------------------
197;
198; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
199; Again a triangle filter; see comments for h2v1 case, above.
200;
201; GLOBAL(void)
Pierre Ossman5eb84ff2009-03-09 13:25:30 +0000202; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor,
203; JDIMENSION downsampled_width,
204; JSAMPARRAY input_data,
205; JSAMPARRAY * output_data_ptr);
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000206;
207
DRCe5eaf372014-05-09 18:00:32 +0000208%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
209%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width
210%define input_data(b) (b)+16 ; JSAMPARRAY input_data
211%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000212
DRCe5eaf372014-05-09 18:00:32 +0000213%define original_ebp ebp+0
214%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
215%define WK_NUM 4
216%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000217
DRCe5eaf372014-05-09 18:00:32 +0000218 align 16
219 global EXTN(jsimd_h2v2_fancy_upsample_mmx)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000220
Pierre Ossman5eb84ff2009-03-09 13:25:30 +0000221EXTN(jsimd_h2v2_fancy_upsample_mmx):
DRCe5eaf372014-05-09 18:00:32 +0000222 push ebp
223 mov eax,esp ; eax = original ebp
224 sub esp, byte 4
225 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
226 mov [esp],eax
227 mov ebp,esp ; ebp = aligned ebp
228 lea esp, [wk(0)]
229 pushpic eax ; make a room for GOT address
230 push ebx
231; push ecx ; need not be preserved
232; push edx ; need not be preserved
233 push esi
234 push edi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000235
DRCe5eaf372014-05-09 18:00:32 +0000236 get_GOT ebx ; get GOT address
237 movpic POINTER [gotptr], ebx ; save GOT address
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000238
DRCe5eaf372014-05-09 18:00:32 +0000239 mov edx,eax ; edx = original ebp
240 mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
241 test eax,eax
242 jz near .return
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000243
DRCe5eaf372014-05-09 18:00:32 +0000244 mov ecx, INT [max_v_samp(edx)] ; rowctr
245 test ecx,ecx
246 jz near .return
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000247
DRCe5eaf372014-05-09 18:00:32 +0000248 mov esi, JSAMPARRAY [input_data(edx)] ; input_data
249 mov edi, POINTER [output_data_ptr(edx)]
250 mov edi, JSAMPARRAY [edi] ; output_data
251 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000252.rowloop:
DRCe5eaf372014-05-09 18:00:32 +0000253 push eax ; colctr
254 push ecx
255 push edi
256 push esi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000257
DRCe5eaf372014-05-09 18:00:32 +0000258 mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)
259 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
260 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)
261 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
262 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000263
DRCe5eaf372014-05-09 18:00:32 +0000264 test eax, SIZEOF_MMWORD-1
265 jz short .skip
266 push edx
267 mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
268 mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
269 mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
270 mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
271 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
272 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
273 pop edx
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000274.skip:
DRCe5eaf372014-05-09 18:00:32 +0000275 ; -- process the first column block
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000276
DRCe5eaf372014-05-09 18:00:32 +0000277 movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0]
278 movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0]
279 movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000280
DRCe5eaf372014-05-09 18:00:32 +0000281 pushpic ebx
282 movpic ebx, POINTER [gotptr] ; load GOT address
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000283
DRCe5eaf372014-05-09 18:00:32 +0000284 pxor mm3,mm3 ; mm3=(all 0's)
285 movq mm4,mm0
286 punpcklbw mm0,mm3 ; mm0=row[ 0][0]( 0 1 2 3)
287 punpckhbw mm4,mm3 ; mm4=row[ 0][0]( 4 5 6 7)
288 movq mm5,mm1
289 punpcklbw mm1,mm3 ; mm1=row[-1][0]( 0 1 2 3)
290 punpckhbw mm5,mm3 ; mm5=row[-1][0]( 4 5 6 7)
291 movq mm6,mm2
292 punpcklbw mm2,mm3 ; mm2=row[+1][0]( 0 1 2 3)
293 punpckhbw mm6,mm3 ; mm6=row[+1][0]( 4 5 6 7)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000294
DRCe5eaf372014-05-09 18:00:32 +0000295 pmullw mm0,[GOTOFF(ebx,PW_THREE)]
296 pmullw mm4,[GOTOFF(ebx,PW_THREE)]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000297
DRCe5eaf372014-05-09 18:00:32 +0000298 pcmpeqb mm7,mm7
299 psrlq mm7,(SIZEOF_MMWORD-2)*BYTE_BIT
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000300
DRCe5eaf372014-05-09 18:00:32 +0000301 paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3)
302 paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7)
303 paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3)
304 paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000305
DRCe5eaf372014-05-09 18:00:32 +0000306 movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save
307 movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data
308 movq MMWORD [edi+0*SIZEOF_MMWORD], mm2
309 movq MMWORD [edi+1*SIZEOF_MMWORD], mm6
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000310
DRCe5eaf372014-05-09 18:00:32 +0000311 pand mm1,mm7 ; mm1=( 0 - - -)
312 pand mm2,mm7 ; mm2=( 0 - - -)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000313
DRCe5eaf372014-05-09 18:00:32 +0000314 movq MMWORD [wk(0)], mm1
315 movq MMWORD [wk(1)], mm2
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000316
DRCe5eaf372014-05-09 18:00:32 +0000317 poppic ebx
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000318
DRCe5eaf372014-05-09 18:00:32 +0000319 add eax, byte SIZEOF_MMWORD-1
320 and eax, byte -SIZEOF_MMWORD
321 cmp eax, byte SIZEOF_MMWORD
322 ja short .columnloop
323 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000324
325.columnloop_last:
DRCe5eaf372014-05-09 18:00:32 +0000326 ; -- process the last column block
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000327
DRCe5eaf372014-05-09 18:00:32 +0000328 pushpic ebx
329 movpic ebx, POINTER [gotptr] ; load GOT address
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000330
DRCe5eaf372014-05-09 18:00:32 +0000331 pcmpeqb mm1,mm1
332 psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT
333 movq mm2,mm1
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000334
DRCe5eaf372014-05-09 18:00:32 +0000335 pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7)
336 pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000337
DRCe5eaf372014-05-09 18:00:32 +0000338 movq MMWORD [wk(2)], mm1
339 movq MMWORD [wk(3)], mm2
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000340
DRCe5eaf372014-05-09 18:00:32 +0000341 jmp short .upsample
342 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000343
344.columnloop:
DRCe5eaf372014-05-09 18:00:32 +0000345 ; -- process the next column block
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000346
DRCe5eaf372014-05-09 18:00:32 +0000347 movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1]
348 movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1]
349 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000350
DRCe5eaf372014-05-09 18:00:32 +0000351 pushpic ebx
352 movpic ebx, POINTER [gotptr] ; load GOT address
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000353
DRCe5eaf372014-05-09 18:00:32 +0000354 pxor mm3,mm3 ; mm3=(all 0's)
355 movq mm4,mm0
356 punpcklbw mm0,mm3 ; mm0=row[ 0][1]( 0 1 2 3)
357 punpckhbw mm4,mm3 ; mm4=row[ 0][1]( 4 5 6 7)
358 movq mm5,mm1
359 punpcklbw mm1,mm3 ; mm1=row[-1][1]( 0 1 2 3)
360 punpckhbw mm5,mm3 ; mm5=row[-1][1]( 4 5 6 7)
361 movq mm6,mm2
362 punpcklbw mm2,mm3 ; mm2=row[+1][1]( 0 1 2 3)
363 punpckhbw mm6,mm3 ; mm6=row[+1][1]( 4 5 6 7)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000364
DRCe5eaf372014-05-09 18:00:32 +0000365 pmullw mm0,[GOTOFF(ebx,PW_THREE)]
366 pmullw mm4,[GOTOFF(ebx,PW_THREE)]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000367
DRCe5eaf372014-05-09 18:00:32 +0000368 paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3)
369 paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7)
370 paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3)
371 paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000372
DRCe5eaf372014-05-09 18:00:32 +0000373 movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save
374 movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data
375 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
376 movq MMWORD [edi+3*SIZEOF_MMWORD], mm6
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000377
DRCe5eaf372014-05-09 18:00:32 +0000378 psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0)
379 psllq mm2,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000380
DRCe5eaf372014-05-09 18:00:32 +0000381 movq MMWORD [wk(2)], mm1
382 movq MMWORD [wk(3)], mm2
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000383
384.upsample:
DRCe5eaf372014-05-09 18:00:32 +0000385 ; -- process the upper row
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000386
DRCe5eaf372014-05-09 18:00:32 +0000387 movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3)
388 movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000389
DRCe5eaf372014-05-09 18:00:32 +0000390 movq mm0,mm7
391 movq mm4,mm3
392 psrlq mm0,2*BYTE_BIT ; mm0=( 1 2 3 -)
393 psllq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4)
394 movq mm5,mm7
395 movq mm6,mm3
396 psrlq mm5,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -)
397 psllq mm6,2*BYTE_BIT ; mm6=( - 4 5 6)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000398
DRCe5eaf372014-05-09 18:00:32 +0000399 por mm0,mm4 ; mm0=( 1 2 3 4)
400 por mm5,mm6 ; mm5=( 3 4 5 6)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000401
DRCe5eaf372014-05-09 18:00:32 +0000402 movq mm1,mm7
403 movq mm2,mm3
404 psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2)
405 psrlq mm2,2*BYTE_BIT ; mm2=( 5 6 7 -)
406 movq mm4,mm3
407 psrlq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000408
DRCe5eaf372014-05-09 18:00:32 +0000409 por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2)
410 por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000411
DRCe5eaf372014-05-09 18:00:32 +0000412 movq MMWORD [wk(0)], mm4
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000413
DRCe5eaf372014-05-09 18:00:32 +0000414 pmullw mm7,[GOTOFF(ebx,PW_THREE)]
415 pmullw mm3,[GOTOFF(ebx,PW_THREE)]
416 paddw mm1,[GOTOFF(ebx,PW_EIGHT)]
417 paddw mm5,[GOTOFF(ebx,PW_EIGHT)]
418 paddw mm0,[GOTOFF(ebx,PW_SEVEN)]
419 paddw mm2,[GOTOFF(ebx,PW_SEVEN)]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000420
DRCe5eaf372014-05-09 18:00:32 +0000421 paddw mm1,mm7
422 paddw mm5,mm3
423 psrlw mm1,4 ; mm1=Out0LE=( 0 2 4 6)
424 psrlw mm5,4 ; mm5=Out0HE=( 8 10 12 14)
425 paddw mm0,mm7
426 paddw mm2,mm3
427 psrlw mm0,4 ; mm0=Out0LO=( 1 3 5 7)
428 psrlw mm2,4 ; mm2=Out0HO=( 9 11 13 15)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000429
DRCe5eaf372014-05-09 18:00:32 +0000430 psllw mm0,BYTE_BIT
431 psllw mm2,BYTE_BIT
432 por mm1,mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7)
433 por mm5,mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000434
DRCe5eaf372014-05-09 18:00:32 +0000435 movq MMWORD [edx+0*SIZEOF_MMWORD], mm1
436 movq MMWORD [edx+1*SIZEOF_MMWORD], mm5
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000437
DRCe5eaf372014-05-09 18:00:32 +0000438 ; -- process the lower row
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000439
DRCe5eaf372014-05-09 18:00:32 +0000440 movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3)
441 movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000442
DRCe5eaf372014-05-09 18:00:32 +0000443 movq mm7,mm6
444 movq mm3,mm4
445 psrlq mm7,2*BYTE_BIT ; mm7=( 1 2 3 -)
446 psllq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4)
447 movq mm0,mm6
448 movq mm2,mm4
449 psrlq mm0,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -)
450 psllq mm2,2*BYTE_BIT ; mm2=( - 4 5 6)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000451
DRCe5eaf372014-05-09 18:00:32 +0000452 por mm7,mm3 ; mm7=( 1 2 3 4)
453 por mm0,mm2 ; mm0=( 3 4 5 6)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000454
DRCe5eaf372014-05-09 18:00:32 +0000455 movq mm1,mm6
456 movq mm5,mm4
457 psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2)
458 psrlq mm5,2*BYTE_BIT ; mm5=( 5 6 7 -)
459 movq mm3,mm4
460 psrlq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000461
DRCe5eaf372014-05-09 18:00:32 +0000462 por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2)
463 por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000464
DRCe5eaf372014-05-09 18:00:32 +0000465 movq MMWORD [wk(1)], mm3
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000466
DRCe5eaf372014-05-09 18:00:32 +0000467 pmullw mm6,[GOTOFF(ebx,PW_THREE)]
468 pmullw mm4,[GOTOFF(ebx,PW_THREE)]
469 paddw mm1,[GOTOFF(ebx,PW_EIGHT)]
470 paddw mm0,[GOTOFF(ebx,PW_EIGHT)]
471 paddw mm7,[GOTOFF(ebx,PW_SEVEN)]
472 paddw mm5,[GOTOFF(ebx,PW_SEVEN)]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000473
DRCe5eaf372014-05-09 18:00:32 +0000474 paddw mm1,mm6
475 paddw mm0,mm4
476 psrlw mm1,4 ; mm1=Out1LE=( 0 2 4 6)
477 psrlw mm0,4 ; mm0=Out1HE=( 8 10 12 14)
478 paddw mm7,mm6
479 paddw mm5,mm4
480 psrlw mm7,4 ; mm7=Out1LO=( 1 3 5 7)
481 psrlw mm5,4 ; mm5=Out1HO=( 9 11 13 15)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000482
DRCe5eaf372014-05-09 18:00:32 +0000483 psllw mm7,BYTE_BIT
484 psllw mm5,BYTE_BIT
485 por mm1,mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7)
486 por mm0,mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000487
DRCe5eaf372014-05-09 18:00:32 +0000488 movq MMWORD [edi+0*SIZEOF_MMWORD], mm1
489 movq MMWORD [edi+1*SIZEOF_MMWORD], mm0
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000490
DRCe5eaf372014-05-09 18:00:32 +0000491 poppic ebx
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000492
DRCe5eaf372014-05-09 18:00:32 +0000493 sub eax, byte SIZEOF_MMWORD
494 add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above)
495 add ebx, byte 1*SIZEOF_MMWORD ; inptr0
496 add esi, byte 1*SIZEOF_MMWORD ; inptr1(below)
497 add edx, byte 2*SIZEOF_MMWORD ; outptr0
498 add edi, byte 2*SIZEOF_MMWORD ; outptr1
499 cmp eax, byte SIZEOF_MMWORD
500 ja near .columnloop
501 test eax,eax
502 jnz near .columnloop_last
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000503
DRCe5eaf372014-05-09 18:00:32 +0000504 pop esi
505 pop edi
506 pop ecx
507 pop eax
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000508
DRCe5eaf372014-05-09 18:00:32 +0000509 add esi, byte 1*SIZEOF_JSAMPROW ; input_data
510 add edi, byte 2*SIZEOF_JSAMPROW ; output_data
511 sub ecx, byte 2 ; rowctr
512 jg near .rowloop
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000513
DRCe5eaf372014-05-09 18:00:32 +0000514 emms ; empty MMX state
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000515
516.return:
DRCe5eaf372014-05-09 18:00:32 +0000517 pop edi
518 pop esi
519; pop edx ; need not be preserved
520; pop ecx ; need not be preserved
521 pop ebx
522 mov esp,ebp ; esp <- aligned ebp
523 pop esp ; esp <- original ebp
524 pop ebp
525 ret
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000526
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000527; --------------------------------------------------------------------------
528;
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000529; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
530; It's still a box filter.
531;
532; GLOBAL(void)
Pierre Ossman5eb84ff2009-03-09 13:25:30 +0000533; jsimd_h2v1_upsample_mmx (int max_v_samp_factor,
534; JDIMENSION output_width,
535; JSAMPARRAY input_data,
536; JSAMPARRAY * output_data_ptr);
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000537;
538
DRCe5eaf372014-05-09 18:00:32 +0000539%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
540%define output_width(b) (b)+12 ; JDIMENSION output_width
541%define input_data(b) (b)+16 ; JSAMPARRAY input_data
542%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000543
DRCe5eaf372014-05-09 18:00:32 +0000544 align 16
545 global EXTN(jsimd_h2v1_upsample_mmx)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000546
Pierre Ossman5eb84ff2009-03-09 13:25:30 +0000547EXTN(jsimd_h2v1_upsample_mmx):
DRCe5eaf372014-05-09 18:00:32 +0000548 push ebp
549 mov ebp,esp
550; push ebx ; unused
551; push ecx ; need not be preserved
552; push edx ; need not be preserved
553 push esi
554 push edi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000555
DRCe5eaf372014-05-09 18:00:32 +0000556 mov edx, JDIMENSION [output_width(ebp)]
557 add edx, byte (2*SIZEOF_MMWORD)-1
558 and edx, byte -(2*SIZEOF_MMWORD)
559 jz short .return
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000560
DRCe5eaf372014-05-09 18:00:32 +0000561 mov ecx, INT [max_v_samp(ebp)] ; rowctr
562 test ecx,ecx
563 jz short .return
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000564
DRCe5eaf372014-05-09 18:00:32 +0000565 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
566 mov edi, POINTER [output_data_ptr(ebp)]
567 mov edi, JSAMPARRAY [edi] ; output_data
568 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000569.rowloop:
DRCe5eaf372014-05-09 18:00:32 +0000570 push edi
571 push esi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000572
DRCe5eaf372014-05-09 18:00:32 +0000573 mov esi, JSAMPROW [esi] ; inptr
574 mov edi, JSAMPROW [edi] ; outptr
575 mov eax,edx ; colctr
576 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000577.columnloop:
578
DRCe5eaf372014-05-09 18:00:32 +0000579 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000580
DRCe5eaf372014-05-09 18:00:32 +0000581 movq mm1,mm0
582 punpcklbw mm0,mm0
583 punpckhbw mm1,mm1
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000584
DRCe5eaf372014-05-09 18:00:32 +0000585 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
586 movq MMWORD [edi+1*SIZEOF_MMWORD], mm1
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000587
DRCe5eaf372014-05-09 18:00:32 +0000588 sub eax, byte 2*SIZEOF_MMWORD
589 jz short .nextrow
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000590
DRCe5eaf372014-05-09 18:00:32 +0000591 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000592
DRCe5eaf372014-05-09 18:00:32 +0000593 movq mm3,mm2
594 punpcklbw mm2,mm2
595 punpckhbw mm3,mm3
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000596
DRCe5eaf372014-05-09 18:00:32 +0000597 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
598 movq MMWORD [edi+3*SIZEOF_MMWORD], mm3
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000599
DRCe5eaf372014-05-09 18:00:32 +0000600 sub eax, byte 2*SIZEOF_MMWORD
601 jz short .nextrow
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000602
DRCe5eaf372014-05-09 18:00:32 +0000603 add esi, byte 2*SIZEOF_MMWORD ; inptr
604 add edi, byte 4*SIZEOF_MMWORD ; outptr
605 jmp short .columnloop
606 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000607
608.nextrow:
DRCe5eaf372014-05-09 18:00:32 +0000609 pop esi
610 pop edi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000611
DRCe5eaf372014-05-09 18:00:32 +0000612 add esi, byte SIZEOF_JSAMPROW ; input_data
613 add edi, byte SIZEOF_JSAMPROW ; output_data
614 dec ecx ; rowctr
615 jg short .rowloop
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000616
DRCe5eaf372014-05-09 18:00:32 +0000617 emms ; empty MMX state
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000618
619.return:
DRCe5eaf372014-05-09 18:00:32 +0000620 pop edi
621 pop esi
622; pop edx ; need not be preserved
623; pop ecx ; need not be preserved
624; pop ebx ; unused
625 pop ebp
626 ret
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000627
628; --------------------------------------------------------------------------
629;
630; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
631; It's still a box filter.
632;
633; GLOBAL(void)
Pierre Ossman5eb84ff2009-03-09 13:25:30 +0000634; jsimd_h2v2_upsample_mmx (int max_v_samp_factor,
635; JDIMENSION output_width,
636; JSAMPARRAY input_data,
637; JSAMPARRAY * output_data_ptr);
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000638;
639
DRCe5eaf372014-05-09 18:00:32 +0000640%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
641%define output_width(b) (b)+12 ; JDIMENSION output_width
642%define input_data(b) (b)+16 ; JSAMPARRAY input_data
643%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000644
DRCe5eaf372014-05-09 18:00:32 +0000645 align 16
646 global EXTN(jsimd_h2v2_upsample_mmx)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000647
Pierre Ossman5eb84ff2009-03-09 13:25:30 +0000648EXTN(jsimd_h2v2_upsample_mmx):
DRCe5eaf372014-05-09 18:00:32 +0000649 push ebp
650 mov ebp,esp
651 push ebx
652; push ecx ; need not be preserved
653; push edx ; need not be preserved
654 push esi
655 push edi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000656
DRCe5eaf372014-05-09 18:00:32 +0000657 mov edx, JDIMENSION [output_width(ebp)]
658 add edx, byte (2*SIZEOF_MMWORD)-1
659 and edx, byte -(2*SIZEOF_MMWORD)
660 jz near .return
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000661
DRCe5eaf372014-05-09 18:00:32 +0000662 mov ecx, INT [max_v_samp(ebp)] ; rowctr
663 test ecx,ecx
664 jz short .return
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000665
DRCe5eaf372014-05-09 18:00:32 +0000666 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
667 mov edi, POINTER [output_data_ptr(ebp)]
668 mov edi, JSAMPARRAY [edi] ; output_data
669 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000670.rowloop:
DRCe5eaf372014-05-09 18:00:32 +0000671 push edi
672 push esi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000673
DRCe5eaf372014-05-09 18:00:32 +0000674 mov esi, JSAMPROW [esi] ; inptr
675 mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
676 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
677 mov eax,edx ; colctr
678 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000679.columnloop:
680
DRCe5eaf372014-05-09 18:00:32 +0000681 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000682
DRCe5eaf372014-05-09 18:00:32 +0000683 movq mm1,mm0
684 punpcklbw mm0,mm0
685 punpckhbw mm1,mm1
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000686
DRCe5eaf372014-05-09 18:00:32 +0000687 movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0
688 movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1
689 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
690 movq MMWORD [edi+1*SIZEOF_MMWORD], mm1
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000691
DRCe5eaf372014-05-09 18:00:32 +0000692 sub eax, byte 2*SIZEOF_MMWORD
693 jz short .nextrow
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000694
DRCe5eaf372014-05-09 18:00:32 +0000695 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000696
DRCe5eaf372014-05-09 18:00:32 +0000697 movq mm3,mm2
698 punpcklbw mm2,mm2
699 punpckhbw mm3,mm3
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000700
DRCe5eaf372014-05-09 18:00:32 +0000701 movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2
702 movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3
703 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
704 movq MMWORD [edi+3*SIZEOF_MMWORD], mm3
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000705
DRCe5eaf372014-05-09 18:00:32 +0000706 sub eax, byte 2*SIZEOF_MMWORD
707 jz short .nextrow
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000708
DRCe5eaf372014-05-09 18:00:32 +0000709 add esi, byte 2*SIZEOF_MMWORD ; inptr
710 add ebx, byte 4*SIZEOF_MMWORD ; outptr0
711 add edi, byte 4*SIZEOF_MMWORD ; outptr1
712 jmp short .columnloop
713 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000714
715.nextrow:
DRCe5eaf372014-05-09 18:00:32 +0000716 pop esi
717 pop edi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000718
DRCe5eaf372014-05-09 18:00:32 +0000719 add esi, byte 1*SIZEOF_JSAMPROW ; input_data
720 add edi, byte 2*SIZEOF_JSAMPROW ; output_data
721 sub ecx, byte 2 ; rowctr
722 jg short .rowloop
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000723
DRCe5eaf372014-05-09 18:00:32 +0000724 emms ; empty MMX state
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000725
726.return:
DRCe5eaf372014-05-09 18:00:32 +0000727 pop edi
728 pop esi
729; pop edx ; need not be preserved
730; pop ecx ; need not be preserved
731 pop ebx
732 pop ebp
733 ret
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000734
DRC132b5fd2009-10-08 09:04:56 +0000735; For some reason, the OS X linker does not honor the request to align the
736; segment unless we do this.
DRCe5eaf372014-05-09 18:00:32 +0000737 align 16