blob: 6881a56d70a703722bc49a8f674074b24ec3d75d [file] [log] [blame]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +00001;
DRC72130be2014-05-09 20:14:26 +00002; jcsample.asm - downsampling (MMX)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +00003;
Pierre Ossman5eb84ff2009-03-09 13:25:30 +00004; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +00007; x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000017; [TAB8]
18
Pierre Ossman3a65ef42009-03-16 13:34:18 +000019%include "jsimdext.inc"
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000020
21; --------------------------------------------------------------------------
DRCe5eaf372014-05-09 18:00:32 +000022 SECTION SEG_TEXT
23 BITS 32
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000024;
25; Downsample pixel values of a single component.
26; This version handles the common case of 2:1 horizontal and 1:1 vertical,
27; without smoothing.
28;
29; GLOBAL(void)
Pierre Ossman5eb84ff2009-03-09 13:25:30 +000030; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
31; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
32; JSAMPARRAY input_data, JSAMPARRAY output_data);
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000033;
34
DRCe5eaf372014-05-09 18:00:32 +000035%define img_width(b) (b)+8 ; JDIMENSION image_width
36%define max_v_samp(b) (b)+12 ; int max_v_samp_factor
37%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
38%define width_blks(b) (b)+20 ; JDIMENSION width_blocks
39%define input_data(b) (b)+24 ; JSAMPARRAY input_data
40%define output_data(b) (b)+28 ; JSAMPARRAY output_data
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000041
DRCe5eaf372014-05-09 18:00:32 +000042 align 16
43 global EXTN(jsimd_h2v1_downsample_mmx)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000044
Pierre Ossman5eb84ff2009-03-09 13:25:30 +000045EXTN(jsimd_h2v1_downsample_mmx):
DRCe5eaf372014-05-09 18:00:32 +000046 push ebp
47 mov ebp,esp
48; push ebx ; unused
49; push ecx ; need not be preserved
50; push edx ; need not be preserved
51 push esi
52 push edi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000053
DRCe5eaf372014-05-09 18:00:32 +000054 mov ecx, JDIMENSION [width_blks(ebp)]
55 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
56 jz near .return
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000057
DRCe5eaf372014-05-09 18:00:32 +000058 mov edx, JDIMENSION [img_width(ebp)]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000059
DRCe5eaf372014-05-09 18:00:32 +000060 ; -- expand_right_edge
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000061
DRCe5eaf372014-05-09 18:00:32 +000062 push ecx
63 shl ecx,1 ; output_cols * 2
64 sub ecx,edx
65 jle short .expand_end
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000066
DRCe5eaf372014-05-09 18:00:32 +000067 mov eax, INT [max_v_samp(ebp)]
68 test eax,eax
69 jle short .expand_end
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000070
DRCe5eaf372014-05-09 18:00:32 +000071 cld
72 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
73 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000074.expandloop:
DRCe5eaf372014-05-09 18:00:32 +000075 push eax
76 push ecx
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000077
DRCe5eaf372014-05-09 18:00:32 +000078 mov edi, JSAMPROW [esi]
79 add edi,edx
80 mov al, JSAMPLE [edi-1]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000081
DRCe5eaf372014-05-09 18:00:32 +000082 rep stosb
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000083
DRCe5eaf372014-05-09 18:00:32 +000084 pop ecx
85 pop eax
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000086
DRCe5eaf372014-05-09 18:00:32 +000087 add esi, byte SIZEOF_JSAMPROW
88 dec eax
89 jg short .expandloop
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000090
91.expand_end:
DRCe5eaf372014-05-09 18:00:32 +000092 pop ecx ; output_cols
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000093
DRCe5eaf372014-05-09 18:00:32 +000094 ; -- h2v1_downsample
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000095
DRCe5eaf372014-05-09 18:00:32 +000096 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
97 test eax,eax
98 jle near .return
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000099
DRCe5eaf372014-05-09 18:00:32 +0000100 mov edx, 0x00010000 ; bias pattern
101 movd mm7,edx
102 pcmpeqw mm6,mm6
103 punpckldq mm7,mm7 ; mm7={0, 1, 0, 1}
104 psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000105
DRCe5eaf372014-05-09 18:00:32 +0000106 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
107 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
108 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000109.rowloop:
DRCe5eaf372014-05-09 18:00:32 +0000110 push ecx
111 push edi
112 push esi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000113
DRCe5eaf372014-05-09 18:00:32 +0000114 mov esi, JSAMPROW [esi] ; inptr
115 mov edi, JSAMPROW [edi] ; outptr
116 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000117.columnloop:
118
DRCe5eaf372014-05-09 18:00:32 +0000119 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
120 movq mm1, MMWORD [esi+1*SIZEOF_MMWORD]
121 movq mm2,mm0
122 movq mm3,mm1
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000123
DRCe5eaf372014-05-09 18:00:32 +0000124 pand mm0,mm6
125 psrlw mm2,BYTE_BIT
126 pand mm1,mm6
127 psrlw mm3,BYTE_BIT
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000128
DRCe5eaf372014-05-09 18:00:32 +0000129 paddw mm0,mm2
130 paddw mm1,mm3
131 paddw mm0,mm7
132 paddw mm1,mm7
133 psrlw mm0,1
134 psrlw mm1,1
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000135
DRCe5eaf372014-05-09 18:00:32 +0000136 packuswb mm0,mm1
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000137
DRCe5eaf372014-05-09 18:00:32 +0000138 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000139
DRCe5eaf372014-05-09 18:00:32 +0000140 add esi, byte 2*SIZEOF_MMWORD ; inptr
141 add edi, byte 1*SIZEOF_MMWORD ; outptr
142 sub ecx, byte SIZEOF_MMWORD ; outcol
143 jnz short .columnloop
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000144
DRCe5eaf372014-05-09 18:00:32 +0000145 pop esi
146 pop edi
147 pop ecx
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000148
DRCe5eaf372014-05-09 18:00:32 +0000149 add esi, byte SIZEOF_JSAMPROW ; input_data
150 add edi, byte SIZEOF_JSAMPROW ; output_data
151 dec eax ; rowctr
152 jg short .rowloop
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000153
DRCe5eaf372014-05-09 18:00:32 +0000154 emms ; empty MMX state
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000155
156.return:
DRCe5eaf372014-05-09 18:00:32 +0000157 pop edi
158 pop esi
159; pop edx ; need not be preserved
160; pop ecx ; need not be preserved
161; pop ebx ; unused
162 pop ebp
163 ret
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000164
165; --------------------------------------------------------------------------
166;
167; Downsample pixel values of a single component.
168; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
169; without smoothing.
170;
171; GLOBAL(void)
Pierre Ossman5eb84ff2009-03-09 13:25:30 +0000172; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
173; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
174; JSAMPARRAY input_data, JSAMPARRAY output_data);
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000175;
176
DRCe5eaf372014-05-09 18:00:32 +0000177%define img_width(b) (b)+8 ; JDIMENSION image_width
178%define max_v_samp(b) (b)+12 ; int max_v_samp_factor
179%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
180%define width_blks(b) (b)+20 ; JDIMENSION width_blocks
181%define input_data(b) (b)+24 ; JSAMPARRAY input_data
182%define output_data(b) (b)+28 ; JSAMPARRAY output_data
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000183
DRCe5eaf372014-05-09 18:00:32 +0000184 align 16
185 global EXTN(jsimd_h2v2_downsample_mmx)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000186
Pierre Ossman5eb84ff2009-03-09 13:25:30 +0000187EXTN(jsimd_h2v2_downsample_mmx):
DRCe5eaf372014-05-09 18:00:32 +0000188 push ebp
189 mov ebp,esp
190; push ebx ; unused
191; push ecx ; need not be preserved
192; push edx ; need not be preserved
193 push esi
194 push edi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000195
DRCe5eaf372014-05-09 18:00:32 +0000196 mov ecx, JDIMENSION [width_blks(ebp)]
197 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
198 jz near .return
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000199
DRCe5eaf372014-05-09 18:00:32 +0000200 mov edx, JDIMENSION [img_width(ebp)]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000201
DRCe5eaf372014-05-09 18:00:32 +0000202 ; -- expand_right_edge
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000203
DRCe5eaf372014-05-09 18:00:32 +0000204 push ecx
205 shl ecx,1 ; output_cols * 2
206 sub ecx,edx
207 jle short .expand_end
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000208
DRCe5eaf372014-05-09 18:00:32 +0000209 mov eax, INT [max_v_samp(ebp)]
210 test eax,eax
211 jle short .expand_end
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000212
DRCe5eaf372014-05-09 18:00:32 +0000213 cld
214 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
215 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000216.expandloop:
DRCe5eaf372014-05-09 18:00:32 +0000217 push eax
218 push ecx
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000219
DRCe5eaf372014-05-09 18:00:32 +0000220 mov edi, JSAMPROW [esi]
221 add edi,edx
222 mov al, JSAMPLE [edi-1]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000223
DRCe5eaf372014-05-09 18:00:32 +0000224 rep stosb
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000225
DRCe5eaf372014-05-09 18:00:32 +0000226 pop ecx
227 pop eax
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000228
DRCe5eaf372014-05-09 18:00:32 +0000229 add esi, byte SIZEOF_JSAMPROW
230 dec eax
231 jg short .expandloop
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000232
233.expand_end:
DRCe5eaf372014-05-09 18:00:32 +0000234 pop ecx ; output_cols
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000235
DRCe5eaf372014-05-09 18:00:32 +0000236 ; -- h2v2_downsample
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000237
DRCe5eaf372014-05-09 18:00:32 +0000238 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
239 test eax,eax
240 jle near .return
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000241
DRCe5eaf372014-05-09 18:00:32 +0000242 mov edx, 0x00020001 ; bias pattern
243 movd mm7,edx
244 pcmpeqw mm6,mm6
245 punpckldq mm7,mm7 ; mm7={1, 2, 1, 2}
246 psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000247
DRCe5eaf372014-05-09 18:00:32 +0000248 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
249 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
250 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000251.rowloop:
DRCe5eaf372014-05-09 18:00:32 +0000252 push ecx
253 push edi
254 push esi
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000255
DRCe5eaf372014-05-09 18:00:32 +0000256 mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
257 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
258 mov edi, JSAMPROW [edi] ; outptr
259 alignx 16,7
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000260.columnloop:
261
DRCe5eaf372014-05-09 18:00:32 +0000262 movq mm0, MMWORD [edx+0*SIZEOF_MMWORD]
263 movq mm1, MMWORD [esi+0*SIZEOF_MMWORD]
264 movq mm2, MMWORD [edx+1*SIZEOF_MMWORD]
265 movq mm3, MMWORD [esi+1*SIZEOF_MMWORD]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000266
DRCe5eaf372014-05-09 18:00:32 +0000267 movq mm4,mm0
268 movq mm5,mm1
269 pand mm0,mm6
270 psrlw mm4,BYTE_BIT
271 pand mm1,mm6
272 psrlw mm5,BYTE_BIT
273 paddw mm0,mm4
274 paddw mm1,mm5
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000275
DRCe5eaf372014-05-09 18:00:32 +0000276 movq mm4,mm2
277 movq mm5,mm3
278 pand mm2,mm6
279 psrlw mm4,BYTE_BIT
280 pand mm3,mm6
281 psrlw mm5,BYTE_BIT
282 paddw mm2,mm4
283 paddw mm3,mm5
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000284
DRCe5eaf372014-05-09 18:00:32 +0000285 paddw mm0,mm1
286 paddw mm2,mm3
287 paddw mm0,mm7
288 paddw mm2,mm7
289 psrlw mm0,2
290 psrlw mm2,2
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000291
DRCe5eaf372014-05-09 18:00:32 +0000292 packuswb mm0,mm2
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000293
DRCe5eaf372014-05-09 18:00:32 +0000294 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000295
DRCe5eaf372014-05-09 18:00:32 +0000296 add edx, byte 2*SIZEOF_MMWORD ; inptr0
297 add esi, byte 2*SIZEOF_MMWORD ; inptr1
298 add edi, byte 1*SIZEOF_MMWORD ; outptr
299 sub ecx, byte SIZEOF_MMWORD ; outcol
300 jnz near .columnloop
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000301
DRCe5eaf372014-05-09 18:00:32 +0000302 pop esi
303 pop edi
304 pop ecx
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000305
DRCe5eaf372014-05-09 18:00:32 +0000306 add esi, byte 2*SIZEOF_JSAMPROW ; input_data
307 add edi, byte 1*SIZEOF_JSAMPROW ; output_data
308 dec eax ; rowctr
309 jg near .rowloop
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000310
DRCe5eaf372014-05-09 18:00:32 +0000311 emms ; empty MMX state
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000312
313.return:
DRCe5eaf372014-05-09 18:00:32 +0000314 pop edi
315 pop esi
316; pop edx ; need not be preserved
317; pop ecx ; need not be preserved
318; pop ebx ; unused
319 pop ebp
320 ret
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000321
DRC132b5fd2009-10-08 09:04:56 +0000322; For some reason, the OS X linker does not honor the request to align the
323; segment unless we do this.
DRCe5eaf372014-05-09 18:00:32 +0000324 align 16