Blame - simd/i386/jdsample-mmx.asm - platform/external/libjpeg-turbo

blob: 1f810fabe1b47f054a26941189bd9db038dfc949 [file] [log] [blame]

Leon Scroggins III	3993b37	2018-07-16 10:43:45 -0400	[diff] [blame^]	1	;
				2	; jdsample.asm - upsampling (MMX)
				3	;
				4	; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
				5	; Copyright (C) 2016, D. R. Commander.
				6	;
				7	; Based on the x86 SIMD extension for IJG JPEG library
				8	; Copyright (C) 1999-2006, MIYASAKA Masaru.
				9	; For conditions of distribution and use, see copyright notice in jsimdext.inc
				10	;
				11	; This file should be assembled with NASM (Netwide Assembler),
				12	; can not be assembled with Microsoft's MASM or any compatible
				13	; assembler (including Borland's Turbo Assembler).
				14	; NASM is available from http://nasm.sourceforge.net/ or
				15	; http://sourceforge.net/project/showfiles.php?group_id=6208
				16	;
				17	; [TAB8]
				18
				19	%include "jsimdext.inc"
				20
				21	; --------------------------------------------------------------------------
				22	SECTION SEG_CONST
				23
				24	alignz 32
				25	GLOBAL_DATA(jconst_fancy_upsample_mmx)
				26
				27	EXTN(jconst_fancy_upsample_mmx):
				28
				29	PW_ONE times 4 dw 1
				30	PW_TWO times 4 dw 2
				31	PW_THREE times 4 dw 3
				32	PW_SEVEN times 4 dw 7
				33	PW_EIGHT times 4 dw 8
				34
				35	alignz 32
				36
				37	; --------------------------------------------------------------------------
				38	SECTION SEG_TEXT
				39	BITS 32
				40	;
				41	; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
				42	;
				43	; The upsampling algorithm is linear interpolation between pixel centers,
				44	; also known as a "triangle filter". This is a good compromise between
				45	; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
				46	; of the way between input pixel centers.
				47	;
				48	; GLOBAL(void)
				49	; jsimd_h2v1_fancy_upsample_mmx(int max_v_samp_factor,
				50	; JDIMENSION downsampled_width,
				51	; JSAMPARRAY input_data,
				52	; JSAMPARRAY *output_data_ptr);
				53	;
				54
				55	%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
				56	%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
				57	%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
				58	%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
				59
				60	align 32
				61	GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_mmx)
				62
				63	EXTN(jsimd_h2v1_fancy_upsample_mmx):
				64	push ebp
				65	mov ebp, esp
				66	pushpic ebx
				67	; push ecx ; need not be preserved
				68	; push edx ; need not be preserved
				69	push esi
				70	push edi
				71
				72	get_GOT ebx ; get GOT address
				73
				74	mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
				75	test eax, eax
				76	jz near .return
				77
				78	mov ecx, INT [max_v_samp(ebp)] ; rowctr
				79	test ecx, ecx
				80	jz near .return
				81
				82	mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
				83	mov edi, POINTER [output_data_ptr(ebp)]
				84	mov edi, JSAMPARRAY [edi] ; output_data
				85	alignx 16, 7
				86	.rowloop:
				87	push eax ; colctr
				88	push edi
				89	push esi
				90
				91	mov esi, JSAMPROW [esi] ; inptr
				92	mov edi, JSAMPROW [edi] ; outptr
				93
				94	test eax, SIZEOF_MMWORD-1
				95	jz short .skip
				96	mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
				97	mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
				98	.skip:
				99	pxor mm0, mm0 ; mm0=(all 0's)
				100	pcmpeqb mm7, mm7
				101	psrlq mm7, (SIZEOF_MMWORD-1)*BYTE_BIT
				102	pand mm7, MMWORD [esi+0*SIZEOF_MMWORD]
				103
				104	add eax, byte SIZEOF_MMWORD-1
				105	and eax, byte -SIZEOF_MMWORD
				106	cmp eax, byte SIZEOF_MMWORD
				107	ja short .columnloop
				108	alignx 16, 7
				109
				110	.columnloop_last:
				111	pcmpeqb mm6, mm6
				112	psllq mm6, (SIZEOF_MMWORD-1)*BYTE_BIT
				113	pand mm6, MMWORD [esi+0*SIZEOF_MMWORD]
				114	jmp short .upsample
				115	alignx 16, 7
				116
				117	.columnloop:
				118	movq mm6, MMWORD [esi+1*SIZEOF_MMWORD]
				119	psllq mm6, (SIZEOF_MMWORD-1)*BYTE_BIT
				120
				121	.upsample:
				122	movq mm1, MMWORD [esi+0*SIZEOF_MMWORD]
				123	movq mm2, mm1
				124	movq mm3, mm1 ; mm1=( 0 1 2 3 4 5 6 7)
				125	psllq mm2, BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6)
				126	psrlq mm3, BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -)
				127
				128	por mm2, mm7 ; mm2=(-1 0 1 2 3 4 5 6)
				129	por mm3, mm6 ; mm3=( 1 2 3 4 5 6 7 8)
				130
				131	movq mm7, mm1
				132	psrlq mm7, (SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -)
				133
				134	movq mm4, mm1
				135	punpcklbw mm1, mm0 ; mm1=( 0 1 2 3)
				136	punpckhbw mm4, mm0 ; mm4=( 4 5 6 7)
				137	movq mm5, mm2
				138	punpcklbw mm2, mm0 ; mm2=(-1 0 1 2)
				139	punpckhbw mm5, mm0 ; mm5=( 3 4 5 6)
				140	movq mm6, mm3
				141	punpcklbw mm3, mm0 ; mm3=( 1 2 3 4)
				142	punpckhbw mm6, mm0 ; mm6=( 5 6 7 8)
				143
				144	pmullw mm1, [GOTOFF(ebx,PW_THREE)]
				145	pmullw mm4, [GOTOFF(ebx,PW_THREE)]
				146	paddw mm2, [GOTOFF(ebx,PW_ONE)]
				147	paddw mm5, [GOTOFF(ebx,PW_ONE)]
				148	paddw mm3, [GOTOFF(ebx,PW_TWO)]
				149	paddw mm6, [GOTOFF(ebx,PW_TWO)]
				150
				151	paddw mm2, mm1
				152	paddw mm5, mm4
				153	psrlw mm2, 2 ; mm2=OutLE=( 0 2 4 6)
				154	psrlw mm5, 2 ; mm5=OutHE=( 8 10 12 14)
				155	paddw mm3, mm1
				156	paddw mm6, mm4
				157	psrlw mm3, 2 ; mm3=OutLO=( 1 3 5 7)
				158	psrlw mm6, 2 ; mm6=OutHO=( 9 11 13 15)
				159
				160	psllw mm3, BYTE_BIT
				161	psllw mm6, BYTE_BIT
				162	por mm2, mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7)
				163	por mm5, mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15)
				164
				165	movq MMWORD [edi+0*SIZEOF_MMWORD], mm2
				166	movq MMWORD [edi+1*SIZEOF_MMWORD], mm5
				167
				168	sub eax, byte SIZEOF_MMWORD
				169	add esi, byte 1*SIZEOF_MMWORD ; inptr
				170	add edi, byte 2*SIZEOF_MMWORD ; outptr
				171	cmp eax, byte SIZEOF_MMWORD
				172	ja near .columnloop
				173	test eax, eax
				174	jnz near .columnloop_last
				175
				176	pop esi
				177	pop edi
				178	pop eax
				179
				180	add esi, byte SIZEOF_JSAMPROW ; input_data
				181	add edi, byte SIZEOF_JSAMPROW ; output_data
				182	dec ecx ; rowctr
				183	jg near .rowloop
				184
				185	emms ; empty MMX state
				186
				187	.return:
				188	pop edi
				189	pop esi
				190	; pop edx ; need not be preserved
				191	; pop ecx ; need not be preserved
				192	poppic ebx
				193	pop ebp
				194	ret
				195
				196	; --------------------------------------------------------------------------
				197	;
				198	; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
				199	; Again a triangle filter; see comments for h2v1 case, above.
				200	;
				201	; GLOBAL(void)
				202	; jsimd_h2v2_fancy_upsample_mmx(int max_v_samp_factor,
				203	; JDIMENSION downsampled_width,
				204	; JSAMPARRAY input_data,
				205	; JSAMPARRAY *output_data_ptr);
				206	;
				207
				208	%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
				209	%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
				210	%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
				211	%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
				212
				213	%define original_ebp ebp + 0
				214	%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM]
				215	%define WK_NUM 4
				216	%define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr
				217
				218	align 32
				219	GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_mmx)
				220
				221	EXTN(jsimd_h2v2_fancy_upsample_mmx):
				222	push ebp
				223	mov eax, esp ; eax = original ebp
				224	sub esp, byte 4
				225	and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
				226	mov [esp], eax
				227	mov ebp, esp ; ebp = aligned ebp
				228	lea esp, [wk(0)]
				229	pushpic eax ; make a room for GOT address
				230	push ebx
				231	; push ecx ; need not be preserved
				232	; push edx ; need not be preserved
				233	push esi
				234	push edi
				235
				236	get_GOT ebx ; get GOT address
				237	movpic POINTER [gotptr], ebx ; save GOT address
				238
				239	mov edx, eax ; edx = original ebp
				240	mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
				241	test eax, eax
				242	jz near .return
				243
				244	mov ecx, INT [max_v_samp(edx)] ; rowctr
				245	test ecx, ecx
				246	jz near .return
				247
				248	mov esi, JSAMPARRAY [input_data(edx)] ; input_data
				249	mov edi, POINTER [output_data_ptr(edx)]
				250	mov edi, JSAMPARRAY [edi] ; output_data
				251	alignx 16, 7
				252	.rowloop:
				253	push eax ; colctr
				254	push ecx
				255	push edi
				256	push esi
				257
				258	mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)
				259	mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
				260	mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)
				261	mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
				262	mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
				263
				264	test eax, SIZEOF_MMWORD-1
				265	jz short .skip
				266	push edx
				267	mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
				268	mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
				269	mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
				270	mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
				271	mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
				272	mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
				273	pop edx
				274	.skip:
				275	; -- process the first column block
				276
				277	movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0]
				278	movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0]
				279	movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0]
				280
				281	pushpic ebx
				282	movpic ebx, POINTER [gotptr] ; load GOT address
				283
				284	pxor mm3, mm3 ; mm3=(all 0's)
				285	movq mm4, mm0
				286	punpcklbw mm0, mm3 ; mm0=row[ 0][0]( 0 1 2 3)
				287	punpckhbw mm4, mm3 ; mm4=row[ 0][0]( 4 5 6 7)
				288	movq mm5, mm1
				289	punpcklbw mm1, mm3 ; mm1=row[-1][0]( 0 1 2 3)
				290	punpckhbw mm5, mm3 ; mm5=row[-1][0]( 4 5 6 7)
				291	movq mm6, mm2
				292	punpcklbw mm2, mm3 ; mm2=row[+1][0]( 0 1 2 3)
				293	punpckhbw mm6, mm3 ; mm6=row[+1][0]( 4 5 6 7)
				294
				295	pmullw mm0, [GOTOFF(ebx,PW_THREE)]
				296	pmullw mm4, [GOTOFF(ebx,PW_THREE)]
				297
				298	pcmpeqb mm7, mm7
				299	psrlq mm7, (SIZEOF_MMWORD-2)*BYTE_BIT
				300
				301	paddw mm1, mm0 ; mm1=Int0L=( 0 1 2 3)
				302	paddw mm5, mm4 ; mm5=Int0H=( 4 5 6 7)
				303	paddw mm2, mm0 ; mm2=Int1L=( 0 1 2 3)
				304	paddw mm6, mm4 ; mm6=Int1H=( 4 5 6 7)
				305
				306	movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save
				307	movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data
				308	movq MMWORD [edi+0*SIZEOF_MMWORD], mm2
				309	movq MMWORD [edi+1*SIZEOF_MMWORD], mm6
				310
				311	pand mm1, mm7 ; mm1=( 0 - - -)
				312	pand mm2, mm7 ; mm2=( 0 - - -)
				313
				314	movq MMWORD [wk(0)], mm1
				315	movq MMWORD [wk(1)], mm2
				316
				317	poppic ebx
				318
				319	add eax, byte SIZEOF_MMWORD-1
				320	and eax, byte -SIZEOF_MMWORD
				321	cmp eax, byte SIZEOF_MMWORD
				322	ja short .columnloop
				323	alignx 16, 7
				324
				325	.columnloop_last:
				326	; -- process the last column block
				327
				328	pushpic ebx
				329	movpic ebx, POINTER [gotptr] ; load GOT address
				330
				331	pcmpeqb mm1, mm1
				332	psllq mm1, (SIZEOF_MMWORD-2)*BYTE_BIT
				333	movq mm2, mm1
				334
				335	pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7)
				336	pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7)
				337
				338	movq MMWORD [wk(2)], mm1
				339	movq MMWORD [wk(3)], mm2
				340
				341	jmp short .upsample
				342	alignx 16, 7
				343
				344	.columnloop:
				345	; -- process the next column block
				346
				347	movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1]
				348	movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1]
				349	movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1]
				350
				351	pushpic ebx
				352	movpic ebx, POINTER [gotptr] ; load GOT address
				353
				354	pxor mm3, mm3 ; mm3=(all 0's)
				355	movq mm4, mm0
				356	punpcklbw mm0, mm3 ; mm0=row[ 0][1]( 0 1 2 3)
				357	punpckhbw mm4, mm3 ; mm4=row[ 0][1]( 4 5 6 7)
				358	movq mm5, mm1
				359	punpcklbw mm1, mm3 ; mm1=row[-1][1]( 0 1 2 3)
				360	punpckhbw mm5, mm3 ; mm5=row[-1][1]( 4 5 6 7)
				361	movq mm6, mm2
				362	punpcklbw mm2, mm3 ; mm2=row[+1][1]( 0 1 2 3)
				363	punpckhbw mm6, mm3 ; mm6=row[+1][1]( 4 5 6 7)
				364
				365	pmullw mm0, [GOTOFF(ebx,PW_THREE)]
				366	pmullw mm4, [GOTOFF(ebx,PW_THREE)]
				367
				368	paddw mm1, mm0 ; mm1=Int0L=( 0 1 2 3)
				369	paddw mm5, mm4 ; mm5=Int0H=( 4 5 6 7)
				370	paddw mm2, mm0 ; mm2=Int1L=( 0 1 2 3)
				371	paddw mm6, mm4 ; mm6=Int1H=( 4 5 6 7)
				372
				373	movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save
				374	movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data
				375	movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
				376	movq MMWORD [edi+3*SIZEOF_MMWORD], mm6
				377
				378	psllq mm1, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0)
				379	psllq mm2, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0)
				380
				381	movq MMWORD [wk(2)], mm1
				382	movq MMWORD [wk(3)], mm2
				383
				384	.upsample:
				385	; -- process the upper row
				386
				387	movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3)
				388	movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7)
				389
				390	movq mm0, mm7
				391	movq mm4, mm3
				392	psrlq mm0, 2*BYTE_BIT ; mm0=( 1 2 3 -)
				393	psllq mm4, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4)
				394	movq mm5, mm7
				395	movq mm6, mm3
				396	psrlq mm5, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -)
				397	psllq mm6, 2*BYTE_BIT ; mm6=( - 4 5 6)
				398
				399	por mm0, mm4 ; mm0=( 1 2 3 4)
				400	por mm5, mm6 ; mm5=( 3 4 5 6)
				401
				402	movq mm1, mm7
				403	movq mm2, mm3
				404	psllq mm1, 2*BYTE_BIT ; mm1=( - 0 1 2)
				405	psrlq mm2, 2*BYTE_BIT ; mm2=( 5 6 7 -)
				406	movq mm4, mm3
				407	psrlq mm4, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -)
				408
				409	por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2)
				410	por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8)
				411
				412	movq MMWORD [wk(0)], mm4
				413
				414	pmullw mm7, [GOTOFF(ebx,PW_THREE)]
				415	pmullw mm3, [GOTOFF(ebx,PW_THREE)]
				416	paddw mm1, [GOTOFF(ebx,PW_EIGHT)]
				417	paddw mm5, [GOTOFF(ebx,PW_EIGHT)]
				418	paddw mm0, [GOTOFF(ebx,PW_SEVEN)]
				419	paddw mm2, [GOTOFF(ebx,PW_SEVEN)]
				420
				421	paddw mm1, mm7
				422	paddw mm5, mm3
				423	psrlw mm1, 4 ; mm1=Out0LE=( 0 2 4 6)
				424	psrlw mm5, 4 ; mm5=Out0HE=( 8 10 12 14)
				425	paddw mm0, mm7
				426	paddw mm2, mm3
				427	psrlw mm0, 4 ; mm0=Out0LO=( 1 3 5 7)
				428	psrlw mm2, 4 ; mm2=Out0HO=( 9 11 13 15)
				429
				430	psllw mm0, BYTE_BIT
				431	psllw mm2, BYTE_BIT
				432	por mm1, mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7)
				433	por mm5, mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15)
				434
				435	movq MMWORD [edx+0*SIZEOF_MMWORD], mm1
				436	movq MMWORD [edx+1*SIZEOF_MMWORD], mm5
				437
				438	; -- process the lower row
				439
				440	movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3)
				441	movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7)
				442
				443	movq mm7, mm6
				444	movq mm3, mm4
				445	psrlq mm7, 2*BYTE_BIT ; mm7=( 1 2 3 -)
				446	psllq mm3, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4)
				447	movq mm0, mm6
				448	movq mm2, mm4
				449	psrlq mm0, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -)
				450	psllq mm2, 2*BYTE_BIT ; mm2=( - 4 5 6)
				451
				452	por mm7, mm3 ; mm7=( 1 2 3 4)
				453	por mm0, mm2 ; mm0=( 3 4 5 6)
				454
				455	movq mm1, mm6
				456	movq mm5, mm4
				457	psllq mm1, 2*BYTE_BIT ; mm1=( - 0 1 2)
				458	psrlq mm5, 2*BYTE_BIT ; mm5=( 5 6 7 -)
				459	movq mm3, mm4
				460	psrlq mm3, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -)
				461
				462	por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2)
				463	por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8)
				464
				465	movq MMWORD [wk(1)], mm3
				466
				467	pmullw mm6, [GOTOFF(ebx,PW_THREE)]
				468	pmullw mm4, [GOTOFF(ebx,PW_THREE)]
				469	paddw mm1, [GOTOFF(ebx,PW_EIGHT)]
				470	paddw mm0, [GOTOFF(ebx,PW_EIGHT)]
				471	paddw mm7, [GOTOFF(ebx,PW_SEVEN)]
				472	paddw mm5, [GOTOFF(ebx,PW_SEVEN)]
				473
				474	paddw mm1, mm6
				475	paddw mm0, mm4
				476	psrlw mm1, 4 ; mm1=Out1LE=( 0 2 4 6)
				477	psrlw mm0, 4 ; mm0=Out1HE=( 8 10 12 14)
				478	paddw mm7, mm6
				479	paddw mm5, mm4
				480	psrlw mm7, 4 ; mm7=Out1LO=( 1 3 5 7)
				481	psrlw mm5, 4 ; mm5=Out1HO=( 9 11 13 15)
				482
				483	psllw mm7, BYTE_BIT
				484	psllw mm5, BYTE_BIT
				485	por mm1, mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7)
				486	por mm0, mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15)
				487
				488	movq MMWORD [edi+0*SIZEOF_MMWORD], mm1
				489	movq MMWORD [edi+1*SIZEOF_MMWORD], mm0
				490
				491	poppic ebx
				492
				493	sub eax, byte SIZEOF_MMWORD
				494	add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above)
				495	add ebx, byte 1*SIZEOF_MMWORD ; inptr0
				496	add esi, byte 1*SIZEOF_MMWORD ; inptr1(below)
				497	add edx, byte 2*SIZEOF_MMWORD ; outptr0
				498	add edi, byte 2*SIZEOF_MMWORD ; outptr1
				499	cmp eax, byte SIZEOF_MMWORD
				500	ja near .columnloop
				501	test eax, eax
				502	jnz near .columnloop_last
				503
				504	pop esi
				505	pop edi
				506	pop ecx
				507	pop eax
				508
				509	add esi, byte 1*SIZEOF_JSAMPROW ; input_data
				510	add edi, byte 2*SIZEOF_JSAMPROW ; output_data
				511	sub ecx, byte 2 ; rowctr
				512	jg near .rowloop
				513
				514	emms ; empty MMX state
				515
				516	.return:
				517	pop edi
				518	pop esi
				519	; pop edx ; need not be preserved
				520	; pop ecx ; need not be preserved
				521	pop ebx
				522	mov esp, ebp ; esp <- aligned ebp
				523	pop esp ; esp <- original ebp
				524	pop ebp
				525	ret
				526
				527	; --------------------------------------------------------------------------
				528	;
				529	; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
				530	; It's still a box filter.
				531	;
				532	; GLOBAL(void)
				533	; jsimd_h2v1_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width,
				534	; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
				535	;
				536
				537	%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
				538	%define output_width(b) (b) + 12 ; JDIMENSION output_width
				539	%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
				540	%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
				541
				542	align 32
				543	GLOBAL_FUNCTION(jsimd_h2v1_upsample_mmx)
				544
				545	EXTN(jsimd_h2v1_upsample_mmx):
				546	push ebp
				547	mov ebp, esp
				548	; push ebx ; unused
				549	; push ecx ; need not be preserved
				550	; push edx ; need not be preserved
				551	push esi
				552	push edi
				553
				554	mov edx, JDIMENSION [output_width(ebp)]
				555	add edx, byte (2*SIZEOF_MMWORD)-1
				556	and edx, byte -(2*SIZEOF_MMWORD)
				557	jz short .return
				558
				559	mov ecx, INT [max_v_samp(ebp)] ; rowctr
				560	test ecx, ecx
				561	jz short .return
				562
				563	mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
				564	mov edi, POINTER [output_data_ptr(ebp)]
				565	mov edi, JSAMPARRAY [edi] ; output_data
				566	alignx 16, 7
				567	.rowloop:
				568	push edi
				569	push esi
				570
				571	mov esi, JSAMPROW [esi] ; inptr
				572	mov edi, JSAMPROW [edi] ; outptr
				573	mov eax, edx ; colctr
				574	alignx 16, 7
				575	.columnloop:
				576
				577	movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
				578
				579	movq mm1, mm0
				580	punpcklbw mm0, mm0
				581	punpckhbw mm1, mm1
				582
				583	movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
				584	movq MMWORD [edi+1*SIZEOF_MMWORD], mm1
				585
				586	sub eax, byte 2*SIZEOF_MMWORD
				587	jz short .nextrow
				588
				589	movq mm2, MMWORD [esi+1*SIZEOF_MMWORD]
				590
				591	movq mm3, mm2
				592	punpcklbw mm2, mm2
				593	punpckhbw mm3, mm3
				594
				595	movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
				596	movq MMWORD [edi+3*SIZEOF_MMWORD], mm3
				597
				598	sub eax, byte 2*SIZEOF_MMWORD
				599	jz short .nextrow
				600
				601	add esi, byte 2*SIZEOF_MMWORD ; inptr
				602	add edi, byte 4*SIZEOF_MMWORD ; outptr
				603	jmp short .columnloop
				604	alignx 16, 7
				605
				606	.nextrow:
				607	pop esi
				608	pop edi
				609
				610	add esi, byte SIZEOF_JSAMPROW ; input_data
				611	add edi, byte SIZEOF_JSAMPROW ; output_data
				612	dec ecx ; rowctr
				613	jg short .rowloop
				614
				615	emms ; empty MMX state
				616
				617	.return:
				618	pop edi
				619	pop esi
				620	; pop edx ; need not be preserved
				621	; pop ecx ; need not be preserved
				622	; pop ebx ; unused
				623	pop ebp
				624	ret
				625
				626	; --------------------------------------------------------------------------
				627	;
				628	; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
				629	; It's still a box filter.
				630	;
				631	; GLOBAL(void)
				632	; jsimd_h2v2_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width,
				633	; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
				634	;
				635
				636	%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
				637	%define output_width(b) (b) + 12 ; JDIMENSION output_width
				638	%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
				639	%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
				640
				641	align 32
				642	GLOBAL_FUNCTION(jsimd_h2v2_upsample_mmx)
				643
				644	EXTN(jsimd_h2v2_upsample_mmx):
				645	push ebp
				646	mov ebp, esp
				647	push ebx
				648	; push ecx ; need not be preserved
				649	; push edx ; need not be preserved
				650	push esi
				651	push edi
				652
				653	mov edx, JDIMENSION [output_width(ebp)]
				654	add edx, byte (2*SIZEOF_MMWORD)-1
				655	and edx, byte -(2*SIZEOF_MMWORD)
				656	jz near .return
				657
				658	mov ecx, INT [max_v_samp(ebp)] ; rowctr
				659	test ecx, ecx
				660	jz short .return
				661
				662	mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
				663	mov edi, POINTER [output_data_ptr(ebp)]
				664	mov edi, JSAMPARRAY [edi] ; output_data
				665	alignx 16, 7
				666	.rowloop:
				667	push edi
				668	push esi
				669
				670	mov esi, JSAMPROW [esi] ; inptr
				671	mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
				672	mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
				673	mov eax, edx ; colctr
				674	alignx 16, 7
				675	.columnloop:
				676
				677	movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
				678
				679	movq mm1, mm0
				680	punpcklbw mm0, mm0
				681	punpckhbw mm1, mm1
				682
				683	movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0
				684	movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1
				685	movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
				686	movq MMWORD [edi+1*SIZEOF_MMWORD], mm1
				687
				688	sub eax, byte 2*SIZEOF_MMWORD
				689	jz short .nextrow
				690
				691	movq mm2, MMWORD [esi+1*SIZEOF_MMWORD]
				692
				693	movq mm3, mm2
				694	punpcklbw mm2, mm2
				695	punpckhbw mm3, mm3
				696
				697	movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2
				698	movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3
				699	movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
				700	movq MMWORD [edi+3*SIZEOF_MMWORD], mm3
				701
				702	sub eax, byte 2*SIZEOF_MMWORD
				703	jz short .nextrow
				704
				705	add esi, byte 2*SIZEOF_MMWORD ; inptr
				706	add ebx, byte 4*SIZEOF_MMWORD ; outptr0
				707	add edi, byte 4*SIZEOF_MMWORD ; outptr1
				708	jmp short .columnloop
				709	alignx 16, 7
				710
				711	.nextrow:
				712	pop esi
				713	pop edi
				714
				715	add esi, byte 1*SIZEOF_JSAMPROW ; input_data
				716	add edi, byte 2*SIZEOF_JSAMPROW ; output_data
				717	sub ecx, byte 2 ; rowctr
				718	jg short .rowloop
				719
				720	emms ; empty MMX state
				721
				722	.return:
				723	pop edi
				724	pop esi
				725	; pop edx ; need not be preserved
				726	; pop ecx ; need not be preserved
				727	pop ebx
				728	pop ebp
				729	ret
				730
				731	; For some reason, the OS X linker does not honor the request to align the
				732	; segment unless we do this.
				733	align 32