Blame - simd/jdsamss2-64.asm - platform/external/libjpeg-turbo

blob: 1f7b1f5946741afe386270ffde3a952e582b5f45 [file] [log] [blame]

DRC	cdc8ac3	2009-06-25 20:38:31 +0000	[diff] [blame^]	1	;
				2	; jdsamss2.asm - upsampling (64-bit SSE2)
				3	;
				4	; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
				5	; Copyright 2009 D. R. Commander
				6	;
				7	; Based on
				8	; x86 SIMD extension for IJG JPEG library
				9	; Copyright (C) 1999-2006, MIYASAKA Masaru.
				10	; For conditions of distribution and use, see copyright notice in jsimdext.inc
				11	;
				12	; This file should be assembled with NASM (Netwide Assembler),
				13	; can not be assembled with Microsoft's MASM or any compatible
				14	; assembler (including Borland's Turbo Assembler).
				15	; NASM is available from http://nasm.sourceforge.net/ or
				16	; http://sourceforge.net/project/showfiles.php?group_id=6208
				17	;
				18	; [TAB8]
				19
				20	%include "jsimdext.inc"
				21
				22	; --------------------------------------------------------------------------
				23	SECTION SEG_CONST
				24
				25	alignz 16
				26	global EXTN(jconst_fancy_upsample_sse2)
				27
				28	EXTN(jconst_fancy_upsample_sse2):
				29
				30	PW_ONE times 8 dw 1
				31	PW_TWO times 8 dw 2
				32	PW_THREE times 8 dw 3
				33	PW_SEVEN times 8 dw 7
				34	PW_EIGHT times 8 dw 8
				35
				36	alignz 16
				37
				38	; --------------------------------------------------------------------------
				39	SECTION SEG_TEXT
				40	BITS 64
				41	;
				42	; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
				43	;
				44	; The upsampling algorithm is linear interpolation between pixel centers,
				45	; also known as a "triangle filter". This is a good compromise between
				46	; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
				47	; of the way between input pixel centers.
				48	;
				49	; GLOBAL(void)
				50	; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
				51	; JDIMENSION downsampled_width,
				52	; JSAMPARRAY input_data,
				53	; JSAMPARRAY * output_data_ptr);
				54	;
				55
				56	; r10 = int max_v_samp_factor
				57	; r11 = JDIMENSION downsampled_width
				58	; r12 = JSAMPARRAY input_data
				59	; r13 = JSAMPARRAY * output_data_ptr
				60
				61	align 16
				62	global EXTN(jsimd_h2v1_fancy_upsample_sse2)
				63
				64	EXTN(jsimd_h2v1_fancy_upsample_sse2):
				65	push rbp
				66	mov rbp,rsp
				67	collect_args
				68
				69	mov rax, r11 ; colctr
				70	test rax,rax
				71	jz near .return
				72
				73	mov rcx, r10 ; rowctr
				74	test rcx,rcx
				75	jz near .return
				76
				77	mov rsi, r12 ; input_data
				78	mov rdi, r13
				79	mov rdi, JSAMPARRAY [rdi] ; output_data
				80	.rowloop:
				81	push rax ; colctr
				82	push rdi
				83	push rsi
				84
				85	mov rsi, JSAMPROW [rsi] ; inptr
				86	mov rdi, JSAMPROW [rdi] ; outptr
				87
				88	test rax, SIZEOF_XMMWORD-1
				89	jz short .skip
				90	mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
				91	mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
				92	.skip:
				93	pxor xmm0,xmm0 ; xmm0=(all 0's)
				94	pcmpeqb xmm7,xmm7
				95	psrldq xmm7,(SIZEOF_XMMWORD-1)
				96	pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
				97
				98	add rax, byte SIZEOF_XMMWORD-1
				99	and rax, byte -SIZEOF_XMMWORD
				100	cmp rax, byte SIZEOF_XMMWORD
				101	ja short .columnloop
				102
				103	.columnloop_last:
				104	pcmpeqb xmm6,xmm6
				105	pslldq xmm6,(SIZEOF_XMMWORD-1)
				106	pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
				107	jmp short .upsample
				108
				109	.columnloop:
				110	movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
				111	pslldq xmm6,(SIZEOF_XMMWORD-1)
				112
				113	.upsample:
				114	movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
				115	movdqa xmm2,xmm1
				116	movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
				117	pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14)
				118	psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --)
				119
				120	por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
				121	por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
				122
				123	movdqa xmm7,xmm1
				124	psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
				125
				126	movdqa xmm4,xmm1
				127	punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
				128	punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
				129	movdqa xmm5,xmm2
				130	punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
				131	punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
				132	movdqa xmm6,xmm3
				133	punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
				134	punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
				135
				136	pmullw xmm1,[PW_THREE]
				137	pmullw xmm4,[PW_THREE]
				138	paddw xmm2,[PW_ONE]
				139	paddw xmm5,[PW_ONE]
				140	paddw xmm3,[PW_TWO]
				141	paddw xmm6,[PW_TWO]
				142
				143	paddw xmm2,xmm1
				144	paddw xmm5,xmm4
				145	psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
				146	psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
				147	paddw xmm3,xmm1
				148	paddw xmm6,xmm4
				149	psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
				150	psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
				151
				152	psllw xmm3,BYTE_BIT
				153	psllw xmm6,BYTE_BIT
				154	por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
				155	por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
				156
				157	movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
				158	movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
				159
				160	sub rax, byte SIZEOF_XMMWORD
				161	add rsi, byte 1*SIZEOF_XMMWORD ; inptr
				162	add rdi, byte 2*SIZEOF_XMMWORD ; outptr
				163	cmp rax, byte SIZEOF_XMMWORD
				164	ja near .columnloop
				165	test eax,eax
				166	jnz near .columnloop_last
				167
				168	pop rsi
				169	pop rdi
				170	pop rax
				171
				172	add rsi, byte SIZEOF_JSAMPROW ; input_data
				173	add rdi, byte SIZEOF_JSAMPROW ; output_data
				174	dec rcx ; rowctr
				175	jg near .rowloop
				176
				177	.return:
				178	uncollect_args
				179	pop rbp
				180	ret
				181
				182	; --------------------------------------------------------------------------
				183	;
				184	; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
				185	; Again a triangle filter; see comments for h2v1 case, above.
				186	;
				187	; GLOBAL(void)
				188	; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
				189	; JDIMENSION downsampled_width,
				190	; JSAMPARRAY input_data,
				191	; JSAMPARRAY * output_data_ptr);
				192	;
				193
				194	; r10 = int max_v_samp_factor
				195	; r11 = JDIMENSION downsampled_width
				196	; r12 = JSAMPARRAY input_data
				197	; r13 = JSAMPARRAY * output_data_ptr
				198
				199	%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
				200	%define WK_NUM 4
				201
				202	align 16
				203	global EXTN(jsimd_h2v2_fancy_upsample_sse2)
				204
				205	EXTN(jsimd_h2v2_fancy_upsample_sse2):
				206	push rbp
				207	mov rax,rsp ; rax = original rbp
				208	sub rsp, byte 4
				209	and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
				210	mov [rsp],rax
				211	mov rbp,rsp ; rbp = aligned rbp
				212	lea rsp, [wk(0)]
				213	push rbx
				214	collect_args
				215
				216	mov rax, r11 ; colctr
				217	test rax,rax
				218	jz near .return
				219
				220	mov rcx, r10 ; rowctr
				221	test rcx,rcx
				222	jz near .return
				223
				224	mov rsi, r12 ; input_data
				225	mov rdi, r13
				226	mov rdi, JSAMPARRAY [rdi] ; output_data
				227	.rowloop:
				228	push rax ; colctr
				229	push rcx
				230	push rdi
				231	push rsi
				232
				233	mov rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above)
				234	mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
				235	mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below)
				236	mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
				237	mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
				238
				239	test rax, SIZEOF_XMMWORD-1
				240	jz short .skip
				241	push rdx
				242	mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
				243	mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
				244	mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
				245	mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
				246	mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
				247	mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
				248	pop rdx
				249	.skip:
				250	; -- process the first column block
				251
				252	movdqa xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0]
				253	movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0]
				254	movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0]
				255
				256	pxor xmm3,xmm3 ; xmm3=(all 0's)
				257	movdqa xmm4,xmm0
				258	punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
				259	punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
				260	movdqa xmm5,xmm1
				261	punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
				262	punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
				263	movdqa xmm6,xmm2
				264	punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
				265	punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
				266
				267	pmullw xmm0,[PW_THREE]
				268	pmullw xmm4,[PW_THREE]
				269
				270	pcmpeqb xmm7,xmm7
				271	psrldq xmm7,(SIZEOF_XMMWORD-2)
				272
				273	paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
				274	paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
				275	paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
				276	paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
				277
				278	movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
				279	movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
				280	movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
				281	movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
				282
				283	pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
				284	pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
				285
				286	movdqa XMMWORD [wk(0)], xmm1
				287	movdqa XMMWORD [wk(1)], xmm2
				288
				289	add rax, byte SIZEOF_XMMWORD-1
				290	and rax, byte -SIZEOF_XMMWORD
				291	cmp rax, byte SIZEOF_XMMWORD
				292	ja short .columnloop
				293
				294	.columnloop_last:
				295	; -- process the last column block
				296
				297	pcmpeqb xmm1,xmm1
				298	pslldq xmm1,(SIZEOF_XMMWORD-2)
				299	movdqa xmm2,xmm1
				300
				301	pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
				302	pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
				303
				304	movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15)
				305	movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15)
				306
				307	jmp near .upsample
				308
				309	.columnloop:
				310	; -- process the next column block
				311
				312	movdqa xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1]
				313	movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1]
				314	movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1]
				315
				316	pxor xmm3,xmm3 ; xmm3=(all 0's)
				317	movdqa xmm4,xmm0
				318	punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
				319	punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
				320	movdqa xmm5,xmm1
				321	punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
				322	punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
				323	movdqa xmm6,xmm2
				324	punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
				325	punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
				326
				327	pmullw xmm0,[PW_THREE]
				328	pmullw xmm4,[PW_THREE]
				329
				330	paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
				331	paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
				332	paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
				333	paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
				334
				335	movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
				336	movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
				337	movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
				338	movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
				339
				340	pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
				341	pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
				342
				343	movdqa XMMWORD [wk(2)], xmm1
				344	movdqa XMMWORD [wk(3)], xmm2
				345
				346	.upsample:
				347	; -- process the upper row
				348
				349	movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
				350	movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
				351
				352	movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
				353	movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
				354	psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --)
				355	pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
				356	movdqa xmm5,xmm7
				357	movdqa xmm6,xmm3
				358	psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
				359	pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14)
				360
				361	por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
				362	por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
				363
				364	movdqa xmm1,xmm7
				365	movdqa xmm2,xmm3
				366	pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)
				367	psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --)
				368	movdqa xmm4,xmm3
				369	psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
				370
				371	por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
				372	por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
				373
				374	movdqa XMMWORD [wk(0)], xmm4
				375
				376	pmullw xmm7,[PW_THREE]
				377	pmullw xmm3,[PW_THREE]
				378	paddw xmm1,[PW_EIGHT]
				379	paddw xmm5,[PW_EIGHT]
				380	paddw xmm0,[PW_SEVEN]
				381	paddw xmm2,[PW_SEVEN]
				382
				383	paddw xmm1,xmm7
				384	paddw xmm5,xmm3
				385	psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
				386	psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
				387	paddw xmm0,xmm7
				388	paddw xmm2,xmm3
				389	psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
				390	psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
				391
				392	psllw xmm0,BYTE_BIT
				393	psllw xmm2,BYTE_BIT
				394	por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
				395	por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
				396
				397	movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
				398	movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
				399
				400	; -- process the lower row
				401
				402	movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
				403	movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
				404
				405	movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
				406	movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
				407	psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --)
				408	pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
				409	movdqa xmm0,xmm6
				410	movdqa xmm2,xmm4
				411	psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
				412	pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14)
				413
				414	por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
				415	por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
				416
				417	movdqa xmm1,xmm6
				418	movdqa xmm5,xmm4
				419	pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)
				420	psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --)
				421	movdqa xmm3,xmm4
				422	psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
				423
				424	por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
				425	por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
				426
				427	movdqa XMMWORD [wk(1)], xmm3
				428
				429	pmullw xmm6,[PW_THREE]
				430	pmullw xmm4,[PW_THREE]
				431	paddw xmm1,[PW_EIGHT]
				432	paddw xmm0,[PW_EIGHT]
				433	paddw xmm7,[PW_SEVEN]
				434	paddw xmm5,[PW_SEVEN]
				435
				436	paddw xmm1,xmm6
				437	paddw xmm0,xmm4
				438	psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
				439	psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
				440	paddw xmm7,xmm6
				441	paddw xmm5,xmm4
				442	psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
				443	psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
				444
				445	psllw xmm7,BYTE_BIT
				446	psllw xmm5,BYTE_BIT
				447	por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
				448	por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
				449
				450	movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
				451	movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
				452
				453	sub rax, byte SIZEOF_XMMWORD
				454	add rcx, byte 1*SIZEOF_XMMWORD ; inptr1(above)
				455	add rbx, byte 1*SIZEOF_XMMWORD ; inptr0
				456	add rsi, byte 1*SIZEOF_XMMWORD ; inptr1(below)
				457	add rdx, byte 2*SIZEOF_XMMWORD ; outptr0
				458	add rdi, byte 2*SIZEOF_XMMWORD ; outptr1
				459	cmp rax, byte SIZEOF_XMMWORD
				460	ja near .columnloop
				461	test rax,rax
				462	jnz near .columnloop_last
				463
				464	pop rsi
				465	pop rdi
				466	pop rcx
				467	pop rax
				468
				469	add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
				470	add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
				471	sub rcx, byte 2 ; rowctr
				472	jg near .rowloop
				473
				474	.return:
				475	uncollect_args
				476	pop rbx
				477	mov rsp,rbp ; rsp <- aligned rbp
				478	pop rsp ; rsp <- original rbp
				479	pop rbp
				480	ret
				481
				482	; --------------------------------------------------------------------------
				483	;
				484	; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
				485	; It's still a box filter.
				486	;
				487	; GLOBAL(void)
				488	; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
				489	; JDIMENSION output_width,
				490	; JSAMPARRAY input_data,
				491	; JSAMPARRAY * output_data_ptr);
				492	;
				493
				494	; r10 = int max_v_samp_factor
				495	; r11 = JDIMENSION output_width
				496	; r12 = JSAMPARRAY input_data
				497	; r13 = JSAMPARRAY * output_data_ptr
				498
				499	align 16
				500	global EXTN(jsimd_h2v1_upsample_sse2)
				501
				502	EXTN(jsimd_h2v1_upsample_sse2):
				503	push rbp
				504	mov rbp,rsp
				505	collect_args
				506
				507	mov rdx, r11
				508	add rdx, byte (2*SIZEOF_XMMWORD)-1
				509	and rdx, byte -(2*SIZEOF_XMMWORD)
				510	jz near .return
				511
				512	mov rcx, r10 ; rowctr
				513	test rcx,rcx
				514	jz short .return
				515
				516	mov rsi, r12 ; input_data
				517	mov rdi, r13
				518	mov rdi, JSAMPARRAY [rdi] ; output_data
				519	.rowloop:
				520	push rdi
				521	push rsi
				522
				523	mov rsi, JSAMPROW [rsi] ; inptr
				524	mov rdi, JSAMPROW [rdi] ; outptr
				525	mov rax,rdx ; colctr
				526	.columnloop:
				527
				528	movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
				529
				530	movdqa xmm1,xmm0
				531	punpcklbw xmm0,xmm0
				532	punpckhbw xmm1,xmm1
				533
				534	movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
				535	movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
				536
				537	sub rax, byte 2*SIZEOF_XMMWORD
				538	jz short .nextrow
				539
				540	movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
				541
				542	movdqa xmm3,xmm2
				543	punpcklbw xmm2,xmm2
				544	punpckhbw xmm3,xmm3
				545
				546	movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
				547	movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
				548
				549	sub rax, byte 2*SIZEOF_XMMWORD
				550	jz short .nextrow
				551
				552	add rsi, byte 2*SIZEOF_XMMWORD ; inptr
				553	add rdi, byte 4*SIZEOF_XMMWORD ; outptr
				554	jmp short .columnloop
				555
				556	.nextrow:
				557	pop rsi
				558	pop rdi
				559
				560	add rsi, byte SIZEOF_JSAMPROW ; input_data
				561	add rdi, byte SIZEOF_JSAMPROW ; output_data
				562	dec rcx ; rowctr
				563	jg short .rowloop
				564
				565	.return:
				566	uncollect_args
				567	pop rbp
				568	ret
				569
				570	; --------------------------------------------------------------------------
				571	;
				572	; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
				573	; It's still a box filter.
				574	;
				575	; GLOBAL(void)
				576	; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
				577	; JDIMENSION output_width,
				578	; JSAMPARRAY input_data,
				579	; JSAMPARRAY * output_data_ptr);
				580	;
				581
				582	; r10 = int max_v_samp_factor
				583	; r11 = JDIMENSION output_width
				584	; r12 = JSAMPARRAY input_data
				585	; r13 = JSAMPARRAY * output_data_ptr
				586
				587	align 16
				588	global EXTN(jsimd_h2v2_upsample_sse2)
				589
				590	EXTN(jsimd_h2v2_upsample_sse2):
				591	push rbp
				592	mov rbp,rsp
				593	push rbx
				594	collect_args
				595
				596	mov rdx, r11
				597	add rdx, byte (2*SIZEOF_XMMWORD)-1
				598	and rdx, byte -(2*SIZEOF_XMMWORD)
				599	jz near .return
				600
				601	mov rcx, r10 ; rowctr
				602	test rcx,rcx
				603	jz near .return
				604
				605	mov rsi, r12 ; input_data
				606	mov rdi, r13
				607	mov rdi, JSAMPARRAY [rdi] ; output_data
				608	.rowloop:
				609	push rdi
				610	push rsi
				611
				612	mov rsi, JSAMPROW [rsi] ; inptr
				613	mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
				614	mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
				615	mov rax,rdx ; colctr
				616	.columnloop:
				617
				618	movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
				619
				620	movdqa xmm1,xmm0
				621	punpcklbw xmm0,xmm0
				622	punpckhbw xmm1,xmm1
				623
				624	movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
				625	movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
				626	movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
				627	movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
				628
				629	sub rax, byte 2*SIZEOF_XMMWORD
				630	jz short .nextrow
				631
				632	movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
				633
				634	movdqa xmm3,xmm2
				635	punpcklbw xmm2,xmm2
				636	punpckhbw xmm3,xmm3
				637
				638	movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
				639	movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
				640	movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
				641	movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
				642
				643	sub rax, byte 2*SIZEOF_XMMWORD
				644	jz short .nextrow
				645
				646	add rsi, byte 2*SIZEOF_XMMWORD ; inptr
				647	add rbx, byte 4*SIZEOF_XMMWORD ; outptr0
				648	add rdi, byte 4*SIZEOF_XMMWORD ; outptr1
				649	jmp short .columnloop
				650
				651	.nextrow:
				652	pop rsi
				653	pop rdi
				654
				655	add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
				656	add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
				657	sub rcx, byte 2 ; rowctr
				658	jg near .rowloop
				659
				660	.return:
				661	uncollect_args
				662	pop rbx
				663	pop rbp
				664	ret