Blame - simd/jcqnts2f-64.asm - platform/external/libjpeg-turbo

blob: 51d63076e91e1d4e615a2091baba4cd5dbea49f9 [file] [log] [blame]

DRC	cdc8ac3	2009-06-25 20:38:31 +0000	[diff] [blame]	1	;
				2	; jcqnts2f.asm - sample data conversion and quantization (64-bit SSE & SSE2)
				3	;
				4	; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
				5	; Copyright 2009 D. R. Commander
				6	;
				7	; Based on
				8	; x86 SIMD extension for IJG JPEG library
				9	; Copyright (C) 1999-2006, MIYASAKA Masaru.
				10	; For conditions of distribution and use, see copyright notice in jsimdext.inc
				11	;
				12	; This file should be assembled with NASM (Netwide Assembler),
				13	; can not be assembled with Microsoft's MASM or any compatible
				14	; assembler (including Borland's Turbo Assembler).
				15	; NASM is available from http://nasm.sourceforge.net/ or
				16	; http://sourceforge.net/project/showfiles.php?group_id=6208
				17	;
				18	; [TAB8]
				19
				20	%include "jsimdext.inc"
				21	%include "jdct.inc"
				22
				23	; --------------------------------------------------------------------------
				24	SECTION SEG_TEXT
				25	BITS 64
				26	;
				27	; Load data into workspace, applying unsigned->signed conversion
				28	;
				29	; GLOBAL(void)
				30	; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
				31	; FAST_FLOAT * workspace);
				32	;
				33
				34	; r10 = JSAMPARRAY sample_data
				35	; r11 = JDIMENSION start_col
				36	; r12 = FAST_FLOAT * workspace
				37
				38	align 16
				39	global EXTN(jsimd_convsamp_float_sse2)
				40
				41	EXTN(jsimd_convsamp_float_sse2):
				42	push rbp
				43	mov rbp,rsp
				44	push rbx
				45	collect_args
				46
				47	pcmpeqw xmm7,xmm7
				48	psllw xmm7,7
				49	packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
				50
				51	mov rsi, r10
				52	mov rax, r11
				53	mov rdi, r12
				54	mov rcx, DCTSIZE/2
				55	.convloop:
				56	mov rbx, JSAMPROW [rsi+0SIZEOF_JSAMPROW] ; (JSAMPLE )
				57	mov rdx, JSAMPROW [rsi+1SIZEOF_JSAMPROW] ; (JSAMPLE )
				58
				59	movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
				60	movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
				61
				62	psubb xmm0,xmm7 ; xmm0=(01234567)
				63	psubb xmm1,xmm7 ; xmm1=(89ABCDEF)
				64
				65	punpcklbw xmm0,xmm0 ; xmm0=(01234567)
				66	punpcklbw xmm1,xmm1 ; xmm1=(89ABCDEF)
				67
				68	punpcklwd xmm2,xmm0 ; xmm2=(*012*3)
				69	punpckhwd xmm0,xmm0 ; xmm0=(*456*7)
				70	punpcklwd xmm3,xmm1 ; xmm3=(*89A*B)
				71	punpckhwd xmm1,xmm1 ; xmm1=(*CDE*F)
				72
				73	psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
				74	psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
				75	cvtdq2ps xmm2,xmm2 ; xmm2=(0123)
				76	cvtdq2ps xmm0,xmm0 ; xmm0=(4567)
				77	psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
				78	psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
				79	cvtdq2ps xmm3,xmm3 ; xmm3=(89AB)
				80	cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF)
				81
				82	movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
				83	movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
				84	movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
				85	movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
				86
				87	add rsi, byte 2*SIZEOF_JSAMPROW
				88	add rdi, byte 2DCTSIZESIZEOF_FAST_FLOAT
				89	dec rcx
				90	jnz short .convloop
				91
				92	uncollect_args
				93	pop rbx
				94	pop rbp
				95	ret
				96
				97
				98	; --------------------------------------------------------------------------
				99	;
				100	; Quantize/descale the coefficients, and store into coef_block
				101	;
				102	; GLOBAL(void)
				103	; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors,
				104	; FAST_FLOAT * workspace);
				105	;
				106
				107	; r10 = JCOEFPTR coef_block
				108	; r11 = FAST_FLOAT * divisors
				109	; r12 = FAST_FLOAT * workspace
				110
				111	align 16
				112	global EXTN(jsimd_quantize_float_sse2)
				113
				114	EXTN(jsimd_quantize_float_sse2):
				115	push rbp
				116	mov rbp,rsp
				117	collect_args
				118
				119	mov rsi, r12
				120	mov rdx, r11
				121	mov rdi, r10
				122	mov rax, DCTSIZE2/16
				123	.quantloop:
				124	movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
				125	movaps xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
				126	mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
				127	mulps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
				128	movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
				129	movaps xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
				130	mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
				131	mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
				132
				133	cvtps2dq xmm0,xmm0
				134	cvtps2dq xmm1,xmm1
				135	cvtps2dq xmm2,xmm2
				136	cvtps2dq xmm3,xmm3
				137
				138	packssdw xmm0,xmm1
				139	packssdw xmm2,xmm3
				140
				141	movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
				142	movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
				143
				144	add rsi, byte 16*SIZEOF_FAST_FLOAT
				145	add rdx, byte 16*SIZEOF_FAST_FLOAT
				146	add rdi, byte 16*SIZEOF_JCOEF
				147	dec rax
				148	jnz short .quantloop
				149
				150	uncollect_args
				151	pop rbp
				152	ret