Blame - jcqntsse.asm - platform/external/libjpeg-turbo

blob: fe99a20b343130cbe88ab30eebed2d6eb43f7b26 [file] [log] [blame]

MIYASAKA Masaru	a2e6a9d	2006-02-04 00:00:00 +0000	[diff] [blame^]	1	;
				2	; jcqntsse.asm - sample data conversion and quantization (SSE & MMX)
				3	;
				4	; x86 SIMD extension for IJG JPEG library
				5	; Copyright (C) 1999-2006, MIYASAKA Masaru.
				6	; For conditions of distribution and use, see copyright notice in jsimdext.inc
				7	;
				8	; This file should be assembled with NASM (Netwide Assembler),
				9	; can not be assembled with Microsoft's MASM or any compatible
				10	; assembler (including Borland's Turbo Assembler).
				11	; NASM is available from http://nasm.sourceforge.net/ or
				12	; http://sourceforge.net/project/showfiles.php?group_id=6208
				13	;
				14	; Last Modified : January 12, 2005
				15	;
				16	; [TAB8]
				17
				18	%include "jsimdext.inc"
				19	%include "jdct.inc"
				20
				21	%ifdef DCT_FLOAT_SUPPORTED
				22	%ifdef JFDCT_FLT_SSE_MMX_SUPPORTED
				23
				24	; This module is specialized to the case DCTSIZE = 8.
				25	;
				26	%if DCTSIZE != 8
				27	%error "Sorry, this code only copes with 8x8 DCTs."
				28	%endif
				29
				30	; --------------------------------------------------------------------------
				31	SECTION SEG_TEXT
				32	BITS 32
				33	;
				34	; Load data into workspace, applying unsigned->signed conversion
				35	;
				36	; GLOBAL(void)
				37	; jpeg_convsamp_flt_sse (JSAMPARRAY sample_data, JDIMENSION start_col,
				38	; FAST_FLOAT * workspace);
				39	;
				40
				41	%define sample_data ebp+8 ; JSAMPARRAY sample_data
				42	%define start_col ebp+12 ; JDIMENSION start_col
				43	%define workspace ebp+16 ; FAST_FLOAT * workspace
				44
				45	align 16
				46	global EXTN(jpeg_convsamp_flt_sse)
				47
				48	EXTN(jpeg_convsamp_flt_sse):
				49	push ebp
				50	mov ebp,esp
				51	push ebx
				52	; push ecx ; need not be preserved
				53	; push edx ; need not be preserved
				54	push esi
				55	push edi
				56
				57	pcmpeqw mm7,mm7
				58	psllw mm7,7
				59	packsswb mm7,mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..)
				60
				61	mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
				62	mov eax, JDIMENSION [start_col]
				63	mov edi, POINTER [workspace] ; (DCTELEM *)
				64	mov ecx, DCTSIZE/2
				65	alignx 16,7
				66	.convloop:
				67	mov ebx, JSAMPROW [esi+0SIZEOF_JSAMPROW] ; (JSAMPLE )
				68	mov edx, JSAMPROW [esi+1SIZEOF_JSAMPROW] ; (JSAMPLE )
				69
				70	movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
				71	movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
				72
				73	psubb mm0,mm7 ; mm0=(01234567)
				74	psubb mm1,mm7 ; mm1=(89ABCDEF)
				75
				76	punpcklbw mm2,mm0 ; mm2=(0123)
				77	punpckhbw mm0,mm0 ; mm0=(4567)
				78	punpcklbw mm3,mm1 ; mm3=(89AB)
				79	punpckhbw mm1,mm1 ; mm1=(CDEF)
				80
				81	punpcklwd mm4,mm2 ; mm4=(*0*1)
				82	punpckhwd mm2,mm2 ; mm2=(*2*3)
				83	punpcklwd mm5,mm0 ; mm5=(*4*5)
				84	punpckhwd mm0,mm0 ; mm0=(*6*7)
				85
				86	psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(01)
				87	psrad mm2,(DWORD_BIT-BYTE_BIT) ; mm2=(23)
				88	cvtpi2ps xmm0,mm4 ; xmm0=(01**)
				89	cvtpi2ps xmm1,mm2 ; xmm1=(23**)
				90	psrad mm5,(DWORD_BIT-BYTE_BIT) ; mm5=(45)
				91	psrad mm0,(DWORD_BIT-BYTE_BIT) ; mm0=(67)
				92	cvtpi2ps xmm2,mm5 ; xmm2=(45**)
				93	cvtpi2ps xmm3,mm0 ; xmm3=(67**)
				94
				95	punpcklwd mm6,mm3 ; mm6=(*8*9)
				96	punpckhwd mm3,mm3 ; mm3=(*A*B)
				97	punpcklwd mm4,mm1 ; mm4=(*C*D)
				98	punpckhwd mm1,mm1 ; mm1=(*E*F)
				99
				100	psrad mm6,(DWORD_BIT-BYTE_BIT) ; mm6=(89)
				101	psrad mm3,(DWORD_BIT-BYTE_BIT) ; mm3=(AB)
				102	cvtpi2ps xmm4,mm6 ; xmm4=(89**)
				103	cvtpi2ps xmm5,mm3 ; xmm5=(AB**)
				104	psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(CD)
				105	psrad mm1,(DWORD_BIT-BYTE_BIT) ; mm1=(EF)
				106	cvtpi2ps xmm6,mm4 ; xmm6=(CD**)
				107	cvtpi2ps xmm7,mm1 ; xmm7=(EF**)
				108
				109	movlhps xmm0,xmm1 ; xmm0=(0123)
				110	movlhps xmm2,xmm3 ; xmm2=(4567)
				111	movlhps xmm4,xmm5 ; xmm4=(89AB)
				112	movlhps xmm6,xmm7 ; xmm6=(CDEF)
				113
				114	movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
				115	movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
				116	movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
				117	movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
				118
				119	add esi, byte 2*SIZEOF_JSAMPROW
				120	add edi, byte 2DCTSIZESIZEOF_FAST_FLOAT
				121	dec ecx
				122	jnz near .convloop
				123
				124	emms ; empty MMX state
				125
				126	pop edi
				127	pop esi
				128	; pop edx ; need not be preserved
				129	; pop ecx ; need not be preserved
				130	pop ebx
				131	pop ebp
				132	ret
				133
				134
				135	; --------------------------------------------------------------------------
				136	;
				137	; Quantize/descale the coefficients, and store into coef_block
				138	;
				139	; GLOBAL(void)
				140	; jpeg_quantize_flt_sse (JCOEFPTR coef_block, FAST_FLOAT * divisors,
				141	; FAST_FLOAT * workspace);
				142	;
				143
				144	%define coef_block ebp+8 ; JCOEFPTR coef_block
				145	%define divisors ebp+12 ; FAST_FLOAT * divisors
				146	%define workspace ebp+16 ; FAST_FLOAT * workspace
				147
				148	align 16
				149	global EXTN(jpeg_quantize_flt_sse)
				150
				151	EXTN(jpeg_quantize_flt_sse):
				152	push ebp
				153	mov ebp,esp
				154	; push ebx ; unused
				155	; push ecx ; unused
				156	; push edx ; need not be preserved
				157	push esi
				158	push edi
				159
				160	mov esi, POINTER [workspace]
				161	mov edx, POINTER [divisors]
				162	mov edi, JCOEFPTR [coef_block]
				163	mov eax, DCTSIZE2/16
				164	alignx 16,7
				165	.quantloop:
				166	movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
				167	movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
				168	mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
				169	mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
				170	movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
				171	movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
				172	mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
				173	mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
				174
				175	movhlps xmm4,xmm0
				176	movhlps xmm5,xmm1
				177
				178	cvtps2pi mm0,xmm0
				179	cvtps2pi mm1,xmm1
				180	cvtps2pi mm4,xmm4
				181	cvtps2pi mm5,xmm5
				182
				183	movhlps xmm6,xmm2
				184	movhlps xmm7,xmm3
				185
				186	cvtps2pi mm2,xmm2
				187	cvtps2pi mm3,xmm3
				188	cvtps2pi mm6,xmm6
				189	cvtps2pi mm7,xmm7
				190
				191	packssdw mm0,mm4
				192	packssdw mm1,mm5
				193	packssdw mm2,mm6
				194	packssdw mm3,mm7
				195
				196	movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
				197	movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
				198	movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
				199	movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
				200
				201	add esi, byte 16*SIZEOF_FAST_FLOAT
				202	add edx, byte 16*SIZEOF_FAST_FLOAT
				203	add edi, byte 16*SIZEOF_JCOEF
				204	dec eax
				205	jnz short .quantloop
				206
				207	emms ; empty MMX state
				208
				209	pop edi
				210	pop esi
				211	; pop edx ; need not be preserved
				212	; pop ecx ; unused
				213	; pop ebx ; unused
				214	pop ebp
				215	ret
				216
				217	%endif ; JFDCT_FLT_SSE_MMX_SUPPORTED
				218	%endif ; DCT_FLOAT_SUPPORTED