Blame - simd/jcqntmmx.asm - platform/external/libjpeg-turbo

blob: 08b08b79e2165277e130c89116534fef1e99346f [file] [log] [blame]

MIYASAKA Masaru	a2e6a9d	2006-02-04 00:00:00 +0000	[diff] [blame]	1	;
				2	; jcqntmmx.asm - sample data conversion and quantization (MMX)
				3	;
Pierre Ossman	5eb84ff	2009-03-09 13:25:30 +0000	[diff] [blame]	4	; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
				5	;
				6	; Based on
MIYASAKA Masaru	a2e6a9d	2006-02-04 00:00:00 +0000	[diff] [blame]	7	; x86 SIMD extension for IJG JPEG library
				8	; Copyright (C) 1999-2006, MIYASAKA Masaru.
				9	; For conditions of distribution and use, see copyright notice in jsimdext.inc
				10	;
				11	; This file should be assembled with NASM (Netwide Assembler),
				12	; can not be assembled with Microsoft's MASM or any compatible
				13	; assembler (including Borland's Turbo Assembler).
				14	; NASM is available from http://nasm.sourceforge.net/ or
				15	; http://sourceforge.net/project/showfiles.php?group_id=6208
				16	;
MIYASAKA Masaru	a2e6a9d	2006-02-04 00:00:00 +0000	[diff] [blame]	17	; [TAB8]
				18
Pierre Ossman	3a65ef4	2009-03-16 13:34:18 +0000	[diff] [blame]	19	%include "jsimdext.inc"
				20	%include "jdct.inc"
MIYASAKA Masaru	a2e6a9d	2006-02-04 00:00:00 +0000	[diff] [blame]	21
				22	; --------------------------------------------------------------------------
				23	SECTION SEG_TEXT
				24	BITS 32
				25	;
				26	; Load data into workspace, applying unsigned->signed conversion
				27	;
				28	; GLOBAL(void)
Pierre Ossman	5eb84ff	2009-03-09 13:25:30 +0000	[diff] [blame]	29	; jsimd_convsamp_mmx (JSAMPARRAY sample_data, JDIMENSION start_col,
				30	; DCTELEM * workspace);
MIYASAKA Masaru	a2e6a9d	2006-02-04 00:00:00 +0000	[diff] [blame]	31	;
				32
				33	%define sample_data ebp+8 ; JSAMPARRAY sample_data
				34	%define start_col ebp+12 ; JDIMENSION start_col
				35	%define workspace ebp+16 ; DCTELEM * workspace
				36
				37	align 16
Pierre Ossman	5eb84ff	2009-03-09 13:25:30 +0000	[diff] [blame]	38	global EXTN(jsimd_convsamp_mmx)
MIYASAKA Masaru	a2e6a9d	2006-02-04 00:00:00 +0000	[diff] [blame]	39
Pierre Ossman	5eb84ff	2009-03-09 13:25:30 +0000	[diff] [blame]	40	EXTN(jsimd_convsamp_mmx):
MIYASAKA Masaru	a2e6a9d	2006-02-04 00:00:00 +0000	[diff] [blame]	41	push ebp
				42	mov ebp,esp
				43	push ebx
				44	; push ecx ; need not be preserved
				45	; push edx ; need not be preserved
				46	push esi
				47	push edi
				48
				49	pxor mm6,mm6 ; mm6=(all 0's)
				50	pcmpeqw mm7,mm7
				51	psllw mm7,7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
				52
				53	mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
				54	mov eax, JDIMENSION [start_col]
				55	mov edi, POINTER [workspace] ; (DCTELEM *)
				56	mov ecx, DCTSIZE/4
				57	alignx 16,7
				58	.convloop:
				59	mov ebx, JSAMPROW [esi+0SIZEOF_JSAMPROW] ; (JSAMPLE )
				60	mov edx, JSAMPROW [esi+1SIZEOF_JSAMPROW] ; (JSAMPLE )
				61
				62	movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm0=(01234567)
				63	movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm1=(89ABCDEF)
				64
				65	mov ebx, JSAMPROW [esi+2SIZEOF_JSAMPROW] ; (JSAMPLE )
				66	mov edx, JSAMPROW [esi+3SIZEOF_JSAMPROW] ; (JSAMPLE )
				67
				68	movq mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm2=(GHIJKLMN)
				69	movq mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm3=(OPQRSTUV)
				70
				71	movq mm4,mm0
				72	punpcklbw mm0,mm6 ; mm0=(0123)
				73	punpckhbw mm4,mm6 ; mm4=(4567)
				74	movq mm5,mm1
				75	punpcklbw mm1,mm6 ; mm1=(89AB)
				76	punpckhbw mm5,mm6 ; mm5=(CDEF)
				77
				78	paddw mm0,mm7
				79	paddw mm4,mm7
				80	paddw mm1,mm7
				81	paddw mm5,mm7
				82
				83	movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
				84	movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
				85	movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
				86	movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
				87
				88	movq mm0,mm2
				89	punpcklbw mm2,mm6 ; mm2=(GHIJ)
				90	punpckhbw mm0,mm6 ; mm0=(KLMN)
				91	movq mm4,mm3
				92	punpcklbw mm3,mm6 ; mm3=(OPQR)
				93	punpckhbw mm4,mm6 ; mm4=(STUV)
				94
				95	paddw mm2,mm7
				96	paddw mm0,mm7
				97	paddw mm3,mm7
				98	paddw mm4,mm7
				99
				100	movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
				101	movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
				102	movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
				103	movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
				104
				105	add esi, byte 4*SIZEOF_JSAMPROW
				106	add edi, byte 4DCTSIZESIZEOF_DCTELEM
				107	dec ecx
				108	jnz short .convloop
				109
				110	emms ; empty MMX state
				111
				112	pop edi
				113	pop esi
				114	; pop edx ; need not be preserved
				115	; pop ecx ; need not be preserved
				116	pop ebx
				117	pop ebp
				118	ret
				119
MIYASAKA Masaru	a2e6a9d	2006-02-04 00:00:00 +0000	[diff] [blame]	120	; --------------------------------------------------------------------------
				121	;
				122	; Quantize/descale the coefficients, and store into coef_block
				123	;
				124	; This implementation is based on an algorithm described in
				125	; "How to optimize for the Pentium family of microprocessors"
				126	; (http://www.agner.org/assem/).
				127	;
				128	; GLOBAL(void)
Pierre Ossman	5eb84ff	2009-03-09 13:25:30 +0000	[diff] [blame]	129	; jsimd_quantize_mmx (JCOEFPTR coef_block, DCTELEM * divisors,
				130	; DCTELEM * workspace);
MIYASAKA Masaru	a2e6a9d	2006-02-04 00:00:00 +0000	[diff] [blame]	131	;
				132
				133	%define RECIPROCAL(m,n,b) MMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
				134	%define CORRECTION(m,n,b) MMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
				135	%define SCALE(m,n,b) MMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
Pierre Ossman	5eb84ff	2009-03-09 13:25:30 +0000	[diff] [blame]	136	%define SHIFT(m,n,b) MMBLOCK(DCTSIZE*3+(m),(n),(b),SIZEOF_DCTELEM)
MIYASAKA Masaru	a2e6a9d	2006-02-04 00:00:00 +0000	[diff] [blame]	137
				138	%define coef_block ebp+8 ; JCOEFPTR coef_block
				139	%define divisors ebp+12 ; DCTELEM * divisors
				140	%define workspace ebp+16 ; DCTELEM * workspace
				141
				142	align 16
Pierre Ossman	5eb84ff	2009-03-09 13:25:30 +0000	[diff] [blame]	143	global EXTN(jsimd_quantize_mmx)
MIYASAKA Masaru	a2e6a9d	2006-02-04 00:00:00 +0000	[diff] [blame]	144
Pierre Ossman	5eb84ff	2009-03-09 13:25:30 +0000	[diff] [blame]	145	EXTN(jsimd_quantize_mmx):
MIYASAKA Masaru	a2e6a9d	2006-02-04 00:00:00 +0000	[diff] [blame]	146	push ebp
				147	mov ebp,esp
				148	; push ebx ; unused
				149	; push ecx ; unused
				150	; push edx ; need not be preserved
				151	push esi
				152	push edi
				153
				154	mov esi, POINTER [workspace]
				155	mov edx, POINTER [divisors]
				156	mov edi, JCOEFPTR [coef_block]
				157	mov ah, 2
				158	alignx 16,7
				159	.quantloop1:
				160	mov al, DCTSIZE2/8/2
				161	alignx 16,7
				162	.quantloop2:
				163	movq mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
				164	movq mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
Pierre Ossman	5eb84ff	2009-03-09 13:25:30 +0000	[diff] [blame]	165
MIYASAKA Masaru	a2e6a9d	2006-02-04 00:00:00 +0000	[diff] [blame]	166	movq mm0,mm2
				167	movq mm1,mm3
MIYASAKA Masaru	a2e6a9d	2006-02-04 00:00:00 +0000	[diff] [blame]	168
Pierre Ossman	5eb84ff	2009-03-09 13:25:30 +0000	[diff] [blame]	169	psraw mm2,(WORD_BIT-1) ; -1 if value < 0, 0 otherwise
				170	psraw mm3,(WORD_BIT-1)
				171
				172	pxor mm0,mm2 ; val = -val
				173	pxor mm1,mm3
				174	psubw mm0,mm2
				175	psubw mm1,mm3
				176
				177	;
				178	; MMX is an annoyingly crappy instruction set. It has two
				179	; misfeatures that are causing problems here:
				180	;
				181	; - All multiplications are signed.
				182	;
				183	; - The second operand for the shifts is not treated as packed.
				184	;
				185	;
				186	; We work around the first problem by implementing this algorithm:
				187	;
MIYASAKA Masaru	a2e6a9d	2006-02-04 00:00:00 +0000	[diff] [blame]	188	; unsigned long unsigned_multiply(unsigned short x, unsigned short y)
				189	; {
				190	; enum { SHORT_BIT = 16 };
				191	; signed short sx = (signed short) x;
				192	; signed short sy = (signed short) y;
				193	; signed long sz;
				194	;
				195	; sz = (long) sx * (long) sy; /* signed multiply */
				196	;
				197	; if (sx < 0) sz += (long) sy << SHORT_BIT;
				198	; if (sy < 0) sz += (long) sx << SHORT_BIT;
				199	;
				200	; return (unsigned long) sz;
				201	; }
Pierre Ossman	5eb84ff	2009-03-09 13:25:30 +0000	[diff] [blame]	202	;
				203	; (note that a negative sx adds _sy_ and vice versa)
				204	;
				205	; For the second problem, we replace the shift by a multiplication.
				206	; Unfortunately that means we have to deal with the signed issue again.
				207	;
MIYASAKA Masaru	a2e6a9d	2006-02-04 00:00:00 +0000	[diff] [blame]	208
				209	paddw mm0, MMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
				210	paddw mm1, MMWORD [CORRECTION(0,1,edx)]
Pierre Ossman	5eb84ff	2009-03-09 13:25:30 +0000	[diff] [blame]	211
				212	movq mm4,mm0 ; store current value for later
MIYASAKA Masaru	a2e6a9d	2006-02-04 00:00:00 +0000	[diff] [blame]	213	movq mm5,mm1
				214	pmulhw mm0, MMWORD [RECIPROCAL(0,0,edx)] ; reciprocal
				215	pmulhw mm1, MMWORD [RECIPROCAL(0,1,edx)]
Pierre Ossman	5eb84ff	2009-03-09 13:25:30 +0000	[diff] [blame]	216	paddw mm0,mm4 ; reciprocal is always negative (MSB=1),
				217	paddw mm1,mm5 ; so we always need to add the initial value
				218	; (input value is never negative as we
				219	; inverted it at the start of this routine)
				220
				221	; here it gets a bit tricky as both scale
				222	; and mm0/mm1 can be negative
MIYASAKA Masaru	a2e6a9d	2006-02-04 00:00:00 +0000	[diff] [blame]	223	movq mm6, MMWORD [SCALE(0,0,edx)] ; scale
				224	movq mm7, MMWORD [SCALE(0,1,edx)]
MIYASAKA Masaru	a2e6a9d	2006-02-04 00:00:00 +0000	[diff] [blame]	225	movq mm4,mm0
				226	movq mm5,mm1
				227	pmulhw mm0,mm6
				228	pmulhw mm1,mm7
Pierre Ossman	5eb84ff	2009-03-09 13:25:30 +0000	[diff] [blame]	229
				230	psraw mm6,(WORD_BIT-1) ; determine if scale is negative
MIYASAKA Masaru	a2e6a9d	2006-02-04 00:00:00 +0000	[diff] [blame]	231	psraw mm7,(WORD_BIT-1)
Pierre Ossman	5eb84ff	2009-03-09 13:25:30 +0000	[diff] [blame]	232
				233	pand mm6,mm4 ; and add input if it is
MIYASAKA Masaru	a2e6a9d	2006-02-04 00:00:00 +0000	[diff] [blame]	234	pand mm7,mm5
				235	paddw mm0,mm6
				236	paddw mm1,mm7
Pierre Ossman	5eb84ff	2009-03-09 13:25:30 +0000	[diff] [blame]	237
				238	psraw mm4,(WORD_BIT-1) ; then check if negative input
MIYASAKA Masaru	a2e6a9d	2006-02-04 00:00:00 +0000	[diff] [blame]	239	psraw mm5,(WORD_BIT-1)
Pierre Ossman	5eb84ff	2009-03-09 13:25:30 +0000	[diff] [blame]	240
				241	pand mm4, MMWORD [SCALE(0,0,edx)] ; and add scale if it is
MIYASAKA Masaru	a2e6a9d	2006-02-04 00:00:00 +0000	[diff] [blame]	242	pand mm5, MMWORD [SCALE(0,1,edx)]
				243	paddw mm0,mm4
				244	paddw mm1,mm5
				245
Pierre Ossman	5eb84ff	2009-03-09 13:25:30 +0000	[diff] [blame]	246	pxor mm0,mm2 ; val = -val
MIYASAKA Masaru	a2e6a9d	2006-02-04 00:00:00 +0000	[diff] [blame]	247	pxor mm1,mm3
				248	psubw mm0,mm2
				249	psubw mm1,mm3
Pierre Ossman	5eb84ff	2009-03-09 13:25:30 +0000	[diff] [blame]	250
MIYASAKA Masaru	a2e6a9d	2006-02-04 00:00:00 +0000	[diff] [blame]	251	movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
				252	movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
				253
				254	add esi, byte 8*SIZEOF_DCTELEM
				255	add edx, byte 8*SIZEOF_DCTELEM
				256	add edi, byte 8*SIZEOF_JCOEF
				257	dec al
				258	jnz near .quantloop2
				259	dec ah
				260	jnz near .quantloop1 ; to avoid branch misprediction
				261
				262	emms ; empty MMX state
				263
				264	pop edi
				265	pop esi
				266	; pop edx ; need not be preserved
				267	; pop ecx ; unused
				268	; pop ebx ; unused
				269	pop ebp
				270	ret
				271
DRC	132b5fd	2009-10-08 09:04:56 +0000	[diff] [blame^]	272	; For some reason, the OS X linker does not honor the request to align the
				273	; segment unless we do this.
				274	align 16