Blame - sse2bfly27.s - platform/external/fec

blob: 27422a2eb7bfe37fad6896cd5e5ee882a0063cd1 [file] [log] [blame]

Bill Yi	4e213d5	2015-06-23 13:53:11 -0700	[diff] [blame]	1	/* Intel SIMD (SSE2) implementations of Viterbi ACS butterflies
				2	for 64-state (k=7) convolutional code
				3	Copyright 2003 Phil Karn, KA9Q
				4	This code may be used under the terms of the GNU Lesser General Public License (LGPL)
				5
				6	void update_viterbi27_blk_sse2(struct v27 *vp,unsigned char syms[],int nbits) ;
				7	*/
				8	# SSE2 (128-bit integer SIMD) version
				9	# Requires Pentium 4 or better
				10
				11	# These are offsets into struct v27, defined in viterbi27.h
				12	.set DP,128
				13	.set OLDMETRICS,132
				14	.set NEWMETRICS,136
				15	.text
				16	.global update_viterbi27_blk_sse2,Branchtab27_sse2
				17	.type update_viterbi27_blk_sse2,@function
				18	.align 16
				19
				20	update_viterbi27_blk_sse2:
				21	pushl %ebp
				22	movl %esp,%ebp
				23	pushl %esi
				24	pushl %edi
				25	pushl %edx
				26	pushl %ebx
				27
				28	movl 8(%ebp),%edx # edx = vp
				29	testl %edx,%edx
				30	jnz 0f
				31	movl -1,%eax
				32	jmp err
				33	0: movl OLDMETRICS(%edx),%esi # esi -> old metrics
				34	movl NEWMETRICS(%edx),%edi # edi -> new metrics
				35	movl DP(%edx),%edx # edx -> decisions
				36
				37	1: movl 16(%ebp),%eax # eax = nbits
				38	decl %eax
				39	jl 2f # passed zero, we're done
				40	movl %eax,16(%ebp)
				41
				42	xorl %eax,%eax
				43	movl 12(%ebp),%ebx # ebx = syms
				44	movb (%ebx),%al
				45	movd %eax,%xmm6 # xmm6[0] = first symbol
				46	movb 1(%ebx),%al
				47	movd %eax,%xmm5 # xmm5[0] = second symbol
				48	addl $2,%ebx
				49	movl %ebx,12(%ebp)
				50
				51	punpcklbw %xmm6,%xmm6 # xmm6[1] = xmm6[0]
				52	punpcklbw %xmm5,%xmm5
				53	pshuflw $0,%xmm6,%xmm6 # copy low word to low 3
				54	pshuflw $0,%xmm5,%xmm5
				55	punpcklqdq %xmm6,%xmm6 # propagate to all 16
				56	punpcklqdq %xmm5,%xmm5
				57	# xmm6 now contains first symbol in each byte, xmm5 the second
				58
				59	movdqa thirtyones,%xmm7
				60
				61	# each invocation of this macro does 16 butterflies in parallel
				62	.MACRO butterfly GROUP
				63	# compute branch metrics
				64	movdqa Branchtab27_sse2+(16*\GROUP),%xmm4
				65	movdqa Branchtab27_sse2+32+(16*\GROUP),%xmm3
				66	pxor %xmm6,%xmm4
				67	pxor %xmm5,%xmm3
				68
				69	# compute 5-bit branch metric in xmm4 by adding the individual symbol metrics
				70	# This is okay for this
				71	# code because the worst-case metric spread (at high Eb/No) is only 120,
				72	# well within the range of our unsigned 8-bit path metrics, and even within
				73	# the range of signed 8-bit path metrics
				74	pavgb %xmm3,%xmm4
				75	psrlw $3,%xmm4
				76
				77	pand %xmm7,%xmm4
				78
				79	movdqa (16*\GROUP)(%esi),%xmm0 # Incoming path metric, high bit = 0
				80	movdqa ((16*\GROUP)+32)(%esi),%xmm3 # Incoming path metric, high bit = 1
				81	movdqa %xmm0,%xmm2
				82	movdqa %xmm3,%xmm1
				83	paddusb %xmm4,%xmm0 # note use of saturating arithmetic
				84	paddusb %xmm4,%xmm3 # this shouldn't be necessary, but why not?
				85
				86	# negate branch metrics
				87	pxor %xmm7,%xmm4
				88	paddusb %xmm4,%xmm1
				89	paddusb %xmm4,%xmm2
				90
				91	# Find survivors, leave in mm0,2
				92	pminub %xmm1,%xmm0
				93	pminub %xmm3,%xmm2
				94	# get decisions, leave in mm1,3
				95	pcmpeqb %xmm0,%xmm1
				96	pcmpeqb %xmm2,%xmm3
				97
				98	# interleave and store new branch metrics in mm0,2
				99	movdqa %xmm0,%xmm4
				100	punpckhbw %xmm2,%xmm0 # interleave second 16 new metrics
				101	punpcklbw %xmm2,%xmm4 # interleave first 16 new metrics
				102	movdqa %xmm0,(32*\GROUP+16)(%edi)
				103	movdqa %xmm4,(32*\GROUP)(%edi)
				104
				105	# interleave decisions & store
				106	movdqa %xmm1,%xmm4
				107	punpckhbw %xmm3,%xmm1
				108	punpcklbw %xmm3,%xmm4
				109	# work around bug in gas due to Intel doc error
				110	.byte 0x66,0x0f,0xd7,0xd9 # pmovmskb %xmm1,%ebx
				111	shll $16,%ebx
				112	.byte 0x66,0x0f,0xd7,0xc4 # pmovmskb %xmm4,%eax
				113	orl %eax,%ebx
				114	movl %ebx,(4*\GROUP)(%edx)
				115	.endm
				116
				117	# invoke macro 2 times for a total of 32 butterflies
				118	butterfly GROUP=0
				119	butterfly GROUP=1
				120
				121	addl $8,%edx # bump decision pointer
				122
				123	# See if we have to normalize. This requires an explanation. We don't want
				124	# our path metrics to exceed 255 on the next iteration. Since the
				125	# largest branch metric is 30, that means we don't want any to exceed 225
				126	# on this iteration. Rather than look them all, we just pick an arbitrary one
				127	# (the first) and see if it exceeds 225-120=105, where 120 is the experimentally-
				128	# determined worst-case metric spread for this code and branch metrics in the range 0-30.
				129
				130	# This is extremely conservative, and empirical testing at a variety of Eb/Nos might
				131	# show that a higher threshold could be used without affecting BER performance
				132	movl (%edi),%eax # extract first output metric
				133	andl $255,%eax
				134	cmp $105,%eax
				135	jle done # No, no need to normalize
				136
				137	# Normalize by finding smallest metric and subtracting it
				138	# from all metrics. We can't just pick an arbitrary small constant because
				139	# the minimum metric might be zero!
				140	movdqa (%edi),%xmm0
				141	movdqa %xmm0,%xmm4
				142	movdqa 16(%edi),%xmm1
				143	pminub %xmm1,%xmm4
				144	movdqa 32(%edi),%xmm2
				145	pminub %xmm2,%xmm4
				146	movdqa 48(%edi),%xmm3
				147	pminub %xmm3,%xmm4
				148
				149	# crunch down to single lowest metric
				150	movdqa %xmm4,%xmm5
				151	psrldq $8,%xmm5 # the count to psrldq is bytes, not bits!
				152	pminub %xmm5,%xmm4
				153	movdqa %xmm4,%xmm5
				154	psrlq $32,%xmm5
				155	pminub %xmm5,%xmm4
				156	movdqa %xmm4,%xmm5
				157	psrlq $16,%xmm5
				158	pminub %xmm5,%xmm4
				159	movdqa %xmm4,%xmm5
				160	psrlq $8,%xmm5
				161	pminub %xmm5,%xmm4 # now in lowest byte of %xmm4
				162
				163	punpcklbw %xmm4,%xmm4 # lowest 2 bytes
				164	pshuflw $0,%xmm4,%xmm4 # lowest 8 bytes
				165	punpcklqdq %xmm4,%xmm4 # all 16 bytes
				166
				167	# xmm4 now contains lowest metric in all 16 bytes
				168	# subtract it from every output metric
				169	psubusb %xmm4,%xmm0
				170	psubusb %xmm4,%xmm1
				171	psubusb %xmm4,%xmm2
				172	psubusb %xmm4,%xmm3
				173	movdqa %xmm0,(%edi)
				174	movdqa %xmm1,16(%edi)
				175	movdqa %xmm2,32(%edi)
				176	movdqa %xmm3,48(%edi)
				177
				178	done:
				179	# swap metrics
				180	movl %esi,%eax
				181	movl %edi,%esi
				182	movl %eax,%edi
				183	jmp 1b
				184
				185	2: movl 8(%ebp),%ebx # ebx = vp
				186	# stash metric pointers
				187	movl %esi,OLDMETRICS(%ebx)
				188	movl %edi,NEWMETRICS(%ebx)
				189	movl %edx,DP(%ebx) # stash incremented value of vp->dp
				190	xorl %eax,%eax
				191	err: popl %ebx
				192	popl %edx
				193	popl %edi
				194	popl %esi
				195	popl %ebp
				196	ret
				197
				198	.data
				199	.align 16
				200
				201	thirtyones:
				202	.byte 31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31