blob: 0fa17425f3af021ab311d9fcd16f401320a19929 [file] [log] [blame]
Bill Yi4e213d52015-06-23 13:53:11 -07001/* Intel SIMD SSE2 implementation of Viterbi ACS butterflies
2 for 256-state (k=9) convolutional code
3 Copyright 2004 Phil Karn, KA9Q
4 This code may be used under the terms of the GNU Lesser General Public License (LGPL)
5
6 void update_viterbi29_blk_sse2(struct v29 *vp,unsigned char *syms,int nbits) ;
7*/
8
9 # SSE2 (128-bit integer SIMD) version
10 # Requires Pentium 4 or better
11 # These are offsets into struct v29, defined in viterbi29.h
12 .set DP,512
13 .set OLDMETRICS,516
14 .set NEWMETRICS,520
15
16 .text
17 .global update_viterbi29_blk_sse2,Branchtab29_sse2
18 .type update_viterbi29_blk_sse2,@function
19 .align 16
20
21update_viterbi29_blk_sse2:
22 pushl %ebp
23 movl %esp,%ebp
24 pushl %esi
25 pushl %edi
26 pushl %edx
27 pushl %ebx
28
29 movl 8(%ebp),%edx # edx = vp
30 testl %edx,%edx
31 jnz 0f
32 movl -1,%eax
33 jmp err
340: movl OLDMETRICS(%edx),%esi # esi -> old metrics
35 movl NEWMETRICS(%edx),%edi # edi -> new metrics
36 movl DP(%edx),%edx # edx -> decisions
37
381: movl 16(%ebp),%eax # eax = nbits
39 decl %eax
40 jl 2f # passed zero, we're done
41 movl %eax,16(%ebp)
42
43 xorl %eax,%eax
44 movl 12(%ebp),%ebx # ebx = syms
45 movb (%ebx),%al
46 movd %eax,%xmm6 # xmm6[0] = first symbol
47 movb 1(%ebx),%al
48 movd %eax,%xmm5 # xmm5[0] = second symbol
49 addl $2,%ebx
50 movl %ebx,12(%ebp)
51
52 punpcklbw %xmm6,%xmm6 # xmm6[1] = xmm6[0]
53 punpcklbw %xmm5,%xmm5
54 movdqa thirtyones,%xmm7
55 pshuflw $0,%xmm6,%xmm6 # copy low word to low 3
56 pshuflw $0,%xmm5,%xmm5
57 punpcklqdq %xmm6,%xmm6 # propagate to all 16
58 punpcklqdq %xmm5,%xmm5
59 # xmm6 now contains first symbol in each byte, xmm5 the second
60
61 movdqa thirtyones,%xmm7
62
63 # each invocation of this macro does 16 butterflies in parallel
64 .MACRO butterfly GROUP
65 # compute branch metrics
66 movdqa Branchtab29_sse2+(16*\GROUP),%xmm4
67 movdqa Branchtab29_sse2+128+(16*\GROUP),%xmm3
68 pxor %xmm6,%xmm4
69 pxor %xmm5,%xmm3
70 pavgb %xmm3,%xmm4
71 psrlw $3,%xmm4
72
73 pand %xmm7,%xmm4 # xmm4 contains branch metrics
74
75 movdqa (16*\GROUP)(%esi),%xmm0 # Incoming path metric, high bit = 0
76 movdqa ((16*\GROUP)+128)(%esi),%xmm3 # Incoming path metric, high bit = 1
77 movdqa %xmm0,%xmm2
78 movdqa %xmm3,%xmm1
79 paddusb %xmm4,%xmm0
80 paddusb %xmm4,%xmm3
81
82 # invert branch metrics
83 pxor %xmm7,%xmm4
84
85 paddusb %xmm4,%xmm1
86 paddusb %xmm4,%xmm2
87
88 # Find survivors, leave in mm0,2
89 pminub %xmm1,%xmm0
90 pminub %xmm3,%xmm2
91 # get decisions, leave in mm1,3
92 pcmpeqb %xmm0,%xmm1
93 pcmpeqb %xmm2,%xmm3
94
95 # interleave and store new branch metrics in mm0,2
96 movdqa %xmm0,%xmm4
97 punpckhbw %xmm2,%xmm0 # interleave second 16 new metrics
98 punpcklbw %xmm2,%xmm4 # interleave first 16 new metrics
99 movdqa %xmm0,(32*\GROUP+16)(%edi)
100 movdqa %xmm4,(32*\GROUP)(%edi)
101
102 # interleave decisions & store
103 movdqa %xmm1,%xmm4
104 punpckhbw %xmm3,%xmm1
105 punpcklbw %xmm3,%xmm4
106 # work around bug in gas due to Intel doc error
107 .byte 0x66,0x0f,0xd7,0xd9 # pmovmskb %xmm1,%ebx
108 shll $16,%ebx
109 .byte 0x66,0x0f,0xd7,0xc4 # pmovmskb %xmm4,%eax
110 orl %eax,%ebx
111 movl %ebx,(4*\GROUP)(%edx)
112 .endm
113
114 # invoke macro 8 times for a total of 128 butterflies
115 butterfly GROUP=0
116 butterfly GROUP=1
117 butterfly GROUP=2
118 butterfly GROUP=3
119 butterfly GROUP=4
120 butterfly GROUP=5
121 butterfly GROUP=6
122 butterfly GROUP=7
123
124 addl $32,%edx # bump decision pointer
125
126 # see if we have to normalize
127 movl (%edi),%eax # extract first output metric
128 andl $255,%eax
129 cmp $50,%eax # is it greater than 50?
130 movl $0,%eax
131 jle done # No, no need to normalize
132
133 # Normalize by finding smallest metric and subtracting it
134 # from all metrics
135 movdqa (%edi),%xmm0
136 pminub 16(%edi),%xmm0
137 pminub 32(%edi),%xmm0
138 pminub 48(%edi),%xmm0
139 pminub 64(%edi),%xmm0
140 pminub 80(%edi),%xmm0
141 pminub 96(%edi),%xmm0
142 pminub 112(%edi),%xmm0
143 pminub 128(%edi),%xmm0
144 pminub 144(%edi),%xmm0
145 pminub 160(%edi),%xmm0
146 pminub 176(%edi),%xmm0
147 pminub 192(%edi),%xmm0
148 pminub 208(%edi),%xmm0
149 pminub 224(%edi),%xmm0
150 pminub 240(%edi),%xmm0
151
152 # crunch down to single lowest metric
153 movdqa %xmm0,%xmm1
154 psrldq $8,%xmm0 # the count to psrldq is bytes, not bits!
155 pminub %xmm1,%xmm0
156 movdqa %xmm0,%xmm1
157 psrlq $32,%xmm0
158 pminub %xmm1,%xmm0
159 movdqa %xmm0,%xmm1
160 psrlq $16,%xmm0
161 pminub %xmm1,%xmm0
162 movdqa %xmm0,%xmm1
163 psrlq $8,%xmm0
164 pminub %xmm1,%xmm0
165
166 punpcklbw %xmm0,%xmm0 # lowest 2 bytes
167 pshuflw $0,%xmm0,%xmm0 # lowest 8 bytes
168 punpcklqdq %xmm0,%xmm0 # all 16 bytes
169
170 # xmm0 now contains lowest metric in all 16 bytes
171 # subtract it from every output metric
172 movdqa (%edi),%xmm1
173 psubusb %xmm0,%xmm1
174 movdqa %xmm1,(%edi)
175 movdqa 16(%edi),%xmm1
176 psubusb %xmm0,%xmm1
177 movdqa %xmm1,16(%edi)
178 movdqa 32(%edi),%xmm1
179 psubusb %xmm0,%xmm1
180 movdqa %xmm1,32(%edi)
181 movdqa 48(%edi),%xmm1
182 psubusb %xmm0,%xmm1
183 movdqa %xmm1,48(%edi)
184 movdqa 64(%edi),%xmm1
185 psubusb %xmm0,%xmm1
186 movdqa %xmm1,64(%edi)
187 movdqa 80(%edi),%xmm1
188 psubusb %xmm0,%xmm1
189 movdqa %xmm1,80(%edi)
190 movdqa 96(%edi),%xmm1
191 psubusb %xmm0,%xmm1
192 movdqa %xmm1,96(%edi)
193 movdqa 112(%edi),%xmm1
194 psubusb %xmm0,%xmm1
195 movdqa %xmm1,112(%edi)
196 movdqa 128(%edi),%xmm1
197 psubusb %xmm0,%xmm1
198 movdqa %xmm1,128(%edi)
199 movdqa 144(%edi),%xmm1
200 psubusb %xmm0,%xmm1
201 movdqa %xmm1,144(%edi)
202 movdqa 160(%edi),%xmm1
203 psubusb %xmm0,%xmm1
204 movdqa %xmm1,160(%edi)
205 movdqa 176(%edi),%xmm1
206 psubusb %xmm0,%xmm1
207 movdqa %xmm1,176(%edi)
208 movdqa 192(%edi),%xmm1
209 psubusb %xmm0,%xmm1
210 movdqa %xmm1,192(%edi)
211 movdqa 208(%edi),%xmm1
212 psubusb %xmm0,%xmm1
213 movdqa %xmm1,208(%edi)
214 movdqa 224(%edi),%xmm1
215 psubusb %xmm0,%xmm1
216 movdqa %xmm1,224(%edi)
217 movdqa 240(%edi),%xmm1
218 psubusb %xmm0,%xmm1
219 movdqa %xmm1,240(%edi)
220
221done:
222 # swap metrics
223 movl %esi,%eax
224 movl %edi,%esi
225 movl %eax,%edi
226 jmp 1b
227
2282: movl 8(%ebp),%ebx # ebx = vp
229 # stash metric pointers
230 movl %esi,OLDMETRICS(%ebx)
231 movl %edi,NEWMETRICS(%ebx)
232 movl %edx,DP(%ebx) # stash incremented value of vp->dp
233 xorl %eax,%eax
234err: popl %ebx
235 popl %edx
236 popl %edi
237 popl %esi
238 popl %ebp
239 ret
240
241 .data
242 .align 16
243thirtyones:
244 .byte 31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31
245