blob: d7d2149222d040b841b86900c15fbb605369095a [file] [log] [blame]
Bill Yi4e213d52015-06-23 13:53:11 -07001/* Intel SIMD SSE implementation of Viterbi ACS butterflies
2 for 256-state (k=9) convolutional code
3 Copyright 2004 Phil Karn, KA9Q
4 This code may be used under the terms of the GNU Lesser General Public License (LGPL)
5
6 void update_viterbi29_blk_sse(struct v29 *vp,unsigned char syms[],int nbits);
7*/
8 # SSE (64-bit integer SIMD) version
9 # Requires Pentium III or better
10 # These are offsets into struct v29, defined in viterbi29.h
11 .set DP,512
12 .set OLDMETRICS,516
13 .set NEWMETRICS,520
14 .text
15 .global update_viterbi29_blk_sse,Branchtab29_sse
16 .type update_viterbi29_blk_sse,@function
17 .align 16
18
19update_viterbi29_blk_sse:
20 pushl %ebp
21 movl %esp,%ebp
22 pushl %esi
23 pushl %edi
24 pushl %edx
25 pushl %ebx
26
27 movl 8(%ebp),%edx # edx = vp
28 testl %edx,%edx
29 jnz 0f
30 movl -1,%eax
31 jmp err
320: movl OLDMETRICS(%edx),%esi # esi -> old metrics
33 movl NEWMETRICS(%edx),%edi # edi -> new metrics
34 movl DP(%edx),%edx # edx -> decisions
35
361: movl 16(%ebp),%eax # eax = nbits
37 decl %eax
38 jl 2f # passed zero, we're done
39 movl %eax,16(%ebp)
40
41 xorl %eax,%eax
42 movl 12(%ebp),%ebx # ebx = syms
43 movb (%ebx),%al
44 movd %eax,%mm6 # mm6[0] = first symbol
45 movb 1(%ebx),%al
46 movd %eax,%mm5 # mm5[0] = second symbol
47 addl $2,%ebx
48 movl %ebx,12(%ebp)
49
50 punpcklbw %mm6,%mm6 # mm6[1] = mm6[0]
51 punpcklbw %mm5,%mm5
52
53 movq thirtyones,%mm7
54 pshufw $0,%mm6,%mm6 # copy low word to upper 3
55 pshufw $0,%mm5,%mm5
56 # mm6 now contains first symbol in each byte, mm5 the second
57
58 # each invocation of this macro does 8 butterflies in parallel
59 .MACRO butterfly GROUP
60 # compute branch metrics
61 movq Branchtab29_sse+(8*\GROUP),%mm4
62 movq Branchtab29_sse+128+(8*\GROUP),%mm3
63 pxor %mm6,%mm4
64 pxor %mm5,%mm3
65 pavgb %mm3,%mm4 # mm4 contains branch metrics
66 psrlw $3,%mm4
67 pand %mm7,%mm4
68
69 movq (8*\GROUP)(%esi),%mm0 # Incoming path metric, high bit = 0
70 movq ((8*\GROUP)+128)(%esi),%mm3 # Incoming path metric, high bit = 1
71 movq %mm0,%mm2
72 movq %mm3,%mm1
73 paddusb %mm4,%mm0
74 paddusb %mm4,%mm3
75
76 # invert branch metrics. This works only because they're 5 bits
77 pxor %mm7,%mm4
78
79 paddusb %mm4,%mm1
80 paddusb %mm4,%mm2
81
82 # Find survivors, leave in mm0,2
83 pminub %mm1,%mm0
84 pminub %mm3,%mm2
85 # get decisions, leave in mm1,3
86 pcmpeqb %mm0,%mm1
87 pcmpeqb %mm2,%mm3
88
89 # interleave and store new branch metrics in mm0,2
90 movq %mm0,%mm4
91 punpckhbw %mm2,%mm0 # interleave second 8 new metrics
92 punpcklbw %mm2,%mm4 # interleave first 8 new metrics
93 movq %mm0,(16*\GROUP+8)(%edi)
94 movq %mm4,(16*\GROUP)(%edi)
95
96 # interleave decisions, accumulate into %ebx
97 movq %mm1,%mm4
98 punpckhbw %mm3,%mm1
99 punpcklbw %mm3,%mm4
100 # Due to an error in the Intel instruction set ref (the register
101 # fields are swapped), gas assembles pmovmskb incorrectly
102 # See http://mail.gnu.org/pipermail/bug-gnu-utils/2000-August/002341.html
103 .byte 0x0f,0xd7,0xc1 # pmovmskb %mm1,%eax
104 shll $((16*\GROUP+8)&31),%eax
105 orl %eax,%ebx
106 .byte 0x0f,0xd7,0xc4 # pmovmskb %mm4,%eax
107 shll $((16*\GROUP)&31),%eax
108 orl %eax,%ebx
109 .endm
110
111 # invoke macro 16 times for a total of 128 butterflies
112 xorl %ebx,%ebx # clear decisions
113 butterfly GROUP=0
114 butterfly GROUP=1
115 movl %ebx,(%edx) # stash first 32 decisions
116 xorl %ebx,%ebx
117 butterfly GROUP=2
118 butterfly GROUP=3
119 movl %ebx,4(%edx) # stash second 32 decisions
120 xorl %ebx,%ebx # clear decisions
121 butterfly GROUP=4
122 butterfly GROUP=5
123 movl %ebx,8(%edx) # stash first 32 decisions
124 xorl %ebx,%ebx
125 butterfly GROUP=6
126 butterfly GROUP=7
127 movl %ebx,12(%edx) # stash second 32 decisions
128 xorl %ebx,%ebx # clear decisions
129 butterfly GROUP=8
130 butterfly GROUP=9
131 movl %ebx,16(%edx) # stash first 32 decisions
132 xorl %ebx,%ebx
133 butterfly GROUP=10
134 butterfly GROUP=11
135 movl %ebx,20(%edx) # stash second 32 decisions
136 xorl %ebx,%ebx # clear decisions
137 butterfly GROUP=12
138 butterfly GROUP=13
139 movl %ebx,24(%edx) # stash first 32 decisions
140 xorl %ebx,%ebx
141 butterfly GROUP=14
142 butterfly GROUP=15
143 movl %ebx,28(%edx) # stash second 32 decisions
144
145 addl $32,%edx # bump decision pointer
146
147 # see if we have to normalize
148 movl (%edi),%eax # extract first output metric
149 andl $255,%eax
150 cmp $50,%eax # is it greater than 50?
151 movl $0,%eax
152 jle done # No, no need to normalize
153
154 # Normalize by finding smallest metric and subtracting it
155 # from all metrics
156 movq (%edi),%mm0
157 pminub 8(%edi),%mm0
158 pminub 16(%edi),%mm0
159 pminub 24(%edi),%mm0
160 pminub 32(%edi),%mm0
161 pminub 40(%edi),%mm0
162 pminub 48(%edi),%mm0
163 pminub 56(%edi),%mm0
164 pminub 64(%edi),%mm0
165 pminub 72(%edi),%mm0
166 pminub 80(%edi),%mm0
167 pminub 88(%edi),%mm0
168 pminub 96(%edi),%mm0
169 pminub 104(%edi),%mm0
170 pminub 112(%edi),%mm0
171 pminub 120(%edi),%mm0
172 pminub 128(%edi),%mm0
173 pminub 136(%edi),%mm0
174 pminub 144(%edi),%mm0
175 pminub 152(%edi),%mm0
176 pminub 160(%edi),%mm0
177 pminub 168(%edi),%mm0
178 pminub 176(%edi),%mm0
179 pminub 184(%edi),%mm0
180 pminub 192(%edi),%mm0
181 pminub 200(%edi),%mm0
182 pminub 208(%edi),%mm0
183 pminub 216(%edi),%mm0
184 pminub 224(%edi),%mm0
185 pminub 232(%edi),%mm0
186 pminub 240(%edi),%mm0
187 pminub 248(%edi),%mm0
188 # mm0 contains 8 smallest metrics
189 # crunch down to single lowest metric
190 movq %mm0,%mm1
191 psrlq $32,%mm0
192 pminub %mm1,%mm0
193 movq %mm0,%mm1
194 psrlq $16,%mm0
195 pminub %mm1,%mm0
196 movq %mm0,%mm1
197 psrlq $8,%mm0
198 pminub %mm1,%mm0
199 movq 8(%edi),%mm1 # reload
200 punpcklbw %mm0,%mm0 # expand to all 8 bytes
201 pshufw $0,%mm0,%mm0
202
203 # mm0 now contains lowest metric in all 8 bytes
204 # subtract it from every output metric
205 # Trashes %mm7
206 .macro PSUBUSBM REG,MEM
207 movq \MEM,%mm7
208 psubusb \REG,%mm7
209 movq %mm7,\MEM
210 .endm
211
212 PSUBUSBM %mm0,(%edi)
213 PSUBUSBM %mm0,8(%edi)
214 PSUBUSBM %mm0,16(%edi)
215 PSUBUSBM %mm0,24(%edi)
216 PSUBUSBM %mm0,32(%edi)
217 PSUBUSBM %mm0,40(%edi)
218 PSUBUSBM %mm0,48(%edi)
219 PSUBUSBM %mm0,56(%edi)
220 PSUBUSBM %mm0,64(%edi)
221 PSUBUSBM %mm0,72(%edi)
222 PSUBUSBM %mm0,80(%edi)
223 PSUBUSBM %mm0,88(%edi)
224 PSUBUSBM %mm0,96(%edi)
225 PSUBUSBM %mm0,104(%edi)
226 PSUBUSBM %mm0,112(%edi)
227 PSUBUSBM %mm0,120(%edi)
228 PSUBUSBM %mm0,128(%edi)
229 PSUBUSBM %mm0,136(%edi)
230 PSUBUSBM %mm0,144(%edi)
231 PSUBUSBM %mm0,152(%edi)
232 PSUBUSBM %mm0,160(%edi)
233 PSUBUSBM %mm0,168(%edi)
234 PSUBUSBM %mm0,176(%edi)
235 PSUBUSBM %mm0,184(%edi)
236 PSUBUSBM %mm0,192(%edi)
237 PSUBUSBM %mm0,200(%edi)
238 PSUBUSBM %mm0,208(%edi)
239 PSUBUSBM %mm0,216(%edi)
240 PSUBUSBM %mm0,224(%edi)
241 PSUBUSBM %mm0,232(%edi)
242 PSUBUSBM %mm0,240(%edi)
243 PSUBUSBM %mm0,248(%edi)
244
245done:
246 # swap metrics
247 movl %esi,%eax
248 movl %edi,%esi
249 movl %eax,%edi
250 jmp 1b
251
2522: emms
253 movl 8(%ebp),%ebx # ebx = vp
254 # stash metric pointers
255 movl %esi,OLDMETRICS(%ebx)
256 movl %edi,NEWMETRICS(%ebx)
257 movl %edx,DP(%ebx) # stash incremented value of vp->dp
258 xorl %eax,%eax
259err: popl %ebx
260 popl %edx
261 popl %edi
262 popl %esi
263 popl %ebp
264 ret
265
266 .data
267 .align 8
268thirtyones:
269 .byte 31,31,31,31,31,31,31,31
270
271