blob: 26d961439ab0c8de3d717a26c3f75c598c8f4c82 [file] [log] [blame]
Richard Kuoc1502902011-10-31 18:38:38 -05001/*
2 * Copyright (c) 2011 Code Aurora Forum. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 and
6 * only version 2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA.
17 */
18
19
20/* HEXAGON assembly optimized memset */
21/* Replaces the standard library function memset */
22
23
24 .macro HEXAGON_OPT_FUNC_BEGIN name
25 .text
26 .p2align 4
27 .globl \name
28 .type \name, @function
29\name:
30 .endm
31
32 .macro HEXAGON_OPT_FUNC_FINISH name
33 .size \name, . - \name
34 .endm
35
36/* FUNCTION: memset (v2 version) */
37#if __HEXAGON_ARCH__ < 3
38HEXAGON_OPT_FUNC_BEGIN memset
39 {
40 r6 = #8
41 r7 = extractu(r0, #3 , #0)
42 p0 = cmp.eq(r2, #0)
43 p1 = cmp.gtu(r2, #7)
44 }
45 {
46 r4 = vsplatb(r1)
47 r8 = r0 /* leave r0 intact for return val */
48 r9 = sub(r6, r7) /* bytes until double alignment */
49 if p0 jumpr r31 /* count == 0, so return */
50 }
51 {
52 r3 = #0
53 r7 = #0
54 p0 = tstbit(r9, #0)
55 if p1 jump 2f /* skip byte loop */
56 }
57
58/* less than 8 bytes to set, so just set a byte at a time and return */
59
60 loop0(1f, r2) /* byte loop */
61 .falign
621: /* byte loop */
63 {
64 memb(r8++#1) = r4
65 }:endloop0
66 jumpr r31
67 .falign
682: /* skip byte loop */
69 {
70 r6 = #1
71 p0 = tstbit(r9, #1)
72 p1 = cmp.eq(r2, #1)
73 if !p0 jump 3f /* skip initial byte store */
74 }
75 {
76 memb(r8++#1) = r4
77 r3:2 = sub(r3:2, r7:6)
78 if p1 jumpr r31
79 }
80 .falign
813: /* skip initial byte store */
82 {
83 r6 = #2
84 p0 = tstbit(r9, #2)
85 p1 = cmp.eq(r2, #2)
86 if !p0 jump 4f /* skip initial half store */
87 }
88 {
89 memh(r8++#2) = r4
90 r3:2 = sub(r3:2, r7:6)
91 if p1 jumpr r31
92 }
93 .falign
944: /* skip initial half store */
95 {
96 r6 = #4
97 p0 = cmp.gtu(r2, #7)
98 p1 = cmp.eq(r2, #4)
99 if !p0 jump 5f /* skip initial word store */
100 }
101 {
102 memw(r8++#4) = r4
103 r3:2 = sub(r3:2, r7:6)
104 p0 = cmp.gtu(r2, #11)
105 if p1 jumpr r31
106 }
107 .falign
1085: /* skip initial word store */
109 {
110 r10 = lsr(r2, #3)
111 p1 = cmp.eq(r3, #1)
112 if !p0 jump 7f /* skip double loop */
113 }
114 {
115 r5 = r4
116 r6 = #8
117 loop0(6f, r10) /* double loop */
118 }
119
120/* set bytes a double word at a time */
121
122 .falign
1236: /* double loop */
124 {
125 memd(r8++#8) = r5:4
126 r3:2 = sub(r3:2, r7:6)
127 p1 = cmp.eq(r2, #8)
128 }:endloop0
129 .falign
1307: /* skip double loop */
131 {
132 p0 = tstbit(r2, #2)
133 if p1 jumpr r31
134 }
135 {
136 r6 = #4
137 p0 = tstbit(r2, #1)
138 p1 = cmp.eq(r2, #4)
139 if !p0 jump 8f /* skip final word store */
140 }
141 {
142 memw(r8++#4) = r4
143 r3:2 = sub(r3:2, r7:6)
144 if p1 jumpr r31
145 }
146 .falign
1478: /* skip final word store */
148 {
149 p1 = cmp.eq(r2, #2)
150 if !p0 jump 9f /* skip final half store */
151 }
152 {
153 memh(r8++#2) = r4
154 if p1 jumpr r31
155 }
156 .falign
1579: /* skip final half store */
158 {
159 memb(r8++#1) = r4
160 jumpr r31
161 }
162HEXAGON_OPT_FUNC_FINISH memset
163#endif
164
165
166/* FUNCTION: memset (v3 and higher version) */
167#if __HEXAGON_ARCH__ >= 3
168HEXAGON_OPT_FUNC_BEGIN memset
169 {
170 r7=vsplatb(r1)
171 r6 = r0
172 if (r2==#0) jump:nt .L1
173 }
174 {
175 r5:4=combine(r7,r7)
176 p0 = cmp.gtu(r2,#8)
177 if (p0.new) jump:nt .L3
178 }
179 {
180 r3 = r0
181 loop0(.L47,r2)
182 }
183 .falign
184.L47:
185 {
186 memb(r3++#1) = r1
187 }:endloop0 /* start=.L47 */
188 jumpr r31
189.L3:
190 {
191 p0 = tstbit(r0,#0)
192 if (!p0.new) jump:nt .L8
193 p1 = cmp.eq(r2, #1)
194 }
195 {
196 r6 = add(r0, #1)
197 r2 = add(r2,#-1)
198 memb(r0) = r1
199 if (p1) jump .L1
200 }
201.L8:
202 {
203 p0 = tstbit(r6,#1)
204 if (!p0.new) jump:nt .L10
205 }
206 {
207 r2 = add(r2,#-2)
208 memh(r6++#2) = r7
209 p0 = cmp.eq(r2, #2)
210 if (p0.new) jump:nt .L1
211 }
212.L10:
213 {
214 p0 = tstbit(r6,#2)
215 if (!p0.new) jump:nt .L12
216 }
217 {
218 r2 = add(r2,#-4)
219 memw(r6++#4) = r7
220 p0 = cmp.eq(r2, #4)
221 if (p0.new) jump:nt .L1
222 }
223.L12:
224 {
225 p0 = cmp.gtu(r2,#127)
226 if (!p0.new) jump:nt .L14
227 }
228 r3 = and(r6,#31)
229 if (r3==#0) jump:nt .L17
230 {
231 memd(r6++#8) = r5:4
232 r2 = add(r2,#-8)
233 }
234 r3 = and(r6,#31)
235 if (r3==#0) jump:nt .L17
236 {
237 memd(r6++#8) = r5:4
238 r2 = add(r2,#-8)
239 }
240 r3 = and(r6,#31)
241 if (r3==#0) jump:nt .L17
242 {
243 memd(r6++#8) = r5:4
244 r2 = add(r2,#-8)
245 }
246.L17:
247 {
248 r3 = lsr(r2,#5)
249 if (r1!=#0) jump:nt .L18
250 }
251 {
252 r8 = r3
253 r3 = r6
254 loop0(.L46,r3)
255 }
256 .falign
257.L46:
258 {
259 dczeroa(r6)
260 r6 = add(r6,#32)
261 r2 = add(r2,#-32)
262 }:endloop0 /* start=.L46 */
263.L14:
264 {
265 p0 = cmp.gtu(r2,#7)
266 if (!p0.new) jump:nt .L28
267 r8 = lsr(r2,#3)
268 }
269 loop0(.L44,r8)
270 .falign
271.L44:
272 {
273 memd(r6++#8) = r5:4
274 r2 = add(r2,#-8)
275 }:endloop0 /* start=.L44 */
276.L28:
277 {
278 p0 = tstbit(r2,#2)
279 if (!p0.new) jump:nt .L33
280 }
281 {
282 r2 = add(r2,#-4)
283 memw(r6++#4) = r7
284 }
285.L33:
286 {
287 p0 = tstbit(r2,#1)
288 if (!p0.new) jump:nt .L35
289 }
290 {
291 r2 = add(r2,#-2)
292 memh(r6++#2) = r7
293 }
294.L35:
295 p0 = cmp.eq(r2,#1)
296 if (p0) memb(r6) = r1
297.L1:
298 jumpr r31
299.L18:
300 loop0(.L45,r3)
301 .falign
302.L45:
303 dczeroa(r6)
304 {
305 memd(r6++#8) = r5:4
306 r2 = add(r2,#-32)
307 }
308 memd(r6++#8) = r5:4
309 memd(r6++#8) = r5:4
310 {
311 memd(r6++#8) = r5:4
312 }:endloop0 /* start=.L45 */
313 jump .L14
314HEXAGON_OPT_FUNC_FINISH memset
315#endif