blob: 7050ab133b9da66561d53f46f07f5974a16ccf7c [file] [log] [blame]
David McCulloughf0be44f2012-09-07 04:17:02 +08001#define __ARM_ARCH__ __LINUX_ARM_ARCH__
2@ ====================================================================
3@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
4@ project. The module is, however, dual licensed under OpenSSL and
5@ CRYPTOGAMS licenses depending on where you obtain it. For further
6@ details see http://www.openssl.org/~appro/cryptogams/.
7@ ====================================================================
8
9@ sha1_block procedure for ARMv4.
10@
11@ January 2007.
12
13@ Size/performance trade-off
14@ ====================================================================
15@ impl size in bytes comp cycles[*] measured performance
16@ ====================================================================
17@ thumb 304 3212 4420
18@ armv4-small 392/+29% 1958/+64% 2250/+96%
19@ armv4-compact 740/+89% 1552/+26% 1840/+22%
20@ armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
21@ full unroll ~5100/+260% ~1260/+4% ~1300/+5%
22@ ====================================================================
23@ thumb = same as 'small' but in Thumb instructions[**] and
24@ with recurring code in two private functions;
25@ small = detached Xload/update, loops are folded;
26@ compact = detached Xload/update, 5x unroll;
27@ large = interleaved Xload/update, 5x unroll;
28@ full unroll = interleaved Xload/update, full unroll, estimated[!];
29@
30@ [*] Manually counted instructions in "grand" loop body. Measured
31@ performance is affected by prologue and epilogue overhead,
32@ i-cache availability, branch penalties, etc.
33@ [**] While each Thumb instruction is twice smaller, they are not as
34@ diverse as ARM ones: e.g., there are only two arithmetic
35@ instructions with 3 arguments, no [fixed] rotate, addressing
36@ modes are limited. As result it takes more instructions to do
37@ the same job in Thumb, therefore the code is never twice as
38@ small and always slower.
39@ [***] which is also ~35% better than compiler generated code. Dual-
40@ issue Cortex A8 core was measured to process input block in
41@ ~990 cycles.
42
43@ August 2010.
44@
45@ Rescheduling for dual-issue pipeline resulted in 13% improvement on
46@ Cortex A8 core and in absolute terms ~870 cycles per input block
47@ [or 13.6 cycles per byte].
48
49@ February 2011.
50@
51@ Profiler-assisted and platform-specific optimization resulted in 10%
52@ improvement on Cortex A8 core and 12.2 cycles per byte.
53
54.text
55
56.global sha1_block_data_order
57.type sha1_block_data_order,%function
58
59.align 2
60sha1_block_data_order:
61 stmdb sp!,{r4-r12,lr}
62 add r2,r1,r2,lsl#6 @ r2 to point at the end of r1
63 ldmia r0,{r3,r4,r5,r6,r7}
64.Lloop:
65 ldr r8,.LK_00_19
66 mov r14,sp
67 sub sp,sp,#15*4
68 mov r5,r5,ror#30
69 mov r6,r6,ror#30
70 mov r7,r7,ror#30 @ [6]
71.L_00_15:
72#if __ARM_ARCH__<7
73 ldrb r10,[r1,#2]
74 ldrb r9,[r1,#3]
75 ldrb r11,[r1,#1]
76 add r7,r8,r7,ror#2 @ E+=K_00_19
77 ldrb r12,[r1],#4
78 orr r9,r9,r10,lsl#8
79 eor r10,r5,r6 @ F_xx_xx
80 orr r9,r9,r11,lsl#16
81 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
82 orr r9,r9,r12,lsl#24
83#else
84 ldr r9,[r1],#4 @ handles unaligned
85 add r7,r8,r7,ror#2 @ E+=K_00_19
86 eor r10,r5,r6 @ F_xx_xx
87 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
88#ifdef __ARMEL__
89 rev r9,r9 @ byte swap
90#endif
91#endif
92 and r10,r4,r10,ror#2
93 add r7,r7,r9 @ E+=X[i]
94 eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
95 str r9,[r14,#-4]!
96 add r7,r7,r10 @ E+=F_00_19(B,C,D)
97#if __ARM_ARCH__<7
98 ldrb r10,[r1,#2]
99 ldrb r9,[r1,#3]
100 ldrb r11,[r1,#1]
101 add r6,r8,r6,ror#2 @ E+=K_00_19
102 ldrb r12,[r1],#4
103 orr r9,r9,r10,lsl#8
104 eor r10,r4,r5 @ F_xx_xx
105 orr r9,r9,r11,lsl#16
106 add r6,r6,r7,ror#27 @ E+=ROR(A,27)
107 orr r9,r9,r12,lsl#24
108#else
109 ldr r9,[r1],#4 @ handles unaligned
110 add r6,r8,r6,ror#2 @ E+=K_00_19
111 eor r10,r4,r5 @ F_xx_xx
112 add r6,r6,r7,ror#27 @ E+=ROR(A,27)
113#ifdef __ARMEL__
114 rev r9,r9 @ byte swap
115#endif
116#endif
117 and r10,r3,r10,ror#2
118 add r6,r6,r9 @ E+=X[i]
119 eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
120 str r9,[r14,#-4]!
121 add r6,r6,r10 @ E+=F_00_19(B,C,D)
122#if __ARM_ARCH__<7
123 ldrb r10,[r1,#2]
124 ldrb r9,[r1,#3]
125 ldrb r11,[r1,#1]
126 add r5,r8,r5,ror#2 @ E+=K_00_19
127 ldrb r12,[r1],#4
128 orr r9,r9,r10,lsl#8
129 eor r10,r3,r4 @ F_xx_xx
130 orr r9,r9,r11,lsl#16
131 add r5,r5,r6,ror#27 @ E+=ROR(A,27)
132 orr r9,r9,r12,lsl#24
133#else
134 ldr r9,[r1],#4 @ handles unaligned
135 add r5,r8,r5,ror#2 @ E+=K_00_19
136 eor r10,r3,r4 @ F_xx_xx
137 add r5,r5,r6,ror#27 @ E+=ROR(A,27)
138#ifdef __ARMEL__
139 rev r9,r9 @ byte swap
140#endif
141#endif
142 and r10,r7,r10,ror#2
143 add r5,r5,r9 @ E+=X[i]
144 eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
145 str r9,[r14,#-4]!
146 add r5,r5,r10 @ E+=F_00_19(B,C,D)
147#if __ARM_ARCH__<7
148 ldrb r10,[r1,#2]
149 ldrb r9,[r1,#3]
150 ldrb r11,[r1,#1]
151 add r4,r8,r4,ror#2 @ E+=K_00_19
152 ldrb r12,[r1],#4
153 orr r9,r9,r10,lsl#8
154 eor r10,r7,r3 @ F_xx_xx
155 orr r9,r9,r11,lsl#16
156 add r4,r4,r5,ror#27 @ E+=ROR(A,27)
157 orr r9,r9,r12,lsl#24
158#else
159 ldr r9,[r1],#4 @ handles unaligned
160 add r4,r8,r4,ror#2 @ E+=K_00_19
161 eor r10,r7,r3 @ F_xx_xx
162 add r4,r4,r5,ror#27 @ E+=ROR(A,27)
163#ifdef __ARMEL__
164 rev r9,r9 @ byte swap
165#endif
166#endif
167 and r10,r6,r10,ror#2
168 add r4,r4,r9 @ E+=X[i]
169 eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
170 str r9,[r14,#-4]!
171 add r4,r4,r10 @ E+=F_00_19(B,C,D)
172#if __ARM_ARCH__<7
173 ldrb r10,[r1,#2]
174 ldrb r9,[r1,#3]
175 ldrb r11,[r1,#1]
176 add r3,r8,r3,ror#2 @ E+=K_00_19
177 ldrb r12,[r1],#4
178 orr r9,r9,r10,lsl#8
179 eor r10,r6,r7 @ F_xx_xx
180 orr r9,r9,r11,lsl#16
181 add r3,r3,r4,ror#27 @ E+=ROR(A,27)
182 orr r9,r9,r12,lsl#24
183#else
184 ldr r9,[r1],#4 @ handles unaligned
185 add r3,r8,r3,ror#2 @ E+=K_00_19
186 eor r10,r6,r7 @ F_xx_xx
187 add r3,r3,r4,ror#27 @ E+=ROR(A,27)
188#ifdef __ARMEL__
189 rev r9,r9 @ byte swap
190#endif
191#endif
192 and r10,r5,r10,ror#2
193 add r3,r3,r9 @ E+=X[i]
194 eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
195 str r9,[r14,#-4]!
196 add r3,r3,r10 @ E+=F_00_19(B,C,D)
197 teq r14,sp
198 bne .L_00_15 @ [((11+4)*5+2)*3]
199#if __ARM_ARCH__<7
200 ldrb r10,[r1,#2]
201 ldrb r9,[r1,#3]
202 ldrb r11,[r1,#1]
203 add r7,r8,r7,ror#2 @ E+=K_00_19
204 ldrb r12,[r1],#4
205 orr r9,r9,r10,lsl#8
206 eor r10,r5,r6 @ F_xx_xx
207 orr r9,r9,r11,lsl#16
208 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
209 orr r9,r9,r12,lsl#24
210#else
211 ldr r9,[r1],#4 @ handles unaligned
212 add r7,r8,r7,ror#2 @ E+=K_00_19
213 eor r10,r5,r6 @ F_xx_xx
214 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
215#ifdef __ARMEL__
216 rev r9,r9 @ byte swap
217#endif
218#endif
219 and r10,r4,r10,ror#2
220 add r7,r7,r9 @ E+=X[i]
221 eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
222 str r9,[r14,#-4]!
223 add r7,r7,r10 @ E+=F_00_19(B,C,D)
224 ldr r9,[r14,#15*4]
225 ldr r10,[r14,#13*4]
226 ldr r11,[r14,#7*4]
227 add r6,r8,r6,ror#2 @ E+=K_xx_xx
228 ldr r12,[r14,#2*4]
229 eor r9,r9,r10
230 eor r11,r11,r12 @ 1 cycle stall
231 eor r10,r4,r5 @ F_xx_xx
232 mov r9,r9,ror#31
233 add r6,r6,r7,ror#27 @ E+=ROR(A,27)
234 eor r9,r9,r11,ror#31
235 str r9,[r14,#-4]!
236 and r10,r3,r10,ror#2 @ F_xx_xx
237 @ F_xx_xx
238 add r6,r6,r9 @ E+=X[i]
239 eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
240 add r6,r6,r10 @ E+=F_00_19(B,C,D)
241 ldr r9,[r14,#15*4]
242 ldr r10,[r14,#13*4]
243 ldr r11,[r14,#7*4]
244 add r5,r8,r5,ror#2 @ E+=K_xx_xx
245 ldr r12,[r14,#2*4]
246 eor r9,r9,r10
247 eor r11,r11,r12 @ 1 cycle stall
248 eor r10,r3,r4 @ F_xx_xx
249 mov r9,r9,ror#31
250 add r5,r5,r6,ror#27 @ E+=ROR(A,27)
251 eor r9,r9,r11,ror#31
252 str r9,[r14,#-4]!
253 and r10,r7,r10,ror#2 @ F_xx_xx
254 @ F_xx_xx
255 add r5,r5,r9 @ E+=X[i]
256 eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
257 add r5,r5,r10 @ E+=F_00_19(B,C,D)
258 ldr r9,[r14,#15*4]
259 ldr r10,[r14,#13*4]
260 ldr r11,[r14,#7*4]
261 add r4,r8,r4,ror#2 @ E+=K_xx_xx
262 ldr r12,[r14,#2*4]
263 eor r9,r9,r10
264 eor r11,r11,r12 @ 1 cycle stall
265 eor r10,r7,r3 @ F_xx_xx
266 mov r9,r9,ror#31
267 add r4,r4,r5,ror#27 @ E+=ROR(A,27)
268 eor r9,r9,r11,ror#31
269 str r9,[r14,#-4]!
270 and r10,r6,r10,ror#2 @ F_xx_xx
271 @ F_xx_xx
272 add r4,r4,r9 @ E+=X[i]
273 eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
274 add r4,r4,r10 @ E+=F_00_19(B,C,D)
275 ldr r9,[r14,#15*4]
276 ldr r10,[r14,#13*4]
277 ldr r11,[r14,#7*4]
278 add r3,r8,r3,ror#2 @ E+=K_xx_xx
279 ldr r12,[r14,#2*4]
280 eor r9,r9,r10
281 eor r11,r11,r12 @ 1 cycle stall
282 eor r10,r6,r7 @ F_xx_xx
283 mov r9,r9,ror#31
284 add r3,r3,r4,ror#27 @ E+=ROR(A,27)
285 eor r9,r9,r11,ror#31
286 str r9,[r14,#-4]!
287 and r10,r5,r10,ror#2 @ F_xx_xx
288 @ F_xx_xx
289 add r3,r3,r9 @ E+=X[i]
290 eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
291 add r3,r3,r10 @ E+=F_00_19(B,C,D)
292
293 ldr r8,.LK_20_39 @ [+15+16*4]
294 sub sp,sp,#25*4
295 cmn sp,#0 @ [+3], clear carry to denote 20_39
296.L_20_39_or_60_79:
297 ldr r9,[r14,#15*4]
298 ldr r10,[r14,#13*4]
299 ldr r11,[r14,#7*4]
300 add r7,r8,r7,ror#2 @ E+=K_xx_xx
301 ldr r12,[r14,#2*4]
302 eor r9,r9,r10
303 eor r11,r11,r12 @ 1 cycle stall
304 eor r10,r5,r6 @ F_xx_xx
305 mov r9,r9,ror#31
306 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
307 eor r9,r9,r11,ror#31
308 str r9,[r14,#-4]!
309 eor r10,r4,r10,ror#2 @ F_xx_xx
310 @ F_xx_xx
311 add r7,r7,r9 @ E+=X[i]
312 add r7,r7,r10 @ E+=F_20_39(B,C,D)
313 ldr r9,[r14,#15*4]
314 ldr r10,[r14,#13*4]
315 ldr r11,[r14,#7*4]
316 add r6,r8,r6,ror#2 @ E+=K_xx_xx
317 ldr r12,[r14,#2*4]
318 eor r9,r9,r10
319 eor r11,r11,r12 @ 1 cycle stall
320 eor r10,r4,r5 @ F_xx_xx
321 mov r9,r9,ror#31
322 add r6,r6,r7,ror#27 @ E+=ROR(A,27)
323 eor r9,r9,r11,ror#31
324 str r9,[r14,#-4]!
325 eor r10,r3,r10,ror#2 @ F_xx_xx
326 @ F_xx_xx
327 add r6,r6,r9 @ E+=X[i]
328 add r6,r6,r10 @ E+=F_20_39(B,C,D)
329 ldr r9,[r14,#15*4]
330 ldr r10,[r14,#13*4]
331 ldr r11,[r14,#7*4]
332 add r5,r8,r5,ror#2 @ E+=K_xx_xx
333 ldr r12,[r14,#2*4]
334 eor r9,r9,r10
335 eor r11,r11,r12 @ 1 cycle stall
336 eor r10,r3,r4 @ F_xx_xx
337 mov r9,r9,ror#31
338 add r5,r5,r6,ror#27 @ E+=ROR(A,27)
339 eor r9,r9,r11,ror#31
340 str r9,[r14,#-4]!
341 eor r10,r7,r10,ror#2 @ F_xx_xx
342 @ F_xx_xx
343 add r5,r5,r9 @ E+=X[i]
344 add r5,r5,r10 @ E+=F_20_39(B,C,D)
345 ldr r9,[r14,#15*4]
346 ldr r10,[r14,#13*4]
347 ldr r11,[r14,#7*4]
348 add r4,r8,r4,ror#2 @ E+=K_xx_xx
349 ldr r12,[r14,#2*4]
350 eor r9,r9,r10
351 eor r11,r11,r12 @ 1 cycle stall
352 eor r10,r7,r3 @ F_xx_xx
353 mov r9,r9,ror#31
354 add r4,r4,r5,ror#27 @ E+=ROR(A,27)
355 eor r9,r9,r11,ror#31
356 str r9,[r14,#-4]!
357 eor r10,r6,r10,ror#2 @ F_xx_xx
358 @ F_xx_xx
359 add r4,r4,r9 @ E+=X[i]
360 add r4,r4,r10 @ E+=F_20_39(B,C,D)
361 ldr r9,[r14,#15*4]
362 ldr r10,[r14,#13*4]
363 ldr r11,[r14,#7*4]
364 add r3,r8,r3,ror#2 @ E+=K_xx_xx
365 ldr r12,[r14,#2*4]
366 eor r9,r9,r10
367 eor r11,r11,r12 @ 1 cycle stall
368 eor r10,r6,r7 @ F_xx_xx
369 mov r9,r9,ror#31
370 add r3,r3,r4,ror#27 @ E+=ROR(A,27)
371 eor r9,r9,r11,ror#31
372 str r9,[r14,#-4]!
373 eor r10,r5,r10,ror#2 @ F_xx_xx
374 @ F_xx_xx
375 add r3,r3,r9 @ E+=X[i]
376 add r3,r3,r10 @ E+=F_20_39(B,C,D)
377 teq r14,sp @ preserve carry
378 bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
379 bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
380
381 ldr r8,.LK_40_59
382 sub sp,sp,#20*4 @ [+2]
383.L_40_59:
384 ldr r9,[r14,#15*4]
385 ldr r10,[r14,#13*4]
386 ldr r11,[r14,#7*4]
387 add r7,r8,r7,ror#2 @ E+=K_xx_xx
388 ldr r12,[r14,#2*4]
389 eor r9,r9,r10
390 eor r11,r11,r12 @ 1 cycle stall
391 eor r10,r5,r6 @ F_xx_xx
392 mov r9,r9,ror#31
393 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
394 eor r9,r9,r11,ror#31
395 str r9,[r14,#-4]!
396 and r10,r4,r10,ror#2 @ F_xx_xx
397 and r11,r5,r6 @ F_xx_xx
398 add r7,r7,r9 @ E+=X[i]
399 add r7,r7,r10 @ E+=F_40_59(B,C,D)
400 add r7,r7,r11,ror#2
401 ldr r9,[r14,#15*4]
402 ldr r10,[r14,#13*4]
403 ldr r11,[r14,#7*4]
404 add r6,r8,r6,ror#2 @ E+=K_xx_xx
405 ldr r12,[r14,#2*4]
406 eor r9,r9,r10
407 eor r11,r11,r12 @ 1 cycle stall
408 eor r10,r4,r5 @ F_xx_xx
409 mov r9,r9,ror#31
410 add r6,r6,r7,ror#27 @ E+=ROR(A,27)
411 eor r9,r9,r11,ror#31
412 str r9,[r14,#-4]!
413 and r10,r3,r10,ror#2 @ F_xx_xx
414 and r11,r4,r5 @ F_xx_xx
415 add r6,r6,r9 @ E+=X[i]
416 add r6,r6,r10 @ E+=F_40_59(B,C,D)
417 add r6,r6,r11,ror#2
418 ldr r9,[r14,#15*4]
419 ldr r10,[r14,#13*4]
420 ldr r11,[r14,#7*4]
421 add r5,r8,r5,ror#2 @ E+=K_xx_xx
422 ldr r12,[r14,#2*4]
423 eor r9,r9,r10
424 eor r11,r11,r12 @ 1 cycle stall
425 eor r10,r3,r4 @ F_xx_xx
426 mov r9,r9,ror#31
427 add r5,r5,r6,ror#27 @ E+=ROR(A,27)
428 eor r9,r9,r11,ror#31
429 str r9,[r14,#-4]!
430 and r10,r7,r10,ror#2 @ F_xx_xx
431 and r11,r3,r4 @ F_xx_xx
432 add r5,r5,r9 @ E+=X[i]
433 add r5,r5,r10 @ E+=F_40_59(B,C,D)
434 add r5,r5,r11,ror#2
435 ldr r9,[r14,#15*4]
436 ldr r10,[r14,#13*4]
437 ldr r11,[r14,#7*4]
438 add r4,r8,r4,ror#2 @ E+=K_xx_xx
439 ldr r12,[r14,#2*4]
440 eor r9,r9,r10
441 eor r11,r11,r12 @ 1 cycle stall
442 eor r10,r7,r3 @ F_xx_xx
443 mov r9,r9,ror#31
444 add r4,r4,r5,ror#27 @ E+=ROR(A,27)
445 eor r9,r9,r11,ror#31
446 str r9,[r14,#-4]!
447 and r10,r6,r10,ror#2 @ F_xx_xx
448 and r11,r7,r3 @ F_xx_xx
449 add r4,r4,r9 @ E+=X[i]
450 add r4,r4,r10 @ E+=F_40_59(B,C,D)
451 add r4,r4,r11,ror#2
452 ldr r9,[r14,#15*4]
453 ldr r10,[r14,#13*4]
454 ldr r11,[r14,#7*4]
455 add r3,r8,r3,ror#2 @ E+=K_xx_xx
456 ldr r12,[r14,#2*4]
457 eor r9,r9,r10
458 eor r11,r11,r12 @ 1 cycle stall
459 eor r10,r6,r7 @ F_xx_xx
460 mov r9,r9,ror#31
461 add r3,r3,r4,ror#27 @ E+=ROR(A,27)
462 eor r9,r9,r11,ror#31
463 str r9,[r14,#-4]!
464 and r10,r5,r10,ror#2 @ F_xx_xx
465 and r11,r6,r7 @ F_xx_xx
466 add r3,r3,r9 @ E+=X[i]
467 add r3,r3,r10 @ E+=F_40_59(B,C,D)
468 add r3,r3,r11,ror#2
469 teq r14,sp
470 bne .L_40_59 @ [+((12+5)*5+2)*4]
471
472 ldr r8,.LK_60_79
473 sub sp,sp,#20*4
474 cmp sp,#0 @ set carry to denote 60_79
475 b .L_20_39_or_60_79 @ [+4], spare 300 bytes
476.L_done:
477 add sp,sp,#80*4 @ "deallocate" stack frame
478 ldmia r0,{r8,r9,r10,r11,r12}
479 add r3,r8,r3
480 add r4,r9,r4
481 add r5,r10,r5,ror#2
482 add r6,r11,r6,ror#2
483 add r7,r12,r7,ror#2
484 stmia r0,{r3,r4,r5,r6,r7}
485 teq r1,r2
486 bne .Lloop @ [+18], total 1307
487
488#if __ARM_ARCH__>=5
489 ldmia sp!,{r4-r12,pc}
490#else
491 ldmia sp!,{r4-r12,lr}
492 tst lr,#1
493 moveq pc,lr @ be binary compatible with V4, yet
494 .word 0xe12fff1e @ interoperable with Thumb ISA:-)
495#endif
496.align 2
497.LK_00_19: .word 0x5a827999
498.LK_20_39: .word 0x6ed9eba1
499.LK_40_59: .word 0x8f1bbcdc
500.LK_60_79: .word 0xca62c1d6
501.size sha1_block_data_order,.-sha1_block_data_order
502.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
503.align 2