Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 1 | /* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function |
| 2 | * |
| 3 | * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi> |
| 4 | * |
| 5 | * This program is free software; you can redistribute it and/or modify it |
| 6 | * under the terms of the GNU General Public License as published by the Free |
| 7 | * Software Foundation; either version 2 of the License, or (at your option) |
| 8 | * any later version. |
| 9 | */ |
| 10 | |
| 11 | #include <linux/linkage.h> |
Ard Biesheuvel | 0777e3e | 2014-08-05 21:15:19 +0100 | [diff] [blame] | 12 | #include <asm/assembler.h> |
Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 13 | |
| 14 | .syntax unified |
| 15 | .code 32 |
| 16 | .fpu neon |
| 17 | |
| 18 | .text |
| 19 | |
| 20 | |
| 21 | /* Context structure */ |
| 22 | |
| 23 | #define state_h0 0 |
| 24 | #define state_h1 4 |
| 25 | #define state_h2 8 |
| 26 | #define state_h3 12 |
| 27 | #define state_h4 16 |
| 28 | |
| 29 | |
| 30 | /* Constants */ |
| 31 | |
| 32 | #define K1 0x5A827999 |
| 33 | #define K2 0x6ED9EBA1 |
| 34 | #define K3 0x8F1BBCDC |
| 35 | #define K4 0xCA62C1D6 |
| 36 | .align 4 |
| 37 | .LK_VEC: |
| 38 | .LK1: .long K1, K1, K1, K1 |
| 39 | .LK2: .long K2, K2, K2, K2 |
| 40 | .LK3: .long K3, K3, K3, K3 |
| 41 | .LK4: .long K4, K4, K4, K4 |
| 42 | |
| 43 | |
| 44 | /* Register macros */ |
| 45 | |
| 46 | #define RSTATE r0 |
| 47 | #define RDATA r1 |
| 48 | #define RNBLKS r2 |
| 49 | #define ROLDSTACK r3 |
| 50 | #define RWK lr |
| 51 | |
| 52 | #define _a r4 |
| 53 | #define _b r5 |
| 54 | #define _c r6 |
| 55 | #define _d r7 |
| 56 | #define _e r8 |
| 57 | |
| 58 | #define RT0 r9 |
| 59 | #define RT1 r10 |
| 60 | #define RT2 r11 |
| 61 | #define RT3 r12 |
| 62 | |
| 63 | #define W0 q0 |
Ard Biesheuvel | 0777e3e | 2014-08-05 21:15:19 +0100 | [diff] [blame] | 64 | #define W1 q7 |
Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 65 | #define W2 q2 |
| 66 | #define W3 q3 |
| 67 | #define W4 q4 |
Ard Biesheuvel | 0777e3e | 2014-08-05 21:15:19 +0100 | [diff] [blame] | 68 | #define W5 q6 |
| 69 | #define W6 q5 |
| 70 | #define W7 q1 |
Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 71 | |
| 72 | #define tmp0 q8 |
| 73 | #define tmp1 q9 |
| 74 | #define tmp2 q10 |
| 75 | #define tmp3 q11 |
| 76 | |
| 77 | #define qK1 q12 |
| 78 | #define qK2 q13 |
| 79 | #define qK3 q14 |
| 80 | #define qK4 q15 |
| 81 | |
Ard Biesheuvel | 0777e3e | 2014-08-05 21:15:19 +0100 | [diff] [blame] | 82 | #ifdef CONFIG_CPU_BIG_ENDIAN |
| 83 | #define ARM_LE(code...) |
| 84 | #else |
| 85 | #define ARM_LE(code...) code |
| 86 | #endif |
Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 87 | |
| 88 | /* Round function macros. */ |
| 89 | |
| 90 | #define WK_offs(i) (((i) & 15) * 4) |
| 91 | |
| 92 | #define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ |
| 93 | W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 94 | ldr RT3, [sp, WK_offs(i)]; \ |
| 95 | pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ |
| 96 | bic RT0, d, b; \ |
| 97 | add e, e, a, ror #(32 - 5); \ |
| 98 | and RT1, c, b; \ |
| 99 | pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ |
| 100 | add RT0, RT0, RT3; \ |
| 101 | add e, e, RT1; \ |
| 102 | ror b, #(32 - 30); \ |
| 103 | pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ |
| 104 | add e, e, RT0; |
| 105 | |
| 106 | #define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ |
| 107 | W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 108 | ldr RT3, [sp, WK_offs(i)]; \ |
| 109 | pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ |
| 110 | eor RT0, d, b; \ |
| 111 | add e, e, a, ror #(32 - 5); \ |
| 112 | eor RT0, RT0, c; \ |
| 113 | pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ |
| 114 | add e, e, RT3; \ |
| 115 | ror b, #(32 - 30); \ |
| 116 | pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ |
| 117 | add e, e, RT0; \ |
| 118 | |
| 119 | #define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ |
| 120 | W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 121 | ldr RT3, [sp, WK_offs(i)]; \ |
| 122 | pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ |
| 123 | eor RT0, b, c; \ |
| 124 | and RT1, b, c; \ |
| 125 | add e, e, a, ror #(32 - 5); \ |
| 126 | pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ |
| 127 | and RT0, RT0, d; \ |
| 128 | add RT1, RT1, RT3; \ |
| 129 | add e, e, RT0; \ |
| 130 | ror b, #(32 - 30); \ |
| 131 | pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ |
| 132 | add e, e, RT1; |
| 133 | |
| 134 | #define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ |
| 135 | W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 136 | _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ |
| 137 | W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) |
| 138 | |
| 139 | #define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\ |
| 140 | W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 141 | _R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ |
| 142 | W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) |
| 143 | |
| 144 | #define R(a,b,c,d,e,f,i) \ |
| 145 | _R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\ |
| 146 | W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) |
| 147 | |
| 148 | #define dummy(...) |
| 149 | |
| 150 | |
| 151 | /* Input expansion macros. */ |
| 152 | |
| 153 | /********* Precalc macros for rounds 0-15 *************************************/ |
| 154 | |
| 155 | #define W_PRECALC_00_15() \ |
| 156 | add RWK, sp, #(WK_offs(0)); \ |
| 157 | \ |
Ard Biesheuvel | 0777e3e | 2014-08-05 21:15:19 +0100 | [diff] [blame] | 158 | vld1.32 {W0, W7}, [RDATA]!; \ |
| 159 | ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \ |
| 160 | vld1.32 {W6, W5}, [RDATA]!; \ |
Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 161 | vadd.u32 tmp0, W0, curK; \ |
Ard Biesheuvel | 0777e3e | 2014-08-05 21:15:19 +0100 | [diff] [blame] | 162 | ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \ |
| 163 | ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \ |
Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 164 | vadd.u32 tmp1, W7, curK; \ |
Ard Biesheuvel | 0777e3e | 2014-08-05 21:15:19 +0100 | [diff] [blame] | 165 | ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \ |
Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 166 | vadd.u32 tmp2, W6, curK; \ |
| 167 | vst1.32 {tmp0, tmp1}, [RWK]!; \ |
| 168 | vadd.u32 tmp3, W5, curK; \ |
| 169 | vst1.32 {tmp2, tmp3}, [RWK]; \ |
| 170 | |
| 171 | #define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
Ard Biesheuvel | 0777e3e | 2014-08-05 21:15:19 +0100 | [diff] [blame] | 172 | vld1.32 {W0, W7}, [RDATA]!; \ |
Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 173 | |
| 174 | #define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 175 | add RWK, sp, #(WK_offs(0)); \ |
| 176 | |
| 177 | #define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
Ard Biesheuvel | 0777e3e | 2014-08-05 21:15:19 +0100 | [diff] [blame] | 178 | ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \ |
Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 179 | |
| 180 | #define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
Ard Biesheuvel | 0777e3e | 2014-08-05 21:15:19 +0100 | [diff] [blame] | 181 | vld1.32 {W6, W5}, [RDATA]!; \ |
Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 182 | |
| 183 | #define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 184 | vadd.u32 tmp0, W0, curK; \ |
| 185 | |
| 186 | #define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
Ard Biesheuvel | 0777e3e | 2014-08-05 21:15:19 +0100 | [diff] [blame] | 187 | ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \ |
Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 188 | |
| 189 | #define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
Ard Biesheuvel | 0777e3e | 2014-08-05 21:15:19 +0100 | [diff] [blame] | 190 | ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \ |
Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 191 | |
| 192 | #define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 193 | vadd.u32 tmp1, W7, curK; \ |
| 194 | |
| 195 | #define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
Ard Biesheuvel | 0777e3e | 2014-08-05 21:15:19 +0100 | [diff] [blame] | 196 | ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \ |
Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 197 | |
| 198 | #define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 199 | vadd.u32 tmp2, W6, curK; \ |
| 200 | |
| 201 | #define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 202 | vst1.32 {tmp0, tmp1}, [RWK]!; \ |
| 203 | |
| 204 | #define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 205 | vadd.u32 tmp3, W5, curK; \ |
| 206 | |
| 207 | #define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 208 | vst1.32 {tmp2, tmp3}, [RWK]; \ |
| 209 | |
| 210 | |
| 211 | /********* Precalc macros for rounds 16-31 ************************************/ |
| 212 | |
| 213 | #define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 214 | veor tmp0, tmp0; \ |
| 215 | vext.8 W, W_m16, W_m12, #8; \ |
| 216 | |
| 217 | #define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 218 | add RWK, sp, #(WK_offs(i)); \ |
| 219 | vext.8 tmp0, W_m04, tmp0, #4; \ |
| 220 | |
| 221 | #define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 222 | veor tmp0, tmp0, W_m16; \ |
| 223 | veor.32 W, W, W_m08; \ |
| 224 | |
| 225 | #define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 226 | veor tmp1, tmp1; \ |
| 227 | veor W, W, tmp0; \ |
| 228 | |
| 229 | #define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 230 | vshl.u32 tmp0, W, #1; \ |
| 231 | |
| 232 | #define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 233 | vext.8 tmp1, tmp1, W, #(16-12); \ |
| 234 | vshr.u32 W, W, #31; \ |
| 235 | |
| 236 | #define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 237 | vorr tmp0, tmp0, W; \ |
| 238 | vshr.u32 W, tmp1, #30; \ |
| 239 | |
| 240 | #define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 241 | vshl.u32 tmp1, tmp1, #2; \ |
| 242 | |
| 243 | #define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 244 | veor tmp0, tmp0, W; \ |
| 245 | |
| 246 | #define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 247 | veor W, tmp0, tmp1; \ |
| 248 | |
| 249 | #define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 250 | vadd.u32 tmp0, W, curK; \ |
| 251 | |
| 252 | #define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 253 | vst1.32 {tmp0}, [RWK]; |
| 254 | |
| 255 | |
| 256 | /********* Precalc macros for rounds 32-79 ************************************/ |
| 257 | |
| 258 | #define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 259 | veor W, W_m28; \ |
| 260 | |
| 261 | #define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 262 | vext.8 tmp0, W_m08, W_m04, #8; \ |
| 263 | |
| 264 | #define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 265 | veor W, W_m16; \ |
| 266 | |
| 267 | #define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 268 | veor W, tmp0; \ |
| 269 | |
| 270 | #define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 271 | add RWK, sp, #(WK_offs(i&~3)); \ |
| 272 | |
| 273 | #define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 274 | vshl.u32 tmp1, W, #2; \ |
| 275 | |
| 276 | #define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 277 | vshr.u32 tmp0, W, #30; \ |
| 278 | |
| 279 | #define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 280 | vorr W, tmp0, tmp1; \ |
| 281 | |
| 282 | #define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 283 | vadd.u32 tmp0, W, curK; \ |
| 284 | |
| 285 | #define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 286 | vst1.32 {tmp0}, [RWK]; |
| 287 | |
| 288 | |
| 289 | /* |
| 290 | * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA. |
| 291 | * |
| 292 | * unsigned int |
| 293 | * sha1_transform_neon (void *ctx, const unsigned char *data, |
| 294 | * unsigned int nblks) |
| 295 | */ |
| 296 | .align 3 |
| 297 | ENTRY(sha1_transform_neon) |
| 298 | /* input: |
| 299 | * r0: ctx, CTX |
| 300 | * r1: data (64*nblks bytes) |
| 301 | * r2: nblks |
| 302 | */ |
| 303 | |
| 304 | cmp RNBLKS, #0; |
| 305 | beq .Ldo_nothing; |
| 306 | |
| 307 | push {r4-r12, lr}; |
| 308 | /*vpush {q4-q7};*/ |
| 309 | |
| 310 | adr RT3, .LK_VEC; |
| 311 | |
| 312 | mov ROLDSTACK, sp; |
| 313 | |
| 314 | /* Align stack. */ |
| 315 | sub RT0, sp, #(16*4); |
| 316 | and RT0, #(~(16-1)); |
| 317 | mov sp, RT0; |
| 318 | |
| 319 | vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */ |
| 320 | |
| 321 | /* Get the values of the chaining variables. */ |
| 322 | ldm RSTATE, {_a-_e}; |
| 323 | |
| 324 | vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */ |
| 325 | |
| 326 | #undef curK |
| 327 | #define curK qK1 |
| 328 | /* Precalc 0-15. */ |
| 329 | W_PRECALC_00_15(); |
| 330 | |
| 331 | .Loop: |
| 332 | /* Transform 0-15 + Precalc 16-31. */ |
| 333 | _R( _a, _b, _c, _d, _e, F1, 0, |
| 334 | WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16, |
| 335 | W4, W5, W6, W7, W0, _, _, _ ); |
| 336 | _R( _e, _a, _b, _c, _d, F1, 1, |
| 337 | WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16, |
| 338 | W4, W5, W6, W7, W0, _, _, _ ); |
| 339 | _R( _d, _e, _a, _b, _c, F1, 2, |
| 340 | WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16, |
| 341 | W4, W5, W6, W7, W0, _, _, _ ); |
| 342 | _R( _c, _d, _e, _a, _b, F1, 3, |
| 343 | WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16, |
| 344 | W4, W5, W6, W7, W0, _, _, _ ); |
| 345 | |
| 346 | #undef curK |
| 347 | #define curK qK2 |
| 348 | _R( _b, _c, _d, _e, _a, F1, 4, |
| 349 | WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20, |
| 350 | W3, W4, W5, W6, W7, _, _, _ ); |
| 351 | _R( _a, _b, _c, _d, _e, F1, 5, |
| 352 | WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20, |
| 353 | W3, W4, W5, W6, W7, _, _, _ ); |
| 354 | _R( _e, _a, _b, _c, _d, F1, 6, |
| 355 | WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20, |
| 356 | W3, W4, W5, W6, W7, _, _, _ ); |
| 357 | _R( _d, _e, _a, _b, _c, F1, 7, |
| 358 | WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20, |
| 359 | W3, W4, W5, W6, W7, _, _, _ ); |
| 360 | |
| 361 | _R( _c, _d, _e, _a, _b, F1, 8, |
| 362 | WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24, |
| 363 | W2, W3, W4, W5, W6, _, _, _ ); |
| 364 | _R( _b, _c, _d, _e, _a, F1, 9, |
| 365 | WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24, |
| 366 | W2, W3, W4, W5, W6, _, _, _ ); |
| 367 | _R( _a, _b, _c, _d, _e, F1, 10, |
| 368 | WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24, |
| 369 | W2, W3, W4, W5, W6, _, _, _ ); |
| 370 | _R( _e, _a, _b, _c, _d, F1, 11, |
| 371 | WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24, |
| 372 | W2, W3, W4, W5, W6, _, _, _ ); |
| 373 | |
| 374 | _R( _d, _e, _a, _b, _c, F1, 12, |
| 375 | WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28, |
| 376 | W1, W2, W3, W4, W5, _, _, _ ); |
| 377 | _R( _c, _d, _e, _a, _b, F1, 13, |
| 378 | WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28, |
| 379 | W1, W2, W3, W4, W5, _, _, _ ); |
| 380 | _R( _b, _c, _d, _e, _a, F1, 14, |
| 381 | WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28, |
| 382 | W1, W2, W3, W4, W5, _, _, _ ); |
| 383 | _R( _a, _b, _c, _d, _e, F1, 15, |
| 384 | WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28, |
| 385 | W1, W2, W3, W4, W5, _, _, _ ); |
| 386 | |
| 387 | /* Transform 16-63 + Precalc 32-79. */ |
| 388 | _R( _e, _a, _b, _c, _d, F1, 16, |
| 389 | WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32, |
| 390 | W0, W1, W2, W3, W4, W5, W6, W7); |
| 391 | _R( _d, _e, _a, _b, _c, F1, 17, |
| 392 | WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32, |
| 393 | W0, W1, W2, W3, W4, W5, W6, W7); |
| 394 | _R( _c, _d, _e, _a, _b, F1, 18, |
| 395 | WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 32, |
| 396 | W0, W1, W2, W3, W4, W5, W6, W7); |
| 397 | _R( _b, _c, _d, _e, _a, F1, 19, |
| 398 | WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 32, |
| 399 | W0, W1, W2, W3, W4, W5, W6, W7); |
| 400 | |
| 401 | _R( _a, _b, _c, _d, _e, F2, 20, |
| 402 | WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36, |
| 403 | W7, W0, W1, W2, W3, W4, W5, W6); |
| 404 | _R( _e, _a, _b, _c, _d, F2, 21, |
| 405 | WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36, |
| 406 | W7, W0, W1, W2, W3, W4, W5, W6); |
| 407 | _R( _d, _e, _a, _b, _c, F2, 22, |
| 408 | WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 36, |
| 409 | W7, W0, W1, W2, W3, W4, W5, W6); |
| 410 | _R( _c, _d, _e, _a, _b, F2, 23, |
| 411 | WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 36, |
| 412 | W7, W0, W1, W2, W3, W4, W5, W6); |
| 413 | |
| 414 | #undef curK |
| 415 | #define curK qK3 |
| 416 | _R( _b, _c, _d, _e, _a, F2, 24, |
| 417 | WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40, |
| 418 | W6, W7, W0, W1, W2, W3, W4, W5); |
| 419 | _R( _a, _b, _c, _d, _e, F2, 25, |
| 420 | WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40, |
| 421 | W6, W7, W0, W1, W2, W3, W4, W5); |
| 422 | _R( _e, _a, _b, _c, _d, F2, 26, |
| 423 | WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 40, |
| 424 | W6, W7, W0, W1, W2, W3, W4, W5); |
| 425 | _R( _d, _e, _a, _b, _c, F2, 27, |
| 426 | WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 40, |
| 427 | W6, W7, W0, W1, W2, W3, W4, W5); |
| 428 | |
| 429 | _R( _c, _d, _e, _a, _b, F2, 28, |
| 430 | WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44, |
| 431 | W5, W6, W7, W0, W1, W2, W3, W4); |
| 432 | _R( _b, _c, _d, _e, _a, F2, 29, |
| 433 | WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44, |
| 434 | W5, W6, W7, W0, W1, W2, W3, W4); |
| 435 | _R( _a, _b, _c, _d, _e, F2, 30, |
| 436 | WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 44, |
| 437 | W5, W6, W7, W0, W1, W2, W3, W4); |
| 438 | _R( _e, _a, _b, _c, _d, F2, 31, |
| 439 | WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 44, |
| 440 | W5, W6, W7, W0, W1, W2, W3, W4); |
| 441 | |
| 442 | _R( _d, _e, _a, _b, _c, F2, 32, |
| 443 | WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48, |
| 444 | W4, W5, W6, W7, W0, W1, W2, W3); |
| 445 | _R( _c, _d, _e, _a, _b, F2, 33, |
| 446 | WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48, |
| 447 | W4, W5, W6, W7, W0, W1, W2, W3); |
| 448 | _R( _b, _c, _d, _e, _a, F2, 34, |
| 449 | WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 48, |
| 450 | W4, W5, W6, W7, W0, W1, W2, W3); |
| 451 | _R( _a, _b, _c, _d, _e, F2, 35, |
| 452 | WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 48, |
| 453 | W4, W5, W6, W7, W0, W1, W2, W3); |
| 454 | |
| 455 | _R( _e, _a, _b, _c, _d, F2, 36, |
| 456 | WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52, |
| 457 | W3, W4, W5, W6, W7, W0, W1, W2); |
| 458 | _R( _d, _e, _a, _b, _c, F2, 37, |
| 459 | WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52, |
| 460 | W3, W4, W5, W6, W7, W0, W1, W2); |
| 461 | _R( _c, _d, _e, _a, _b, F2, 38, |
| 462 | WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 52, |
| 463 | W3, W4, W5, W6, W7, W0, W1, W2); |
| 464 | _R( _b, _c, _d, _e, _a, F2, 39, |
| 465 | WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 52, |
| 466 | W3, W4, W5, W6, W7, W0, W1, W2); |
| 467 | |
| 468 | _R( _a, _b, _c, _d, _e, F3, 40, |
| 469 | WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56, |
| 470 | W2, W3, W4, W5, W6, W7, W0, W1); |
| 471 | _R( _e, _a, _b, _c, _d, F3, 41, |
| 472 | WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56, |
| 473 | W2, W3, W4, W5, W6, W7, W0, W1); |
| 474 | _R( _d, _e, _a, _b, _c, F3, 42, |
| 475 | WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 56, |
| 476 | W2, W3, W4, W5, W6, W7, W0, W1); |
| 477 | _R( _c, _d, _e, _a, _b, F3, 43, |
| 478 | WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 56, |
| 479 | W2, W3, W4, W5, W6, W7, W0, W1); |
| 480 | |
| 481 | #undef curK |
| 482 | #define curK qK4 |
| 483 | _R( _b, _c, _d, _e, _a, F3, 44, |
| 484 | WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60, |
| 485 | W1, W2, W3, W4, W5, W6, W7, W0); |
| 486 | _R( _a, _b, _c, _d, _e, F3, 45, |
| 487 | WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60, |
| 488 | W1, W2, W3, W4, W5, W6, W7, W0); |
| 489 | _R( _e, _a, _b, _c, _d, F3, 46, |
| 490 | WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 60, |
| 491 | W1, W2, W3, W4, W5, W6, W7, W0); |
| 492 | _R( _d, _e, _a, _b, _c, F3, 47, |
| 493 | WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 60, |
| 494 | W1, W2, W3, W4, W5, W6, W7, W0); |
| 495 | |
| 496 | _R( _c, _d, _e, _a, _b, F3, 48, |
| 497 | WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64, |
| 498 | W0, W1, W2, W3, W4, W5, W6, W7); |
| 499 | _R( _b, _c, _d, _e, _a, F3, 49, |
| 500 | WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64, |
| 501 | W0, W1, W2, W3, W4, W5, W6, W7); |
| 502 | _R( _a, _b, _c, _d, _e, F3, 50, |
| 503 | WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 64, |
| 504 | W0, W1, W2, W3, W4, W5, W6, W7); |
| 505 | _R( _e, _a, _b, _c, _d, F3, 51, |
| 506 | WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 64, |
| 507 | W0, W1, W2, W3, W4, W5, W6, W7); |
| 508 | |
| 509 | _R( _d, _e, _a, _b, _c, F3, 52, |
| 510 | WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68, |
| 511 | W7, W0, W1, W2, W3, W4, W5, W6); |
| 512 | _R( _c, _d, _e, _a, _b, F3, 53, |
| 513 | WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68, |
| 514 | W7, W0, W1, W2, W3, W4, W5, W6); |
| 515 | _R( _b, _c, _d, _e, _a, F3, 54, |
| 516 | WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 68, |
| 517 | W7, W0, W1, W2, W3, W4, W5, W6); |
| 518 | _R( _a, _b, _c, _d, _e, F3, 55, |
| 519 | WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 68, |
| 520 | W7, W0, W1, W2, W3, W4, W5, W6); |
| 521 | |
| 522 | _R( _e, _a, _b, _c, _d, F3, 56, |
| 523 | WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72, |
| 524 | W6, W7, W0, W1, W2, W3, W4, W5); |
| 525 | _R( _d, _e, _a, _b, _c, F3, 57, |
| 526 | WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72, |
| 527 | W6, W7, W0, W1, W2, W3, W4, W5); |
| 528 | _R( _c, _d, _e, _a, _b, F3, 58, |
| 529 | WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 72, |
| 530 | W6, W7, W0, W1, W2, W3, W4, W5); |
| 531 | _R( _b, _c, _d, _e, _a, F3, 59, |
| 532 | WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 72, |
| 533 | W6, W7, W0, W1, W2, W3, W4, W5); |
| 534 | |
| 535 | subs RNBLKS, #1; |
| 536 | |
| 537 | _R( _a, _b, _c, _d, _e, F4, 60, |
| 538 | WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76, |
| 539 | W5, W6, W7, W0, W1, W2, W3, W4); |
| 540 | _R( _e, _a, _b, _c, _d, F4, 61, |
| 541 | WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76, |
| 542 | W5, W6, W7, W0, W1, W2, W3, W4); |
| 543 | _R( _d, _e, _a, _b, _c, F4, 62, |
| 544 | WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 76, |
| 545 | W5, W6, W7, W0, W1, W2, W3, W4); |
| 546 | _R( _c, _d, _e, _a, _b, F4, 63, |
| 547 | WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 76, |
| 548 | W5, W6, W7, W0, W1, W2, W3, W4); |
| 549 | |
| 550 | beq .Lend; |
| 551 | |
| 552 | /* Transform 64-79 + Precalc 0-15 of next block. */ |
| 553 | #undef curK |
| 554 | #define curK qK1 |
| 555 | _R( _b, _c, _d, _e, _a, F4, 64, |
| 556 | WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 557 | _R( _a, _b, _c, _d, _e, F4, 65, |
| 558 | WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 559 | _R( _e, _a, _b, _c, _d, F4, 66, |
| 560 | WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 561 | _R( _d, _e, _a, _b, _c, F4, 67, |
| 562 | WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 563 | |
| 564 | _R( _c, _d, _e, _a, _b, F4, 68, |
| 565 | dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 566 | _R( _b, _c, _d, _e, _a, F4, 69, |
| 567 | dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 568 | _R( _a, _b, _c, _d, _e, F4, 70, |
| 569 | WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 570 | _R( _e, _a, _b, _c, _d, F4, 71, |
| 571 | WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 572 | |
| 573 | _R( _d, _e, _a, _b, _c, F4, 72, |
| 574 | dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 575 | _R( _c, _d, _e, _a, _b, F4, 73, |
| 576 | dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 577 | _R( _b, _c, _d, _e, _a, F4, 74, |
| 578 | WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 579 | _R( _a, _b, _c, _d, _e, F4, 75, |
| 580 | WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 581 | |
| 582 | _R( _e, _a, _b, _c, _d, F4, 76, |
| 583 | WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 584 | _R( _d, _e, _a, _b, _c, F4, 77, |
| 585 | WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 586 | _R( _c, _d, _e, _a, _b, F4, 78, |
| 587 | WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 588 | _R( _b, _c, _d, _e, _a, F4, 79, |
| 589 | WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ ); |
| 590 | |
| 591 | /* Update the chaining variables. */ |
| 592 | ldm RSTATE, {RT0-RT3}; |
| 593 | add _a, RT0; |
| 594 | ldr RT0, [RSTATE, #state_h4]; |
| 595 | add _b, RT1; |
| 596 | add _c, RT2; |
| 597 | add _d, RT3; |
| 598 | add _e, RT0; |
| 599 | stm RSTATE, {_a-_e}; |
| 600 | |
| 601 | b .Loop; |
| 602 | |
| 603 | .Lend: |
| 604 | /* Transform 64-79 */ |
| 605 | R( _b, _c, _d, _e, _a, F4, 64 ); |
| 606 | R( _a, _b, _c, _d, _e, F4, 65 ); |
| 607 | R( _e, _a, _b, _c, _d, F4, 66 ); |
| 608 | R( _d, _e, _a, _b, _c, F4, 67 ); |
| 609 | R( _c, _d, _e, _a, _b, F4, 68 ); |
| 610 | R( _b, _c, _d, _e, _a, F4, 69 ); |
| 611 | R( _a, _b, _c, _d, _e, F4, 70 ); |
| 612 | R( _e, _a, _b, _c, _d, F4, 71 ); |
| 613 | R( _d, _e, _a, _b, _c, F4, 72 ); |
| 614 | R( _c, _d, _e, _a, _b, F4, 73 ); |
| 615 | R( _b, _c, _d, _e, _a, F4, 74 ); |
| 616 | R( _a, _b, _c, _d, _e, F4, 75 ); |
| 617 | R( _e, _a, _b, _c, _d, F4, 76 ); |
| 618 | R( _d, _e, _a, _b, _c, F4, 77 ); |
| 619 | R( _c, _d, _e, _a, _b, F4, 78 ); |
| 620 | R( _b, _c, _d, _e, _a, F4, 79 ); |
| 621 | |
| 622 | mov sp, ROLDSTACK; |
| 623 | |
| 624 | /* Update the chaining variables. */ |
| 625 | ldm RSTATE, {RT0-RT3}; |
| 626 | add _a, RT0; |
| 627 | ldr RT0, [RSTATE, #state_h4]; |
| 628 | add _b, RT1; |
| 629 | add _c, RT2; |
| 630 | add _d, RT3; |
| 631 | /*vpop {q4-q7};*/ |
| 632 | add _e, RT0; |
| 633 | stm RSTATE, {_a-_e}; |
| 634 | |
| 635 | pop {r4-r12, pc}; |
| 636 | |
| 637 | .Ldo_nothing: |
| 638 | bx lr |
| 639 | ENDPROC(sha1_transform_neon) |