Jussi Kivilinna | c8611d7 | 2014-07-29 17:15:24 +0100 | [diff] [blame] | 1 | /* sha512-armv7-neon.S - ARM/NEON assembly implementation of SHA-512 transform |
| 2 | * |
| 3 | * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi> |
| 4 | * |
| 5 | * This program is free software; you can redistribute it and/or modify it |
| 6 | * under the terms of the GNU General Public License as published by the Free |
| 7 | * Software Foundation; either version 2 of the License, or (at your option) |
| 8 | * any later version. |
| 9 | */ |
| 10 | |
| 11 | #include <linux/linkage.h> |
| 12 | |
| 13 | |
| 14 | .syntax unified |
| 15 | .code 32 |
| 16 | .fpu neon |
| 17 | |
| 18 | .text |
| 19 | |
| 20 | /* structure of SHA512_CONTEXT */ |
| 21 | #define hd_a 0 |
| 22 | #define hd_b ((hd_a) + 8) |
| 23 | #define hd_c ((hd_b) + 8) |
| 24 | #define hd_d ((hd_c) + 8) |
| 25 | #define hd_e ((hd_d) + 8) |
| 26 | #define hd_f ((hd_e) + 8) |
| 27 | #define hd_g ((hd_f) + 8) |
| 28 | |
| 29 | /* register macros */ |
| 30 | #define RK %r2 |
| 31 | |
| 32 | #define RA d0 |
| 33 | #define RB d1 |
| 34 | #define RC d2 |
| 35 | #define RD d3 |
| 36 | #define RE d4 |
| 37 | #define RF d5 |
| 38 | #define RG d6 |
| 39 | #define RH d7 |
| 40 | |
| 41 | #define RT0 d8 |
| 42 | #define RT1 d9 |
| 43 | #define RT2 d10 |
| 44 | #define RT3 d11 |
| 45 | #define RT4 d12 |
| 46 | #define RT5 d13 |
| 47 | #define RT6 d14 |
| 48 | #define RT7 d15 |
| 49 | |
| 50 | #define RT01q q4 |
| 51 | #define RT23q q5 |
| 52 | #define RT45q q6 |
| 53 | #define RT67q q7 |
| 54 | |
| 55 | #define RW0 d16 |
| 56 | #define RW1 d17 |
| 57 | #define RW2 d18 |
| 58 | #define RW3 d19 |
| 59 | #define RW4 d20 |
| 60 | #define RW5 d21 |
| 61 | #define RW6 d22 |
| 62 | #define RW7 d23 |
| 63 | #define RW8 d24 |
| 64 | #define RW9 d25 |
| 65 | #define RW10 d26 |
| 66 | #define RW11 d27 |
| 67 | #define RW12 d28 |
| 68 | #define RW13 d29 |
| 69 | #define RW14 d30 |
| 70 | #define RW15 d31 |
| 71 | |
| 72 | #define RW01q q8 |
| 73 | #define RW23q q9 |
| 74 | #define RW45q q10 |
| 75 | #define RW67q q11 |
| 76 | #define RW89q q12 |
| 77 | #define RW1011q q13 |
| 78 | #define RW1213q q14 |
| 79 | #define RW1415q q15 |
| 80 | |
| 81 | /*********************************************************************** |
| 82 | * ARM assembly implementation of sha512 transform |
| 83 | ***********************************************************************/ |
| 84 | #define rounds2_0_63(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, rw01q, rw2, \ |
| 85 | rw23q, rw1415q, rw9, rw10, interleave_op, arg1) \ |
| 86 | /* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \ |
| 87 | vshr.u64 RT2, re, #14; \ |
| 88 | vshl.u64 RT3, re, #64 - 14; \ |
| 89 | interleave_op(arg1); \ |
| 90 | vshr.u64 RT4, re, #18; \ |
| 91 | vshl.u64 RT5, re, #64 - 18; \ |
| 92 | vld1.64 {RT0}, [RK]!; \ |
| 93 | veor.64 RT23q, RT23q, RT45q; \ |
| 94 | vshr.u64 RT4, re, #41; \ |
| 95 | vshl.u64 RT5, re, #64 - 41; \ |
| 96 | vadd.u64 RT0, RT0, rw0; \ |
| 97 | veor.64 RT23q, RT23q, RT45q; \ |
| 98 | vmov.64 RT7, re; \ |
| 99 | veor.64 RT1, RT2, RT3; \ |
| 100 | vbsl.64 RT7, rf, rg; \ |
| 101 | \ |
| 102 | vadd.u64 RT1, RT1, rh; \ |
| 103 | vshr.u64 RT2, ra, #28; \ |
| 104 | vshl.u64 RT3, ra, #64 - 28; \ |
| 105 | vadd.u64 RT1, RT1, RT0; \ |
| 106 | vshr.u64 RT4, ra, #34; \ |
| 107 | vshl.u64 RT5, ra, #64 - 34; \ |
| 108 | vadd.u64 RT1, RT1, RT7; \ |
| 109 | \ |
| 110 | /* h = Sum0 (a) + Maj (a, b, c); */ \ |
| 111 | veor.64 RT23q, RT23q, RT45q; \ |
| 112 | vshr.u64 RT4, ra, #39; \ |
| 113 | vshl.u64 RT5, ra, #64 - 39; \ |
| 114 | veor.64 RT0, ra, rb; \ |
| 115 | veor.64 RT23q, RT23q, RT45q; \ |
| 116 | vbsl.64 RT0, rc, rb; \ |
| 117 | vadd.u64 rd, rd, RT1; /* d+=t1; */ \ |
| 118 | veor.64 rh, RT2, RT3; \ |
| 119 | \ |
| 120 | /* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \ |
| 121 | vshr.u64 RT2, rd, #14; \ |
| 122 | vshl.u64 RT3, rd, #64 - 14; \ |
| 123 | vadd.u64 rh, rh, RT0; \ |
| 124 | vshr.u64 RT4, rd, #18; \ |
| 125 | vshl.u64 RT5, rd, #64 - 18; \ |
| 126 | vadd.u64 rh, rh, RT1; /* h+=t1; */ \ |
| 127 | vld1.64 {RT0}, [RK]!; \ |
| 128 | veor.64 RT23q, RT23q, RT45q; \ |
| 129 | vshr.u64 RT4, rd, #41; \ |
| 130 | vshl.u64 RT5, rd, #64 - 41; \ |
| 131 | vadd.u64 RT0, RT0, rw1; \ |
| 132 | veor.64 RT23q, RT23q, RT45q; \ |
| 133 | vmov.64 RT7, rd; \ |
| 134 | veor.64 RT1, RT2, RT3; \ |
| 135 | vbsl.64 RT7, re, rf; \ |
| 136 | \ |
| 137 | vadd.u64 RT1, RT1, rg; \ |
| 138 | vshr.u64 RT2, rh, #28; \ |
| 139 | vshl.u64 RT3, rh, #64 - 28; \ |
| 140 | vadd.u64 RT1, RT1, RT0; \ |
| 141 | vshr.u64 RT4, rh, #34; \ |
| 142 | vshl.u64 RT5, rh, #64 - 34; \ |
| 143 | vadd.u64 RT1, RT1, RT7; \ |
| 144 | \ |
| 145 | /* g = Sum0 (h) + Maj (h, a, b); */ \ |
| 146 | veor.64 RT23q, RT23q, RT45q; \ |
| 147 | vshr.u64 RT4, rh, #39; \ |
| 148 | vshl.u64 RT5, rh, #64 - 39; \ |
| 149 | veor.64 RT0, rh, ra; \ |
| 150 | veor.64 RT23q, RT23q, RT45q; \ |
| 151 | vbsl.64 RT0, rb, ra; \ |
| 152 | vadd.u64 rc, rc, RT1; /* c+=t1; */ \ |
| 153 | veor.64 rg, RT2, RT3; \ |
| 154 | \ |
| 155 | /* w[0] += S1 (w[14]) + w[9] + S0 (w[1]); */ \ |
| 156 | /* w[1] += S1 (w[15]) + w[10] + S0 (w[2]); */ \ |
| 157 | \ |
| 158 | /**** S0(w[1:2]) */ \ |
| 159 | \ |
| 160 | /* w[0:1] += w[9:10] */ \ |
| 161 | /* RT23q = rw1:rw2 */ \ |
| 162 | vext.u64 RT23q, rw01q, rw23q, #1; \ |
| 163 | vadd.u64 rw0, rw9; \ |
| 164 | vadd.u64 rg, rg, RT0; \ |
| 165 | vadd.u64 rw1, rw10;\ |
| 166 | vadd.u64 rg, rg, RT1; /* g+=t1; */ \ |
| 167 | \ |
| 168 | vshr.u64 RT45q, RT23q, #1; \ |
| 169 | vshl.u64 RT67q, RT23q, #64 - 1; \ |
| 170 | vshr.u64 RT01q, RT23q, #8; \ |
| 171 | veor.u64 RT45q, RT45q, RT67q; \ |
| 172 | vshl.u64 RT67q, RT23q, #64 - 8; \ |
| 173 | veor.u64 RT45q, RT45q, RT01q; \ |
| 174 | vshr.u64 RT01q, RT23q, #7; \ |
| 175 | veor.u64 RT45q, RT45q, RT67q; \ |
| 176 | \ |
| 177 | /**** S1(w[14:15]) */ \ |
| 178 | vshr.u64 RT23q, rw1415q, #6; \ |
| 179 | veor.u64 RT01q, RT01q, RT45q; \ |
| 180 | vshr.u64 RT45q, rw1415q, #19; \ |
| 181 | vshl.u64 RT67q, rw1415q, #64 - 19; \ |
| 182 | veor.u64 RT23q, RT23q, RT45q; \ |
| 183 | vshr.u64 RT45q, rw1415q, #61; \ |
| 184 | veor.u64 RT23q, RT23q, RT67q; \ |
| 185 | vshl.u64 RT67q, rw1415q, #64 - 61; \ |
| 186 | veor.u64 RT23q, RT23q, RT45q; \ |
| 187 | vadd.u64 rw01q, RT01q; /* w[0:1] += S(w[1:2]) */ \ |
| 188 | veor.u64 RT01q, RT23q, RT67q; |
| 189 | #define vadd_RT01q(rw01q) \ |
| 190 | /* w[0:1] += S(w[14:15]) */ \ |
| 191 | vadd.u64 rw01q, RT01q; |
| 192 | |
| 193 | #define dummy(_) /*_*/ |
| 194 | |
| 195 | #define rounds2_64_79(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, \ |
| 196 | interleave_op1, arg1, interleave_op2, arg2) \ |
| 197 | /* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \ |
| 198 | vshr.u64 RT2, re, #14; \ |
| 199 | vshl.u64 RT3, re, #64 - 14; \ |
| 200 | interleave_op1(arg1); \ |
| 201 | vshr.u64 RT4, re, #18; \ |
| 202 | vshl.u64 RT5, re, #64 - 18; \ |
| 203 | interleave_op2(arg2); \ |
| 204 | vld1.64 {RT0}, [RK]!; \ |
| 205 | veor.64 RT23q, RT23q, RT45q; \ |
| 206 | vshr.u64 RT4, re, #41; \ |
| 207 | vshl.u64 RT5, re, #64 - 41; \ |
| 208 | vadd.u64 RT0, RT0, rw0; \ |
| 209 | veor.64 RT23q, RT23q, RT45q; \ |
| 210 | vmov.64 RT7, re; \ |
| 211 | veor.64 RT1, RT2, RT3; \ |
| 212 | vbsl.64 RT7, rf, rg; \ |
| 213 | \ |
| 214 | vadd.u64 RT1, RT1, rh; \ |
| 215 | vshr.u64 RT2, ra, #28; \ |
| 216 | vshl.u64 RT3, ra, #64 - 28; \ |
| 217 | vadd.u64 RT1, RT1, RT0; \ |
| 218 | vshr.u64 RT4, ra, #34; \ |
| 219 | vshl.u64 RT5, ra, #64 - 34; \ |
| 220 | vadd.u64 RT1, RT1, RT7; \ |
| 221 | \ |
| 222 | /* h = Sum0 (a) + Maj (a, b, c); */ \ |
| 223 | veor.64 RT23q, RT23q, RT45q; \ |
| 224 | vshr.u64 RT4, ra, #39; \ |
| 225 | vshl.u64 RT5, ra, #64 - 39; \ |
| 226 | veor.64 RT0, ra, rb; \ |
| 227 | veor.64 RT23q, RT23q, RT45q; \ |
| 228 | vbsl.64 RT0, rc, rb; \ |
| 229 | vadd.u64 rd, rd, RT1; /* d+=t1; */ \ |
| 230 | veor.64 rh, RT2, RT3; \ |
| 231 | \ |
| 232 | /* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \ |
| 233 | vshr.u64 RT2, rd, #14; \ |
| 234 | vshl.u64 RT3, rd, #64 - 14; \ |
| 235 | vadd.u64 rh, rh, RT0; \ |
| 236 | vshr.u64 RT4, rd, #18; \ |
| 237 | vshl.u64 RT5, rd, #64 - 18; \ |
| 238 | vadd.u64 rh, rh, RT1; /* h+=t1; */ \ |
| 239 | vld1.64 {RT0}, [RK]!; \ |
| 240 | veor.64 RT23q, RT23q, RT45q; \ |
| 241 | vshr.u64 RT4, rd, #41; \ |
| 242 | vshl.u64 RT5, rd, #64 - 41; \ |
| 243 | vadd.u64 RT0, RT0, rw1; \ |
| 244 | veor.64 RT23q, RT23q, RT45q; \ |
| 245 | vmov.64 RT7, rd; \ |
| 246 | veor.64 RT1, RT2, RT3; \ |
| 247 | vbsl.64 RT7, re, rf; \ |
| 248 | \ |
| 249 | vadd.u64 RT1, RT1, rg; \ |
| 250 | vshr.u64 RT2, rh, #28; \ |
| 251 | vshl.u64 RT3, rh, #64 - 28; \ |
| 252 | vadd.u64 RT1, RT1, RT0; \ |
| 253 | vshr.u64 RT4, rh, #34; \ |
| 254 | vshl.u64 RT5, rh, #64 - 34; \ |
| 255 | vadd.u64 RT1, RT1, RT7; \ |
| 256 | \ |
| 257 | /* g = Sum0 (h) + Maj (h, a, b); */ \ |
| 258 | veor.64 RT23q, RT23q, RT45q; \ |
| 259 | vshr.u64 RT4, rh, #39; \ |
| 260 | vshl.u64 RT5, rh, #64 - 39; \ |
| 261 | veor.64 RT0, rh, ra; \ |
| 262 | veor.64 RT23q, RT23q, RT45q; \ |
| 263 | vbsl.64 RT0, rb, ra; \ |
| 264 | vadd.u64 rc, rc, RT1; /* c+=t1; */ \ |
| 265 | veor.64 rg, RT2, RT3; |
| 266 | #define vadd_rg_RT0(rg) \ |
| 267 | vadd.u64 rg, rg, RT0; |
| 268 | #define vadd_rg_RT1(rg) \ |
| 269 | vadd.u64 rg, rg, RT1; /* g+=t1; */ |
| 270 | |
| 271 | .align 3 |
| 272 | ENTRY(sha512_transform_neon) |
| 273 | /* Input: |
| 274 | * %r0: SHA512_CONTEXT |
| 275 | * %r1: data |
| 276 | * %r2: u64 k[] constants |
| 277 | * %r3: nblks |
| 278 | */ |
| 279 | push {%lr}; |
| 280 | |
| 281 | mov %lr, #0; |
| 282 | |
| 283 | /* Load context to d0-d7 */ |
| 284 | vld1.64 {RA-RD}, [%r0]!; |
| 285 | vld1.64 {RE-RH}, [%r0]; |
| 286 | sub %r0, #(4*8); |
| 287 | |
| 288 | /* Load input to w[16], d16-d31 */ |
| 289 | /* NOTE: Assumes that on ARMv7 unaligned accesses are always allowed. */ |
| 290 | vld1.64 {RW0-RW3}, [%r1]!; |
| 291 | vld1.64 {RW4-RW7}, [%r1]!; |
| 292 | vld1.64 {RW8-RW11}, [%r1]!; |
| 293 | vld1.64 {RW12-RW15}, [%r1]!; |
| 294 | #ifdef __ARMEL__ |
| 295 | /* byteswap */ |
| 296 | vrev64.8 RW01q, RW01q; |
| 297 | vrev64.8 RW23q, RW23q; |
| 298 | vrev64.8 RW45q, RW45q; |
| 299 | vrev64.8 RW67q, RW67q; |
| 300 | vrev64.8 RW89q, RW89q; |
| 301 | vrev64.8 RW1011q, RW1011q; |
| 302 | vrev64.8 RW1213q, RW1213q; |
| 303 | vrev64.8 RW1415q, RW1415q; |
| 304 | #endif |
| 305 | |
| 306 | /* EABI says that d8-d15 must be preserved by callee. */ |
| 307 | /*vpush {RT0-RT7};*/ |
| 308 | |
| 309 | .Loop: |
| 310 | rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2, |
| 311 | RW23q, RW1415q, RW9, RW10, dummy, _); |
| 312 | b .Lenter_rounds; |
| 313 | |
| 314 | .Loop_rounds: |
| 315 | rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2, |
| 316 | RW23q, RW1415q, RW9, RW10, vadd_RT01q, RW1415q); |
| 317 | .Lenter_rounds: |
| 318 | rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3, RW23q, RW4, |
| 319 | RW45q, RW01q, RW11, RW12, vadd_RT01q, RW01q); |
| 320 | rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, RW45q, RW6, |
| 321 | RW67q, RW23q, RW13, RW14, vadd_RT01q, RW23q); |
| 322 | rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, RW67q, RW8, |
| 323 | RW89q, RW45q, RW15, RW0, vadd_RT01q, RW45q); |
| 324 | rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, RW89q, RW10, |
| 325 | RW1011q, RW67q, RW1, RW2, vadd_RT01q, RW67q); |
| 326 | rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, RW1011q, RW12, |
| 327 | RW1213q, RW89q, RW3, RW4, vadd_RT01q, RW89q); |
| 328 | add %lr, #16; |
| 329 | rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, RW1213q, RW14, |
| 330 | RW1415q, RW1011q, RW5, RW6, vadd_RT01q, RW1011q); |
| 331 | cmp %lr, #64; |
| 332 | rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, RW1415q, RW0, |
| 333 | RW01q, RW1213q, RW7, RW8, vadd_RT01q, RW1213q); |
| 334 | bne .Loop_rounds; |
| 335 | |
| 336 | subs %r3, #1; |
| 337 | |
| 338 | rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, |
| 339 | vadd_RT01q, RW1415q, dummy, _); |
| 340 | rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3, |
| 341 | vadd_rg_RT0, RG, vadd_rg_RT1, RG); |
| 342 | beq .Lhandle_tail; |
| 343 | vld1.64 {RW0-RW3}, [%r1]!; |
| 344 | rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, |
| 345 | vadd_rg_RT0, RE, vadd_rg_RT1, RE); |
| 346 | rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, |
| 347 | vadd_rg_RT0, RC, vadd_rg_RT1, RC); |
| 348 | #ifdef __ARMEL__ |
| 349 | vrev64.8 RW01q, RW01q; |
| 350 | vrev64.8 RW23q, RW23q; |
| 351 | #endif |
| 352 | vld1.64 {RW4-RW7}, [%r1]!; |
| 353 | rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, |
| 354 | vadd_rg_RT0, RA, vadd_rg_RT1, RA); |
| 355 | rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, |
| 356 | vadd_rg_RT0, RG, vadd_rg_RT1, RG); |
| 357 | #ifdef __ARMEL__ |
| 358 | vrev64.8 RW45q, RW45q; |
| 359 | vrev64.8 RW67q, RW67q; |
| 360 | #endif |
| 361 | vld1.64 {RW8-RW11}, [%r1]!; |
| 362 | rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, |
| 363 | vadd_rg_RT0, RE, vadd_rg_RT1, RE); |
| 364 | rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, |
| 365 | vadd_rg_RT0, RC, vadd_rg_RT1, RC); |
| 366 | #ifdef __ARMEL__ |
| 367 | vrev64.8 RW89q, RW89q; |
| 368 | vrev64.8 RW1011q, RW1011q; |
| 369 | #endif |
| 370 | vld1.64 {RW12-RW15}, [%r1]!; |
| 371 | vadd_rg_RT0(RA); |
| 372 | vadd_rg_RT1(RA); |
| 373 | |
| 374 | /* Load context */ |
| 375 | vld1.64 {RT0-RT3}, [%r0]!; |
| 376 | vld1.64 {RT4-RT7}, [%r0]; |
| 377 | sub %r0, #(4*8); |
| 378 | |
| 379 | #ifdef __ARMEL__ |
| 380 | vrev64.8 RW1213q, RW1213q; |
| 381 | vrev64.8 RW1415q, RW1415q; |
| 382 | #endif |
| 383 | |
| 384 | vadd.u64 RA, RT0; |
| 385 | vadd.u64 RB, RT1; |
| 386 | vadd.u64 RC, RT2; |
| 387 | vadd.u64 RD, RT3; |
| 388 | vadd.u64 RE, RT4; |
| 389 | vadd.u64 RF, RT5; |
| 390 | vadd.u64 RG, RT6; |
| 391 | vadd.u64 RH, RT7; |
| 392 | |
| 393 | /* Store the first half of context */ |
| 394 | vst1.64 {RA-RD}, [%r0]!; |
| 395 | sub RK, $(8*80); |
| 396 | vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */ |
| 397 | mov %lr, #0; |
| 398 | sub %r0, #(4*8); |
| 399 | |
| 400 | b .Loop; |
| 401 | |
| 402 | .Lhandle_tail: |
| 403 | rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, |
| 404 | vadd_rg_RT0, RE, vadd_rg_RT1, RE); |
| 405 | rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, |
| 406 | vadd_rg_RT0, RC, vadd_rg_RT1, RC); |
| 407 | rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, |
| 408 | vadd_rg_RT0, RA, vadd_rg_RT1, RA); |
| 409 | rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, |
| 410 | vadd_rg_RT0, RG, vadd_rg_RT1, RG); |
| 411 | rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, |
| 412 | vadd_rg_RT0, RE, vadd_rg_RT1, RE); |
| 413 | rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, |
| 414 | vadd_rg_RT0, RC, vadd_rg_RT1, RC); |
| 415 | |
| 416 | /* Load context to d16-d23 */ |
| 417 | vld1.64 {RW0-RW3}, [%r0]!; |
| 418 | vadd_rg_RT0(RA); |
| 419 | vld1.64 {RW4-RW7}, [%r0]; |
| 420 | vadd_rg_RT1(RA); |
| 421 | sub %r0, #(4*8); |
| 422 | |
| 423 | vadd.u64 RA, RW0; |
| 424 | vadd.u64 RB, RW1; |
| 425 | vadd.u64 RC, RW2; |
| 426 | vadd.u64 RD, RW3; |
| 427 | vadd.u64 RE, RW4; |
| 428 | vadd.u64 RF, RW5; |
| 429 | vadd.u64 RG, RW6; |
| 430 | vadd.u64 RH, RW7; |
| 431 | |
| 432 | /* Store the first half of context */ |
| 433 | vst1.64 {RA-RD}, [%r0]!; |
| 434 | |
| 435 | /* Clear used registers */ |
| 436 | /* d16-d31 */ |
| 437 | veor.u64 RW01q, RW01q; |
| 438 | veor.u64 RW23q, RW23q; |
| 439 | veor.u64 RW45q, RW45q; |
| 440 | veor.u64 RW67q, RW67q; |
| 441 | vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */ |
| 442 | veor.u64 RW89q, RW89q; |
| 443 | veor.u64 RW1011q, RW1011q; |
| 444 | veor.u64 RW1213q, RW1213q; |
| 445 | veor.u64 RW1415q, RW1415q; |
| 446 | /* d8-d15 */ |
| 447 | /*vpop {RT0-RT7};*/ |
| 448 | /* d0-d7 (q0-q3) */ |
| 449 | veor.u64 %q0, %q0; |
| 450 | veor.u64 %q1, %q1; |
| 451 | veor.u64 %q2, %q2; |
| 452 | veor.u64 %q3, %q3; |
| 453 | |
| 454 | pop {%pc}; |
| 455 | ENDPROC(sha512_transform_neon) |