Jussi Kivilinna | 0444380 | 2013-01-19 13:39:31 +0200 | [diff] [blame] | 1 | #include <linux/linkage.h> |
| 2 | |
| 3 | # enter salsa20_encrypt_bytes |
| 4 | ENTRY(salsa20_encrypt_bytes) |
Tan Swee Heng | 9a7dafb | 2007-12-18 00:04:40 +0800 | [diff] [blame] | 5 | mov %rsp,%r11 |
| 6 | and $31,%r11 |
| 7 | add $256,%r11 |
| 8 | sub %r11,%rsp |
| 9 | # x = arg1 |
| 10 | mov %rdi,%r8 |
| 11 | # m = arg2 |
| 12 | mov %rsi,%rsi |
| 13 | # out = arg3 |
| 14 | mov %rdx,%rdi |
| 15 | # bytes = arg4 |
| 16 | mov %rcx,%rdx |
| 17 | # unsigned>? bytes - 0 |
| 18 | cmp $0,%rdx |
| 19 | # comment:fp stack unchanged by jump |
| 20 | # goto done if !unsigned> |
| 21 | jbe ._done |
| 22 | # comment:fp stack unchanged by fallthrough |
| 23 | # start: |
| 24 | ._start: |
| 25 | # r11_stack = r11 |
| 26 | movq %r11,0(%rsp) |
| 27 | # r12_stack = r12 |
| 28 | movq %r12,8(%rsp) |
| 29 | # r13_stack = r13 |
| 30 | movq %r13,16(%rsp) |
| 31 | # r14_stack = r14 |
| 32 | movq %r14,24(%rsp) |
| 33 | # r15_stack = r15 |
| 34 | movq %r15,32(%rsp) |
| 35 | # rbx_stack = rbx |
| 36 | movq %rbx,40(%rsp) |
| 37 | # rbp_stack = rbp |
| 38 | movq %rbp,48(%rsp) |
| 39 | # in0 = *(uint64 *) (x + 0) |
| 40 | movq 0(%r8),%rcx |
| 41 | # in2 = *(uint64 *) (x + 8) |
| 42 | movq 8(%r8),%r9 |
| 43 | # in4 = *(uint64 *) (x + 16) |
| 44 | movq 16(%r8),%rax |
| 45 | # in6 = *(uint64 *) (x + 24) |
| 46 | movq 24(%r8),%r10 |
| 47 | # in8 = *(uint64 *) (x + 32) |
| 48 | movq 32(%r8),%r11 |
| 49 | # in10 = *(uint64 *) (x + 40) |
| 50 | movq 40(%r8),%r12 |
| 51 | # in12 = *(uint64 *) (x + 48) |
| 52 | movq 48(%r8),%r13 |
| 53 | # in14 = *(uint64 *) (x + 56) |
| 54 | movq 56(%r8),%r14 |
| 55 | # j0 = in0 |
| 56 | movq %rcx,56(%rsp) |
| 57 | # j2 = in2 |
| 58 | movq %r9,64(%rsp) |
| 59 | # j4 = in4 |
| 60 | movq %rax,72(%rsp) |
| 61 | # j6 = in6 |
| 62 | movq %r10,80(%rsp) |
| 63 | # j8 = in8 |
| 64 | movq %r11,88(%rsp) |
| 65 | # j10 = in10 |
| 66 | movq %r12,96(%rsp) |
| 67 | # j12 = in12 |
| 68 | movq %r13,104(%rsp) |
| 69 | # j14 = in14 |
| 70 | movq %r14,112(%rsp) |
| 71 | # x_backup = x |
| 72 | movq %r8,120(%rsp) |
| 73 | # bytesatleast1: |
| 74 | ._bytesatleast1: |
| 75 | # unsigned<? bytes - 64 |
| 76 | cmp $64,%rdx |
| 77 | # comment:fp stack unchanged by jump |
| 78 | # goto nocopy if !unsigned< |
| 79 | jae ._nocopy |
| 80 | # ctarget = out |
| 81 | movq %rdi,128(%rsp) |
| 82 | # out = &tmp |
| 83 | leaq 192(%rsp),%rdi |
| 84 | # i = bytes |
| 85 | mov %rdx,%rcx |
| 86 | # while (i) { *out++ = *m++; --i } |
| 87 | rep movsb |
| 88 | # out = &tmp |
| 89 | leaq 192(%rsp),%rdi |
| 90 | # m = &tmp |
| 91 | leaq 192(%rsp),%rsi |
| 92 | # comment:fp stack unchanged by fallthrough |
| 93 | # nocopy: |
| 94 | ._nocopy: |
| 95 | # out_backup = out |
| 96 | movq %rdi,136(%rsp) |
| 97 | # m_backup = m |
| 98 | movq %rsi,144(%rsp) |
| 99 | # bytes_backup = bytes |
| 100 | movq %rdx,152(%rsp) |
| 101 | # x1 = j0 |
| 102 | movq 56(%rsp),%rdi |
| 103 | # x0 = x1 |
| 104 | mov %rdi,%rdx |
| 105 | # (uint64) x1 >>= 32 |
| 106 | shr $32,%rdi |
| 107 | # x3 = j2 |
| 108 | movq 64(%rsp),%rsi |
| 109 | # x2 = x3 |
| 110 | mov %rsi,%rcx |
| 111 | # (uint64) x3 >>= 32 |
| 112 | shr $32,%rsi |
| 113 | # x5 = j4 |
| 114 | movq 72(%rsp),%r8 |
| 115 | # x4 = x5 |
| 116 | mov %r8,%r9 |
| 117 | # (uint64) x5 >>= 32 |
| 118 | shr $32,%r8 |
| 119 | # x5_stack = x5 |
| 120 | movq %r8,160(%rsp) |
| 121 | # x7 = j6 |
| 122 | movq 80(%rsp),%r8 |
| 123 | # x6 = x7 |
| 124 | mov %r8,%rax |
| 125 | # (uint64) x7 >>= 32 |
| 126 | shr $32,%r8 |
| 127 | # x9 = j8 |
| 128 | movq 88(%rsp),%r10 |
| 129 | # x8 = x9 |
| 130 | mov %r10,%r11 |
| 131 | # (uint64) x9 >>= 32 |
| 132 | shr $32,%r10 |
| 133 | # x11 = j10 |
| 134 | movq 96(%rsp),%r12 |
| 135 | # x10 = x11 |
| 136 | mov %r12,%r13 |
| 137 | # x10_stack = x10 |
| 138 | movq %r13,168(%rsp) |
| 139 | # (uint64) x11 >>= 32 |
| 140 | shr $32,%r12 |
| 141 | # x13 = j12 |
| 142 | movq 104(%rsp),%r13 |
| 143 | # x12 = x13 |
| 144 | mov %r13,%r14 |
| 145 | # (uint64) x13 >>= 32 |
| 146 | shr $32,%r13 |
| 147 | # x15 = j14 |
| 148 | movq 112(%rsp),%r15 |
| 149 | # x14 = x15 |
| 150 | mov %r15,%rbx |
| 151 | # (uint64) x15 >>= 32 |
| 152 | shr $32,%r15 |
| 153 | # x15_stack = x15 |
| 154 | movq %r15,176(%rsp) |
| 155 | # i = 20 |
| 156 | mov $20,%r15 |
| 157 | # mainloop: |
| 158 | ._mainloop: |
| 159 | # i_backup = i |
| 160 | movq %r15,184(%rsp) |
| 161 | # x5 = x5_stack |
| 162 | movq 160(%rsp),%r15 |
| 163 | # a = x12 + x0 |
| 164 | lea (%r14,%rdx),%rbp |
| 165 | # (uint32) a <<<= 7 |
| 166 | rol $7,%ebp |
| 167 | # x4 ^= a |
| 168 | xor %rbp,%r9 |
| 169 | # b = x1 + x5 |
| 170 | lea (%rdi,%r15),%rbp |
| 171 | # (uint32) b <<<= 7 |
| 172 | rol $7,%ebp |
| 173 | # x9 ^= b |
| 174 | xor %rbp,%r10 |
| 175 | # a = x0 + x4 |
| 176 | lea (%rdx,%r9),%rbp |
| 177 | # (uint32) a <<<= 9 |
| 178 | rol $9,%ebp |
| 179 | # x8 ^= a |
| 180 | xor %rbp,%r11 |
| 181 | # b = x5 + x9 |
| 182 | lea (%r15,%r10),%rbp |
| 183 | # (uint32) b <<<= 9 |
| 184 | rol $9,%ebp |
| 185 | # x13 ^= b |
| 186 | xor %rbp,%r13 |
| 187 | # a = x4 + x8 |
| 188 | lea (%r9,%r11),%rbp |
| 189 | # (uint32) a <<<= 13 |
| 190 | rol $13,%ebp |
| 191 | # x12 ^= a |
| 192 | xor %rbp,%r14 |
| 193 | # b = x9 + x13 |
| 194 | lea (%r10,%r13),%rbp |
| 195 | # (uint32) b <<<= 13 |
| 196 | rol $13,%ebp |
| 197 | # x1 ^= b |
| 198 | xor %rbp,%rdi |
| 199 | # a = x8 + x12 |
| 200 | lea (%r11,%r14),%rbp |
| 201 | # (uint32) a <<<= 18 |
| 202 | rol $18,%ebp |
| 203 | # x0 ^= a |
| 204 | xor %rbp,%rdx |
| 205 | # b = x13 + x1 |
| 206 | lea (%r13,%rdi),%rbp |
| 207 | # (uint32) b <<<= 18 |
| 208 | rol $18,%ebp |
| 209 | # x5 ^= b |
| 210 | xor %rbp,%r15 |
| 211 | # x10 = x10_stack |
| 212 | movq 168(%rsp),%rbp |
| 213 | # x5_stack = x5 |
| 214 | movq %r15,160(%rsp) |
| 215 | # c = x6 + x10 |
| 216 | lea (%rax,%rbp),%r15 |
| 217 | # (uint32) c <<<= 7 |
| 218 | rol $7,%r15d |
| 219 | # x14 ^= c |
| 220 | xor %r15,%rbx |
| 221 | # c = x10 + x14 |
| 222 | lea (%rbp,%rbx),%r15 |
| 223 | # (uint32) c <<<= 9 |
| 224 | rol $9,%r15d |
| 225 | # x2 ^= c |
| 226 | xor %r15,%rcx |
| 227 | # c = x14 + x2 |
| 228 | lea (%rbx,%rcx),%r15 |
| 229 | # (uint32) c <<<= 13 |
| 230 | rol $13,%r15d |
| 231 | # x6 ^= c |
| 232 | xor %r15,%rax |
| 233 | # c = x2 + x6 |
| 234 | lea (%rcx,%rax),%r15 |
| 235 | # (uint32) c <<<= 18 |
| 236 | rol $18,%r15d |
| 237 | # x10 ^= c |
| 238 | xor %r15,%rbp |
| 239 | # x15 = x15_stack |
| 240 | movq 176(%rsp),%r15 |
| 241 | # x10_stack = x10 |
| 242 | movq %rbp,168(%rsp) |
| 243 | # d = x11 + x15 |
| 244 | lea (%r12,%r15),%rbp |
| 245 | # (uint32) d <<<= 7 |
| 246 | rol $7,%ebp |
| 247 | # x3 ^= d |
| 248 | xor %rbp,%rsi |
| 249 | # d = x15 + x3 |
| 250 | lea (%r15,%rsi),%rbp |
| 251 | # (uint32) d <<<= 9 |
| 252 | rol $9,%ebp |
| 253 | # x7 ^= d |
| 254 | xor %rbp,%r8 |
| 255 | # d = x3 + x7 |
| 256 | lea (%rsi,%r8),%rbp |
| 257 | # (uint32) d <<<= 13 |
| 258 | rol $13,%ebp |
| 259 | # x11 ^= d |
| 260 | xor %rbp,%r12 |
| 261 | # d = x7 + x11 |
| 262 | lea (%r8,%r12),%rbp |
| 263 | # (uint32) d <<<= 18 |
| 264 | rol $18,%ebp |
| 265 | # x15 ^= d |
| 266 | xor %rbp,%r15 |
| 267 | # x15_stack = x15 |
| 268 | movq %r15,176(%rsp) |
| 269 | # x5 = x5_stack |
| 270 | movq 160(%rsp),%r15 |
| 271 | # a = x3 + x0 |
| 272 | lea (%rsi,%rdx),%rbp |
| 273 | # (uint32) a <<<= 7 |
| 274 | rol $7,%ebp |
| 275 | # x1 ^= a |
| 276 | xor %rbp,%rdi |
| 277 | # b = x4 + x5 |
| 278 | lea (%r9,%r15),%rbp |
| 279 | # (uint32) b <<<= 7 |
| 280 | rol $7,%ebp |
| 281 | # x6 ^= b |
| 282 | xor %rbp,%rax |
| 283 | # a = x0 + x1 |
| 284 | lea (%rdx,%rdi),%rbp |
| 285 | # (uint32) a <<<= 9 |
| 286 | rol $9,%ebp |
| 287 | # x2 ^= a |
| 288 | xor %rbp,%rcx |
| 289 | # b = x5 + x6 |
| 290 | lea (%r15,%rax),%rbp |
| 291 | # (uint32) b <<<= 9 |
| 292 | rol $9,%ebp |
| 293 | # x7 ^= b |
| 294 | xor %rbp,%r8 |
| 295 | # a = x1 + x2 |
| 296 | lea (%rdi,%rcx),%rbp |
| 297 | # (uint32) a <<<= 13 |
| 298 | rol $13,%ebp |
| 299 | # x3 ^= a |
| 300 | xor %rbp,%rsi |
| 301 | # b = x6 + x7 |
| 302 | lea (%rax,%r8),%rbp |
| 303 | # (uint32) b <<<= 13 |
| 304 | rol $13,%ebp |
| 305 | # x4 ^= b |
| 306 | xor %rbp,%r9 |
| 307 | # a = x2 + x3 |
| 308 | lea (%rcx,%rsi),%rbp |
| 309 | # (uint32) a <<<= 18 |
| 310 | rol $18,%ebp |
| 311 | # x0 ^= a |
| 312 | xor %rbp,%rdx |
| 313 | # b = x7 + x4 |
| 314 | lea (%r8,%r9),%rbp |
| 315 | # (uint32) b <<<= 18 |
| 316 | rol $18,%ebp |
| 317 | # x5 ^= b |
| 318 | xor %rbp,%r15 |
| 319 | # x10 = x10_stack |
| 320 | movq 168(%rsp),%rbp |
| 321 | # x5_stack = x5 |
| 322 | movq %r15,160(%rsp) |
| 323 | # c = x9 + x10 |
| 324 | lea (%r10,%rbp),%r15 |
| 325 | # (uint32) c <<<= 7 |
| 326 | rol $7,%r15d |
| 327 | # x11 ^= c |
| 328 | xor %r15,%r12 |
| 329 | # c = x10 + x11 |
| 330 | lea (%rbp,%r12),%r15 |
| 331 | # (uint32) c <<<= 9 |
| 332 | rol $9,%r15d |
| 333 | # x8 ^= c |
| 334 | xor %r15,%r11 |
| 335 | # c = x11 + x8 |
| 336 | lea (%r12,%r11),%r15 |
| 337 | # (uint32) c <<<= 13 |
| 338 | rol $13,%r15d |
| 339 | # x9 ^= c |
| 340 | xor %r15,%r10 |
| 341 | # c = x8 + x9 |
| 342 | lea (%r11,%r10),%r15 |
| 343 | # (uint32) c <<<= 18 |
| 344 | rol $18,%r15d |
| 345 | # x10 ^= c |
| 346 | xor %r15,%rbp |
| 347 | # x15 = x15_stack |
| 348 | movq 176(%rsp),%r15 |
| 349 | # x10_stack = x10 |
| 350 | movq %rbp,168(%rsp) |
| 351 | # d = x14 + x15 |
| 352 | lea (%rbx,%r15),%rbp |
| 353 | # (uint32) d <<<= 7 |
| 354 | rol $7,%ebp |
| 355 | # x12 ^= d |
| 356 | xor %rbp,%r14 |
| 357 | # d = x15 + x12 |
| 358 | lea (%r15,%r14),%rbp |
| 359 | # (uint32) d <<<= 9 |
| 360 | rol $9,%ebp |
| 361 | # x13 ^= d |
| 362 | xor %rbp,%r13 |
| 363 | # d = x12 + x13 |
| 364 | lea (%r14,%r13),%rbp |
| 365 | # (uint32) d <<<= 13 |
| 366 | rol $13,%ebp |
| 367 | # x14 ^= d |
| 368 | xor %rbp,%rbx |
| 369 | # d = x13 + x14 |
| 370 | lea (%r13,%rbx),%rbp |
| 371 | # (uint32) d <<<= 18 |
| 372 | rol $18,%ebp |
| 373 | # x15 ^= d |
| 374 | xor %rbp,%r15 |
| 375 | # x15_stack = x15 |
| 376 | movq %r15,176(%rsp) |
| 377 | # x5 = x5_stack |
| 378 | movq 160(%rsp),%r15 |
| 379 | # a = x12 + x0 |
| 380 | lea (%r14,%rdx),%rbp |
| 381 | # (uint32) a <<<= 7 |
| 382 | rol $7,%ebp |
| 383 | # x4 ^= a |
| 384 | xor %rbp,%r9 |
| 385 | # b = x1 + x5 |
| 386 | lea (%rdi,%r15),%rbp |
| 387 | # (uint32) b <<<= 7 |
| 388 | rol $7,%ebp |
| 389 | # x9 ^= b |
| 390 | xor %rbp,%r10 |
| 391 | # a = x0 + x4 |
| 392 | lea (%rdx,%r9),%rbp |
| 393 | # (uint32) a <<<= 9 |
| 394 | rol $9,%ebp |
| 395 | # x8 ^= a |
| 396 | xor %rbp,%r11 |
| 397 | # b = x5 + x9 |
| 398 | lea (%r15,%r10),%rbp |
| 399 | # (uint32) b <<<= 9 |
| 400 | rol $9,%ebp |
| 401 | # x13 ^= b |
| 402 | xor %rbp,%r13 |
| 403 | # a = x4 + x8 |
| 404 | lea (%r9,%r11),%rbp |
| 405 | # (uint32) a <<<= 13 |
| 406 | rol $13,%ebp |
| 407 | # x12 ^= a |
| 408 | xor %rbp,%r14 |
| 409 | # b = x9 + x13 |
| 410 | lea (%r10,%r13),%rbp |
| 411 | # (uint32) b <<<= 13 |
| 412 | rol $13,%ebp |
| 413 | # x1 ^= b |
| 414 | xor %rbp,%rdi |
| 415 | # a = x8 + x12 |
| 416 | lea (%r11,%r14),%rbp |
| 417 | # (uint32) a <<<= 18 |
| 418 | rol $18,%ebp |
| 419 | # x0 ^= a |
| 420 | xor %rbp,%rdx |
| 421 | # b = x13 + x1 |
| 422 | lea (%r13,%rdi),%rbp |
| 423 | # (uint32) b <<<= 18 |
| 424 | rol $18,%ebp |
| 425 | # x5 ^= b |
| 426 | xor %rbp,%r15 |
| 427 | # x10 = x10_stack |
| 428 | movq 168(%rsp),%rbp |
| 429 | # x5_stack = x5 |
| 430 | movq %r15,160(%rsp) |
| 431 | # c = x6 + x10 |
| 432 | lea (%rax,%rbp),%r15 |
| 433 | # (uint32) c <<<= 7 |
| 434 | rol $7,%r15d |
| 435 | # x14 ^= c |
| 436 | xor %r15,%rbx |
| 437 | # c = x10 + x14 |
| 438 | lea (%rbp,%rbx),%r15 |
| 439 | # (uint32) c <<<= 9 |
| 440 | rol $9,%r15d |
| 441 | # x2 ^= c |
| 442 | xor %r15,%rcx |
| 443 | # c = x14 + x2 |
| 444 | lea (%rbx,%rcx),%r15 |
| 445 | # (uint32) c <<<= 13 |
| 446 | rol $13,%r15d |
| 447 | # x6 ^= c |
| 448 | xor %r15,%rax |
| 449 | # c = x2 + x6 |
| 450 | lea (%rcx,%rax),%r15 |
| 451 | # (uint32) c <<<= 18 |
| 452 | rol $18,%r15d |
| 453 | # x10 ^= c |
| 454 | xor %r15,%rbp |
| 455 | # x15 = x15_stack |
| 456 | movq 176(%rsp),%r15 |
| 457 | # x10_stack = x10 |
| 458 | movq %rbp,168(%rsp) |
| 459 | # d = x11 + x15 |
| 460 | lea (%r12,%r15),%rbp |
| 461 | # (uint32) d <<<= 7 |
| 462 | rol $7,%ebp |
| 463 | # x3 ^= d |
| 464 | xor %rbp,%rsi |
| 465 | # d = x15 + x3 |
| 466 | lea (%r15,%rsi),%rbp |
| 467 | # (uint32) d <<<= 9 |
| 468 | rol $9,%ebp |
| 469 | # x7 ^= d |
| 470 | xor %rbp,%r8 |
| 471 | # d = x3 + x7 |
| 472 | lea (%rsi,%r8),%rbp |
| 473 | # (uint32) d <<<= 13 |
| 474 | rol $13,%ebp |
| 475 | # x11 ^= d |
| 476 | xor %rbp,%r12 |
| 477 | # d = x7 + x11 |
| 478 | lea (%r8,%r12),%rbp |
| 479 | # (uint32) d <<<= 18 |
| 480 | rol $18,%ebp |
| 481 | # x15 ^= d |
| 482 | xor %rbp,%r15 |
| 483 | # x15_stack = x15 |
| 484 | movq %r15,176(%rsp) |
| 485 | # x5 = x5_stack |
| 486 | movq 160(%rsp),%r15 |
| 487 | # a = x3 + x0 |
| 488 | lea (%rsi,%rdx),%rbp |
| 489 | # (uint32) a <<<= 7 |
| 490 | rol $7,%ebp |
| 491 | # x1 ^= a |
| 492 | xor %rbp,%rdi |
| 493 | # b = x4 + x5 |
| 494 | lea (%r9,%r15),%rbp |
| 495 | # (uint32) b <<<= 7 |
| 496 | rol $7,%ebp |
| 497 | # x6 ^= b |
| 498 | xor %rbp,%rax |
| 499 | # a = x0 + x1 |
| 500 | lea (%rdx,%rdi),%rbp |
| 501 | # (uint32) a <<<= 9 |
| 502 | rol $9,%ebp |
| 503 | # x2 ^= a |
| 504 | xor %rbp,%rcx |
| 505 | # b = x5 + x6 |
| 506 | lea (%r15,%rax),%rbp |
| 507 | # (uint32) b <<<= 9 |
| 508 | rol $9,%ebp |
| 509 | # x7 ^= b |
| 510 | xor %rbp,%r8 |
| 511 | # a = x1 + x2 |
| 512 | lea (%rdi,%rcx),%rbp |
| 513 | # (uint32) a <<<= 13 |
| 514 | rol $13,%ebp |
| 515 | # x3 ^= a |
| 516 | xor %rbp,%rsi |
| 517 | # b = x6 + x7 |
| 518 | lea (%rax,%r8),%rbp |
| 519 | # (uint32) b <<<= 13 |
| 520 | rol $13,%ebp |
| 521 | # x4 ^= b |
| 522 | xor %rbp,%r9 |
| 523 | # a = x2 + x3 |
| 524 | lea (%rcx,%rsi),%rbp |
| 525 | # (uint32) a <<<= 18 |
| 526 | rol $18,%ebp |
| 527 | # x0 ^= a |
| 528 | xor %rbp,%rdx |
| 529 | # b = x7 + x4 |
| 530 | lea (%r8,%r9),%rbp |
| 531 | # (uint32) b <<<= 18 |
| 532 | rol $18,%ebp |
| 533 | # x5 ^= b |
| 534 | xor %rbp,%r15 |
| 535 | # x10 = x10_stack |
| 536 | movq 168(%rsp),%rbp |
| 537 | # x5_stack = x5 |
| 538 | movq %r15,160(%rsp) |
| 539 | # c = x9 + x10 |
| 540 | lea (%r10,%rbp),%r15 |
| 541 | # (uint32) c <<<= 7 |
| 542 | rol $7,%r15d |
| 543 | # x11 ^= c |
| 544 | xor %r15,%r12 |
| 545 | # c = x10 + x11 |
| 546 | lea (%rbp,%r12),%r15 |
| 547 | # (uint32) c <<<= 9 |
| 548 | rol $9,%r15d |
| 549 | # x8 ^= c |
| 550 | xor %r15,%r11 |
| 551 | # c = x11 + x8 |
| 552 | lea (%r12,%r11),%r15 |
| 553 | # (uint32) c <<<= 13 |
| 554 | rol $13,%r15d |
| 555 | # x9 ^= c |
| 556 | xor %r15,%r10 |
| 557 | # c = x8 + x9 |
| 558 | lea (%r11,%r10),%r15 |
| 559 | # (uint32) c <<<= 18 |
| 560 | rol $18,%r15d |
| 561 | # x10 ^= c |
| 562 | xor %r15,%rbp |
| 563 | # x15 = x15_stack |
| 564 | movq 176(%rsp),%r15 |
| 565 | # x10_stack = x10 |
| 566 | movq %rbp,168(%rsp) |
| 567 | # d = x14 + x15 |
| 568 | lea (%rbx,%r15),%rbp |
| 569 | # (uint32) d <<<= 7 |
| 570 | rol $7,%ebp |
| 571 | # x12 ^= d |
| 572 | xor %rbp,%r14 |
| 573 | # d = x15 + x12 |
| 574 | lea (%r15,%r14),%rbp |
| 575 | # (uint32) d <<<= 9 |
| 576 | rol $9,%ebp |
| 577 | # x13 ^= d |
| 578 | xor %rbp,%r13 |
| 579 | # d = x12 + x13 |
| 580 | lea (%r14,%r13),%rbp |
| 581 | # (uint32) d <<<= 13 |
| 582 | rol $13,%ebp |
| 583 | # x14 ^= d |
| 584 | xor %rbp,%rbx |
| 585 | # d = x13 + x14 |
| 586 | lea (%r13,%rbx),%rbp |
| 587 | # (uint32) d <<<= 18 |
| 588 | rol $18,%ebp |
| 589 | # x15 ^= d |
| 590 | xor %rbp,%r15 |
| 591 | # x15_stack = x15 |
| 592 | movq %r15,176(%rsp) |
| 593 | # i = i_backup |
| 594 | movq 184(%rsp),%r15 |
| 595 | # unsigned>? i -= 4 |
| 596 | sub $4,%r15 |
| 597 | # comment:fp stack unchanged by jump |
| 598 | # goto mainloop if unsigned> |
| 599 | ja ._mainloop |
| 600 | # (uint32) x2 += j2 |
| 601 | addl 64(%rsp),%ecx |
| 602 | # x3 <<= 32 |
| 603 | shl $32,%rsi |
| 604 | # x3 += j2 |
| 605 | addq 64(%rsp),%rsi |
| 606 | # (uint64) x3 >>= 32 |
| 607 | shr $32,%rsi |
| 608 | # x3 <<= 32 |
| 609 | shl $32,%rsi |
| 610 | # x2 += x3 |
| 611 | add %rsi,%rcx |
| 612 | # (uint32) x6 += j6 |
| 613 | addl 80(%rsp),%eax |
| 614 | # x7 <<= 32 |
| 615 | shl $32,%r8 |
| 616 | # x7 += j6 |
| 617 | addq 80(%rsp),%r8 |
| 618 | # (uint64) x7 >>= 32 |
| 619 | shr $32,%r8 |
| 620 | # x7 <<= 32 |
| 621 | shl $32,%r8 |
| 622 | # x6 += x7 |
| 623 | add %r8,%rax |
| 624 | # (uint32) x8 += j8 |
| 625 | addl 88(%rsp),%r11d |
| 626 | # x9 <<= 32 |
| 627 | shl $32,%r10 |
| 628 | # x9 += j8 |
| 629 | addq 88(%rsp),%r10 |
| 630 | # (uint64) x9 >>= 32 |
| 631 | shr $32,%r10 |
| 632 | # x9 <<= 32 |
| 633 | shl $32,%r10 |
| 634 | # x8 += x9 |
| 635 | add %r10,%r11 |
| 636 | # (uint32) x12 += j12 |
| 637 | addl 104(%rsp),%r14d |
| 638 | # x13 <<= 32 |
| 639 | shl $32,%r13 |
| 640 | # x13 += j12 |
| 641 | addq 104(%rsp),%r13 |
| 642 | # (uint64) x13 >>= 32 |
| 643 | shr $32,%r13 |
| 644 | # x13 <<= 32 |
| 645 | shl $32,%r13 |
| 646 | # x12 += x13 |
| 647 | add %r13,%r14 |
| 648 | # (uint32) x0 += j0 |
| 649 | addl 56(%rsp),%edx |
| 650 | # x1 <<= 32 |
| 651 | shl $32,%rdi |
| 652 | # x1 += j0 |
| 653 | addq 56(%rsp),%rdi |
| 654 | # (uint64) x1 >>= 32 |
| 655 | shr $32,%rdi |
| 656 | # x1 <<= 32 |
| 657 | shl $32,%rdi |
| 658 | # x0 += x1 |
| 659 | add %rdi,%rdx |
| 660 | # x5 = x5_stack |
| 661 | movq 160(%rsp),%rdi |
| 662 | # (uint32) x4 += j4 |
| 663 | addl 72(%rsp),%r9d |
| 664 | # x5 <<= 32 |
| 665 | shl $32,%rdi |
| 666 | # x5 += j4 |
| 667 | addq 72(%rsp),%rdi |
| 668 | # (uint64) x5 >>= 32 |
| 669 | shr $32,%rdi |
| 670 | # x5 <<= 32 |
| 671 | shl $32,%rdi |
| 672 | # x4 += x5 |
| 673 | add %rdi,%r9 |
| 674 | # x10 = x10_stack |
| 675 | movq 168(%rsp),%r8 |
| 676 | # (uint32) x10 += j10 |
| 677 | addl 96(%rsp),%r8d |
| 678 | # x11 <<= 32 |
| 679 | shl $32,%r12 |
| 680 | # x11 += j10 |
| 681 | addq 96(%rsp),%r12 |
| 682 | # (uint64) x11 >>= 32 |
| 683 | shr $32,%r12 |
| 684 | # x11 <<= 32 |
| 685 | shl $32,%r12 |
| 686 | # x10 += x11 |
| 687 | add %r12,%r8 |
| 688 | # x15 = x15_stack |
| 689 | movq 176(%rsp),%rdi |
| 690 | # (uint32) x14 += j14 |
| 691 | addl 112(%rsp),%ebx |
| 692 | # x15 <<= 32 |
| 693 | shl $32,%rdi |
| 694 | # x15 += j14 |
| 695 | addq 112(%rsp),%rdi |
| 696 | # (uint64) x15 >>= 32 |
| 697 | shr $32,%rdi |
| 698 | # x15 <<= 32 |
| 699 | shl $32,%rdi |
| 700 | # x14 += x15 |
| 701 | add %rdi,%rbx |
| 702 | # out = out_backup |
| 703 | movq 136(%rsp),%rdi |
| 704 | # m = m_backup |
| 705 | movq 144(%rsp),%rsi |
| 706 | # x0 ^= *(uint64 *) (m + 0) |
| 707 | xorq 0(%rsi),%rdx |
| 708 | # *(uint64 *) (out + 0) = x0 |
| 709 | movq %rdx,0(%rdi) |
| 710 | # x2 ^= *(uint64 *) (m + 8) |
| 711 | xorq 8(%rsi),%rcx |
| 712 | # *(uint64 *) (out + 8) = x2 |
| 713 | movq %rcx,8(%rdi) |
| 714 | # x4 ^= *(uint64 *) (m + 16) |
| 715 | xorq 16(%rsi),%r9 |
| 716 | # *(uint64 *) (out + 16) = x4 |
| 717 | movq %r9,16(%rdi) |
| 718 | # x6 ^= *(uint64 *) (m + 24) |
| 719 | xorq 24(%rsi),%rax |
| 720 | # *(uint64 *) (out + 24) = x6 |
| 721 | movq %rax,24(%rdi) |
| 722 | # x8 ^= *(uint64 *) (m + 32) |
| 723 | xorq 32(%rsi),%r11 |
| 724 | # *(uint64 *) (out + 32) = x8 |
| 725 | movq %r11,32(%rdi) |
| 726 | # x10 ^= *(uint64 *) (m + 40) |
| 727 | xorq 40(%rsi),%r8 |
| 728 | # *(uint64 *) (out + 40) = x10 |
| 729 | movq %r8,40(%rdi) |
| 730 | # x12 ^= *(uint64 *) (m + 48) |
| 731 | xorq 48(%rsi),%r14 |
| 732 | # *(uint64 *) (out + 48) = x12 |
| 733 | movq %r14,48(%rdi) |
| 734 | # x14 ^= *(uint64 *) (m + 56) |
| 735 | xorq 56(%rsi),%rbx |
| 736 | # *(uint64 *) (out + 56) = x14 |
| 737 | movq %rbx,56(%rdi) |
| 738 | # bytes = bytes_backup |
| 739 | movq 152(%rsp),%rdx |
| 740 | # in8 = j8 |
| 741 | movq 88(%rsp),%rcx |
| 742 | # in8 += 1 |
| 743 | add $1,%rcx |
| 744 | # j8 = in8 |
| 745 | movq %rcx,88(%rsp) |
| 746 | # unsigned>? unsigned<? bytes - 64 |
| 747 | cmp $64,%rdx |
| 748 | # comment:fp stack unchanged by jump |
| 749 | # goto bytesatleast65 if unsigned> |
| 750 | ja ._bytesatleast65 |
| 751 | # comment:fp stack unchanged by jump |
| 752 | # goto bytesatleast64 if !unsigned< |
| 753 | jae ._bytesatleast64 |
| 754 | # m = out |
| 755 | mov %rdi,%rsi |
| 756 | # out = ctarget |
| 757 | movq 128(%rsp),%rdi |
| 758 | # i = bytes |
| 759 | mov %rdx,%rcx |
| 760 | # while (i) { *out++ = *m++; --i } |
| 761 | rep movsb |
| 762 | # comment:fp stack unchanged by fallthrough |
| 763 | # bytesatleast64: |
| 764 | ._bytesatleast64: |
| 765 | # x = x_backup |
| 766 | movq 120(%rsp),%rdi |
| 767 | # in8 = j8 |
| 768 | movq 88(%rsp),%rsi |
| 769 | # *(uint64 *) (x + 32) = in8 |
| 770 | movq %rsi,32(%rdi) |
| 771 | # r11 = r11_stack |
| 772 | movq 0(%rsp),%r11 |
| 773 | # r12 = r12_stack |
| 774 | movq 8(%rsp),%r12 |
| 775 | # r13 = r13_stack |
| 776 | movq 16(%rsp),%r13 |
| 777 | # r14 = r14_stack |
| 778 | movq 24(%rsp),%r14 |
| 779 | # r15 = r15_stack |
| 780 | movq 32(%rsp),%r15 |
| 781 | # rbx = rbx_stack |
| 782 | movq 40(%rsp),%rbx |
| 783 | # rbp = rbp_stack |
| 784 | movq 48(%rsp),%rbp |
| 785 | # comment:fp stack unchanged by fallthrough |
| 786 | # done: |
| 787 | ._done: |
| 788 | # leave |
| 789 | add %r11,%rsp |
| 790 | mov %rdi,%rax |
| 791 | mov %rsi,%rdx |
| 792 | ret |
| 793 | # bytesatleast65: |
| 794 | ._bytesatleast65: |
| 795 | # bytes -= 64 |
| 796 | sub $64,%rdx |
| 797 | # out += 64 |
| 798 | add $64,%rdi |
| 799 | # m += 64 |
| 800 | add $64,%rsi |
| 801 | # comment:fp stack unchanged by jump |
| 802 | # goto bytesatleast1 |
| 803 | jmp ._bytesatleast1 |
Jussi Kivilinna | 0444380 | 2013-01-19 13:39:31 +0200 | [diff] [blame] | 804 | ENDPROC(salsa20_encrypt_bytes) |
| 805 | |
| 806 | # enter salsa20_keysetup |
| 807 | ENTRY(salsa20_keysetup) |
Tan Swee Heng | 9a7dafb | 2007-12-18 00:04:40 +0800 | [diff] [blame] | 808 | mov %rsp,%r11 |
| 809 | and $31,%r11 |
| 810 | add $256,%r11 |
| 811 | sub %r11,%rsp |
| 812 | # k = arg2 |
| 813 | mov %rsi,%rsi |
| 814 | # kbits = arg3 |
| 815 | mov %rdx,%rdx |
| 816 | # x = arg1 |
| 817 | mov %rdi,%rdi |
| 818 | # in0 = *(uint64 *) (k + 0) |
| 819 | movq 0(%rsi),%r8 |
| 820 | # in2 = *(uint64 *) (k + 8) |
| 821 | movq 8(%rsi),%r9 |
| 822 | # *(uint64 *) (x + 4) = in0 |
| 823 | movq %r8,4(%rdi) |
| 824 | # *(uint64 *) (x + 12) = in2 |
| 825 | movq %r9,12(%rdi) |
| 826 | # unsigned<? kbits - 256 |
| 827 | cmp $256,%rdx |
| 828 | # comment:fp stack unchanged by jump |
| 829 | # goto kbits128 if unsigned< |
| 830 | jb ._kbits128 |
| 831 | # kbits256: |
| 832 | ._kbits256: |
| 833 | # in10 = *(uint64 *) (k + 16) |
| 834 | movq 16(%rsi),%rdx |
| 835 | # in12 = *(uint64 *) (k + 24) |
| 836 | movq 24(%rsi),%rsi |
| 837 | # *(uint64 *) (x + 44) = in10 |
| 838 | movq %rdx,44(%rdi) |
| 839 | # *(uint64 *) (x + 52) = in12 |
| 840 | movq %rsi,52(%rdi) |
| 841 | # in0 = 1634760805 |
| 842 | mov $1634760805,%rsi |
| 843 | # in4 = 857760878 |
| 844 | mov $857760878,%rdx |
| 845 | # in10 = 2036477234 |
| 846 | mov $2036477234,%rcx |
| 847 | # in14 = 1797285236 |
| 848 | mov $1797285236,%r8 |
| 849 | # *(uint32 *) (x + 0) = in0 |
| 850 | movl %esi,0(%rdi) |
| 851 | # *(uint32 *) (x + 20) = in4 |
| 852 | movl %edx,20(%rdi) |
| 853 | # *(uint32 *) (x + 40) = in10 |
| 854 | movl %ecx,40(%rdi) |
| 855 | # *(uint32 *) (x + 60) = in14 |
| 856 | movl %r8d,60(%rdi) |
| 857 | # comment:fp stack unchanged by jump |
| 858 | # goto keysetupdone |
| 859 | jmp ._keysetupdone |
| 860 | # kbits128: |
| 861 | ._kbits128: |
| 862 | # in10 = *(uint64 *) (k + 0) |
| 863 | movq 0(%rsi),%rdx |
| 864 | # in12 = *(uint64 *) (k + 8) |
| 865 | movq 8(%rsi),%rsi |
| 866 | # *(uint64 *) (x + 44) = in10 |
| 867 | movq %rdx,44(%rdi) |
| 868 | # *(uint64 *) (x + 52) = in12 |
| 869 | movq %rsi,52(%rdi) |
| 870 | # in0 = 1634760805 |
| 871 | mov $1634760805,%rsi |
| 872 | # in4 = 824206446 |
| 873 | mov $824206446,%rdx |
| 874 | # in10 = 2036477238 |
| 875 | mov $2036477238,%rcx |
| 876 | # in14 = 1797285236 |
| 877 | mov $1797285236,%r8 |
| 878 | # *(uint32 *) (x + 0) = in0 |
| 879 | movl %esi,0(%rdi) |
| 880 | # *(uint32 *) (x + 20) = in4 |
| 881 | movl %edx,20(%rdi) |
| 882 | # *(uint32 *) (x + 40) = in10 |
| 883 | movl %ecx,40(%rdi) |
| 884 | # *(uint32 *) (x + 60) = in14 |
| 885 | movl %r8d,60(%rdi) |
| 886 | # keysetupdone: |
| 887 | ._keysetupdone: |
| 888 | # leave |
| 889 | add %r11,%rsp |
| 890 | mov %rdi,%rax |
| 891 | mov %rsi,%rdx |
| 892 | ret |
Jussi Kivilinna | 0444380 | 2013-01-19 13:39:31 +0200 | [diff] [blame] | 893 | ENDPROC(salsa20_keysetup) |
| 894 | |
| 895 | # enter salsa20_ivsetup |
| 896 | ENTRY(salsa20_ivsetup) |
Tan Swee Heng | 9a7dafb | 2007-12-18 00:04:40 +0800 | [diff] [blame] | 897 | mov %rsp,%r11 |
| 898 | and $31,%r11 |
| 899 | add $256,%r11 |
| 900 | sub %r11,%rsp |
| 901 | # iv = arg2 |
| 902 | mov %rsi,%rsi |
| 903 | # x = arg1 |
| 904 | mov %rdi,%rdi |
| 905 | # in6 = *(uint64 *) (iv + 0) |
| 906 | movq 0(%rsi),%rsi |
| 907 | # in8 = 0 |
| 908 | mov $0,%r8 |
| 909 | # *(uint64 *) (x + 24) = in6 |
| 910 | movq %rsi,24(%rdi) |
| 911 | # *(uint64 *) (x + 32) = in8 |
| 912 | movq %r8,32(%rdi) |
| 913 | # leave |
| 914 | add %r11,%rsp |
| 915 | mov %rdi,%rax |
| 916 | mov %rsi,%rdx |
| 917 | ret |
Jussi Kivilinna | 0444380 | 2013-01-19 13:39:31 +0200 | [diff] [blame] | 918 | ENDPROC(salsa20_ivsetup) |