Megha Dey | 9925324 | 2016-06-23 18:40:46 -0700 | [diff] [blame] | 1 | /* |
| 2 | * Multi-buffer SHA256 algorithm hash compute routine |
| 3 | * |
| 4 | * This file is provided under a dual BSD/GPLv2 license. When using or |
| 5 | * redistributing this file, you may do so under either license. |
| 6 | * |
| 7 | * GPL LICENSE SUMMARY |
| 8 | * |
| 9 | * Copyright(c) 2016 Intel Corporation. |
| 10 | * |
| 11 | * This program is free software; you can redistribute it and/or modify |
| 12 | * it under the terms of version 2 of the GNU General Public License as |
| 13 | * published by the Free Software Foundation. |
| 14 | * |
| 15 | * This program is distributed in the hope that it will be useful, but |
| 16 | * WITHOUT ANY WARRANTY; without even the implied warranty of |
| 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 18 | * General Public License for more details. |
| 19 | * |
| 20 | * Contact Information: |
| 21 | * Megha Dey <megha.dey@linux.intel.com> |
| 22 | * |
| 23 | * BSD LICENSE |
| 24 | * |
| 25 | * Copyright(c) 2016 Intel Corporation. |
| 26 | * |
| 27 | * Redistribution and use in source and binary forms, with or without |
| 28 | * modification, are permitted provided that the following conditions |
| 29 | * are met: |
| 30 | * |
| 31 | * * Redistributions of source code must retain the above copyright |
| 32 | * notice, this list of conditions and the following disclaimer. |
| 33 | * * Redistributions in binary form must reproduce the above copyright |
| 34 | * notice, this list of conditions and the following disclaimer in |
| 35 | * the documentation and/or other materials provided with the |
| 36 | * distribution. |
| 37 | * * Neither the name of Intel Corporation nor the names of its |
| 38 | * contributors may be used to endorse or promote products derived |
| 39 | * from this software without specific prior written permission. |
| 40 | * |
| 41 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 42 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 43 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 44 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| 45 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 46 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 47 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 48 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 49 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 50 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 51 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 52 | */ |
| 53 | |
| 54 | #include <linux/linkage.h> |
| 55 | #include "sha256_mb_mgr_datastruct.S" |
| 56 | |
| 57 | ## code to compute oct SHA256 using SSE-256 |
| 58 | ## outer calling routine takes care of save and restore of XMM registers |
| 59 | ## Logic designed/laid out by JDG |
| 60 | |
| 61 | ## Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; %ymm0-15 |
| 62 | ## Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15 |
| 63 | ## Linux preserves: rdi rbp r8 |
| 64 | ## |
| 65 | ## clobbers %ymm0-15 |
| 66 | |
| 67 | arg1 = %rdi |
| 68 | arg2 = %rsi |
| 69 | reg3 = %rcx |
| 70 | reg4 = %rdx |
| 71 | |
| 72 | # Common definitions |
| 73 | STATE = arg1 |
| 74 | INP_SIZE = arg2 |
| 75 | |
| 76 | IDX = %rax |
| 77 | ROUND = %rbx |
| 78 | TBL = reg3 |
| 79 | |
| 80 | inp0 = %r9 |
| 81 | inp1 = %r10 |
| 82 | inp2 = %r11 |
| 83 | inp3 = %r12 |
| 84 | inp4 = %r13 |
| 85 | inp5 = %r14 |
| 86 | inp6 = %r15 |
| 87 | inp7 = reg4 |
| 88 | |
| 89 | a = %ymm0 |
| 90 | b = %ymm1 |
| 91 | c = %ymm2 |
| 92 | d = %ymm3 |
| 93 | e = %ymm4 |
| 94 | f = %ymm5 |
| 95 | g = %ymm6 |
| 96 | h = %ymm7 |
| 97 | |
| 98 | T1 = %ymm8 |
| 99 | |
| 100 | a0 = %ymm12 |
| 101 | a1 = %ymm13 |
| 102 | a2 = %ymm14 |
| 103 | TMP = %ymm15 |
| 104 | TMP0 = %ymm6 |
| 105 | TMP1 = %ymm7 |
| 106 | |
| 107 | TT0 = %ymm8 |
| 108 | TT1 = %ymm9 |
| 109 | TT2 = %ymm10 |
| 110 | TT3 = %ymm11 |
| 111 | TT4 = %ymm12 |
| 112 | TT5 = %ymm13 |
| 113 | TT6 = %ymm14 |
| 114 | TT7 = %ymm15 |
| 115 | |
| 116 | # Define stack usage |
| 117 | |
| 118 | # Assume stack aligned to 32 bytes before call |
| 119 | # Therefore FRAMESZ mod 32 must be 32-8 = 24 |
| 120 | |
| 121 | #define FRAMESZ 0x388 |
| 122 | |
| 123 | #define VMOVPS vmovups |
| 124 | |
| 125 | # TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 |
| 126 | # "transpose" data in {r0...r7} using temps {t0...t1} |
| 127 | # Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7} |
| 128 | # r0 = {a7 a6 a5 a4 a3 a2 a1 a0} |
| 129 | # r1 = {b7 b6 b5 b4 b3 b2 b1 b0} |
| 130 | # r2 = {c7 c6 c5 c4 c3 c2 c1 c0} |
| 131 | # r3 = {d7 d6 d5 d4 d3 d2 d1 d0} |
| 132 | # r4 = {e7 e6 e5 e4 e3 e2 e1 e0} |
| 133 | # r5 = {f7 f6 f5 f4 f3 f2 f1 f0} |
| 134 | # r6 = {g7 g6 g5 g4 g3 g2 g1 g0} |
| 135 | # r7 = {h7 h6 h5 h4 h3 h2 h1 h0} |
| 136 | # |
| 137 | # Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7} |
| 138 | # r0 = {h0 g0 f0 e0 d0 c0 b0 a0} |
| 139 | # r1 = {h1 g1 f1 e1 d1 c1 b1 a1} |
| 140 | # r2 = {h2 g2 f2 e2 d2 c2 b2 a2} |
| 141 | # r3 = {h3 g3 f3 e3 d3 c3 b3 a3} |
| 142 | # r4 = {h4 g4 f4 e4 d4 c4 b4 a4} |
| 143 | # r5 = {h5 g5 f5 e5 d5 c5 b5 a5} |
| 144 | # r6 = {h6 g6 f6 e6 d6 c6 b6 a6} |
| 145 | # r7 = {h7 g7 f7 e7 d7 c7 b7 a7} |
| 146 | # |
| 147 | |
| 148 | .macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1 |
| 149 | # process top half (r0..r3) {a...d} |
| 150 | vshufps $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4 b1 b0 a1 a0} |
| 151 | vshufps $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6 b3 b2 a3 a2} |
| 152 | vshufps $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4 d1 d0 c1 c0} |
| 153 | vshufps $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6 d3 d2 c3 c2} |
| 154 | vshufps $0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5 d1 c1 b1 a1} |
| 155 | vshufps $0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6 d2 c2 b2 a2} |
| 156 | vshufps $0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7 d3 c3 b3 a3} |
| 157 | vshufps $0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4 d0 c0 b0 a0} |
| 158 | |
| 159 | # use r2 in place of t0 |
| 160 | # process bottom half (r4..r7) {e...h} |
| 161 | vshufps $0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4 f1 f0 e1 e0} |
| 162 | vshufps $0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6 f3 f2 e3 e2} |
| 163 | vshufps $0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4 h1 h0 g1 g0} |
| 164 | vshufps $0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6 h3 h2 g3 g2} |
| 165 | vshufps $0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5 h1 g1 f1 e1} |
| 166 | vshufps $0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6 h2 g2 f2 e2} |
| 167 | vshufps $0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7 h3 g3 f3 e3} |
| 168 | vshufps $0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4 h0 g0 f0 e0} |
| 169 | |
| 170 | vperm2f128 $0x13, \r1, \r5, \r6 # h6...a6 |
| 171 | vperm2f128 $0x02, \r1, \r5, \r2 # h2...a2 |
| 172 | vperm2f128 $0x13, \r3, \r7, \r5 # h5...a5 |
| 173 | vperm2f128 $0x02, \r3, \r7, \r1 # h1...a1 |
| 174 | vperm2f128 $0x13, \r0, \r4, \r7 # h7...a7 |
| 175 | vperm2f128 $0x02, \r0, \r4, \r3 # h3...a3 |
| 176 | vperm2f128 $0x13, \t0, \t1, \r4 # h4...a4 |
| 177 | vperm2f128 $0x02, \t0, \t1, \r0 # h0...a0 |
| 178 | |
| 179 | .endm |
| 180 | |
| 181 | .macro ROTATE_ARGS |
| 182 | TMP_ = h |
| 183 | h = g |
| 184 | g = f |
| 185 | f = e |
| 186 | e = d |
| 187 | d = c |
| 188 | c = b |
| 189 | b = a |
| 190 | a = TMP_ |
| 191 | .endm |
| 192 | |
| 193 | .macro _PRORD reg imm tmp |
| 194 | vpslld $(32-\imm),\reg,\tmp |
| 195 | vpsrld $\imm,\reg, \reg |
| 196 | vpor \tmp,\reg, \reg |
| 197 | .endm |
| 198 | |
| 199 | # PRORD_nd reg, imm, tmp, src |
| 200 | .macro _PRORD_nd reg imm tmp src |
| 201 | vpslld $(32-\imm), \src, \tmp |
| 202 | vpsrld $\imm, \src, \reg |
| 203 | vpor \tmp, \reg, \reg |
| 204 | .endm |
| 205 | |
| 206 | # PRORD dst/src, amt |
| 207 | .macro PRORD reg imm |
| 208 | _PRORD \reg,\imm,TMP |
| 209 | .endm |
| 210 | |
| 211 | # PRORD_nd dst, src, amt |
| 212 | .macro PRORD_nd reg tmp imm |
| 213 | _PRORD_nd \reg, \imm, TMP, \tmp |
| 214 | .endm |
| 215 | |
| 216 | # arguments passed implicitly in preprocessor symbols i, a...h |
| 217 | .macro ROUND_00_15 _T1 i |
| 218 | PRORD_nd a0,e,5 # sig1: a0 = (e >> 5) |
| 219 | |
| 220 | vpxor g, f, a2 # ch: a2 = f^g |
| 221 | vpand e,a2, a2 # ch: a2 = (f^g)&e |
| 222 | vpxor g, a2, a2 # a2 = ch |
| 223 | |
| 224 | PRORD_nd a1,e,25 # sig1: a1 = (e >> 25) |
| 225 | |
| 226 | vmovdqu \_T1,(SZ8*(\i & 0xf))(%rsp) |
| 227 | vpaddd (TBL,ROUND,1), \_T1, \_T1 # T1 = W + K |
| 228 | vpxor e,a0, a0 # sig1: a0 = e ^ (e >> 5) |
| 229 | PRORD a0, 6 # sig1: a0 = (e >> 6) ^ (e >> 11) |
| 230 | vpaddd a2, h, h # h = h + ch |
| 231 | PRORD_nd a2,a,11 # sig0: a2 = (a >> 11) |
| 232 | vpaddd \_T1,h, h # h = h + ch + W + K |
| 233 | vpxor a1, a0, a0 # a0 = sigma1 |
| 234 | PRORD_nd a1,a,22 # sig0: a1 = (a >> 22) |
| 235 | vpxor c, a, \_T1 # maj: T1 = a^c |
| 236 | add $SZ8, ROUND # ROUND++ |
| 237 | vpand b, \_T1, \_T1 # maj: T1 = (a^c)&b |
| 238 | vpaddd a0, h, h |
| 239 | vpaddd h, d, d |
| 240 | vpxor a, a2, a2 # sig0: a2 = a ^ (a >> 11) |
| 241 | PRORD a2,2 # sig0: a2 = (a >> 2) ^ (a >> 13) |
| 242 | vpxor a1, a2, a2 # a2 = sig0 |
| 243 | vpand c, a, a1 # maj: a1 = a&c |
| 244 | vpor \_T1, a1, a1 # a1 = maj |
| 245 | vpaddd a1, h, h # h = h + ch + W + K + maj |
| 246 | vpaddd a2, h, h # h = h + ch + W + K + maj + sigma0 |
| 247 | ROTATE_ARGS |
| 248 | .endm |
| 249 | |
| 250 | # arguments passed implicitly in preprocessor symbols i, a...h |
| 251 | .macro ROUND_16_XX _T1 i |
| 252 | vmovdqu (SZ8*((\i-15)&0xf))(%rsp), \_T1 |
| 253 | vmovdqu (SZ8*((\i-2)&0xf))(%rsp), a1 |
| 254 | vmovdqu \_T1, a0 |
| 255 | PRORD \_T1,11 |
| 256 | vmovdqu a1, a2 |
| 257 | PRORD a1,2 |
| 258 | vpxor a0, \_T1, \_T1 |
| 259 | PRORD \_T1, 7 |
| 260 | vpxor a2, a1, a1 |
| 261 | PRORD a1, 17 |
| 262 | vpsrld $3, a0, a0 |
| 263 | vpxor a0, \_T1, \_T1 |
| 264 | vpsrld $10, a2, a2 |
| 265 | vpxor a2, a1, a1 |
| 266 | vpaddd (SZ8*((\i-16)&0xf))(%rsp), \_T1, \_T1 |
| 267 | vpaddd (SZ8*((\i-7)&0xf))(%rsp), a1, a1 |
| 268 | vpaddd a1, \_T1, \_T1 |
| 269 | |
| 270 | ROUND_00_15 \_T1,\i |
| 271 | .endm |
| 272 | |
| 273 | # SHA256_ARGS: |
| 274 | # UINT128 digest[8]; // transposed digests |
| 275 | # UINT8 *data_ptr[4]; |
| 276 | |
| 277 | # void sha256_x8_avx2(SHA256_ARGS *args, UINT64 bytes); |
| 278 | # arg 1 : STATE : pointer to array of pointers to input data |
| 279 | # arg 2 : INP_SIZE : size of input in blocks |
| 280 | # general registers preserved in outer calling routine |
| 281 | # outer calling routine saves all the XMM registers |
| 282 | # save rsp, allocate 32-byte aligned for local variables |
| 283 | ENTRY(sha256_x8_avx2) |
| 284 | |
| 285 | # save callee-saved clobbered registers to comply with C function ABI |
| 286 | push %r12 |
| 287 | push %r13 |
| 288 | push %r14 |
| 289 | push %r15 |
| 290 | |
| 291 | mov %rsp, IDX |
| 292 | sub $FRAMESZ, %rsp |
| 293 | and $~0x1F, %rsp |
| 294 | mov IDX, _rsp(%rsp) |
| 295 | |
| 296 | # Load the pre-transposed incoming digest. |
| 297 | vmovdqu 0*SHA256_DIGEST_ROW_SIZE(STATE),a |
| 298 | vmovdqu 1*SHA256_DIGEST_ROW_SIZE(STATE),b |
| 299 | vmovdqu 2*SHA256_DIGEST_ROW_SIZE(STATE),c |
| 300 | vmovdqu 3*SHA256_DIGEST_ROW_SIZE(STATE),d |
| 301 | vmovdqu 4*SHA256_DIGEST_ROW_SIZE(STATE),e |
| 302 | vmovdqu 5*SHA256_DIGEST_ROW_SIZE(STATE),f |
| 303 | vmovdqu 6*SHA256_DIGEST_ROW_SIZE(STATE),g |
| 304 | vmovdqu 7*SHA256_DIGEST_ROW_SIZE(STATE),h |
| 305 | |
| 306 | lea K256_8(%rip),TBL |
| 307 | |
| 308 | # load the address of each of the 4 message lanes |
| 309 | # getting ready to transpose input onto stack |
| 310 | mov _args_data_ptr+0*PTR_SZ(STATE),inp0 |
| 311 | mov _args_data_ptr+1*PTR_SZ(STATE),inp1 |
| 312 | mov _args_data_ptr+2*PTR_SZ(STATE),inp2 |
| 313 | mov _args_data_ptr+3*PTR_SZ(STATE),inp3 |
| 314 | mov _args_data_ptr+4*PTR_SZ(STATE),inp4 |
| 315 | mov _args_data_ptr+5*PTR_SZ(STATE),inp5 |
| 316 | mov _args_data_ptr+6*PTR_SZ(STATE),inp6 |
| 317 | mov _args_data_ptr+7*PTR_SZ(STATE),inp7 |
| 318 | |
| 319 | xor IDX, IDX |
| 320 | lloop: |
| 321 | xor ROUND, ROUND |
| 322 | |
| 323 | # save old digest |
| 324 | vmovdqu a, _digest(%rsp) |
| 325 | vmovdqu b, _digest+1*SZ8(%rsp) |
| 326 | vmovdqu c, _digest+2*SZ8(%rsp) |
| 327 | vmovdqu d, _digest+3*SZ8(%rsp) |
| 328 | vmovdqu e, _digest+4*SZ8(%rsp) |
| 329 | vmovdqu f, _digest+5*SZ8(%rsp) |
| 330 | vmovdqu g, _digest+6*SZ8(%rsp) |
| 331 | vmovdqu h, _digest+7*SZ8(%rsp) |
| 332 | i = 0 |
| 333 | .rep 2 |
| 334 | VMOVPS i*32(inp0, IDX), TT0 |
| 335 | VMOVPS i*32(inp1, IDX), TT1 |
| 336 | VMOVPS i*32(inp2, IDX), TT2 |
| 337 | VMOVPS i*32(inp3, IDX), TT3 |
| 338 | VMOVPS i*32(inp4, IDX), TT4 |
| 339 | VMOVPS i*32(inp5, IDX), TT5 |
| 340 | VMOVPS i*32(inp6, IDX), TT6 |
| 341 | VMOVPS i*32(inp7, IDX), TT7 |
| 342 | vmovdqu g, _ytmp(%rsp) |
| 343 | vmovdqu h, _ytmp+1*SZ8(%rsp) |
| 344 | TRANSPOSE8 TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7, TMP0, TMP1 |
| 345 | vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), TMP1 |
| 346 | vmovdqu _ytmp(%rsp), g |
| 347 | vpshufb TMP1, TT0, TT0 |
| 348 | vpshufb TMP1, TT1, TT1 |
| 349 | vpshufb TMP1, TT2, TT2 |
| 350 | vpshufb TMP1, TT3, TT3 |
| 351 | vpshufb TMP1, TT4, TT4 |
| 352 | vpshufb TMP1, TT5, TT5 |
| 353 | vpshufb TMP1, TT6, TT6 |
| 354 | vpshufb TMP1, TT7, TT7 |
| 355 | vmovdqu _ytmp+1*SZ8(%rsp), h |
| 356 | vmovdqu TT4, _ytmp(%rsp) |
| 357 | vmovdqu TT5, _ytmp+1*SZ8(%rsp) |
| 358 | vmovdqu TT6, _ytmp+2*SZ8(%rsp) |
| 359 | vmovdqu TT7, _ytmp+3*SZ8(%rsp) |
| 360 | ROUND_00_15 TT0,(i*8+0) |
| 361 | vmovdqu _ytmp(%rsp), TT0 |
| 362 | ROUND_00_15 TT1,(i*8+1) |
| 363 | vmovdqu _ytmp+1*SZ8(%rsp), TT1 |
| 364 | ROUND_00_15 TT2,(i*8+2) |
| 365 | vmovdqu _ytmp+2*SZ8(%rsp), TT2 |
| 366 | ROUND_00_15 TT3,(i*8+3) |
| 367 | vmovdqu _ytmp+3*SZ8(%rsp), TT3 |
| 368 | ROUND_00_15 TT0,(i*8+4) |
| 369 | ROUND_00_15 TT1,(i*8+5) |
| 370 | ROUND_00_15 TT2,(i*8+6) |
| 371 | ROUND_00_15 TT3,(i*8+7) |
| 372 | i = (i+1) |
| 373 | .endr |
| 374 | add $64, IDX |
| 375 | i = (i*8) |
| 376 | |
| 377 | jmp Lrounds_16_xx |
| 378 | .align 16 |
| 379 | Lrounds_16_xx: |
| 380 | .rep 16 |
| 381 | ROUND_16_XX T1, i |
| 382 | i = (i+1) |
| 383 | .endr |
| 384 | |
| 385 | cmp $ROUNDS,ROUND |
| 386 | jb Lrounds_16_xx |
| 387 | |
| 388 | # add old digest |
| 389 | vpaddd _digest+0*SZ8(%rsp), a, a |
| 390 | vpaddd _digest+1*SZ8(%rsp), b, b |
| 391 | vpaddd _digest+2*SZ8(%rsp), c, c |
| 392 | vpaddd _digest+3*SZ8(%rsp), d, d |
| 393 | vpaddd _digest+4*SZ8(%rsp), e, e |
| 394 | vpaddd _digest+5*SZ8(%rsp), f, f |
| 395 | vpaddd _digest+6*SZ8(%rsp), g, g |
| 396 | vpaddd _digest+7*SZ8(%rsp), h, h |
| 397 | |
| 398 | sub $1, INP_SIZE # unit is blocks |
| 399 | jne lloop |
| 400 | |
| 401 | # write back to memory (state object) the transposed digest |
| 402 | vmovdqu a, 0*SHA256_DIGEST_ROW_SIZE(STATE) |
| 403 | vmovdqu b, 1*SHA256_DIGEST_ROW_SIZE(STATE) |
| 404 | vmovdqu c, 2*SHA256_DIGEST_ROW_SIZE(STATE) |
| 405 | vmovdqu d, 3*SHA256_DIGEST_ROW_SIZE(STATE) |
| 406 | vmovdqu e, 4*SHA256_DIGEST_ROW_SIZE(STATE) |
| 407 | vmovdqu f, 5*SHA256_DIGEST_ROW_SIZE(STATE) |
| 408 | vmovdqu g, 6*SHA256_DIGEST_ROW_SIZE(STATE) |
| 409 | vmovdqu h, 7*SHA256_DIGEST_ROW_SIZE(STATE) |
| 410 | |
| 411 | # update input pointers |
| 412 | add IDX, inp0 |
| 413 | mov inp0, _args_data_ptr+0*8(STATE) |
| 414 | add IDX, inp1 |
| 415 | mov inp1, _args_data_ptr+1*8(STATE) |
| 416 | add IDX, inp2 |
| 417 | mov inp2, _args_data_ptr+2*8(STATE) |
| 418 | add IDX, inp3 |
| 419 | mov inp3, _args_data_ptr+3*8(STATE) |
| 420 | add IDX, inp4 |
| 421 | mov inp4, _args_data_ptr+4*8(STATE) |
| 422 | add IDX, inp5 |
| 423 | mov inp5, _args_data_ptr+5*8(STATE) |
| 424 | add IDX, inp6 |
| 425 | mov inp6, _args_data_ptr+6*8(STATE) |
| 426 | add IDX, inp7 |
| 427 | mov inp7, _args_data_ptr+7*8(STATE) |
| 428 | |
| 429 | # Postamble |
| 430 | mov _rsp(%rsp), %rsp |
| 431 | |
| 432 | # restore callee-saved clobbered registers |
| 433 | pop %r15 |
| 434 | pop %r14 |
| 435 | pop %r13 |
| 436 | pop %r12 |
| 437 | |
| 438 | ret |
| 439 | ENDPROC(sha256_x8_avx2) |
Denys Vlasenko | e183914 | 2017-01-19 22:33:04 +0100 | [diff] [blame^] | 440 | |
| 441 | .section .rodata.K256_8, "a", @progbits |
Megha Dey | 9925324 | 2016-06-23 18:40:46 -0700 | [diff] [blame] | 442 | .align 64 |
| 443 | K256_8: |
| 444 | .octa 0x428a2f98428a2f98428a2f98428a2f98 |
| 445 | .octa 0x428a2f98428a2f98428a2f98428a2f98 |
| 446 | .octa 0x71374491713744917137449171374491 |
| 447 | .octa 0x71374491713744917137449171374491 |
| 448 | .octa 0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf |
| 449 | .octa 0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf |
| 450 | .octa 0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5 |
| 451 | .octa 0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5 |
| 452 | .octa 0x3956c25b3956c25b3956c25b3956c25b |
| 453 | .octa 0x3956c25b3956c25b3956c25b3956c25b |
| 454 | .octa 0x59f111f159f111f159f111f159f111f1 |
| 455 | .octa 0x59f111f159f111f159f111f159f111f1 |
| 456 | .octa 0x923f82a4923f82a4923f82a4923f82a4 |
| 457 | .octa 0x923f82a4923f82a4923f82a4923f82a4 |
| 458 | .octa 0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5 |
| 459 | .octa 0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5 |
| 460 | .octa 0xd807aa98d807aa98d807aa98d807aa98 |
| 461 | .octa 0xd807aa98d807aa98d807aa98d807aa98 |
| 462 | .octa 0x12835b0112835b0112835b0112835b01 |
| 463 | .octa 0x12835b0112835b0112835b0112835b01 |
| 464 | .octa 0x243185be243185be243185be243185be |
| 465 | .octa 0x243185be243185be243185be243185be |
| 466 | .octa 0x550c7dc3550c7dc3550c7dc3550c7dc3 |
| 467 | .octa 0x550c7dc3550c7dc3550c7dc3550c7dc3 |
| 468 | .octa 0x72be5d7472be5d7472be5d7472be5d74 |
| 469 | .octa 0x72be5d7472be5d7472be5d7472be5d74 |
| 470 | .octa 0x80deb1fe80deb1fe80deb1fe80deb1fe |
| 471 | .octa 0x80deb1fe80deb1fe80deb1fe80deb1fe |
| 472 | .octa 0x9bdc06a79bdc06a79bdc06a79bdc06a7 |
| 473 | .octa 0x9bdc06a79bdc06a79bdc06a79bdc06a7 |
| 474 | .octa 0xc19bf174c19bf174c19bf174c19bf174 |
| 475 | .octa 0xc19bf174c19bf174c19bf174c19bf174 |
| 476 | .octa 0xe49b69c1e49b69c1e49b69c1e49b69c1 |
| 477 | .octa 0xe49b69c1e49b69c1e49b69c1e49b69c1 |
| 478 | .octa 0xefbe4786efbe4786efbe4786efbe4786 |
| 479 | .octa 0xefbe4786efbe4786efbe4786efbe4786 |
| 480 | .octa 0x0fc19dc60fc19dc60fc19dc60fc19dc6 |
| 481 | .octa 0x0fc19dc60fc19dc60fc19dc60fc19dc6 |
| 482 | .octa 0x240ca1cc240ca1cc240ca1cc240ca1cc |
| 483 | .octa 0x240ca1cc240ca1cc240ca1cc240ca1cc |
| 484 | .octa 0x2de92c6f2de92c6f2de92c6f2de92c6f |
| 485 | .octa 0x2de92c6f2de92c6f2de92c6f2de92c6f |
| 486 | .octa 0x4a7484aa4a7484aa4a7484aa4a7484aa |
| 487 | .octa 0x4a7484aa4a7484aa4a7484aa4a7484aa |
| 488 | .octa 0x5cb0a9dc5cb0a9dc5cb0a9dc5cb0a9dc |
| 489 | .octa 0x5cb0a9dc5cb0a9dc5cb0a9dc5cb0a9dc |
| 490 | .octa 0x76f988da76f988da76f988da76f988da |
| 491 | .octa 0x76f988da76f988da76f988da76f988da |
| 492 | .octa 0x983e5152983e5152983e5152983e5152 |
| 493 | .octa 0x983e5152983e5152983e5152983e5152 |
| 494 | .octa 0xa831c66da831c66da831c66da831c66d |
| 495 | .octa 0xa831c66da831c66da831c66da831c66d |
| 496 | .octa 0xb00327c8b00327c8b00327c8b00327c8 |
| 497 | .octa 0xb00327c8b00327c8b00327c8b00327c8 |
| 498 | .octa 0xbf597fc7bf597fc7bf597fc7bf597fc7 |
| 499 | .octa 0xbf597fc7bf597fc7bf597fc7bf597fc7 |
| 500 | .octa 0xc6e00bf3c6e00bf3c6e00bf3c6e00bf3 |
| 501 | .octa 0xc6e00bf3c6e00bf3c6e00bf3c6e00bf3 |
| 502 | .octa 0xd5a79147d5a79147d5a79147d5a79147 |
| 503 | .octa 0xd5a79147d5a79147d5a79147d5a79147 |
| 504 | .octa 0x06ca635106ca635106ca635106ca6351 |
| 505 | .octa 0x06ca635106ca635106ca635106ca6351 |
| 506 | .octa 0x14292967142929671429296714292967 |
| 507 | .octa 0x14292967142929671429296714292967 |
| 508 | .octa 0x27b70a8527b70a8527b70a8527b70a85 |
| 509 | .octa 0x27b70a8527b70a8527b70a8527b70a85 |
| 510 | .octa 0x2e1b21382e1b21382e1b21382e1b2138 |
| 511 | .octa 0x2e1b21382e1b21382e1b21382e1b2138 |
| 512 | .octa 0x4d2c6dfc4d2c6dfc4d2c6dfc4d2c6dfc |
| 513 | .octa 0x4d2c6dfc4d2c6dfc4d2c6dfc4d2c6dfc |
| 514 | .octa 0x53380d1353380d1353380d1353380d13 |
| 515 | .octa 0x53380d1353380d1353380d1353380d13 |
| 516 | .octa 0x650a7354650a7354650a7354650a7354 |
| 517 | .octa 0x650a7354650a7354650a7354650a7354 |
| 518 | .octa 0x766a0abb766a0abb766a0abb766a0abb |
| 519 | .octa 0x766a0abb766a0abb766a0abb766a0abb |
| 520 | .octa 0x81c2c92e81c2c92e81c2c92e81c2c92e |
| 521 | .octa 0x81c2c92e81c2c92e81c2c92e81c2c92e |
| 522 | .octa 0x92722c8592722c8592722c8592722c85 |
| 523 | .octa 0x92722c8592722c8592722c8592722c85 |
| 524 | .octa 0xa2bfe8a1a2bfe8a1a2bfe8a1a2bfe8a1 |
| 525 | .octa 0xa2bfe8a1a2bfe8a1a2bfe8a1a2bfe8a1 |
| 526 | .octa 0xa81a664ba81a664ba81a664ba81a664b |
| 527 | .octa 0xa81a664ba81a664ba81a664ba81a664b |
| 528 | .octa 0xc24b8b70c24b8b70c24b8b70c24b8b70 |
| 529 | .octa 0xc24b8b70c24b8b70c24b8b70c24b8b70 |
| 530 | .octa 0xc76c51a3c76c51a3c76c51a3c76c51a3 |
| 531 | .octa 0xc76c51a3c76c51a3c76c51a3c76c51a3 |
| 532 | .octa 0xd192e819d192e819d192e819d192e819 |
| 533 | .octa 0xd192e819d192e819d192e819d192e819 |
| 534 | .octa 0xd6990624d6990624d6990624d6990624 |
| 535 | .octa 0xd6990624d6990624d6990624d6990624 |
| 536 | .octa 0xf40e3585f40e3585f40e3585f40e3585 |
| 537 | .octa 0xf40e3585f40e3585f40e3585f40e3585 |
| 538 | .octa 0x106aa070106aa070106aa070106aa070 |
| 539 | .octa 0x106aa070106aa070106aa070106aa070 |
| 540 | .octa 0x19a4c11619a4c11619a4c11619a4c116 |
| 541 | .octa 0x19a4c11619a4c11619a4c11619a4c116 |
| 542 | .octa 0x1e376c081e376c081e376c081e376c08 |
| 543 | .octa 0x1e376c081e376c081e376c081e376c08 |
| 544 | .octa 0x2748774c2748774c2748774c2748774c |
| 545 | .octa 0x2748774c2748774c2748774c2748774c |
| 546 | .octa 0x34b0bcb534b0bcb534b0bcb534b0bcb5 |
| 547 | .octa 0x34b0bcb534b0bcb534b0bcb534b0bcb5 |
| 548 | .octa 0x391c0cb3391c0cb3391c0cb3391c0cb3 |
| 549 | .octa 0x391c0cb3391c0cb3391c0cb3391c0cb3 |
| 550 | .octa 0x4ed8aa4a4ed8aa4a4ed8aa4a4ed8aa4a |
| 551 | .octa 0x4ed8aa4a4ed8aa4a4ed8aa4a4ed8aa4a |
| 552 | .octa 0x5b9cca4f5b9cca4f5b9cca4f5b9cca4f |
| 553 | .octa 0x5b9cca4f5b9cca4f5b9cca4f5b9cca4f |
| 554 | .octa 0x682e6ff3682e6ff3682e6ff3682e6ff3 |
| 555 | .octa 0x682e6ff3682e6ff3682e6ff3682e6ff3 |
| 556 | .octa 0x748f82ee748f82ee748f82ee748f82ee |
| 557 | .octa 0x748f82ee748f82ee748f82ee748f82ee |
| 558 | .octa 0x78a5636f78a5636f78a5636f78a5636f |
| 559 | .octa 0x78a5636f78a5636f78a5636f78a5636f |
| 560 | .octa 0x84c8781484c8781484c8781484c87814 |
| 561 | .octa 0x84c8781484c8781484c8781484c87814 |
| 562 | .octa 0x8cc702088cc702088cc702088cc70208 |
| 563 | .octa 0x8cc702088cc702088cc702088cc70208 |
| 564 | .octa 0x90befffa90befffa90befffa90befffa |
| 565 | .octa 0x90befffa90befffa90befffa90befffa |
| 566 | .octa 0xa4506ceba4506ceba4506ceba4506ceb |
| 567 | .octa 0xa4506ceba4506ceba4506ceba4506ceb |
| 568 | .octa 0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7 |
| 569 | .octa 0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7 |
| 570 | .octa 0xc67178f2c67178f2c67178f2c67178f2 |
| 571 | .octa 0xc67178f2c67178f2c67178f2c67178f2 |
Denys Vlasenko | e183914 | 2017-01-19 22:33:04 +0100 | [diff] [blame^] | 572 | |
| 573 | .section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 |
| 574 | .align 32 |
Megha Dey | 9925324 | 2016-06-23 18:40:46 -0700 | [diff] [blame] | 575 | PSHUFFLE_BYTE_FLIP_MASK: |
| 576 | .octa 0x0c0d0e0f08090a0b0405060700010203 |
| 577 | .octa 0x0c0d0e0f08090a0b0405060700010203 |
| 578 | |
Denys Vlasenko | e183914 | 2017-01-19 22:33:04 +0100 | [diff] [blame^] | 579 | .section .rodata.cst256.K256, "aM", @progbits, 256 |
Megha Dey | 9925324 | 2016-06-23 18:40:46 -0700 | [diff] [blame] | 580 | .align 64 |
| 581 | .global K256 |
| 582 | K256: |
| 583 | .int 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 |
| 584 | .int 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 |
| 585 | .int 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 |
| 586 | .int 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 |
| 587 | .int 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc |
| 588 | .int 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da |
| 589 | .int 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 |
| 590 | .int 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 |
| 591 | .int 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 |
| 592 | .int 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 |
| 593 | .int 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 |
| 594 | .int 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 |
| 595 | .int 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 |
| 596 | .int 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 |
| 597 | .int 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 |
| 598 | .int 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 |