Markus Stockhausen | 6bb7100 | 2015-01-30 15:39:23 +0100 | [diff] [blame] | 1 | /* |
| 2 | * Fast SHA-256 implementation for SPE instruction set (PPC) |
| 3 | * |
| 4 | * This code makes use of the SPE SIMD instruction set as defined in |
| 5 | * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf |
| 6 | * Implementation is based on optimization guide notes from |
| 7 | * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf |
| 8 | * |
| 9 | * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> |
| 10 | * |
| 11 | * This program is free software; you can redistribute it and/or modify it |
| 12 | * under the terms of the GNU General Public License as published by the Free |
| 13 | * Software Foundation; either version 2 of the License, or (at your option) |
| 14 | * any later version. |
| 15 | * |
| 16 | */ |
| 17 | |
| 18 | #include <asm/ppc_asm.h> |
| 19 | #include <asm/asm-offsets.h> |
| 20 | |
| 21 | #define rHP r3 /* pointer to hash values in memory */ |
| 22 | #define rKP r24 /* pointer to round constants */ |
| 23 | #define rWP r4 /* pointer to input data */ |
| 24 | |
| 25 | #define rH0 r5 /* 8 32 bit hash values in 8 registers */ |
| 26 | #define rH1 r6 |
| 27 | #define rH2 r7 |
| 28 | #define rH3 r8 |
| 29 | #define rH4 r9 |
| 30 | #define rH5 r10 |
| 31 | #define rH6 r11 |
| 32 | #define rH7 r12 |
| 33 | |
| 34 | #define rW0 r14 /* 64 bit registers. 16 words in 8 registers */ |
| 35 | #define rW1 r15 |
| 36 | #define rW2 r16 |
| 37 | #define rW3 r17 |
| 38 | #define rW4 r18 |
| 39 | #define rW5 r19 |
| 40 | #define rW6 r20 |
| 41 | #define rW7 r21 |
| 42 | |
| 43 | #define rT0 r22 /* 64 bit temporaries */ |
| 44 | #define rT1 r23 |
| 45 | #define rT2 r0 /* 32 bit temporaries */ |
| 46 | #define rT3 r25 |
| 47 | |
| 48 | #define CMP_KN_LOOP |
| 49 | #define CMP_KC_LOOP \ |
| 50 | cmpwi rT1,0; |
| 51 | |
| 52 | #define INITIALIZE \ |
| 53 | stwu r1,-128(r1); /* create stack frame */ \ |
| 54 | evstdw r14,8(r1); /* We must save non volatile */ \ |
| 55 | evstdw r15,16(r1); /* registers. Take the chance */ \ |
| 56 | evstdw r16,24(r1); /* and save the SPE part too */ \ |
| 57 | evstdw r17,32(r1); \ |
| 58 | evstdw r18,40(r1); \ |
| 59 | evstdw r19,48(r1); \ |
| 60 | evstdw r20,56(r1); \ |
| 61 | evstdw r21,64(r1); \ |
| 62 | evstdw r22,72(r1); \ |
| 63 | evstdw r23,80(r1); \ |
| 64 | stw r24,88(r1); /* save normal registers */ \ |
| 65 | stw r25,92(r1); |
| 66 | |
| 67 | |
| 68 | #define FINALIZE \ |
| 69 | evldw r14,8(r1); /* restore SPE registers */ \ |
| 70 | evldw r15,16(r1); \ |
| 71 | evldw r16,24(r1); \ |
| 72 | evldw r17,32(r1); \ |
| 73 | evldw r18,40(r1); \ |
| 74 | evldw r19,48(r1); \ |
| 75 | evldw r20,56(r1); \ |
| 76 | evldw r21,64(r1); \ |
| 77 | evldw r22,72(r1); \ |
| 78 | evldw r23,80(r1); \ |
| 79 | lwz r24,88(r1); /* restore normal registers */ \ |
| 80 | lwz r25,92(r1); \ |
| 81 | xor r0,r0,r0; \ |
| 82 | stw r0,8(r1); /* Delete sensitive data */ \ |
| 83 | stw r0,16(r1); /* that we might have pushed */ \ |
| 84 | stw r0,24(r1); /* from other context that runs */ \ |
| 85 | stw r0,32(r1); /* the same code. Assume that */ \ |
| 86 | stw r0,40(r1); /* the lower part of the GPRs */ \ |
| 87 | stw r0,48(r1); /* was already overwritten on */ \ |
| 88 | stw r0,56(r1); /* the way down to here */ \ |
| 89 | stw r0,64(r1); \ |
| 90 | stw r0,72(r1); \ |
| 91 | stw r0,80(r1); \ |
| 92 | addi r1,r1,128; /* cleanup stack frame */ |
| 93 | |
| 94 | #ifdef __BIG_ENDIAN__ |
| 95 | #define LOAD_DATA(reg, off) \ |
| 96 | lwz reg,off(rWP); /* load data */ |
| 97 | #define NEXT_BLOCK \ |
| 98 | addi rWP,rWP,64; /* increment per block */ |
| 99 | #else |
| 100 | #define LOAD_DATA(reg, off) \ |
| 101 | lwbrx reg,0,rWP; /* load data */ \ |
| 102 | addi rWP,rWP,4; /* increment per word */ |
| 103 | #define NEXT_BLOCK /* nothing to do */ |
| 104 | #endif |
| 105 | |
| 106 | #define R_LOAD_W(a, b, c, d, e, f, g, h, w, off) \ |
| 107 | LOAD_DATA(w, off) /* 1: W */ \ |
| 108 | rotrwi rT0,e,6; /* 1: S1 = e rotr 6 */ \ |
| 109 | rotrwi rT1,e,11; /* 1: S1' = e rotr 11 */ \ |
| 110 | rotrwi rT2,e,25; /* 1: S1" = e rotr 25 */ \ |
| 111 | xor rT0,rT0,rT1; /* 1: S1 = S1 xor S1' */ \ |
| 112 | and rT3,e,f; /* 1: ch = e and f */ \ |
| 113 | xor rT0,rT0,rT2; /* 1: S1 = S1 xor S1" */ \ |
| 114 | andc rT1,g,e; /* 1: ch' = ~e and g */ \ |
| 115 | lwz rT2,off(rKP); /* 1: K */ \ |
| 116 | xor rT3,rT3,rT1; /* 1: ch = ch xor ch' */ \ |
| 117 | add h,h,rT0; /* 1: temp1 = h + S1 */ \ |
| 118 | add rT3,rT3,w; /* 1: temp1' = ch + w */ \ |
| 119 | rotrwi rT0,a,2; /* 1: S0 = a rotr 2 */ \ |
| 120 | add h,h,rT3; /* 1: temp1 = temp1 + temp1' */ \ |
| 121 | rotrwi rT1,a,13; /* 1: S0' = a rotr 13 */ \ |
| 122 | add h,h,rT2; /* 1: temp1 = temp1 + K */ \ |
| 123 | rotrwi rT3,a,22; /* 1: S0" = a rotr 22 */ \ |
| 124 | xor rT0,rT0,rT1; /* 1: S0 = S0 xor S0' */ \ |
| 125 | add d,d,h; /* 1: d = d + temp1 */ \ |
| 126 | xor rT3,rT0,rT3; /* 1: S0 = S0 xor S0" */ \ |
| 127 | evmergelo w,w,w; /* shift W */ \ |
| 128 | or rT2,a,b; /* 1: maj = a or b */ \ |
| 129 | and rT1,a,b; /* 1: maj' = a and b */ \ |
| 130 | and rT2,rT2,c; /* 1: maj = maj and c */ \ |
| 131 | LOAD_DATA(w, off+4) /* 2: W */ \ |
| 132 | or rT2,rT1,rT2; /* 1: maj = maj or maj' */ \ |
| 133 | rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \ |
| 134 | add rT3,rT3,rT2; /* 1: temp2 = S0 + maj */ \ |
| 135 | rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \ |
| 136 | add h,h,rT3; /* 1: h = temp1 + temp2 */ \ |
| 137 | rotrwi rT2,d,25; /* 2: S1" = e rotr 25 */ \ |
| 138 | xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \ |
| 139 | and rT3,d,e; /* 2: ch = e and f */ \ |
| 140 | xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \ |
| 141 | andc rT1,f,d; /* 2: ch' = ~e and g */ \ |
| 142 | lwz rT2,off+4(rKP); /* 2: K */ \ |
| 143 | xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \ |
| 144 | add g,g,rT0; /* 2: temp1 = h + S1 */ \ |
| 145 | add rT3,rT3,w; /* 2: temp1' = ch + w */ \ |
| 146 | rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \ |
| 147 | add g,g,rT3; /* 2: temp1 = temp1 + temp1' */ \ |
| 148 | rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \ |
| 149 | add g,g,rT2; /* 2: temp1 = temp1 + K */ \ |
| 150 | rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \ |
| 151 | xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \ |
| 152 | or rT2,h,a; /* 2: maj = a or b */ \ |
| 153 | xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \ |
| 154 | and rT1,h,a; /* 2: maj' = a and b */ \ |
| 155 | and rT2,rT2,b; /* 2: maj = maj and c */ \ |
| 156 | add c,c,g; /* 2: d = d + temp1 */ \ |
| 157 | or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \ |
| 158 | add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \ |
| 159 | add g,g,rT3 /* 2: h = temp1 + temp2 */ |
| 160 | |
| 161 | #define R_CALC_W(a, b, c, d, e, f, g, h, w0, w1, w4, w5, w7, k, off) \ |
| 162 | rotrwi rT2,e,6; /* 1: S1 = e rotr 6 */ \ |
| 163 | evmergelohi rT0,w0,w1; /* w[-15] */ \ |
| 164 | rotrwi rT3,e,11; /* 1: S1' = e rotr 11 */ \ |
| 165 | evsrwiu rT1,rT0,3; /* s0 = w[-15] >> 3 */ \ |
| 166 | xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \ |
| 167 | evrlwi rT0,rT0,25; /* s0' = w[-15] rotr 7 */ \ |
| 168 | rotrwi rT3,e,25; /* 1: S1' = e rotr 25 */ \ |
| 169 | evxor rT1,rT1,rT0; /* s0 = s0 xor s0' */ \ |
| 170 | xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \ |
| 171 | evrlwi rT0,rT0,21; /* s0' = w[-15] rotr 18 */ \ |
| 172 | add h,h,rT2; /* 1: temp1 = h + S1 */ \ |
| 173 | evxor rT0,rT0,rT1; /* s0 = s0 xor s0' */ \ |
| 174 | and rT2,e,f; /* 1: ch = e and f */ \ |
| 175 | evaddw w0,w0,rT0; /* w = w[-16] + s0 */ \ |
| 176 | andc rT3,g,e; /* 1: ch' = ~e and g */ \ |
| 177 | evsrwiu rT0,w7,10; /* s1 = w[-2] >> 10 */ \ |
| 178 | xor rT2,rT2,rT3; /* 1: ch = ch xor ch' */ \ |
| 179 | evrlwi rT1,w7,15; /* s1' = w[-2] rotr 17 */ \ |
| 180 | add h,h,rT2; /* 1: temp1 = temp1 + ch */ \ |
| 181 | evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \ |
| 182 | rotrwi rT2,a,2; /* 1: S0 = a rotr 2 */ \ |
| 183 | evrlwi rT1,w7,13; /* s1' = w[-2] rotr 19 */ \ |
| 184 | rotrwi rT3,a,13; /* 1: S0' = a rotr 13 */ \ |
| 185 | evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \ |
| 186 | xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \ |
| 187 | evldw rT1,off(rKP); /* k */ \ |
| 188 | rotrwi rT3,a,22; /* 1: S0' = a rotr 22 */ \ |
| 189 | evaddw w0,w0,rT0; /* w = w + s1 */ \ |
| 190 | xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \ |
| 191 | evmergelohi rT0,w4,w5; /* w[-7] */ \ |
| 192 | and rT3,a,b; /* 1: maj = a and b */ \ |
| 193 | evaddw w0,w0,rT0; /* w = w + w[-7] */ \ |
| 194 | CMP_K##k##_LOOP \ |
| 195 | add rT2,rT2,rT3; /* 1: temp2 = S0 + maj */ \ |
| 196 | evaddw rT1,rT1,w0; /* wk = w + k */ \ |
| 197 | xor rT3,a,b; /* 1: maj = a xor b */ \ |
| 198 | evmergehi rT0,rT1,rT1; /* wk1/wk2 */ \ |
| 199 | and rT3,rT3,c; /* 1: maj = maj and c */ \ |
| 200 | add h,h,rT0; /* 1: temp1 = temp1 + wk */ \ |
| 201 | add rT2,rT2,rT3; /* 1: temp2 = temp2 + maj */ \ |
| 202 | add g,g,rT1; /* 2: temp1 = temp1 + wk */ \ |
| 203 | add d,d,h; /* 1: d = d + temp1 */ \ |
| 204 | rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \ |
| 205 | add h,h,rT2; /* 1: h = temp1 + temp2 */ \ |
| 206 | rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \ |
| 207 | rotrwi rT2,d,25; /* 2: S" = e rotr 25 */ \ |
| 208 | xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \ |
| 209 | and rT3,d,e; /* 2: ch = e and f */ \ |
| 210 | xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \ |
| 211 | andc rT1,f,d; /* 2: ch' = ~e and g */ \ |
| 212 | add g,g,rT0; /* 2: temp1 = h + S1 */ \ |
| 213 | xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \ |
| 214 | rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \ |
| 215 | add g,g,rT3; /* 2: temp1 = temp1 + ch */ \ |
| 216 | rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \ |
| 217 | rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \ |
| 218 | xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \ |
| 219 | or rT2,h,a; /* 2: maj = a or b */ \ |
| 220 | and rT1,h,a; /* 2: maj' = a and b */ \ |
| 221 | and rT2,rT2,b; /* 2: maj = maj and c */ \ |
| 222 | xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \ |
| 223 | or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \ |
| 224 | add c,c,g; /* 2: d = d + temp1 */ \ |
| 225 | add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \ |
| 226 | add g,g,rT3 /* 2: h = temp1 + temp2 */ |
| 227 | |
| 228 | _GLOBAL(ppc_spe_sha256_transform) |
| 229 | INITIALIZE |
| 230 | |
| 231 | mtctr r5 |
| 232 | lwz rH0,0(rHP) |
| 233 | lwz rH1,4(rHP) |
| 234 | lwz rH2,8(rHP) |
| 235 | lwz rH3,12(rHP) |
| 236 | lwz rH4,16(rHP) |
| 237 | lwz rH5,20(rHP) |
| 238 | lwz rH6,24(rHP) |
| 239 | lwz rH7,28(rHP) |
| 240 | |
| 241 | ppc_spe_sha256_main: |
| 242 | lis rKP,PPC_SPE_SHA256_K@ha |
| 243 | addi rKP,rKP,PPC_SPE_SHA256_K@l |
| 244 | |
| 245 | R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW0, 0) |
| 246 | R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW1, 8) |
| 247 | R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW2, 16) |
| 248 | R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW3, 24) |
| 249 | R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW4, 32) |
| 250 | R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW5, 40) |
| 251 | R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW6, 48) |
| 252 | R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW7, 56) |
| 253 | ppc_spe_sha256_16_rounds: |
| 254 | addi rKP,rKP,64 |
| 255 | R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, |
| 256 | rW0, rW1, rW4, rW5, rW7, N, 0) |
| 257 | R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, |
| 258 | rW1, rW2, rW5, rW6, rW0, N, 8) |
| 259 | R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, |
| 260 | rW2, rW3, rW6, rW7, rW1, N, 16) |
| 261 | R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, |
| 262 | rW3, rW4, rW7, rW0, rW2, N, 24) |
| 263 | R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, |
| 264 | rW4, rW5, rW0, rW1, rW3, N, 32) |
| 265 | R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, |
| 266 | rW5, rW6, rW1, rW2, rW4, N, 40) |
| 267 | R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, |
| 268 | rW6, rW7, rW2, rW3, rW5, N, 48) |
| 269 | R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, |
| 270 | rW7, rW0, rW3, rW4, rW6, C, 56) |
| 271 | bt gt,ppc_spe_sha256_16_rounds |
| 272 | |
| 273 | lwz rW0,0(rHP) |
| 274 | NEXT_BLOCK |
| 275 | lwz rW1,4(rHP) |
| 276 | lwz rW2,8(rHP) |
| 277 | lwz rW3,12(rHP) |
| 278 | lwz rW4,16(rHP) |
| 279 | lwz rW5,20(rHP) |
| 280 | lwz rW6,24(rHP) |
| 281 | lwz rW7,28(rHP) |
| 282 | |
| 283 | add rH0,rH0,rW0 |
| 284 | stw rH0,0(rHP) |
| 285 | add rH1,rH1,rW1 |
| 286 | stw rH1,4(rHP) |
| 287 | add rH2,rH2,rW2 |
| 288 | stw rH2,8(rHP) |
| 289 | add rH3,rH3,rW3 |
| 290 | stw rH3,12(rHP) |
| 291 | add rH4,rH4,rW4 |
| 292 | stw rH4,16(rHP) |
| 293 | add rH5,rH5,rW5 |
| 294 | stw rH5,20(rHP) |
| 295 | add rH6,rH6,rW6 |
| 296 | stw rH6,24(rHP) |
| 297 | add rH7,rH7,rW7 |
| 298 | stw rH7,28(rHP) |
| 299 | |
| 300 | bdnz ppc_spe_sha256_main |
| 301 | |
| 302 | FINALIZE |
| 303 | blr |
| 304 | |
| 305 | .data |
| 306 | .align 5 |
| 307 | PPC_SPE_SHA256_K: |
| 308 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 |
| 309 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 |
| 310 | .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 |
| 311 | .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 |
| 312 | .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc |
| 313 | .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da |
| 314 | .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 |
| 315 | .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 |
| 316 | .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 |
| 317 | .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 |
| 318 | .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 |
| 319 | .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 |
| 320 | .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 |
| 321 | .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 |
| 322 | .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 |
| 323 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 |