Markus Stockhausen | 20f1b1f | 2015-02-24 20:36:40 +0100 | [diff] [blame] | 1 | /* |
| 2 | * Fast SHA-1 implementation for SPE instruction set (PPC) |
| 3 | * |
| 4 | * This code makes use of the SPE SIMD instruction set as defined in |
| 5 | * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf |
| 6 | * Implementation is based on optimization guide notes from |
| 7 | * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf |
| 8 | * |
| 9 | * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> |
| 10 | * |
| 11 | * This program is free software; you can redistribute it and/or modify it |
| 12 | * under the terms of the GNU General Public License as published by the Free |
| 13 | * Software Foundation; either version 2 of the License, or (at your option) |
| 14 | * any later version. |
| 15 | * |
| 16 | */ |
| 17 | |
| 18 | #include <asm/ppc_asm.h> |
| 19 | #include <asm/asm-offsets.h> |
| 20 | |
| 21 | #define rHP r3 /* pointer to hash value */ |
| 22 | #define rWP r4 /* pointer to input */ |
| 23 | #define rKP r5 /* pointer to constants */ |
| 24 | |
| 25 | #define rW0 r14 /* 64 bit round words */ |
| 26 | #define rW1 r15 |
| 27 | #define rW2 r16 |
| 28 | #define rW3 r17 |
| 29 | #define rW4 r18 |
| 30 | #define rW5 r19 |
| 31 | #define rW6 r20 |
| 32 | #define rW7 r21 |
| 33 | |
| 34 | #define rH0 r6 /* 32 bit hash values */ |
| 35 | #define rH1 r7 |
| 36 | #define rH2 r8 |
| 37 | #define rH3 r9 |
| 38 | #define rH4 r10 |
| 39 | |
| 40 | #define rT0 r22 /* 64 bit temporary */ |
| 41 | #define rT1 r0 /* 32 bit temporaries */ |
| 42 | #define rT2 r11 |
| 43 | #define rT3 r12 |
| 44 | |
| 45 | #define rK r23 /* 64 bit constant in volatile register */ |
| 46 | |
| 47 | #define LOAD_K01 |
| 48 | |
| 49 | #define LOAD_K11 \ |
| 50 | evlwwsplat rK,0(rKP); |
| 51 | |
| 52 | #define LOAD_K21 \ |
| 53 | evlwwsplat rK,4(rKP); |
| 54 | |
| 55 | #define LOAD_K31 \ |
| 56 | evlwwsplat rK,8(rKP); |
| 57 | |
| 58 | #define LOAD_K41 \ |
| 59 | evlwwsplat rK,12(rKP); |
| 60 | |
| 61 | #define INITIALIZE \ |
| 62 | stwu r1,-128(r1); /* create stack frame */ \ |
| 63 | evstdw r14,8(r1); /* We must save non volatile */ \ |
| 64 | evstdw r15,16(r1); /* registers. Take the chance */ \ |
| 65 | evstdw r16,24(r1); /* and save the SPE part too */ \ |
| 66 | evstdw r17,32(r1); \ |
| 67 | evstdw r18,40(r1); \ |
| 68 | evstdw r19,48(r1); \ |
| 69 | evstdw r20,56(r1); \ |
| 70 | evstdw r21,64(r1); \ |
| 71 | evstdw r22,72(r1); \ |
| 72 | evstdw r23,80(r1); |
| 73 | |
| 74 | |
| 75 | #define FINALIZE \ |
| 76 | evldw r14,8(r1); /* restore SPE registers */ \ |
| 77 | evldw r15,16(r1); \ |
| 78 | evldw r16,24(r1); \ |
| 79 | evldw r17,32(r1); \ |
| 80 | evldw r18,40(r1); \ |
| 81 | evldw r19,48(r1); \ |
| 82 | evldw r20,56(r1); \ |
| 83 | evldw r21,64(r1); \ |
| 84 | evldw r22,72(r1); \ |
| 85 | evldw r23,80(r1); \ |
| 86 | xor r0,r0,r0; \ |
| 87 | stw r0,8(r1); /* Delete sensitive data */ \ |
| 88 | stw r0,16(r1); /* that we might have pushed */ \ |
| 89 | stw r0,24(r1); /* from other context that runs */ \ |
| 90 | stw r0,32(r1); /* the same code. Assume that */ \ |
| 91 | stw r0,40(r1); /* the lower part of the GPRs */ \ |
| 92 | stw r0,48(r1); /* were already overwritten on */ \ |
| 93 | stw r0,56(r1); /* the way down to here */ \ |
| 94 | stw r0,64(r1); \ |
| 95 | stw r0,72(r1); \ |
| 96 | stw r0,80(r1); \ |
| 97 | addi r1,r1,128; /* cleanup stack frame */ |
| 98 | |
| 99 | #ifdef __BIG_ENDIAN__ |
| 100 | #define LOAD_DATA(reg, off) \ |
| 101 | lwz reg,off(rWP); /* load data */ |
| 102 | #define NEXT_BLOCK \ |
| 103 | addi rWP,rWP,64; /* increment per block */ |
| 104 | #else |
| 105 | #define LOAD_DATA(reg, off) \ |
| 106 | lwbrx reg,0,rWP; /* load data */ \ |
| 107 | addi rWP,rWP,4; /* increment per word */ |
| 108 | #define NEXT_BLOCK /* nothing to do */ |
| 109 | #endif |
| 110 | |
| 111 | #define R_00_15(a, b, c, d, e, w0, w1, k, off) \ |
| 112 | LOAD_DATA(w0, off) /* 1: W */ \ |
| 113 | and rT2,b,c; /* 1: F' = B and C */ \ |
| 114 | LOAD_K##k##1 \ |
| 115 | andc rT1,d,b; /* 1: F" = ~B and D */ \ |
| 116 | rotrwi rT0,a,27; /* 1: A' = A rotl 5 */ \ |
| 117 | or rT2,rT2,rT1; /* 1: F = F' or F" */ \ |
| 118 | add e,e,rT0; /* 1: E = E + A' */ \ |
| 119 | rotrwi b,b,2; /* 1: B = B rotl 30 */ \ |
| 120 | add e,e,w0; /* 1: E = E + W */ \ |
| 121 | LOAD_DATA(w1, off+4) /* 2: W */ \ |
| 122 | add e,e,rT2; /* 1: E = E + F */ \ |
| 123 | and rT1,a,b; /* 2: F' = B and C */ \ |
| 124 | add e,e,rK; /* 1: E = E + K */ \ |
| 125 | andc rT2,c,a; /* 2: F" = ~B and D */ \ |
| 126 | add d,d,rK; /* 2: E = E + K */ \ |
| 127 | or rT2,rT2,rT1; /* 2: F = F' or F" */ \ |
| 128 | rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ |
| 129 | add d,d,w1; /* 2: E = E + W */ \ |
| 130 | rotrwi a,a,2; /* 2: B = B rotl 30 */ \ |
| 131 | add d,d,rT0; /* 2: E = E + A' */ \ |
| 132 | evmergelo w1,w1,w0; /* mix W[0]/W[1] */ \ |
| 133 | add d,d,rT2 /* 2: E = E + F */ |
| 134 | |
| 135 | #define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ |
| 136 | and rT2,b,c; /* 1: F' = B and C */ \ |
| 137 | evmergelohi rT0,w7,w6; /* W[-3] */ \ |
| 138 | andc rT1,d,b; /* 1: F" = ~B and D */ \ |
| 139 | evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ |
| 140 | or rT1,rT1,rT2; /* 1: F = F' or F" */ \ |
| 141 | evxor w0,w0,w4; /* W = W xor W[-8] */ \ |
| 142 | add e,e,rT1; /* 1: E = E + F */ \ |
| 143 | evxor w0,w0,w1; /* W = W xor W[-14] */ \ |
| 144 | rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ |
| 145 | evrlwi w0,w0,1; /* W = W rotl 1 */ \ |
| 146 | add e,e,rT2; /* 1: E = E + A' */ \ |
| 147 | evaddw rT0,w0,rK; /* WK = W + K */ \ |
| 148 | rotrwi b,b,2; /* 1: B = B rotl 30 */ \ |
| 149 | LOAD_K##k##1 \ |
| 150 | evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ |
| 151 | add e,e,rT0; /* 1: E = E + WK */ \ |
| 152 | add d,d,rT1; /* 2: E = E + WK */ \ |
| 153 | and rT2,a,b; /* 2: F' = B and C */ \ |
| 154 | andc rT1,c,a; /* 2: F" = ~B and D */ \ |
| 155 | rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ |
| 156 | or rT1,rT1,rT2; /* 2: F = F' or F" */ \ |
| 157 | add d,d,rT0; /* 2: E = E + A' */ \ |
| 158 | rotrwi a,a,2; /* 2: B = B rotl 30 */ \ |
| 159 | add d,d,rT1 /* 2: E = E + F */ |
| 160 | |
| 161 | #define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ |
| 162 | evmergelohi rT0,w7,w6; /* W[-3] */ \ |
| 163 | xor rT2,b,c; /* 1: F' = B xor C */ \ |
| 164 | evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ |
| 165 | xor rT2,rT2,d; /* 1: F = F' xor D */ \ |
| 166 | evxor w0,w0,w4; /* W = W xor W[-8] */ \ |
| 167 | add e,e,rT2; /* 1: E = E + F */ \ |
| 168 | evxor w0,w0,w1; /* W = W xor W[-14] */ \ |
| 169 | rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ |
| 170 | evrlwi w0,w0,1; /* W = W rotl 1 */ \ |
| 171 | add e,e,rT2; /* 1: E = E + A' */ \ |
| 172 | evaddw rT0,w0,rK; /* WK = W + K */ \ |
| 173 | rotrwi b,b,2; /* 1: B = B rotl 30 */ \ |
| 174 | LOAD_K##k##1 \ |
| 175 | evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ |
| 176 | add e,e,rT0; /* 1: E = E + WK */ \ |
| 177 | xor rT2,a,b; /* 2: F' = B xor C */ \ |
| 178 | add d,d,rT1; /* 2: E = E + WK */ \ |
| 179 | xor rT2,rT2,c; /* 2: F = F' xor D */ \ |
| 180 | rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ |
| 181 | add d,d,rT2; /* 2: E = E + F */ \ |
| 182 | rotrwi a,a,2; /* 2: B = B rotl 30 */ \ |
| 183 | add d,d,rT0 /* 2: E = E + A' */ |
| 184 | |
| 185 | #define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ |
| 186 | and rT2,b,c; /* 1: F' = B and C */ \ |
| 187 | evmergelohi rT0,w7,w6; /* W[-3] */ \ |
| 188 | or rT1,b,c; /* 1: F" = B or C */ \ |
| 189 | evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ |
| 190 | and rT1,d,rT1; /* 1: F" = F" and D */ \ |
| 191 | evxor w0,w0,w4; /* W = W xor W[-8] */ \ |
| 192 | or rT2,rT2,rT1; /* 1: F = F' or F" */ \ |
| 193 | evxor w0,w0,w1; /* W = W xor W[-14] */ \ |
| 194 | add e,e,rT2; /* 1: E = E + F */ \ |
| 195 | evrlwi w0,w0,1; /* W = W rotl 1 */ \ |
| 196 | rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ |
| 197 | evaddw rT0,w0,rK; /* WK = W + K */ \ |
| 198 | add e,e,rT2; /* 1: E = E + A' */ \ |
| 199 | LOAD_K##k##1 \ |
| 200 | evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ |
| 201 | rotrwi b,b,2; /* 1: B = B rotl 30 */ \ |
| 202 | add e,e,rT0; /* 1: E = E + WK */ \ |
| 203 | and rT2,a,b; /* 2: F' = B and C */ \ |
| 204 | or rT0,a,b; /* 2: F" = B or C */ \ |
| 205 | add d,d,rT1; /* 2: E = E + WK */ \ |
| 206 | and rT0,c,rT0; /* 2: F" = F" and D */ \ |
| 207 | rotrwi a,a,2; /* 2: B = B rotl 30 */ \ |
| 208 | or rT2,rT2,rT0; /* 2: F = F' or F" */ \ |
| 209 | rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ |
| 210 | add d,d,rT2; /* 2: E = E + F */ \ |
| 211 | add d,d,rT0 /* 2: E = E + A' */ |
| 212 | |
| 213 | #define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ |
| 214 | R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) |
| 215 | |
| 216 | _GLOBAL(ppc_spe_sha1_transform) |
| 217 | INITIALIZE |
| 218 | |
| 219 | lwz rH0,0(rHP) |
| 220 | lwz rH1,4(rHP) |
| 221 | mtctr r5 |
| 222 | lwz rH2,8(rHP) |
| 223 | lis rKP,PPC_SPE_SHA1_K@h |
| 224 | lwz rH3,12(rHP) |
| 225 | ori rKP,rKP,PPC_SPE_SHA1_K@l |
| 226 | lwz rH4,16(rHP) |
| 227 | |
| 228 | ppc_spe_sha1_main: |
| 229 | R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0) |
| 230 | R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8) |
| 231 | R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16) |
| 232 | R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24) |
| 233 | R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32) |
| 234 | R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40) |
| 235 | R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48) |
| 236 | R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56) |
| 237 | |
| 238 | R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0) |
| 239 | R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2) |
| 240 | |
| 241 | R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0) |
| 242 | R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0) |
| 243 | R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0) |
| 244 | R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0) |
| 245 | R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0) |
| 246 | R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0) |
| 247 | R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0) |
| 248 | R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0) |
| 249 | R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0) |
| 250 | R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3) |
| 251 | |
| 252 | R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0) |
| 253 | R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0) |
| 254 | R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0) |
| 255 | R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0) |
| 256 | R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0) |
| 257 | R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0) |
| 258 | R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0) |
| 259 | R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0) |
| 260 | R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0) |
| 261 | R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4) |
| 262 | |
| 263 | R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0) |
| 264 | R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0) |
| 265 | R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0) |
| 266 | R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0) |
| 267 | R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0) |
| 268 | R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0) |
| 269 | R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0) |
| 270 | lwz rT3,0(rHP) |
| 271 | R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0) |
| 272 | lwz rW1,4(rHP) |
| 273 | R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0) |
| 274 | lwz rW2,8(rHP) |
| 275 | R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0) |
| 276 | lwz rW3,12(rHP) |
| 277 | NEXT_BLOCK |
| 278 | lwz rW4,16(rHP) |
| 279 | |
| 280 | add rH0,rH0,rT3 |
| 281 | stw rH0,0(rHP) |
| 282 | add rH1,rH1,rW1 |
| 283 | stw rH1,4(rHP) |
| 284 | add rH2,rH2,rW2 |
| 285 | stw rH2,8(rHP) |
| 286 | add rH3,rH3,rW3 |
| 287 | stw rH3,12(rHP) |
| 288 | add rH4,rH4,rW4 |
| 289 | stw rH4,16(rHP) |
| 290 | |
| 291 | bdnz ppc_spe_sha1_main |
| 292 | |
| 293 | FINALIZE |
| 294 | blr |
| 295 | |
| 296 | .data |
| 297 | .align 4 |
| 298 | PPC_SPE_SHA1_K: |
| 299 | .long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6 |