Robert Sloan | 8967815 | 2019-03-12 14:24:00 -0700 | [diff] [blame] | 1 | #!/usr/bin/env perl |
| 2 | # Copyright (c) 2019, Google Inc. |
| 3 | # |
| 4 | # Permission to use, copy, modify, and/or distribute this software for any |
| 5 | # purpose with or without fee is hereby granted, provided that the above |
| 6 | # copyright notice and this permission notice appear in all copies. |
| 7 | # |
| 8 | # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
| 9 | # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
| 10 | # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY |
| 11 | # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
| 12 | # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION |
| 13 | # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN |
| 14 | # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
| 15 | |
| 16 | # ghash-ssse3-x86.pl is a constant-time variant of the traditional 4-bit |
| 17 | # table-based GHASH implementation. It requires SSSE3 instructions. |
| 18 | # |
| 19 | # For background, the table-based strategy is a 4-bit windowed multiplication. |
| 20 | # It precomputes all 4-bit multiples of H (this is 16 128-bit rows), then loops |
| 21 | # over 4-bit windows of the input and indexes them up into the table. Visually, |
| 22 | # it multiplies as in the schoolbook multiplication diagram below, but with |
| 23 | # more terms. (Each term is 4 bits, so there are 32 terms in each row.) First |
| 24 | # it incorporates the terms labeled '1' by indexing the most significant term |
| 25 | # of X into the table. Then it shifts and repeats for '2' and so on. |
| 26 | # |
| 27 | # hhhhhh |
| 28 | # * xxxxxx |
| 29 | # ============ |
| 30 | # 666666 |
| 31 | # 555555 |
| 32 | # 444444 |
| 33 | # 333333 |
| 34 | # 222222 |
| 35 | # 111111 |
| 36 | # |
| 37 | # This implementation changes the order. We treat the table as a 16×16 matrix |
| 38 | # and transpose it. The first row is then the first byte of each multiple of H, |
| 39 | # and so on. We then reorder terms as below. Observe that the terms labeled '1' |
| 40 | # and '2' are all lookups into the first row, etc. This maps well to the SSSE3 |
| 41 | # pshufb instruction, using alternating terms of X in parallel as indices. This |
| 42 | # alternation is needed because pshufb maps 4 bits to 8 bits. Then we shift and |
| 43 | # repeat for each row. |
| 44 | # |
| 45 | # hhhhhh |
| 46 | # * xxxxxx |
| 47 | # ============ |
| 48 | # 224466 |
| 49 | # 113355 |
| 50 | # 224466 |
| 51 | # 113355 |
| 52 | # 224466 |
| 53 | # 113355 |
| 54 | # |
| 55 | # Next we account for GCM's confusing bit order. The "first" bit is the least |
| 56 | # significant coefficient, but GCM treats the most sigificant bit within a byte |
| 57 | # as first. Bytes are little-endian, and bits are big-endian. We reverse the |
| 58 | # bytes in XMM registers for a consistent bit and byte ordering, but this means |
| 59 | # the least significant bit is the most significant coefficient and vice versa. |
| 60 | # |
| 61 | # For consistency, "low", "high", "left-shift", and "right-shift" refer to the |
| 62 | # bit ordering within the XMM register, rather than the reversed coefficient |
| 63 | # ordering. Low bits are less significant bits and more significant |
| 64 | # coefficients. Right-shifts move from MSB to the LSB and correspond to |
| 65 | # increasing the power of each coefficient. |
| 66 | # |
| 67 | # Note this bit reversal enters into the table's column indices. H*1 is stored |
| 68 | # in column 0b1000 and H*x^3 is stored in column 0b0001. It also means earlier |
| 69 | # table rows contain more significant coefficients, so we iterate forwards. |
| 70 | |
| 71 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| 72 | push(@INC,"${dir}","${dir}../../../perlasm"); |
| 73 | require "x86asm.pl"; |
| 74 | |
| 75 | $output = pop; |
| 76 | open STDOUT, ">$output"; |
| 77 | |
| 78 | &asm_init($ARGV[0]); |
| 79 | |
| 80 | my ($Xi, $Htable, $in, $len) = ("edi", "esi", "edx", "ecx"); |
| 81 | &static_label("reverse_bytes"); |
| 82 | &static_label("low4_mask"); |
| 83 | |
| 84 | my $call_counter = 0; |
| 85 | # process_rows emits assembly code to process $rows rows of the table. On |
| 86 | # input, $Htable stores the pointer to the next row. xmm0 and xmm1 store the |
| 87 | # low and high halves of the input. The result so far is passed in xmm2. xmm3 |
| 88 | # must be zero. On output, $Htable is advanced to the next row and xmm2 is |
| 89 | # updated. xmm3 remains zero. It clobbers eax, xmm4, xmm5, and xmm6. |
| 90 | sub process_rows { |
| 91 | my ($rows) = @_; |
| 92 | $call_counter++; |
| 93 | |
| 94 | # Shifting whole XMM registers by bits is complex. psrldq shifts by |
| 95 | # bytes, and psrlq shifts the two 64-bit halves separately. Each row |
| 96 | # produces 8 bits of carry, and the reduction needs an additional 7-bit |
| 97 | # shift. This must fit in 64 bits so reduction can use psrlq. This |
| 98 | # allows up to 7 rows at a time. |
| 99 | die "Carry register would overflow 64 bits." if ($rows*8 + 7 > 64); |
| 100 | |
| 101 | &mov("eax", $rows); |
| 102 | &set_label("loop_row_$call_counter"); |
| 103 | &movdqa("xmm4", &QWP(0, $Htable)); |
| 104 | &lea($Htable, &DWP(16, $Htable)); |
| 105 | |
| 106 | # Right-shift xmm2 and xmm3 by 8 bytes. |
| 107 | &movdqa("xmm6", "xmm2"); |
| 108 | &palignr("xmm6", "xmm3", 1); |
| 109 | &movdqa("xmm3", "xmm6"); |
| 110 | &psrldq("xmm2", 1); |
| 111 | |
| 112 | # Load the next table row and index the low and high bits of the input. |
| 113 | # Note the low (respectively, high) half corresponds to more |
| 114 | # (respectively, less) significant coefficients. |
| 115 | &movdqa("xmm5", "xmm4"); |
| 116 | &pshufb("xmm4", "xmm0"); |
| 117 | &pshufb("xmm5", "xmm1"); |
| 118 | |
| 119 | # Add the high half (xmm5) without shifting. |
| 120 | &pxor("xmm2", "xmm5"); |
| 121 | |
| 122 | # Add the low half (xmm4). This must be right-shifted by 4 bits. First, |
| 123 | # add into the carry register (xmm3). |
| 124 | &movdqa("xmm5", "xmm4"); |
| 125 | &psllq("xmm5", 60); |
| 126 | &movdqa("xmm6", "xmm5"); |
| 127 | &pslldq("xmm6", 8); |
| 128 | &pxor("xmm3", "xmm6"); |
| 129 | |
| 130 | # Next, add into xmm2. |
| 131 | &psrldq("xmm5", 8); |
| 132 | &pxor("xmm2", "xmm5"); |
| 133 | &psrlq("xmm4", 4); |
| 134 | &pxor("xmm2", "xmm4"); |
| 135 | |
| 136 | &sub("eax", 1); |
| 137 | &jnz(&label("loop_row_$call_counter")); |
| 138 | |
| 139 | # Reduce the carry register. The reduction polynomial is 1 + x + x^2 + |
| 140 | # x^7, so we shift and XOR four times. |
| 141 | &pxor("xmm2", "xmm3"); # x^0 = 0 |
| 142 | &psrlq("xmm3", 1); |
| 143 | &pxor("xmm2", "xmm3"); # x^1 = x |
| 144 | &psrlq("xmm3", 1); |
| 145 | &pxor("xmm2", "xmm3"); # x^(1+1) = x^2 |
| 146 | &psrlq("xmm3", 5); |
| 147 | &pxor("xmm2", "xmm3"); # x^(1+1+5) = x^7 |
| 148 | &pxor("xmm3", "xmm3"); |
| 149 | ____ |
| 150 | } |
| 151 | |
| 152 | # gcm_gmult_ssse3 multiplies |Xi| by |Htable| and writes the result to |Xi|. |
| 153 | # |Xi| is represented in GHASH's serialized byte representation. |Htable| is |
| 154 | # formatted as described above. |
| 155 | # void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]); |
| 156 | &function_begin("gcm_gmult_ssse3"); |
| 157 | &mov($Xi, &wparam(0)); |
| 158 | &mov($Htable, &wparam(1)); |
| 159 | |
| 160 | &movdqu("xmm0", &QWP(0, $Xi)); |
| 161 | &call(&label("pic_point")); |
| 162 | &set_label("pic_point"); |
| 163 | &blindpop("eax"); |
| 164 | &movdqa("xmm7", &QWP(&label("reverse_bytes")."-".&label("pic_point"), "eax")); |
| 165 | &movdqa("xmm2", &QWP(&label("low4_mask")."-".&label("pic_point"), "eax")); |
| 166 | |
| 167 | # Reverse input bytes to deserialize. |
| 168 | &pshufb("xmm0", "xmm7"); |
| 169 | |
| 170 | # Split each byte into low (xmm0) and high (xmm1) halves. |
| 171 | &movdqa("xmm1", "xmm2"); |
| 172 | &pandn("xmm1", "xmm0"); |
| 173 | &psrld("xmm1", 4); |
| 174 | &pand("xmm0", "xmm2"); |
| 175 | |
| 176 | # Maintain the result in xmm2 (the value) and xmm3 (carry bits). Note |
| 177 | # that, due to bit reversal, xmm3 contains bits that fall off when |
| 178 | # right-shifting, not left-shifting. |
| 179 | &pxor("xmm2", "xmm2"); |
| 180 | &pxor("xmm3", "xmm3"); |
| 181 | |
| 182 | # We must reduce at least once every 7 rows, so divide into three |
| 183 | # chunks. |
| 184 | &process_rows(5); |
| 185 | &process_rows(5); |
| 186 | &process_rows(6); |
| 187 | |
| 188 | # Store the result. Reverse bytes to serialize. |
| 189 | &pshufb("xmm2", "xmm7"); |
| 190 | &movdqu(&QWP(0, $Xi), "xmm2"); |
| 191 | |
| 192 | # Zero any registers which contain secrets. |
| 193 | &pxor("xmm0", "xmm0"); |
| 194 | &pxor("xmm1", "xmm1"); |
| 195 | &pxor("xmm2", "xmm2"); |
| 196 | &pxor("xmm3", "xmm3"); |
| 197 | &pxor("xmm4", "xmm4"); |
| 198 | &pxor("xmm5", "xmm5"); |
| 199 | &pxor("xmm6", "xmm6"); |
| 200 | &function_end("gcm_gmult_ssse3"); |
| 201 | |
| 202 | # gcm_ghash_ssse3 incorporates |len| bytes from |in| to |Xi|, using |Htable| as |
| 203 | # the key. It writes the result back to |Xi|. |Xi| is represented in GHASH's |
| 204 | # serialized byte representation. |Htable| is formatted as described above. |
| 205 | # void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in, |
| 206 | # size_t len); |
| 207 | &function_begin("gcm_ghash_ssse3"); |
| 208 | &mov($Xi, &wparam(0)); |
| 209 | &mov($Htable, &wparam(1)); |
| 210 | &mov($in, &wparam(2)); |
| 211 | &mov($len, &wparam(3)); |
| 212 | |
| 213 | &movdqu("xmm0", &QWP(0, $Xi)); |
| 214 | &call(&label("pic_point")); |
| 215 | &set_label("pic_point"); |
| 216 | &blindpop("ebx"); |
| 217 | &movdqa("xmm7", &QWP(&label("reverse_bytes")."-".&label("pic_point"), "ebx")); |
| 218 | |
| 219 | # This function only processes whole blocks. |
| 220 | &and($len, -16); |
| 221 | |
| 222 | # Reverse input bytes to deserialize. We maintain the running |
| 223 | # total in xmm0. |
| 224 | &pshufb("xmm0", "xmm7"); |
| 225 | |
| 226 | # Iterate over each block. On entry to each iteration, xmm3 is zero. |
| 227 | &pxor("xmm3", "xmm3"); |
| 228 | &set_label("loop_ghash"); |
| 229 | &movdqa("xmm2", &QWP(&label("low4_mask")."-".&label("pic_point"), "ebx")); |
| 230 | |
| 231 | # Incorporate the next block of input. |
| 232 | &movdqu("xmm1", &QWP(0, $in)); |
| 233 | &pshufb("xmm1", "xmm7"); # Reverse bytes. |
| 234 | &pxor("xmm0", "xmm1"); |
| 235 | |
| 236 | # Split each byte into low (xmm0) and high (xmm1) halves. |
| 237 | &movdqa("xmm1", "xmm2"); |
| 238 | &pandn("xmm1", "xmm0"); |
| 239 | &psrld("xmm1", 4); |
| 240 | &pand("xmm0", "xmm2"); |
| 241 | |
| 242 | # Maintain the result in xmm2 (the value) and xmm3 (carry bits). Note |
| 243 | # that, due to bit reversal, xmm3 contains bits that fall off when |
| 244 | # right-shifting, not left-shifting. |
| 245 | &pxor("xmm2", "xmm2"); |
| 246 | # xmm3 is already zero at this point. |
| 247 | |
| 248 | # We must reduce at least once every 7 rows, so divide into three |
| 249 | # chunks. |
| 250 | &process_rows(5); |
| 251 | &process_rows(5); |
| 252 | &process_rows(6); |
| 253 | |
| 254 | &movdqa("xmm0", "xmm2"); |
| 255 | |
| 256 | # Rewind $Htable for the next iteration. |
| 257 | &lea($Htable, &DWP(-256, $Htable)); |
| 258 | |
| 259 | # Advance input and continue. |
| 260 | &lea($in, &DWP(16, $in)); |
| 261 | &sub($len, 16); |
| 262 | &jnz(&label("loop_ghash")); |
| 263 | |
| 264 | # Reverse bytes and store the result. |
| 265 | &pshufb("xmm0", "xmm7"); |
| 266 | &movdqu(&QWP(0, $Xi), "xmm0"); |
| 267 | |
| 268 | # Zero any registers which contain secrets. |
| 269 | &pxor("xmm0", "xmm0"); |
| 270 | &pxor("xmm1", "xmm1"); |
| 271 | &pxor("xmm2", "xmm2"); |
| 272 | &pxor("xmm3", "xmm3"); |
| 273 | &pxor("xmm4", "xmm4"); |
| 274 | &pxor("xmm5", "xmm5"); |
| 275 | &pxor("xmm6", "xmm6"); |
| 276 | &function_end("gcm_ghash_ssse3"); |
| 277 | |
| 278 | # reverse_bytes is a permutation which, if applied with pshufb, reverses the |
| 279 | # bytes in an XMM register. |
| 280 | &set_label("reverse_bytes", 16); |
| 281 | &data_byte(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); |
| 282 | # low4_mask is an XMM mask which selects the low four bits of each byte. |
| 283 | &set_label("low4_mask", 16); |
| 284 | &data_word(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f); |
| 285 | |
| 286 | &asm_finish(); |
| 287 | |
Pete Bentley | a5c947b | 2019-08-09 14:24:27 +0000 | [diff] [blame^] | 288 | close STDOUT or die "error closing STDOUT"; |