blob: 1dd251992bd968554f13c9d6dd6e7036627d7945 [file] [log] [blame]
Robert Sloan4c22c5f2019-03-01 15:53:37 -08001#!/usr/bin/env perl
2# Copyright (c) 2019, Google Inc.
3#
4# Permission to use, copy, modify, and/or distribute this software for any
5# purpose with or without fee is hereby granted, provided that the above
6# copyright notice and this permission notice appear in all copies.
7#
8# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
11# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
13# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
14# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15
16# ghash-ssse3-x86_64.pl is a constant-time variant of the traditional 4-bit
17# table-based GHASH implementation. It requires SSSE3 instructions.
18#
19# For background, the table-based strategy is a 4-bit windowed multiplication.
20# It precomputes all 4-bit multiples of H (this is 16 128-bit rows), then loops
21# over 4-bit windows of the input and indexes them up into the table. Visually,
22# it multiplies as in the schoolbook multiplication diagram below, but with
23# more terms. (Each term is 4 bits, so there are 32 terms in each row.) First
24# it incorporates the terms labeled '1' by indexing the most significant term
25# of X into the table. Then it shifts and repeats for '2' and so on.
26#
27# hhhhhh
28# * xxxxxx
29# ============
30# 666666
31# 555555
32# 444444
33# 333333
34# 222222
35# 111111
36#
37# This implementation changes the order. We treat the table as a 16×16 matrix
38# and transpose it. The first row is then the first byte of each multiple of H,
39# and so on. We then reorder terms as below. Observe that the terms labeled '1'
40# and '2' are all lookups into the first row, etc. This maps well to the SSSE3
41# pshufb instruction, using alternating terms of X in parallel as indices. This
42# alternation is needed because pshufb maps 4 bits to 8 bits. Then we shift and
43# repeat for each row.
44#
45# hhhhhh
46# * xxxxxx
47# ============
48# 224466
49# 113355
50# 224466
51# 113355
52# 224466
53# 113355
54#
55# Next we account for GCM's confusing bit order. The "first" bit is the least
56# significant coefficient, but GCM treats the most sigificant bit within a byte
57# as first. Bytes are little-endian, and bits are big-endian. We reverse the
58# bytes in XMM registers for a consistent bit and byte ordering, but this means
59# the least significant bit is the most significant coefficient and vice versa.
60#
61# For consistency, "low", "high", "left-shift", and "right-shift" refer to the
62# bit ordering within the XMM register, rather than the reversed coefficient
63# ordering. Low bits are less significant bits and more significant
64# coefficients. Right-shifts move from MSB to the LSB and correspond to
65# increasing the power of each coefficient.
66#
67# Note this bit reversal enters into the table's column indices. H*1 is stored
68# in column 0b1000 and H*x^3 is stored in column 0b0001. It also means earlier
69# table rows contain more significant coefficients, so we iterate forwards.
70
71use strict;
72
73my $flavour = shift;
74my $output = shift;
75if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
76
77my $win64 = 0;
78$win64 = 1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
79
80$0 =~ m/(.*[\/\\])[^\/\\]+$/;
81my $dir = $1;
82my $xlate;
83( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
84( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
85die "can't locate x86_64-xlate.pl";
86
87open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"";
88*STDOUT = *OUT;
89
90my ($Xi, $Htable, $in, $len) = $win64 ? ("%rcx", "%rdx", "%r8", "%r9") :
91 ("%rdi", "%rsi", "%rdx", "%rcx");
92
93
94my $code = <<____;
95.text
96
97# gcm_gmult_ssse3 multiplies |Xi| by |Htable| and writes the result to |Xi|.
98# |Xi| is represented in GHASH's serialized byte representation. |Htable| is
99# formatted as described above.
100# void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]);
101.type gcm_gmult_ssse3, \@abi-omnipotent
102.globl gcm_gmult_ssse3
103.align 16
104gcm_gmult_ssse3:
105.cfi_startproc
106.Lgmult_seh_begin:
107____
108$code .= <<____ if ($win64);
109 subq \$40, %rsp
110.Lgmult_seh_allocstack:
111 movdqa %xmm6, (%rsp)
112.Lgmult_seh_save_xmm6:
113 movdqa %xmm10, 16(%rsp)
114.Lgmult_seh_save_xmm10:
115.Lgmult_seh_prolog_end:
116____
117$code .= <<____;
118 movdqu ($Xi), %xmm0
119 movdqa .Lreverse_bytes(%rip), %xmm10
120 movdqa .Llow4_mask(%rip), %xmm2
121
122 # Reverse input bytes to deserialize.
123 pshufb %xmm10, %xmm0
124
125 # Split each byte into low (%xmm0) and high (%xmm1) halves.
126 movdqa %xmm2, %xmm1
127 pandn %xmm0, %xmm1
128 psrld \$4, %xmm1
129 pand %xmm2, %xmm0
130
131 # Maintain the result in %xmm2 (the value) and %xmm3 (carry bits). Note
132 # that, due to bit reversal, %xmm3 contains bits that fall off when
133 # right-shifting, not left-shifting.
134 pxor %xmm2, %xmm2
135 pxor %xmm3, %xmm3
136____
137
138my $call_counter = 0;
139# process_rows returns assembly code to process $rows rows of the table. On
140# input, $Htable stores the pointer to the next row. %xmm0 and %xmm1 store the
141# low and high halves of the input. The result so far is passed in %xmm2. %xmm3
142# must be zero. On output, $Htable is advanced to the next row and %xmm2 is
143# updated. %xmm3 remains zero. It clobbers %rax, %xmm4, %xmm5, and %xmm6.
144sub process_rows {
145 my ($rows) = @_;
146 $call_counter++;
147
148 # Shifting whole XMM registers by bits is complex. psrldq shifts by bytes,
149 # and psrlq shifts the two 64-bit halves separately. Each row produces 8
150 # bits of carry, and the reduction needs an additional 7-bit shift. This
151 # must fit in 64 bits so reduction can use psrlq. This allows up to 7 rows
152 # at a time.
153 die "Carry register would overflow 64 bits." if ($rows*8 + 7 > 64);
154
155 return <<____;
156 movq \$$rows, %rax
157.Loop_row_$call_counter:
158 movdqa ($Htable), %xmm4
159 leaq 16($Htable), $Htable
160
161 # Right-shift %xmm2 and %xmm3 by 8 bytes.
162 movdqa %xmm2, %xmm6
163 palignr \$1, %xmm3, %xmm6
164 movdqa %xmm6, %xmm3
165 psrldq \$1, %xmm2
166
167 # Load the next table row and index the low and high bits of the input.
168 # Note the low (respectively, high) half corresponds to more
169 # (respectively, less) significant coefficients.
170 movdqa %xmm4, %xmm5
171 pshufb %xmm0, %xmm4
172 pshufb %xmm1, %xmm5
173
174 # Add the high half (%xmm5) without shifting.
175 pxor %xmm5, %xmm2
176
177 # Add the low half (%xmm4). This must be right-shifted by 4 bits. First,
178 # add into the carry register (%xmm3).
179 movdqa %xmm4, %xmm5
180 psllq \$60, %xmm5
181 movdqa %xmm5, %xmm6
182 pslldq \$8, %xmm6
183 pxor %xmm6, %xmm3
184
185 # Next, add into %xmm2.
186 psrldq \$8, %xmm5
187 pxor %xmm5, %xmm2
188 psrlq \$4, %xmm4
189 pxor %xmm4, %xmm2
190
191 subq \$1, %rax
192 jnz .Loop_row_$call_counter
193
194 # Reduce the carry register. The reduction polynomial is 1 + x + x^2 +
195 # x^7, so we shift and XOR four times.
196 pxor %xmm3, %xmm2 # x^0 = 0
197 psrlq \$1, %xmm3
198 pxor %xmm3, %xmm2 # x^1 = x
199 psrlq \$1, %xmm3
200 pxor %xmm3, %xmm2 # x^(1+1) = x^2
201 psrlq \$5, %xmm3
202 pxor %xmm3, %xmm2 # x^(1+1+5) = x^7
203 pxor %xmm3, %xmm3
204____
205}
206
207# We must reduce at least once every 7 rows, so divide into three chunks.
208$code .= process_rows(5);
209$code .= process_rows(5);
210$code .= process_rows(6);
211
212$code .= <<____;
213 # Store the result. Reverse bytes to serialize.
214 pshufb %xmm10, %xmm2
215 movdqu %xmm2, ($Xi)
216
217 # Zero any registers which contain secrets.
218 pxor %xmm0, %xmm0
219 pxor %xmm1, %xmm1
220 pxor %xmm2, %xmm2
221 pxor %xmm3, %xmm3
222 pxor %xmm4, %xmm4
223 pxor %xmm5, %xmm5
224 pxor %xmm6, %xmm6
225____
226$code .= <<____ if ($win64);
227 movdqa (%rsp), %xmm6
228 movdqa 16(%rsp), %xmm10
229 addq \$40, %rsp
230____
231$code .= <<____;
232 ret
233.Lgmult_seh_end:
234.cfi_endproc
235.size gcm_gmult_ssse3,.-gcm_gmult_ssse3
236____
237
238$code .= <<____;
239# gcm_ghash_ssse3 incorporates |len| bytes from |in| to |Xi|, using |Htable| as
240# the key. It writes the result back to |Xi|. |Xi| is represented in GHASH's
241# serialized byte representation. |Htable| is formatted as described above.
242# void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in,
243# size_t len);
244.type gcm_ghash_ssse3, \@abi-omnipotent
245.globl gcm_ghash_ssse3
246.align 16
247gcm_ghash_ssse3:
248.Lghash_seh_begin:
249.cfi_startproc
250____
251$code .= <<____ if ($win64);
252 subq \$56, %rsp
253.Lghash_seh_allocstack:
254 movdqa %xmm6, (%rsp)
255.Lghash_seh_save_xmm6:
256 movdqa %xmm10, 16(%rsp)
257.Lghash_seh_save_xmm10:
258 movdqa %xmm11, 32(%rsp)
259.Lghash_seh_save_xmm11:
260.Lghash_seh_prolog_end:
261____
262$code .= <<____;
263 movdqu ($Xi), %xmm0
264 movdqa .Lreverse_bytes(%rip), %xmm10
265 movdqa .Llow4_mask(%rip), %xmm11
266
267 # This function only processes whole blocks.
268 andq \$-16, $len
269
270 # Reverse input bytes to deserialize. We maintain the running
271 # total in %xmm0.
272 pshufb %xmm10, %xmm0
273
274 # Iterate over each block. On entry to each iteration, %xmm3 is zero.
275 pxor %xmm3, %xmm3
276.Loop_ghash:
277 # Incorporate the next block of input.
278 movdqu ($in), %xmm1
279 pshufb %xmm10, %xmm1 # Reverse bytes.
280 pxor %xmm1, %xmm0
281
282 # Split each byte into low (%xmm0) and high (%xmm1) halves.
283 movdqa %xmm11, %xmm1
284 pandn %xmm0, %xmm1
285 psrld \$4, %xmm1
286 pand %xmm11, %xmm0
287
288 # Maintain the result in %xmm2 (the value) and %xmm3 (carry bits). Note
289 # that, due to bit reversal, %xmm3 contains bits that fall off when
290 # right-shifting, not left-shifting.
291 pxor %xmm2, %xmm2
292 # %xmm3 is already zero at this point.
293____
294
295# We must reduce at least once every 7 rows, so divide into three chunks.
296$code .= process_rows(5);
297$code .= process_rows(5);
298$code .= process_rows(6);
299
300$code .= <<____;
301 movdqa %xmm2, %xmm0
302
303 # Rewind $Htable for the next iteration.
304 leaq -256($Htable), $Htable
305
306 # Advance input and continue.
307 leaq 16($in), $in
308 subq \$16, $len
309 jnz .Loop_ghash
310
311 # Reverse bytes and store the result.
312 pshufb %xmm10, %xmm0
313 movdqu %xmm0, ($Xi)
314
315 # Zero any registers which contain secrets.
316 pxor %xmm0, %xmm0
317 pxor %xmm1, %xmm1
318 pxor %xmm2, %xmm2
319 pxor %xmm3, %xmm3
320 pxor %xmm4, %xmm4
321 pxor %xmm5, %xmm5
322 pxor %xmm6, %xmm6
323____
324$code .= <<____ if ($win64);
325 movdqa (%rsp), %xmm6
326 movdqa 16(%rsp), %xmm10
327 movdqa 32(%rsp), %xmm11
328 addq \$56, %rsp
329____
330$code .= <<____;
331 ret
332.Lghash_seh_end:
333.cfi_endproc
334.size gcm_ghash_ssse3,.-gcm_ghash_ssse3
335
336.align 16
337# .Lreverse_bytes is a permutation which, if applied with pshufb, reverses the
338# bytes in an XMM register.
339.Lreverse_bytes:
340.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
341# .Llow4_mask is an XMM mask which selects the low four bits of each byte.
342.Llow4_mask:
343.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
344____
345
346if ($win64) {
347 # Add unwind metadata for SEH.
348 #
349 # TODO(davidben): This is all manual right now. Once we've added SEH tests,
350 # add support for emitting these in x86_64-xlate.pl, probably based on MASM
351 # and Yasm's unwind directives, and unify with CFI. Then upstream it to
352 # replace the error-prone and non-standard custom handlers.
353
354 # See https://docs.microsoft.com/en-us/cpp/build/struct-unwind-code?view=vs-2017
355 my $UWOP_ALLOC_SMALL = 2;
356 my $UWOP_SAVE_XMM128 = 8;
357
358 $code .= <<____;
359.section .pdata
360.align 4
361 .rva .Lgmult_seh_begin
362 .rva .Lgmult_seh_end
363 .rva .Lgmult_seh_info
364
365 .rva .Lghash_seh_begin
366 .rva .Lghash_seh_end
367 .rva .Lghash_seh_info
368
369.section .xdata
370.align 8
371.Lgmult_seh_info:
372 .byte 1 # version 1, no flags
373 .byte .Lgmult_seh_prolog_end-.Lgmult_seh_begin
374 .byte 5 # num_slots = 1 + 2 + 2
375 .byte 0 # no frame register
376
377 .byte .Lgmult_seh_save_xmm10-.Lgmult_seh_begin
378 .byte @{[$UWOP_SAVE_XMM128 | (10 << 4)]}
379 .value 1
380
381 .byte .Lgmult_seh_save_xmm6-.Lgmult_seh_begin
382 .byte @{[$UWOP_SAVE_XMM128 | (6 << 4)]}
383 .value 0
384
385 .byte .Lgmult_seh_allocstack-.Lgmult_seh_begin
386 .byte @{[$UWOP_ALLOC_SMALL | (((40 - 8) / 8) << 4)]}
387
388.align 8
389.Lghash_seh_info:
390 .byte 1 # version 1, no flags
391 .byte .Lghash_seh_prolog_end-.Lghash_seh_begin
392 .byte 7 # num_slots = 1 + 2 + 2 + 2
393 .byte 0 # no frame register
394
395 .byte .Lghash_seh_save_xmm11-.Lghash_seh_begin
396 .byte @{[$UWOP_SAVE_XMM128 | (11 << 4)]}
397 .value 2
398
399 .byte .Lghash_seh_save_xmm10-.Lghash_seh_begin
400 .byte @{[$UWOP_SAVE_XMM128 | (10 << 4)]}
401 .value 1
402
403 .byte .Lghash_seh_save_xmm6-.Lghash_seh_begin
404 .byte @{[$UWOP_SAVE_XMM128 | (6 << 4)]}
405 .value 0
406
407 .byte .Lghash_seh_allocstack-.Lghash_seh_begin
408 .byte @{[$UWOP_ALLOC_SMALL | (((56 - 8) / 8) << 4)]}
409____
410}
411
412print $code;
Srinivas Paladugudd42a612019-08-09 19:30:39 +0000413close STDOUT;