Blame - src/crypto/fipsmodule/modes/asm/ghash-ssse3-x86_64.pl - platform/external/boringssl

blob: 1dd251992bd968554f13c9d6dd6e7036627d7945 [file] [log] [blame]

Robert Sloan	4c22c5f	2019-03-01 15:53:37 -0800	[diff] [blame]	1	#!/usr/bin/env perl
				2	# Copyright (c) 2019, Google Inc.
				3	#
				4	# Permission to use, copy, modify, and/or distribute this software for any
				5	# purpose with or without fee is hereby granted, provided that the above
				6	# copyright notice and this permission notice appear in all copies.
				7	#
				8	# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
				9	# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
				10	# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
				11	# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				12	# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
				13	# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
				14	# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				15
				16	# ghash-ssse3-x86_64.pl is a constant-time variant of the traditional 4-bit
				17	# table-based GHASH implementation. It requires SSSE3 instructions.
				18	#
				19	# For background, the table-based strategy is a 4-bit windowed multiplication.
				20	# It precomputes all 4-bit multiples of H (this is 16 128-bit rows), then loops
				21	# over 4-bit windows of the input and indexes them up into the table. Visually,
				22	# it multiplies as in the schoolbook multiplication diagram below, but with
				23	# more terms. (Each term is 4 bits, so there are 32 terms in each row.) First
				24	# it incorporates the terms labeled '1' by indexing the most significant term
				25	# of X into the table. Then it shifts and repeats for '2' and so on.
				26	#
				27	# hhhhhh
				28	# * xxxxxx
				29	# ============
				30	# 666666
				31	# 555555
				32	# 444444
				33	# 333333
				34	# 222222
				35	# 111111
				36	#
				37	# This implementation changes the order. We treat the table as a 16×16 matrix
				38	# and transpose it. The first row is then the first byte of each multiple of H,
				39	# and so on. We then reorder terms as below. Observe that the terms labeled '1'
				40	# and '2' are all lookups into the first row, etc. This maps well to the SSSE3
				41	# pshufb instruction, using alternating terms of X in parallel as indices. This
				42	# alternation is needed because pshufb maps 4 bits to 8 bits. Then we shift and
				43	# repeat for each row.
				44	#
				45	# hhhhhh
				46	# * xxxxxx
				47	# ============
				48	# 224466
				49	# 113355
				50	# 224466
				51	# 113355
				52	# 224466
				53	# 113355
				54	#
				55	# Next we account for GCM's confusing bit order. The "first" bit is the least
				56	# significant coefficient, but GCM treats the most sigificant bit within a byte
				57	# as first. Bytes are little-endian, and bits are big-endian. We reverse the
				58	# bytes in XMM registers for a consistent bit and byte ordering, but this means
				59	# the least significant bit is the most significant coefficient and vice versa.
				60	#
				61	# For consistency, "low", "high", "left-shift", and "right-shift" refer to the
				62	# bit ordering within the XMM register, rather than the reversed coefficient
				63	# ordering. Low bits are less significant bits and more significant
				64	# coefficients. Right-shifts move from MSB to the LSB and correspond to
				65	# increasing the power of each coefficient.
				66	#
				67	# Note this bit reversal enters into the table's column indices. H*1 is stored
				68	# in column 0b1000 and H*x^3 is stored in column 0b0001. It also means earlier
				69	# table rows contain more significant coefficients, so we iterate forwards.
				70
				71	use strict;
				72
				73	my $flavour = shift;
				74	my $output = shift;
				75	if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
				76
				77	my $win64 = 0;
				78	$win64 = 1 if ($flavour =~ /[nm]asm\|mingw64/ \|\| $output =~ /\.asm$/);
				79
				80	$0 =~ m/(.*[\/\\])[^\/\\]+$/;
				81	my $dir = $1;
				82	my $xlate;
				83	( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
				84	( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
				85	die "can't locate x86_64-xlate.pl";
				86
				87	open OUT, "\| \"$^X\" \"$xlate\" $flavour \"$output\"";
				88	STDOUT = OUT;
				89
				90	my ($Xi, $Htable, $in, $len) = $win64 ? ("%rcx", "%rdx", "%r8", "%r9") :
				91	("%rdi", "%rsi", "%rdx", "%rcx");
				92
				93
				94	my $code = <<____;
				95	.text
				96
				97	# gcm_gmult_ssse3 multiplies \|Xi\| by \|Htable\| and writes the result to \|Xi\|.
				98	# \|Xi\| is represented in GHASH's serialized byte representation. \|Htable\| is
				99	# formatted as described above.
				100	# void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]);
				101	.type gcm_gmult_ssse3, \@abi-omnipotent
				102	.globl gcm_gmult_ssse3
				103	.align 16
				104	gcm_gmult_ssse3:
				105	.cfi_startproc
				106	.Lgmult_seh_begin:
				107	____
				108	$code .= <<____ if ($win64);
				109	subq \$40, %rsp
				110	.Lgmult_seh_allocstack:
				111	movdqa %xmm6, (%rsp)
				112	.Lgmult_seh_save_xmm6:
				113	movdqa %xmm10, 16(%rsp)
				114	.Lgmult_seh_save_xmm10:
				115	.Lgmult_seh_prolog_end:
				116	____
				117	$code .= <<____;
				118	movdqu ($Xi), %xmm0
				119	movdqa .Lreverse_bytes(%rip), %xmm10
				120	movdqa .Llow4_mask(%rip), %xmm2
				121
				122	# Reverse input bytes to deserialize.
				123	pshufb %xmm10, %xmm0
				124
				125	# Split each byte into low (%xmm0) and high (%xmm1) halves.
				126	movdqa %xmm2, %xmm1
				127	pandn %xmm0, %xmm1
				128	psrld \$4, %xmm1
				129	pand %xmm2, %xmm0
				130
				131	# Maintain the result in %xmm2 (the value) and %xmm3 (carry bits). Note
				132	# that, due to bit reversal, %xmm3 contains bits that fall off when
				133	# right-shifting, not left-shifting.
				134	pxor %xmm2, %xmm2
				135	pxor %xmm3, %xmm3
				136	____
				137
				138	my $call_counter = 0;
				139	# process_rows returns assembly code to process $rows rows of the table. On
				140	# input, $Htable stores the pointer to the next row. %xmm0 and %xmm1 store the
				141	# low and high halves of the input. The result so far is passed in %xmm2. %xmm3
				142	# must be zero. On output, $Htable is advanced to the next row and %xmm2 is
				143	# updated. %xmm3 remains zero. It clobbers %rax, %xmm4, %xmm5, and %xmm6.
				144	sub process_rows {
				145	my ($rows) = @_;
				146	$call_counter++;
				147
				148	# Shifting whole XMM registers by bits is complex. psrldq shifts by bytes,
				149	# and psrlq shifts the two 64-bit halves separately. Each row produces 8
				150	# bits of carry, and the reduction needs an additional 7-bit shift. This
				151	# must fit in 64 bits so reduction can use psrlq. This allows up to 7 rows
				152	# at a time.
				153	die "Carry register would overflow 64 bits." if ($rows*8 + 7 > 64);
				154
				155	return <<____;
				156	movq \$$rows, %rax
				157	.Loop_row_$call_counter:
				158	movdqa ($Htable), %xmm4
				159	leaq 16($Htable), $Htable
				160
				161	# Right-shift %xmm2 and %xmm3 by 8 bytes.
				162	movdqa %xmm2, %xmm6
				163	palignr \$1, %xmm3, %xmm6
				164	movdqa %xmm6, %xmm3
				165	psrldq \$1, %xmm2
				166
				167	# Load the next table row and index the low and high bits of the input.
				168	# Note the low (respectively, high) half corresponds to more
				169	# (respectively, less) significant coefficients.
				170	movdqa %xmm4, %xmm5
				171	pshufb %xmm0, %xmm4
				172	pshufb %xmm1, %xmm5
				173
				174	# Add the high half (%xmm5) without shifting.
				175	pxor %xmm5, %xmm2
				176
				177	# Add the low half (%xmm4). This must be right-shifted by 4 bits. First,
				178	# add into the carry register (%xmm3).
				179	movdqa %xmm4, %xmm5
				180	psllq \$60, %xmm5
				181	movdqa %xmm5, %xmm6
				182	pslldq \$8, %xmm6
				183	pxor %xmm6, %xmm3
				184
				185	# Next, add into %xmm2.
				186	psrldq \$8, %xmm5
				187	pxor %xmm5, %xmm2
				188	psrlq \$4, %xmm4
				189	pxor %xmm4, %xmm2
				190
				191	subq \$1, %rax
				192	jnz .Loop_row_$call_counter
				193
				194	# Reduce the carry register. The reduction polynomial is 1 + x + x^2 +
				195	# x^7, so we shift and XOR four times.
				196	pxor %xmm3, %xmm2 # x^0 = 0
				197	psrlq \$1, %xmm3
				198	pxor %xmm3, %xmm2 # x^1 = x
				199	psrlq \$1, %xmm3
				200	pxor %xmm3, %xmm2 # x^(1+1) = x^2
				201	psrlq \$5, %xmm3
				202	pxor %xmm3, %xmm2 # x^(1+1+5) = x^7
				203	pxor %xmm3, %xmm3
				204	____
				205	}
				206
				207	# We must reduce at least once every 7 rows, so divide into three chunks.
				208	$code .= process_rows(5);
				209	$code .= process_rows(5);
				210	$code .= process_rows(6);
				211
				212	$code .= <<____;
				213	# Store the result. Reverse bytes to serialize.
				214	pshufb %xmm10, %xmm2
				215	movdqu %xmm2, ($Xi)
				216
				217	# Zero any registers which contain secrets.
				218	pxor %xmm0, %xmm0
				219	pxor %xmm1, %xmm1
				220	pxor %xmm2, %xmm2
				221	pxor %xmm3, %xmm3
				222	pxor %xmm4, %xmm4
				223	pxor %xmm5, %xmm5
				224	pxor %xmm6, %xmm6
				225	____
				226	$code .= <<____ if ($win64);
				227	movdqa (%rsp), %xmm6
				228	movdqa 16(%rsp), %xmm10
				229	addq \$40, %rsp
				230	____
				231	$code .= <<____;
				232	ret
				233	.Lgmult_seh_end:
				234	.cfi_endproc
				235	.size gcm_gmult_ssse3,.-gcm_gmult_ssse3
				236	____
				237
				238	$code .= <<____;
				239	# gcm_ghash_ssse3 incorporates \|len\| bytes from \|in\| to \|Xi\|, using \|Htable\| as
				240	# the key. It writes the result back to \|Xi\|. \|Xi\| is represented in GHASH's
				241	# serialized byte representation. \|Htable\| is formatted as described above.
				242	# void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in,
				243	# size_t len);
				244	.type gcm_ghash_ssse3, \@abi-omnipotent
				245	.globl gcm_ghash_ssse3
				246	.align 16
				247	gcm_ghash_ssse3:
				248	.Lghash_seh_begin:
				249	.cfi_startproc
				250	____
				251	$code .= <<____ if ($win64);
				252	subq \$56, %rsp
				253	.Lghash_seh_allocstack:
				254	movdqa %xmm6, (%rsp)
				255	.Lghash_seh_save_xmm6:
				256	movdqa %xmm10, 16(%rsp)
				257	.Lghash_seh_save_xmm10:
				258	movdqa %xmm11, 32(%rsp)
				259	.Lghash_seh_save_xmm11:
				260	.Lghash_seh_prolog_end:
				261	____
				262	$code .= <<____;
				263	movdqu ($Xi), %xmm0
				264	movdqa .Lreverse_bytes(%rip), %xmm10
				265	movdqa .Llow4_mask(%rip), %xmm11
				266
				267	# This function only processes whole blocks.
				268	andq \$-16, $len
				269
				270	# Reverse input bytes to deserialize. We maintain the running
				271	# total in %xmm0.
				272	pshufb %xmm10, %xmm0
				273
				274	# Iterate over each block. On entry to each iteration, %xmm3 is zero.
				275	pxor %xmm3, %xmm3
				276	.Loop_ghash:
				277	# Incorporate the next block of input.
				278	movdqu ($in), %xmm1
				279	pshufb %xmm10, %xmm1 # Reverse bytes.
				280	pxor %xmm1, %xmm0
				281
				282	# Split each byte into low (%xmm0) and high (%xmm1) halves.
				283	movdqa %xmm11, %xmm1
				284	pandn %xmm0, %xmm1
				285	psrld \$4, %xmm1
				286	pand %xmm11, %xmm0
				287
				288	# Maintain the result in %xmm2 (the value) and %xmm3 (carry bits). Note
				289	# that, due to bit reversal, %xmm3 contains bits that fall off when
				290	# right-shifting, not left-shifting.
				291	pxor %xmm2, %xmm2
				292	# %xmm3 is already zero at this point.
				293	____
				294
				295	# We must reduce at least once every 7 rows, so divide into three chunks.
				296	$code .= process_rows(5);
				297	$code .= process_rows(5);
				298	$code .= process_rows(6);
				299
				300	$code .= <<____;
				301	movdqa %xmm2, %xmm0
				302
				303	# Rewind $Htable for the next iteration.
				304	leaq -256($Htable), $Htable
				305
				306	# Advance input and continue.
				307	leaq 16($in), $in
				308	subq \$16, $len
				309	jnz .Loop_ghash
				310
				311	# Reverse bytes and store the result.
				312	pshufb %xmm10, %xmm0
				313	movdqu %xmm0, ($Xi)
				314
				315	# Zero any registers which contain secrets.
				316	pxor %xmm0, %xmm0
				317	pxor %xmm1, %xmm1
				318	pxor %xmm2, %xmm2
				319	pxor %xmm3, %xmm3
				320	pxor %xmm4, %xmm4
				321	pxor %xmm5, %xmm5
				322	pxor %xmm6, %xmm6
				323	____
				324	$code .= <<____ if ($win64);
				325	movdqa (%rsp), %xmm6
				326	movdqa 16(%rsp), %xmm10
				327	movdqa 32(%rsp), %xmm11
				328	addq \$56, %rsp
				329	____
				330	$code .= <<____;
				331	ret
				332	.Lghash_seh_end:
				333	.cfi_endproc
				334	.size gcm_ghash_ssse3,.-gcm_ghash_ssse3
				335
				336	.align 16
				337	# .Lreverse_bytes is a permutation which, if applied with pshufb, reverses the
				338	# bytes in an XMM register.
				339	.Lreverse_bytes:
				340	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
				341	# .Llow4_mask is an XMM mask which selects the low four bits of each byte.
				342	.Llow4_mask:
				343	.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
				344	____
				345
				346	if ($win64) {
				347	# Add unwind metadata for SEH.
				348	#
				349	# TODO(davidben): This is all manual right now. Once we've added SEH tests,
				350	# add support for emitting these in x86_64-xlate.pl, probably based on MASM
				351	# and Yasm's unwind directives, and unify with CFI. Then upstream it to
				352	# replace the error-prone and non-standard custom handlers.
				353
				354	# See https://docs.microsoft.com/en-us/cpp/build/struct-unwind-code?view=vs-2017
				355	my $UWOP_ALLOC_SMALL = 2;
				356	my $UWOP_SAVE_XMM128 = 8;
				357
				358	$code .= <<____;
				359	.section .pdata
				360	.align 4
				361	.rva .Lgmult_seh_begin
				362	.rva .Lgmult_seh_end
				363	.rva .Lgmult_seh_info
				364
				365	.rva .Lghash_seh_begin
				366	.rva .Lghash_seh_end
				367	.rva .Lghash_seh_info
				368
				369	.section .xdata
				370	.align 8
				371	.Lgmult_seh_info:
				372	.byte 1 # version 1, no flags
				373	.byte .Lgmult_seh_prolog_end-.Lgmult_seh_begin
				374	.byte 5 # num_slots = 1 + 2 + 2
				375	.byte 0 # no frame register
				376
				377	.byte .Lgmult_seh_save_xmm10-.Lgmult_seh_begin
				378	.byte @{[$UWOP_SAVE_XMM128 \| (10 << 4)]}
				379	.value 1
				380
				381	.byte .Lgmult_seh_save_xmm6-.Lgmult_seh_begin
				382	.byte @{[$UWOP_SAVE_XMM128 \| (6 << 4)]}
				383	.value 0
				384
				385	.byte .Lgmult_seh_allocstack-.Lgmult_seh_begin
				386	.byte @{[$UWOP_ALLOC_SMALL \| (((40 - 8) / 8) << 4)]}
				387
				388	.align 8
				389	.Lghash_seh_info:
				390	.byte 1 # version 1, no flags
				391	.byte .Lghash_seh_prolog_end-.Lghash_seh_begin
				392	.byte 7 # num_slots = 1 + 2 + 2 + 2
				393	.byte 0 # no frame register
				394
				395	.byte .Lghash_seh_save_xmm11-.Lghash_seh_begin
				396	.byte @{[$UWOP_SAVE_XMM128 \| (11 << 4)]}
				397	.value 2
				398
				399	.byte .Lghash_seh_save_xmm10-.Lghash_seh_begin
				400	.byte @{[$UWOP_SAVE_XMM128 \| (10 << 4)]}
				401	.value 1
				402
				403	.byte .Lghash_seh_save_xmm6-.Lghash_seh_begin
				404	.byte @{[$UWOP_SAVE_XMM128 \| (6 << 4)]}
				405	.value 0
				406
				407	.byte .Lghash_seh_allocstack-.Lghash_seh_begin
				408	.byte @{[$UWOP_ALLOC_SMALL \| (((56 - 8) / 8) << 4)]}
				409	____
				410	}
				411
				412	print $code;
Srinivas Paladugu	dd42a61	2019-08-09 19:30:39 +0000	[diff] [blame^]	413	close STDOUT;