Blame - src/crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl - platform/external/boringssl

blob: b9edb799344a37148b4a7a2b3ebc5d2a8603d77e [file] [log] [blame]

Robert Sloan	fe7cd21	2017-08-07 09:03:39 -0700	[diff] [blame]	1	#! /usr/bin/env perl
				2	# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
				3	#
				4	# Licensed under the OpenSSL license (the "License"). You may not use
				5	# this file except in compliance with the License. You can obtain a copy
				6	# in the file LICENSE in the source distribution or at
				7	# https://www.openssl.org/source/license.html
				8
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	9	#
				10	# ====================================================================
				11	# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
				12	# project. The module is, however, dual licensed under OpenSSL and
				13	# CRYPTOGAMS licenses depending on where you obtain it. For further
				14	# details see http://www.openssl.org/~appro/cryptogams/.
				15	# ====================================================================
				16	#
				17	#
				18	# AES-NI-CTR+GHASH stitch.
				19	#
				20	# February 2013
				21	#
				22	# OpenSSL GCM implementation is organized in such way that its
				23	# performance is rather close to the sum of its streamed components,
				24	# in the context parallelized AES-NI CTR and modulo-scheduled
				25	# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
				26	# was observed to perform significantly better than the sum of the
				27	# components on contemporary CPUs, the effort was deemed impossible to
				28	# justify. This module is based on combination of Intel submissions,
				29	# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
				30	# Locktyukhin of Intel Corp. who verified that it reduces shuffles
				31	# pressure with notable relative improvement, achieving 1.0 cycle per
Robert Sloan	a94fe05	2017-02-21 08:49:28 -0800	[diff] [blame]	32	# byte processed with 128-bit key on Haswell processor, 0.74 - on
				33	# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
				34	# measurements for favourable packet size, one divisible by 96.
				35	# Applications using the EVP interface will observe a few percent
				36	# worse performance.]
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	37	#
Robert Sloan	fe7cd21	2017-08-07 09:03:39 -0700	[diff] [blame]	38	# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
				39	#
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	40	# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
				41	# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
				42
				43	$flavour = shift;
				44	$output = shift;
				45	if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
				46
				47	$win64=0; $win64=1 if ($flavour =~ /[nm]asm\|mingw64/ \|\| $output =~ /\.asm$/);
				48
				49	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
				50	( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
Robert Sloan	9254e68	2017-04-24 09:42:06 -0700	[diff] [blame]	51	( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	52	die "can't locate x86_64-xlate.pl";
				53
Steven Valdez	b0b45c6	2017-01-17 16:23:54 -0500	[diff] [blame]	54	# \|$avx\| in ghash-x86_64.pl must be set to at least 1; otherwise tags will
David Benjamin	4969cc9	2016-04-22 15:02:23 -0400	[diff] [blame]	55	# be computed incorrectly.
				56	#
Kenny Root	e99801b	2015-11-06 15:31:15 -0800	[diff] [blame]	57	# In upstream, this is controlled by shelling out to the compiler to check
				58	# versions, but BoringSSL is intended to be used with pre-generated perlasm
				59	# output, so this isn't useful anyway.
Steven Valdez	b0b45c6	2017-01-17 16:23:54 -0500	[diff] [blame]	60	#
				61	# The upstream code uses the condition \|$avx>1\| even though no AVX2
				62	# instructions are used, because it assumes MOVBE is supported by the assembler
				63	# if and only if AVX2 is also supported by the assembler; see
				64	# https://marc.info/?l=openssl-dev&m=146567589526984&w=2.
				65	$avx = 2;
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	66
David Benjamin	c895d6b	2016-08-11 13:26:41 -0400	[diff] [blame]	67	open OUT,"\| \"$^X\" \"$xlate\" $flavour \"$output\"";
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	68	STDOUT=OUT;
				69
Steven Valdez	b0b45c6	2017-01-17 16:23:54 -0500	[diff] [blame]	70	# See the comment above regarding why the condition is ($avx>1) when there are
				71	# no AVX2 instructions being used.
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	72	if ($avx>1) {{{
				73
				74	($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
				75
				76	($Ii,$T1,$T2,$Hkey,
				77	$Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
				78
				79	($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
				80
				81	($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
				82
				83	$code=<<___;
				84	.text
				85
				86	.type _aesni_ctr32_ghash_6x,\@abi-omnipotent
				87	.align 32
				88	_aesni_ctr32_ghash_6x:
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	89	.cfi_startproc
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	90	vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
				91	sub \$6,$len
				92	vpxor $Z0,$Z0,$Z0 # $Z0 = 0
				93	vmovdqu 0x00-0x80($key),$rndkey
				94	vpaddb $T2,$T1,$inout1
				95	vpaddb $T2,$inout1,$inout2
				96	vpaddb $T2,$inout2,$inout3
				97	vpaddb $T2,$inout3,$inout4
				98	vpaddb $T2,$inout4,$inout5
				99	vpxor $rndkey,$T1,$inout0
				100	vmovdqu $Z0,16+8(%rsp) # "$Z3" = 0
				101	jmp .Loop6x
				102
				103	.align 32
				104	.Loop6x:
				105	add \$`6<<24`,$counter
				106	jc .Lhandle_ctr32 # discard $inout[1-5]?
				107	vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
				108	vpaddb $T2,$inout5,$T1 # next counter value
				109	vpxor $rndkey,$inout1,$inout1
				110	vpxor $rndkey,$inout2,$inout2
				111
				112	.Lresume_ctr32:
				113	vmovdqu $T1,($ivp) # save next counter value
				114	vpclmulqdq \$0x10,$Hkey,$Z3,$Z1
				115	vpxor $rndkey,$inout3,$inout3
				116	vmovups 0x10-0x80($key),$T2 # borrow $T2 for $rndkey
				117	vpclmulqdq \$0x01,$Hkey,$Z3,$Z2
David Benjamin	4969cc9	2016-04-22 15:02:23 -0400	[diff] [blame]	118
				119	# At this point, the current block of 96 (0x60) bytes has already been
				120	# loaded into registers. Concurrently with processing it, we want to
				121	# load the next 96 bytes of input for the next round. Obviously, we can
				122	# only do this if there are at least 96 more bytes of input beyond the
				123	# input we're currently processing, or else we'd read past the end of
				124	# the input buffer. Here, we set \|%r12\| to 96 if there are at least 96
				125	# bytes of input beyond the 96 bytes we're already processing, and we
				126	# set \|%r12\| to 0 otherwise. In the case where we set \|%r12\| to 96,
				127	# we'll read in the next block so that it is in registers for the next
				128	# loop iteration. In the case where we set \|%r12\| to 0, we'll re-read
				129	# the current block and then ignore what we re-read.
				130	#
				131	# At this point, \|$in0\| points to the current (already read into
				132	# registers) block, and \|$end0\| points to 2*96 bytes before the end of
				133	# the input. Thus, \|$in0\| > \|$end0\| means that we do not have the next
				134	# 96-byte block to read in, and \|$in0\| <= \|$end0\| means we do.
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	135	xor %r12,%r12
				136	cmp $in0,$end0
				137
				138	vaesenc $T2,$inout0,$inout0
				139	vmovdqu 0x30+8(%rsp),$Ii # I[4]
				140	vpxor $rndkey,$inout4,$inout4
				141	vpclmulqdq \$0x00,$Hkey,$Z3,$T1
				142	vaesenc $T2,$inout1,$inout1
				143	vpxor $rndkey,$inout5,$inout5
				144	setnc %r12b
				145	vpclmulqdq \$0x11,$Hkey,$Z3,$Z3
				146	vaesenc $T2,$inout2,$inout2
				147	vmovdqu 0x10-0x20($Xip),$Hkey # $Hkey^2
				148	neg %r12
				149	vaesenc $T2,$inout3,$inout3
				150	vpxor $Z1,$Z2,$Z2
				151	vpclmulqdq \$0x00,$Hkey,$Ii,$Z1
				152	vpxor $Z0,$Xi,$Xi # modulo-scheduled
				153	vaesenc $T2,$inout4,$inout4
				154	vpxor $Z1,$T1,$Z0
				155	and \$0x60,%r12
				156	vmovups 0x20-0x80($key),$rndkey
				157	vpclmulqdq \$0x10,$Hkey,$Ii,$T1
				158	vaesenc $T2,$inout5,$inout5
				159
				160	vpclmulqdq \$0x01,$Hkey,$Ii,$T2
				161	lea ($in0,%r12),$in0
				162	vaesenc $rndkey,$inout0,$inout0
				163	vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled [vpxor $Z3,$Xi,$Xi]
				164	vpclmulqdq \$0x11,$Hkey,$Ii,$Hkey
				165	vmovdqu 0x40+8(%rsp),$Ii # I[3]
				166	vaesenc $rndkey,$inout1,$inout1
				167	movbe 0x58($in0),%r13
				168	vaesenc $rndkey,$inout2,$inout2
				169	movbe 0x50($in0),%r12
				170	vaesenc $rndkey,$inout3,$inout3
				171	mov %r13,0x20+8(%rsp)
				172	vaesenc $rndkey,$inout4,$inout4
				173	mov %r12,0x28+8(%rsp)
				174	vmovdqu 0x30-0x20($Xip),$Z1 # borrow $Z1 for $Hkey^3
				175	vaesenc $rndkey,$inout5,$inout5
				176
				177	vmovups 0x30-0x80($key),$rndkey
				178	vpxor $T1,$Z2,$Z2
				179	vpclmulqdq \$0x00,$Z1,$Ii,$T1
				180	vaesenc $rndkey,$inout0,$inout0
				181	vpxor $T2,$Z2,$Z2
				182	vpclmulqdq \$0x10,$Z1,$Ii,$T2
				183	vaesenc $rndkey,$inout1,$inout1
				184	vpxor $Hkey,$Z3,$Z3
				185	vpclmulqdq \$0x01,$Z1,$Ii,$Hkey
				186	vaesenc $rndkey,$inout2,$inout2
				187	vpclmulqdq \$0x11,$Z1,$Ii,$Z1
				188	vmovdqu 0x50+8(%rsp),$Ii # I[2]
				189	vaesenc $rndkey,$inout3,$inout3
				190	vaesenc $rndkey,$inout4,$inout4
				191	vpxor $T1,$Z0,$Z0
				192	vmovdqu 0x40-0x20($Xip),$T1 # borrow $T1 for $Hkey^4
				193	vaesenc $rndkey,$inout5,$inout5
				194
				195	vmovups 0x40-0x80($key),$rndkey
				196	vpxor $T2,$Z2,$Z2
				197	vpclmulqdq \$0x00,$T1,$Ii,$T2
				198	vaesenc $rndkey,$inout0,$inout0
				199	vpxor $Hkey,$Z2,$Z2
				200	vpclmulqdq \$0x10,$T1,$Ii,$Hkey
				201	vaesenc $rndkey,$inout1,$inout1
				202	movbe 0x48($in0),%r13
				203	vpxor $Z1,$Z3,$Z3
				204	vpclmulqdq \$0x01,$T1,$Ii,$Z1
				205	vaesenc $rndkey,$inout2,$inout2
				206	movbe 0x40($in0),%r12
				207	vpclmulqdq \$0x11,$T1,$Ii,$T1
				208	vmovdqu 0x60+8(%rsp),$Ii # I[1]
				209	vaesenc $rndkey,$inout3,$inout3
				210	mov %r13,0x30+8(%rsp)
				211	vaesenc $rndkey,$inout4,$inout4
				212	mov %r12,0x38+8(%rsp)
				213	vpxor $T2,$Z0,$Z0
				214	vmovdqu 0x60-0x20($Xip),$T2 # borrow $T2 for $Hkey^5
				215	vaesenc $rndkey,$inout5,$inout5
				216
				217	vmovups 0x50-0x80($key),$rndkey
				218	vpxor $Hkey,$Z2,$Z2
				219	vpclmulqdq \$0x00,$T2,$Ii,$Hkey
				220	vaesenc $rndkey,$inout0,$inout0
				221	vpxor $Z1,$Z2,$Z2
				222	vpclmulqdq \$0x10,$T2,$Ii,$Z1
				223	vaesenc $rndkey,$inout1,$inout1
				224	movbe 0x38($in0),%r13
				225	vpxor $T1,$Z3,$Z3
				226	vpclmulqdq \$0x01,$T2,$Ii,$T1
				227	vpxor 0x70+8(%rsp),$Xi,$Xi # accumulate I[0]
				228	vaesenc $rndkey,$inout2,$inout2
				229	movbe 0x30($in0),%r12
				230	vpclmulqdq \$0x11,$T2,$Ii,$T2
				231	vaesenc $rndkey,$inout3,$inout3
				232	mov %r13,0x40+8(%rsp)
				233	vaesenc $rndkey,$inout4,$inout4
				234	mov %r12,0x48+8(%rsp)
				235	vpxor $Hkey,$Z0,$Z0
				236	vmovdqu 0x70-0x20($Xip),$Hkey # $Hkey^6
				237	vaesenc $rndkey,$inout5,$inout5
				238
				239	vmovups 0x60-0x80($key),$rndkey
				240	vpxor $Z1,$Z2,$Z2
				241	vpclmulqdq \$0x10,$Hkey,$Xi,$Z1
				242	vaesenc $rndkey,$inout0,$inout0
				243	vpxor $T1,$Z2,$Z2
				244	vpclmulqdq \$0x01,$Hkey,$Xi,$T1
				245	vaesenc $rndkey,$inout1,$inout1
				246	movbe 0x28($in0),%r13
				247	vpxor $T2,$Z3,$Z3
				248	vpclmulqdq \$0x00,$Hkey,$Xi,$T2
				249	vaesenc $rndkey,$inout2,$inout2
				250	movbe 0x20($in0),%r12
				251	vpclmulqdq \$0x11,$Hkey,$Xi,$Xi
				252	vaesenc $rndkey,$inout3,$inout3
				253	mov %r13,0x50+8(%rsp)
				254	vaesenc $rndkey,$inout4,$inout4
				255	mov %r12,0x58+8(%rsp)
				256	vpxor $Z1,$Z2,$Z2
				257	vaesenc $rndkey,$inout5,$inout5
				258	vpxor $T1,$Z2,$Z2
				259
				260	vmovups 0x70-0x80($key),$rndkey
				261	vpslldq \$8,$Z2,$Z1
				262	vpxor $T2,$Z0,$Z0
				263	vmovdqu 0x10($const),$Hkey # .Lpoly
				264
				265	vaesenc $rndkey,$inout0,$inout0
				266	vpxor $Xi,$Z3,$Z3
				267	vaesenc $rndkey,$inout1,$inout1
				268	vpxor $Z1,$Z0,$Z0
				269	movbe 0x18($in0),%r13
				270	vaesenc $rndkey,$inout2,$inout2
				271	movbe 0x10($in0),%r12
				272	vpalignr \$8,$Z0,$Z0,$Ii # 1st phase
				273	vpclmulqdq \$0x10,$Hkey,$Z0,$Z0
				274	mov %r13,0x60+8(%rsp)
				275	vaesenc $rndkey,$inout3,$inout3
				276	mov %r12,0x68+8(%rsp)
				277	vaesenc $rndkey,$inout4,$inout4
				278	vmovups 0x80-0x80($key),$T1 # borrow $T1 for $rndkey
				279	vaesenc $rndkey,$inout5,$inout5
				280
				281	vaesenc $T1,$inout0,$inout0
				282	vmovups 0x90-0x80($key),$rndkey
				283	vaesenc $T1,$inout1,$inout1
				284	vpsrldq \$8,$Z2,$Z2
				285	vaesenc $T1,$inout2,$inout2
				286	vpxor $Z2,$Z3,$Z3
				287	vaesenc $T1,$inout3,$inout3
				288	vpxor $Ii,$Z0,$Z0
				289	movbe 0x08($in0),%r13
				290	vaesenc $T1,$inout4,$inout4
				291	movbe 0x00($in0),%r12
				292	vaesenc $T1,$inout5,$inout5
				293	vmovups 0xa0-0x80($key),$T1
				294	cmp \$11,$rounds
				295	jb .Lenc_tail # 128-bit key
				296
				297	vaesenc $rndkey,$inout0,$inout0
				298	vaesenc $rndkey,$inout1,$inout1
				299	vaesenc $rndkey,$inout2,$inout2
				300	vaesenc $rndkey,$inout3,$inout3
				301	vaesenc $rndkey,$inout4,$inout4
				302	vaesenc $rndkey,$inout5,$inout5
				303
				304	vaesenc $T1,$inout0,$inout0
				305	vaesenc $T1,$inout1,$inout1
				306	vaesenc $T1,$inout2,$inout2
				307	vaesenc $T1,$inout3,$inout3
				308	vaesenc $T1,$inout4,$inout4
				309	vmovups 0xb0-0x80($key),$rndkey
				310	vaesenc $T1,$inout5,$inout5
				311	vmovups 0xc0-0x80($key),$T1
				312	je .Lenc_tail # 192-bit key
				313
				314	vaesenc $rndkey,$inout0,$inout0
				315	vaesenc $rndkey,$inout1,$inout1
				316	vaesenc $rndkey,$inout2,$inout2
				317	vaesenc $rndkey,$inout3,$inout3
				318	vaesenc $rndkey,$inout4,$inout4
				319	vaesenc $rndkey,$inout5,$inout5
				320
				321	vaesenc $T1,$inout0,$inout0
				322	vaesenc $T1,$inout1,$inout1
				323	vaesenc $T1,$inout2,$inout2
				324	vaesenc $T1,$inout3,$inout3
				325	vaesenc $T1,$inout4,$inout4
				326	vmovups 0xd0-0x80($key),$rndkey
				327	vaesenc $T1,$inout5,$inout5
				328	vmovups 0xe0-0x80($key),$T1
				329	jmp .Lenc_tail # 256-bit key
				330
				331	.align 32
				332	.Lhandle_ctr32:
				333	vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
				334	vpshufb $Ii,$T1,$Z2 # byte-swap counter
				335	vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb
				336	vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb
				337	vpaddd $Z1,$Z2,$inout2
				338	vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
				339	vpaddd $Z1,$inout1,$inout3
				340	vpshufb $Ii,$inout1,$inout1
				341	vpaddd $Z1,$inout2,$inout4
				342	vpshufb $Ii,$inout2,$inout2
				343	vpxor $rndkey,$inout1,$inout1
				344	vpaddd $Z1,$inout3,$inout5
				345	vpshufb $Ii,$inout3,$inout3
				346	vpxor $rndkey,$inout2,$inout2
				347	vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value
				348	vpshufb $Ii,$inout4,$inout4
				349	vpshufb $Ii,$inout5,$inout5
				350	vpshufb $Ii,$T1,$T1 # next counter value
				351	jmp .Lresume_ctr32
				352
				353	.align 32
				354	.Lenc_tail:
				355	vaesenc $rndkey,$inout0,$inout0
				356	vmovdqu $Z3,16+8(%rsp) # postpone vpxor $Z3,$Xi,$Xi
				357	vpalignr \$8,$Z0,$Z0,$Xi # 2nd phase
				358	vaesenc $rndkey,$inout1,$inout1
				359	vpclmulqdq \$0x10,$Hkey,$Z0,$Z0
				360	vpxor 0x00($inp),$T1,$T2
				361	vaesenc $rndkey,$inout2,$inout2
				362	vpxor 0x10($inp),$T1,$Ii
				363	vaesenc $rndkey,$inout3,$inout3
				364	vpxor 0x20($inp),$T1,$Z1
				365	vaesenc $rndkey,$inout4,$inout4
				366	vpxor 0x30($inp),$T1,$Z2
				367	vaesenc $rndkey,$inout5,$inout5
				368	vpxor 0x40($inp),$T1,$Z3
				369	vpxor 0x50($inp),$T1,$Hkey
				370	vmovdqu ($ivp),$T1 # load next counter value
				371
				372	vaesenclast $T2,$inout0,$inout0
				373	vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
				374	vaesenclast $Ii,$inout1,$inout1
				375	vpaddb $T2,$T1,$Ii
				376	mov %r13,0x70+8(%rsp)
				377	lea 0x60($inp),$inp
				378	vaesenclast $Z1,$inout2,$inout2
				379	vpaddb $T2,$Ii,$Z1
				380	mov %r12,0x78+8(%rsp)
				381	lea 0x60($out),$out
				382	vmovdqu 0x00-0x80($key),$rndkey
				383	vaesenclast $Z2,$inout3,$inout3
				384	vpaddb $T2,$Z1,$Z2
				385	vaesenclast $Z3, $inout4,$inout4
				386	vpaddb $T2,$Z2,$Z3
				387	vaesenclast $Hkey,$inout5,$inout5
				388	vpaddb $T2,$Z3,$Hkey
				389
				390	add \$0x60,$ret
				391	sub \$0x6,$len
				392	jc .L6x_done
				393
				394	vmovups $inout0,-0x60($out) # save output
				395	vpxor $rndkey,$T1,$inout0
				396	vmovups $inout1,-0x50($out)
				397	vmovdqa $Ii,$inout1 # 0 latency
				398	vmovups $inout2,-0x40($out)
				399	vmovdqa $Z1,$inout2 # 0 latency
				400	vmovups $inout3,-0x30($out)
				401	vmovdqa $Z2,$inout3 # 0 latency
				402	vmovups $inout4,-0x20($out)
				403	vmovdqa $Z3,$inout4 # 0 latency
				404	vmovups $inout5,-0x10($out)
				405	vmovdqa $Hkey,$inout5 # 0 latency
				406	vmovdqu 0x20+8(%rsp),$Z3 # I[5]
				407	jmp .Loop6x
				408
				409	.L6x_done:
				410	vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled
				411	vpxor $Z0,$Xi,$Xi # modulo-scheduled
				412
				413	ret
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	414	.cfi_endproc
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	415	.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
				416	___
				417	######################################################################
				418	#
				419	# size_t aesni_gcm_[en\|de]crypt(const void inp, void out, size_t len,
				420	# const AES_KEY *key, unsigned char iv[16],
				421	# struct { u128 Xi,H,Htbl[9]; } *Xip);
				422	$code.=<<___;
				423	.globl aesni_gcm_decrypt
				424	.type aesni_gcm_decrypt,\@function,6
				425	.align 32
				426	aesni_gcm_decrypt:
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	427	.cfi_startproc
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	428	xor $ret,$ret
David Benjamin	4969cc9	2016-04-22 15:02:23 -0400	[diff] [blame]	429
				430	# We call \|_aesni_ctr32_ghash_6x\|, which requires at least 96 (0x60)
				431	# bytes of input.
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	432	cmp \$0x60,$len # minimal accepted length
				433	jb .Lgcm_dec_abort
				434
				435	lea (%rsp),%rax # save stack pointer
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	436	.cfi_def_cfa_register %rax
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	437	push %rbx
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	438	.cfi_push %rbx
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	439	push %rbp
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	440	.cfi_push %rbp
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	441	push %r12
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	442	.cfi_push %r12
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	443	push %r13
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	444	.cfi_push %r13
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	445	push %r14
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	446	.cfi_push %r14
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	447	push %r15
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	448	.cfi_push %r15
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	449	___
				450	$code.=<<___ if ($win64);
				451	lea -0xa8(%rsp),%rsp
				452	movaps %xmm6,-0xd8(%rax)
				453	movaps %xmm7,-0xc8(%rax)
				454	movaps %xmm8,-0xb8(%rax)
				455	movaps %xmm9,-0xa8(%rax)
				456	movaps %xmm10,-0x98(%rax)
				457	movaps %xmm11,-0x88(%rax)
				458	movaps %xmm12,-0x78(%rax)
				459	movaps %xmm13,-0x68(%rax)
				460	movaps %xmm14,-0x58(%rax)
				461	movaps %xmm15,-0x48(%rax)
				462	.Lgcm_dec_body:
				463	___
				464	$code.=<<___;
				465	vzeroupper
				466
				467	vmovdqu ($ivp),$T1 # input counter value
				468	add \$-128,%rsp
				469	mov 12($ivp),$counter
				470	lea .Lbswap_mask(%rip),$const
				471	lea -0x80($key),$in0 # borrow $in0
				472	mov \$0xf80,$end0 # borrow $end0
				473	vmovdqu ($Xip),$Xi # load Xi
				474	and \$-128,%rsp # ensure stack alignment
				475	vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
				476	lea 0x80($key),$key # size optimization
				477	lea 0x20+0x20($Xip),$Xip # size optimization
				478	mov 0xf0-0x80($key),$rounds
				479	vpshufb $Ii,$Xi,$Xi
				480
				481	and $end0,$in0
				482	and %rsp,$end0
				483	sub $in0,$end0
				484	jc .Ldec_no_key_aliasing
				485	cmp \$768,$end0
				486	jnc .Ldec_no_key_aliasing
				487	sub $end0,%rsp # avoid aliasing with key
				488	.Ldec_no_key_aliasing:
				489
				490	vmovdqu 0x50($inp),$Z3 # I[5]
				491	lea ($inp),$in0
				492	vmovdqu 0x40($inp),$Z0
David Benjamin	4969cc9	2016-04-22 15:02:23 -0400	[diff] [blame]	493
				494	# \|_aesni_ctr32_ghash_6x\| requires \|$end0\| to point to 2*96 (0xc0)
				495	# bytes before the end of the input. Note, in particular, that this is
				496	# correct even if \|$len\| is not an even multiple of 96 or 16. XXX: This
				497	# seems to require that \|$inp\| + \|$len\| >= 2*96 (0xc0); i.e. \|$inp\| must
				498	# not be near the very beginning of the address space when \|$len\| < 2*96
				499	# (0xc0).
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	500	lea -0xc0($inp,$len),$end0
David Benjamin	4969cc9	2016-04-22 15:02:23 -0400	[diff] [blame]	501
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	502	vmovdqu 0x30($inp),$Z1
				503	shr \$4,$len
				504	xor $ret,$ret
				505	vmovdqu 0x20($inp),$Z2
				506	vpshufb $Ii,$Z3,$Z3 # passed to _aesni_ctr32_ghash_6x
				507	vmovdqu 0x10($inp),$T2
				508	vpshufb $Ii,$Z0,$Z0
				509	vmovdqu ($inp),$Hkey
				510	vpshufb $Ii,$Z1,$Z1
				511	vmovdqu $Z0,0x30(%rsp)
				512	vpshufb $Ii,$Z2,$Z2
				513	vmovdqu $Z1,0x40(%rsp)
				514	vpshufb $Ii,$T2,$T2
				515	vmovdqu $Z2,0x50(%rsp)
				516	vpshufb $Ii,$Hkey,$Hkey
				517	vmovdqu $T2,0x60(%rsp)
				518	vmovdqu $Hkey,0x70(%rsp)
				519
				520	call _aesni_ctr32_ghash_6x
				521
				522	vmovups $inout0,-0x60($out) # save output
				523	vmovups $inout1,-0x50($out)
				524	vmovups $inout2,-0x40($out)
				525	vmovups $inout3,-0x30($out)
				526	vmovups $inout4,-0x20($out)
				527	vmovups $inout5,-0x10($out)
				528
				529	vpshufb ($const),$Xi,$Xi # .Lbswap_mask
				530	vmovdqu $Xi,-0x40($Xip) # output Xi
				531
				532	vzeroupper
				533	___
				534	$code.=<<___ if ($win64);
				535	movaps -0xd8(%rax),%xmm6
David Benjamin	4969cc9	2016-04-22 15:02:23 -0400	[diff] [blame]	536	movaps -0xc8(%rax),%xmm7
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	537	movaps -0xb8(%rax),%xmm8
				538	movaps -0xa8(%rax),%xmm9
				539	movaps -0x98(%rax),%xmm10
				540	movaps -0x88(%rax),%xmm11
				541	movaps -0x78(%rax),%xmm12
				542	movaps -0x68(%rax),%xmm13
				543	movaps -0x58(%rax),%xmm14
				544	movaps -0x48(%rax),%xmm15
				545	___
				546	$code.=<<___;
				547	mov -48(%rax),%r15
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	548	.cfi_restore %r15
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	549	mov -40(%rax),%r14
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	550	.cfi_restore %r14
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	551	mov -32(%rax),%r13
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	552	.cfi_restore %r13
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	553	mov -24(%rax),%r12
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	554	.cfi_restore %r12
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	555	mov -16(%rax),%rbp
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	556	.cfi_restore %rbp
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	557	mov -8(%rax),%rbx
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	558	.cfi_restore %rbx
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	559	lea (%rax),%rsp # restore %rsp
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	560	.cfi_def_cfa_register %rsp
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	561	.Lgcm_dec_abort:
				562	mov $ret,%rax # return value
				563	ret
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	564	.cfi_endproc
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	565	.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
				566	___
				567
				568	$code.=<<___;
				569	.type _aesni_ctr32_6x,\@abi-omnipotent
				570	.align 32
				571	_aesni_ctr32_6x:
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	572	.cfi_startproc
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	573	vmovdqu 0x00-0x80($key),$Z0 # borrow $Z0 for $rndkey
				574	vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
				575	lea -1($rounds),%r13
				576	vmovups 0x10-0x80($key),$rndkey
				577	lea 0x20-0x80($key),%r12
				578	vpxor $Z0,$T1,$inout0
				579	add \$`6<<24`,$counter
				580	jc .Lhandle_ctr32_2
				581	vpaddb $T2,$T1,$inout1
				582	vpaddb $T2,$inout1,$inout2
				583	vpxor $Z0,$inout1,$inout1
				584	vpaddb $T2,$inout2,$inout3
				585	vpxor $Z0,$inout2,$inout2
				586	vpaddb $T2,$inout3,$inout4
				587	vpxor $Z0,$inout3,$inout3
				588	vpaddb $T2,$inout4,$inout5
				589	vpxor $Z0,$inout4,$inout4
				590	vpaddb $T2,$inout5,$T1
				591	vpxor $Z0,$inout5,$inout5
				592	jmp .Loop_ctr32
				593
				594	.align 16
				595	.Loop_ctr32:
				596	vaesenc $rndkey,$inout0,$inout0
				597	vaesenc $rndkey,$inout1,$inout1
				598	vaesenc $rndkey,$inout2,$inout2
				599	vaesenc $rndkey,$inout3,$inout3
				600	vaesenc $rndkey,$inout4,$inout4
				601	vaesenc $rndkey,$inout5,$inout5
				602	vmovups (%r12),$rndkey
				603	lea 0x10(%r12),%r12
				604	dec %r13d
				605	jnz .Loop_ctr32
				606
				607	vmovdqu (%r12),$Hkey # last round key
				608	vaesenc $rndkey,$inout0,$inout0
				609	vpxor 0x00($inp),$Hkey,$Z0
				610	vaesenc $rndkey,$inout1,$inout1
				611	vpxor 0x10($inp),$Hkey,$Z1
				612	vaesenc $rndkey,$inout2,$inout2
				613	vpxor 0x20($inp),$Hkey,$Z2
				614	vaesenc $rndkey,$inout3,$inout3
				615	vpxor 0x30($inp),$Hkey,$Xi
				616	vaesenc $rndkey,$inout4,$inout4
				617	vpxor 0x40($inp),$Hkey,$T2
				618	vaesenc $rndkey,$inout5,$inout5
				619	vpxor 0x50($inp),$Hkey,$Hkey
				620	lea 0x60($inp),$inp
				621
				622	vaesenclast $Z0,$inout0,$inout0
				623	vaesenclast $Z1,$inout1,$inout1
				624	vaesenclast $Z2,$inout2,$inout2
				625	vaesenclast $Xi,$inout3,$inout3
				626	vaesenclast $T2,$inout4,$inout4
				627	vaesenclast $Hkey,$inout5,$inout5
				628	vmovups $inout0,0x00($out)
				629	vmovups $inout1,0x10($out)
				630	vmovups $inout2,0x20($out)
				631	vmovups $inout3,0x30($out)
				632	vmovups $inout4,0x40($out)
				633	vmovups $inout5,0x50($out)
				634	lea 0x60($out),$out
				635
				636	ret
				637	.align 32
				638	.Lhandle_ctr32_2:
				639	vpshufb $Ii,$T1,$Z2 # byte-swap counter
				640	vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb
				641	vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb
				642	vpaddd $Z1,$Z2,$inout2
				643	vpaddd $Z1,$inout1,$inout3
				644	vpshufb $Ii,$inout1,$inout1
				645	vpaddd $Z1,$inout2,$inout4
				646	vpshufb $Ii,$inout2,$inout2
				647	vpxor $Z0,$inout1,$inout1
				648	vpaddd $Z1,$inout3,$inout5
				649	vpshufb $Ii,$inout3,$inout3
				650	vpxor $Z0,$inout2,$inout2
				651	vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value
				652	vpshufb $Ii,$inout4,$inout4
				653	vpxor $Z0,$inout3,$inout3
				654	vpshufb $Ii,$inout5,$inout5
				655	vpxor $Z0,$inout4,$inout4
				656	vpshufb $Ii,$T1,$T1 # next counter value
				657	vpxor $Z0,$inout5,$inout5
				658	jmp .Loop_ctr32
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	659	.cfi_endproc
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	660	.size _aesni_ctr32_6x,.-_aesni_ctr32_6x
				661
				662	.globl aesni_gcm_encrypt
				663	.type aesni_gcm_encrypt,\@function,6
				664	.align 32
				665	aesni_gcm_encrypt:
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	666	.cfi_startproc
Robert Sloan	4c22c5f	2019-03-01 15:53:37 -0800	[diff] [blame]	667	#ifndef NDEBUG
				668	#ifndef BORINGSSL_FIPS
				669	.extern BORINGSSL_function_hit
				670	movb \$1,BORINGSSL_function_hit+2(%rip)
				671	#endif
				672	#endif
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	673	xor $ret,$ret
David Benjamin	4969cc9	2016-04-22 15:02:23 -0400	[diff] [blame]	674
				675	# We call \|_aesni_ctr32_6x\| twice, each call consuming 96 bytes of
				676	# input. Then we call \|_aesni_ctr32_ghash_6x\|, which requires at
				677	# least 96 more bytes of input.
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	678	cmp \$0x60*3,$len # minimal accepted length
				679	jb .Lgcm_enc_abort
				680
				681	lea (%rsp),%rax # save stack pointer
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	682	.cfi_def_cfa_register %rax
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	683	push %rbx
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	684	.cfi_push %rbx
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	685	push %rbp
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	686	.cfi_push %rbp
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	687	push %r12
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	688	.cfi_push %r12
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	689	push %r13
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	690	.cfi_push %r13
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	691	push %r14
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	692	.cfi_push %r14
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	693	push %r15
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	694	.cfi_push %r15
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	695	___
				696	$code.=<<___ if ($win64);
				697	lea -0xa8(%rsp),%rsp
				698	movaps %xmm6,-0xd8(%rax)
				699	movaps %xmm7,-0xc8(%rax)
				700	movaps %xmm8,-0xb8(%rax)
				701	movaps %xmm9,-0xa8(%rax)
				702	movaps %xmm10,-0x98(%rax)
				703	movaps %xmm11,-0x88(%rax)
				704	movaps %xmm12,-0x78(%rax)
				705	movaps %xmm13,-0x68(%rax)
				706	movaps %xmm14,-0x58(%rax)
				707	movaps %xmm15,-0x48(%rax)
				708	.Lgcm_enc_body:
				709	___
				710	$code.=<<___;
				711	vzeroupper
				712
				713	vmovdqu ($ivp),$T1 # input counter value
				714	add \$-128,%rsp
				715	mov 12($ivp),$counter
				716	lea .Lbswap_mask(%rip),$const
				717	lea -0x80($key),$in0 # borrow $in0
				718	mov \$0xf80,$end0 # borrow $end0
				719	lea 0x80($key),$key # size optimization
				720	vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
				721	and \$-128,%rsp # ensure stack alignment
				722	mov 0xf0-0x80($key),$rounds
				723
				724	and $end0,$in0
				725	and %rsp,$end0
				726	sub $in0,$end0
				727	jc .Lenc_no_key_aliasing
				728	cmp \$768,$end0
				729	jnc .Lenc_no_key_aliasing
				730	sub $end0,%rsp # avoid aliasing with key
				731	.Lenc_no_key_aliasing:
				732
				733	lea ($out),$in0
David Benjamin	4969cc9	2016-04-22 15:02:23 -0400	[diff] [blame]	734
				735	# \|_aesni_ctr32_ghash_6x\| requires \|$end0\| to point to 2*96 (0xc0)
				736	# bytes before the end of the input. Note, in particular, that this is
				737	# correct even if \|$len\| is not an even multiple of 96 or 16. Unlike in
				738	# the decryption case, there's no caveat that \|$out\| must not be near
				739	# the very beginning of the address space, because we know that
				740	# \|$len\| >= 3*96 from the check above, and so we know
				741	# \|$out\| + \|$len\| >= 2*96 (0xc0).
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	742	lea -0xc0($out,$len),$end0
David Benjamin	4969cc9	2016-04-22 15:02:23 -0400	[diff] [blame]	743
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	744	shr \$4,$len
				745
				746	call _aesni_ctr32_6x
				747	vpshufb $Ii,$inout0,$Xi # save bswapped output on stack
				748	vpshufb $Ii,$inout1,$T2
				749	vmovdqu $Xi,0x70(%rsp)
				750	vpshufb $Ii,$inout2,$Z0
				751	vmovdqu $T2,0x60(%rsp)
				752	vpshufb $Ii,$inout3,$Z1
				753	vmovdqu $Z0,0x50(%rsp)
				754	vpshufb $Ii,$inout4,$Z2
				755	vmovdqu $Z1,0x40(%rsp)
				756	vpshufb $Ii,$inout5,$Z3 # passed to _aesni_ctr32_ghash_6x
				757	vmovdqu $Z2,0x30(%rsp)
				758
				759	call _aesni_ctr32_6x
				760
				761	vmovdqu ($Xip),$Xi # load Xi
				762	lea 0x20+0x20($Xip),$Xip # size optimization
				763	sub \$12,$len
				764	mov \$0x60*2,$ret
				765	vpshufb $Ii,$Xi,$Xi
				766
				767	call _aesni_ctr32_ghash_6x
				768	vmovdqu 0x20(%rsp),$Z3 # I[5]
				769	vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
				770	vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
				771	vpunpckhqdq $Z3,$Z3,$T1
				772	vmovdqu 0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK
				773	vmovups $inout0,-0x60($out) # save output
				774	vpshufb $Ii,$inout0,$inout0 # but keep bswapped copy
				775	vpxor $Z3,$T1,$T1
				776	vmovups $inout1,-0x50($out)
				777	vpshufb $Ii,$inout1,$inout1
				778	vmovups $inout2,-0x40($out)
				779	vpshufb $Ii,$inout2,$inout2
				780	vmovups $inout3,-0x30($out)
				781	vpshufb $Ii,$inout3,$inout3
				782	vmovups $inout4,-0x20($out)
				783	vpshufb $Ii,$inout4,$inout4
				784	vmovups $inout5,-0x10($out)
				785	vpshufb $Ii,$inout5,$inout5
				786	vmovdqu $inout0,0x10(%rsp) # free $inout0
				787	___
				788	{ my ($HK,$T3)=($rndkey,$inout0);
				789
				790	$code.=<<___;
				791	vmovdqu 0x30(%rsp),$Z2 # I[4]
				792	vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2
				793	vpunpckhqdq $Z2,$Z2,$T2
				794	vpclmulqdq \$0x00,$Hkey,$Z3,$Z1
				795	vpxor $Z2,$T2,$T2
				796	vpclmulqdq \$0x11,$Hkey,$Z3,$Z3
				797	vpclmulqdq \$0x00,$HK,$T1,$T1
				798
				799	vmovdqu 0x40(%rsp),$T3 # I[3]
				800	vpclmulqdq \$0x00,$Ii,$Z2,$Z0
				801	vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3
				802	vpxor $Z1,$Z0,$Z0
				803	vpunpckhqdq $T3,$T3,$Z1
				804	vpclmulqdq \$0x11,$Ii,$Z2,$Z2
				805	vpxor $T3,$Z1,$Z1
				806	vpxor $Z3,$Z2,$Z2
				807	vpclmulqdq \$0x10,$HK,$T2,$T2
				808	vmovdqu 0x50-0x20($Xip),$HK
				809	vpxor $T1,$T2,$T2
				810
				811	vmovdqu 0x50(%rsp),$T1 # I[2]
				812	vpclmulqdq \$0x00,$Hkey,$T3,$Z3
				813	vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4
				814	vpxor $Z0,$Z3,$Z3
				815	vpunpckhqdq $T1,$T1,$Z0
				816	vpclmulqdq \$0x11,$Hkey,$T3,$T3
				817	vpxor $T1,$Z0,$Z0
				818	vpxor $Z2,$T3,$T3
				819	vpclmulqdq \$0x00,$HK,$Z1,$Z1
				820	vpxor $T2,$Z1,$Z1
				821
				822	vmovdqu 0x60(%rsp),$T2 # I[1]
				823	vpclmulqdq \$0x00,$Ii,$T1,$Z2
				824	vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5
				825	vpxor $Z3,$Z2,$Z2
				826	vpunpckhqdq $T2,$T2,$Z3
				827	vpclmulqdq \$0x11,$Ii,$T1,$T1
				828	vpxor $T2,$Z3,$Z3
				829	vpxor $T3,$T1,$T1
				830	vpclmulqdq \$0x10,$HK,$Z0,$Z0
				831	vmovdqu 0x80-0x20($Xip),$HK
				832	vpxor $Z1,$Z0,$Z0
				833
				834	vpxor 0x70(%rsp),$Xi,$Xi # accumulate I[0]
				835	vpclmulqdq \$0x00,$Hkey,$T2,$Z1
				836	vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6
				837	vpunpckhqdq $Xi,$Xi,$T3
				838	vpxor $Z2,$Z1,$Z1
				839	vpclmulqdq \$0x11,$Hkey,$T2,$T2
				840	vpxor $Xi,$T3,$T3
				841	vpxor $T1,$T2,$T2
				842	vpclmulqdq \$0x00,$HK,$Z3,$Z3
				843	vpxor $Z0,$Z3,$Z0
				844
				845	vpclmulqdq \$0x00,$Ii,$Xi,$Z2
				846	vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
				847	vpunpckhqdq $inout5,$inout5,$T1
				848	vpclmulqdq \$0x11,$Ii,$Xi,$Xi
				849	vpxor $inout5,$T1,$T1
				850	vpxor $Z1,$Z2,$Z1
				851	vpclmulqdq \$0x10,$HK,$T3,$T3
				852	vmovdqu 0x20-0x20($Xip),$HK
				853	vpxor $T2,$Xi,$Z3
				854	vpxor $Z0,$T3,$Z2
				855
				856	vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2
				857	vpxor $Z1,$Z3,$T3 # aggregated Karatsuba post-processing
				858	vpclmulqdq \$0x00,$Hkey,$inout5,$Z0
				859	vpxor $T3,$Z2,$Z2
				860	vpunpckhqdq $inout4,$inout4,$T2
				861	vpclmulqdq \$0x11,$Hkey,$inout5,$inout5
				862	vpxor $inout4,$T2,$T2
				863	vpslldq \$8,$Z2,$T3
				864	vpclmulqdq \$0x00,$HK,$T1,$T1
				865	vpxor $T3,$Z1,$Xi
				866	vpsrldq \$8,$Z2,$Z2
				867	vpxor $Z2,$Z3,$Z3
				868
				869	vpclmulqdq \$0x00,$Ii,$inout4,$Z1
				870	vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3
				871	vpxor $Z0,$Z1,$Z1
				872	vpunpckhqdq $inout3,$inout3,$T3
				873	vpclmulqdq \$0x11,$Ii,$inout4,$inout4
				874	vpxor $inout3,$T3,$T3
				875	vpxor $inout5,$inout4,$inout4
				876	vpalignr \$8,$Xi,$Xi,$inout5 # 1st phase
				877	vpclmulqdq \$0x10,$HK,$T2,$T2
				878	vmovdqu 0x50-0x20($Xip),$HK
				879	vpxor $T1,$T2,$T2
				880
				881	vpclmulqdq \$0x00,$Hkey,$inout3,$Z0
				882	vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4
				883	vpxor $Z1,$Z0,$Z0
				884	vpunpckhqdq $inout2,$inout2,$T1
				885	vpclmulqdq \$0x11,$Hkey,$inout3,$inout3
				886	vpxor $inout2,$T1,$T1
				887	vpxor $inout4,$inout3,$inout3
				888	vxorps 0x10(%rsp),$Z3,$Z3 # accumulate $inout0
				889	vpclmulqdq \$0x00,$HK,$T3,$T3
				890	vpxor $T2,$T3,$T3
				891
				892	vpclmulqdq \$0x10,0x10($const),$Xi,$Xi
				893	vxorps $inout5,$Xi,$Xi
				894
				895	vpclmulqdq \$0x00,$Ii,$inout2,$Z1
				896	vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5
				897	vpxor $Z0,$Z1,$Z1
				898	vpunpckhqdq $inout1,$inout1,$T2
				899	vpclmulqdq \$0x11,$Ii,$inout2,$inout2
				900	vpxor $inout1,$T2,$T2
				901	vpalignr \$8,$Xi,$Xi,$inout5 # 2nd phase
				902	vpxor $inout3,$inout2,$inout2
				903	vpclmulqdq \$0x10,$HK,$T1,$T1
				904	vmovdqu 0x80-0x20($Xip),$HK
				905	vpxor $T3,$T1,$T1
				906
				907	vxorps $Z3,$inout5,$inout5
				908	vpclmulqdq \$0x10,0x10($const),$Xi,$Xi
				909	vxorps $inout5,$Xi,$Xi
				910
				911	vpclmulqdq \$0x00,$Hkey,$inout1,$Z0
				912	vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6
				913	vpxor $Z1,$Z0,$Z0
				914	vpunpckhqdq $Xi,$Xi,$T3
				915	vpclmulqdq \$0x11,$Hkey,$inout1,$inout1
				916	vpxor $Xi,$T3,$T3
				917	vpxor $inout2,$inout1,$inout1
				918	vpclmulqdq \$0x00,$HK,$T2,$T2
				919	vpxor $T1,$T2,$T2
				920
				921	vpclmulqdq \$0x00,$Ii,$Xi,$Z1
				922	vpclmulqdq \$0x11,$Ii,$Xi,$Z3
				923	vpxor $Z0,$Z1,$Z1
				924	vpclmulqdq \$0x10,$HK,$T3,$Z2
				925	vpxor $inout1,$Z3,$Z3
				926	vpxor $T2,$Z2,$Z2
				927
				928	vpxor $Z1,$Z3,$Z0 # aggregated Karatsuba post-processing
				929	vpxor $Z0,$Z2,$Z2
				930	vpslldq \$8,$Z2,$T1
				931	vmovdqu 0x10($const),$Hkey # .Lpoly
				932	vpsrldq \$8,$Z2,$Z2
				933	vpxor $T1,$Z1,$Xi
				934	vpxor $Z2,$Z3,$Z3
				935
				936	vpalignr \$8,$Xi,$Xi,$T2 # 1st phase
				937	vpclmulqdq \$0x10,$Hkey,$Xi,$Xi
				938	vpxor $T2,$Xi,$Xi
				939
				940	vpalignr \$8,$Xi,$Xi,$T2 # 2nd phase
				941	vpclmulqdq \$0x10,$Hkey,$Xi,$Xi
				942	vpxor $Z3,$T2,$T2
				943	vpxor $T2,$Xi,$Xi
				944	___
				945	}
				946	$code.=<<___;
				947	vpshufb ($const),$Xi,$Xi # .Lbswap_mask
				948	vmovdqu $Xi,-0x40($Xip) # output Xi
				949
				950	vzeroupper
				951	___
				952	$code.=<<___ if ($win64);
				953	movaps -0xd8(%rax),%xmm6
				954	movaps -0xc8(%rax),%xmm7
				955	movaps -0xb8(%rax),%xmm8
				956	movaps -0xa8(%rax),%xmm9
				957	movaps -0x98(%rax),%xmm10
				958	movaps -0x88(%rax),%xmm11
				959	movaps -0x78(%rax),%xmm12
				960	movaps -0x68(%rax),%xmm13
				961	movaps -0x58(%rax),%xmm14
				962	movaps -0x48(%rax),%xmm15
				963	___
				964	$code.=<<___;
				965	mov -48(%rax),%r15
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	966	.cfi_restore %r15
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	967	mov -40(%rax),%r14
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	968	.cfi_restore %r14
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	969	mov -32(%rax),%r13
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	970	.cfi_restore %r13
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	971	mov -24(%rax),%r12
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	972	.cfi_restore %r12
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	973	mov -16(%rax),%rbp
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	974	.cfi_restore %rbp
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	975	mov -8(%rax),%rbx
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	976	.cfi_restore %rbx
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	977	lea (%rax),%rsp # restore %rsp
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	978	.cfi_def_cfa_register %rsp
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	979	.Lgcm_enc_abort:
				980	mov $ret,%rax # return value
				981	ret
Robert Sloan	d5c2215	2017-11-13 09:22:12 -0800	[diff] [blame]	982	.cfi_endproc
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	983	.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
				984	___
				985
				986	$code.=<<___;
				987	.align 64
				988	.Lbswap_mask:
				989	.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
				990	.Lpoly:
				991	.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
				992	.Lone_msb:
				993	.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
				994	.Ltwo_lsb:
				995	.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
				996	.Lone_lsb:
				997	.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
				998	.asciz "AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
				999	.align 64
				1000	___
				1001	if ($win64) {
				1002	$rec="%rcx";
				1003	$frame="%rdx";
				1004	$context="%r8";
				1005	$disp="%r9";
				1006
				1007	$code.=<<___
				1008	.extern __imp_RtlVirtualUnwind
				1009	.type gcm_se_handler,\@abi-omnipotent
				1010	.align 16
				1011	gcm_se_handler:
				1012	push %rsi
				1013	push %rdi
				1014	push %rbx
				1015	push %rbp
				1016	push %r12
				1017	push %r13
				1018	push %r14
				1019	push %r15
				1020	pushfq
				1021	sub \$64,%rsp
				1022
				1023	mov 120($context),%rax # pull context->Rax
				1024	mov 248($context),%rbx # pull context->Rip
				1025
				1026	mov 8($disp),%rsi # disp->ImageBase
				1027	mov 56($disp),%r11 # disp->HandlerData
				1028
				1029	mov 0(%r11),%r10d # HandlerData[0]
				1030	lea (%rsi,%r10),%r10 # prologue label
				1031	cmp %r10,%rbx # context->Rip<prologue label
				1032	jb .Lcommon_seh_tail
				1033
				1034	mov 152($context),%rax # pull context->Rsp
				1035
				1036	mov 4(%r11),%r10d # HandlerData[1]
				1037	lea (%rsi,%r10),%r10 # epilogue label
				1038	cmp %r10,%rbx # context->Rip>=epilogue label
				1039	jae .Lcommon_seh_tail
				1040
				1041	mov 120($context),%rax # pull context->Rax
				1042
				1043	mov -48(%rax),%r15
				1044	mov -40(%rax),%r14
				1045	mov -32(%rax),%r13
				1046	mov -24(%rax),%r12
				1047	mov -16(%rax),%rbp
				1048	mov -8(%rax),%rbx
				1049	mov %r15,240($context)
				1050	mov %r14,232($context)
				1051	mov %r13,224($context)
				1052	mov %r12,216($context)
				1053	mov %rbp,160($context)
				1054	mov %rbx,144($context)
				1055
				1056	lea -0xd8(%rax),%rsi # %xmm save area
				1057	lea 512($context),%rdi # & context.Xmm6
				1058	mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
				1059	.long 0xa548f3fc # cld; rep movsq
				1060
				1061	.Lcommon_seh_tail:
				1062	mov 8(%rax),%rdi
				1063	mov 16(%rax),%rsi
				1064	mov %rax,152($context) # restore context->Rsp
				1065	mov %rsi,168($context) # restore context->Rsi
				1066	mov %rdi,176($context) # restore context->Rdi
				1067
				1068	mov 40($disp),%rdi # disp->ContextRecord
				1069	mov $context,%rsi # context
				1070	mov \$154,%ecx # sizeof(CONTEXT)
				1071	.long 0xa548f3fc # cld; rep movsq
				1072
				1073	mov $disp,%rsi
				1074	xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
				1075	mov 8(%rsi),%rdx # arg2, disp->ImageBase
				1076	mov 0(%rsi),%r8 # arg3, disp->ControlPc
				1077	mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
				1078	mov 40(%rsi),%r10 # disp->ContextRecord
				1079	lea 56(%rsi),%r11 # &disp->HandlerData
				1080	lea 24(%rsi),%r12 # &disp->EstablisherFrame
				1081	mov %r10,32(%rsp) # arg5
				1082	mov %r11,40(%rsp) # arg6
				1083	mov %r12,48(%rsp) # arg7
				1084	mov %rcx,56(%rsp) # arg8, (NULL)
				1085	call *__imp_RtlVirtualUnwind(%rip)
				1086
				1087	mov \$1,%eax # ExceptionContinueSearch
				1088	add \$64,%rsp
				1089	popfq
				1090	pop %r15
				1091	pop %r14
				1092	pop %r13
				1093	pop %r12
				1094	pop %rbp
				1095	pop %rbx
				1096	pop %rdi
				1097	pop %rsi
				1098	ret
				1099	.size gcm_se_handler,.-gcm_se_handler
				1100
				1101	.section .pdata
				1102	.align 4
				1103	.rva .LSEH_begin_aesni_gcm_decrypt
				1104	.rva .LSEH_end_aesni_gcm_decrypt
				1105	.rva .LSEH_gcm_dec_info
				1106
				1107	.rva .LSEH_begin_aesni_gcm_encrypt
				1108	.rva .LSEH_end_aesni_gcm_encrypt
				1109	.rva .LSEH_gcm_enc_info
				1110	.section .xdata
				1111	.align 8
				1112	.LSEH_gcm_dec_info:
				1113	.byte 9,0,0,0
				1114	.rva gcm_se_handler
				1115	.rva .Lgcm_dec_body,.Lgcm_dec_abort
				1116	.LSEH_gcm_enc_info:
				1117	.byte 9,0,0,0
				1118	.rva gcm_se_handler
				1119	.rva .Lgcm_enc_body,.Lgcm_enc_abort
				1120	___
				1121	}
				1122	}}} else {{{
				1123	$code=<<___; # assembler is too old
				1124	.text
				1125
				1126	.globl aesni_gcm_encrypt
				1127	.type aesni_gcm_encrypt,\@abi-omnipotent
				1128	aesni_gcm_encrypt:
				1129	xor %eax,%eax
				1130	ret
				1131	.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
				1132
				1133	.globl aesni_gcm_decrypt
				1134	.type aesni_gcm_decrypt,\@abi-omnipotent
				1135	aesni_gcm_decrypt:
				1136	xor %eax,%eax
				1137	ret
				1138	.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
				1139	___
				1140	}}}
				1141
				1142	$code =~ s/\`([^\`]*)\`/eval($1)/gem;
				1143
				1144	print $code;
				1145
Srinivas Paladugu	dd42a61	2019-08-09 19:30:39 +0000	[diff] [blame^]	1146	close STDOUT;