Blame - arch/x86/crypto/sha1_avx2_x86_64_asm.S - kernel/msm-4.9

blob: 1cd792db15efe760e3a6fc8b17b9a4c4e6f35233 [file] [log] [blame]

chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	1	/*
				2	* Implement fast SHA-1 with AVX2 instructions. (x86_64)
				3	*
				4	* This file is provided under a dual BSD/GPLv2 license. When using or
				5	* redistributing this file, you may do so under either license.
				6	*
				7	* GPL LICENSE SUMMARY
				8	*
				9	* Copyright(c) 2014 Intel Corporation.
				10	*
				11	* This program is free software; you can redistribute it and/or modify
				12	* it under the terms of version 2 of the GNU General Public License as
				13	* published by the Free Software Foundation.
				14	*
				15	* This program is distributed in the hope that it will be useful, but
				16	* WITHOUT ANY WARRANTY; without even the implied warranty of
				17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				18	* General Public License for more details.
				19	*
				20	* Contact Information:
				21	* Ilya Albrekht <ilya.albrekht@intel.com>
				22	* Maxim Locktyukhin <maxim.locktyukhin@intel.com>
				23	* Ronen Zohar <ronen.zohar@intel.com>
				24	* Chandramouli Narayanan <mouli@linux.intel.com>
				25	*
				26	* BSD LICENSE
				27	*
				28	* Copyright(c) 2014 Intel Corporation.
				29	*
				30	* Redistribution and use in source and binary forms, with or without
				31	* modification, are permitted provided that the following conditions
				32	* are met:
				33	*
				34	* Redistributions of source code must retain the above copyright
				35	* notice, this list of conditions and the following disclaimer.
				36	* Redistributions in binary form must reproduce the above copyright
				37	* notice, this list of conditions and the following disclaimer in
				38	* the documentation and/or other materials provided with the
				39	* distribution.
				40	* Neither the name of Intel Corporation nor the names of its
				41	* contributors may be used to endorse or promote products derived
				42	* from this software without specific prior written permission.
				43	*
				44	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				45	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				46	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				47	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				48	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				49	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				50	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				51	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				52	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				53	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				54	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				55	*
				56	*/
				57
				58	/*
				59	* SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
				60	*
				61	*This implementation is based on the previous SSSE3 release:
				62	*Visit http://software.intel.com/en-us/articles/
				63	*and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
				64	*
				65	*Updates 20-byte SHA-1 record in 'hash' for even number of
				66	*'num_blocks' consecutive 64-byte blocks
				67	*
				68	*extern "C" void sha1_transform_avx2(
				69	* int hash, const char input, size_t num_blocks );
				70	*/
				71
				72	#include <linux/linkage.h>
				73
				74	#define CTX %rdi /* arg1 */
				75	#define BUF %rsi /* arg2 */
				76	#define CNT %rdx /* arg3 */
				77
				78	#define REG_A %ecx
				79	#define REG_B %esi
				80	#define REG_C %edi
				81	#define REG_D %eax
				82	#define REG_E %edx
				83	#define REG_TB %ebx
				84	#define REG_TA %r12d
				85	#define REG_RA %rcx
				86	#define REG_RB %rsi
				87	#define REG_RC %rdi
				88	#define REG_RD %rax
				89	#define REG_RE %rdx
				90	#define REG_RTA %r12
				91	#define REG_RTB %rbx
				92	#define REG_T1 %ebp
				93	#define xmm_mov vmovups
				94	#define avx2_zeroupper vzeroupper
				95	#define RND_F1 1
				96	#define RND_F2 2
				97	#define RND_F3 3
				98
				99	.macro REGALLOC
				100	.set A, REG_A
				101	.set B, REG_B
				102	.set C, REG_C
				103	.set D, REG_D
				104	.set E, REG_E
				105	.set TB, REG_TB
				106	.set TA, REG_TA
				107
				108	.set RA, REG_RA
				109	.set RB, REG_RB
				110	.set RC, REG_RC
				111	.set RD, REG_RD
				112	.set RE, REG_RE
				113
				114	.set RTA, REG_RTA
				115	.set RTB, REG_RTB
				116
				117	.set T1, REG_T1
				118	.endm
				119
				120	#define K_BASE %r8
				121	#define HASH_PTR %r9
				122	#define BUFFER_PTR %r10
				123	#define BUFFER_PTR2 %r13
				124	#define BUFFER_END %r11
				125
				126	#define PRECALC_BUF %r14
				127	#define WK_BUF %r15
				128
				129	#define W_TMP %xmm0
				130	#define WY_TMP %ymm0
				131	#define WY_TMP2 %ymm9
				132
				133	# AVX2 variables
				134	#define WY0 %ymm3
				135	#define WY4 %ymm5
				136	#define WY08 %ymm7
				137	#define WY12 %ymm8
				138	#define WY16 %ymm12
				139	#define WY20 %ymm13
				140	#define WY24 %ymm14
				141	#define WY28 %ymm15
				142
				143	#define YMM_SHUFB_BSWAP %ymm10
				144
				145	/*
				146	* Keep 2 iterations precalculated at a time:
				147	* - 80 DWORDs per iteration * 2
				148	*/
				149	#define W_SIZE (8022 +16)
				150
				151	#define WK(t) ((((t) % 80) / 4)32 + ( (t) % 4)4 + ((t)/80)*16 )(WK_BUF)
				152	#define PRECALC_WK(t) ((t)22)(PRECALC_BUF)
				153
				154
				155	.macro UPDATE_HASH hash, val
				156	add \hash, \val
				157	mov \val, \hash
				158	.endm
				159
				160	.macro PRECALC_RESET_WY
				161	.set WY_00, WY0
				162	.set WY_04, WY4
				163	.set WY_08, WY08
				164	.set WY_12, WY12
				165	.set WY_16, WY16
				166	.set WY_20, WY20
				167	.set WY_24, WY24
				168	.set WY_28, WY28
				169	.set WY_32, WY_00
				170	.endm
				171
				172	.macro PRECALC_ROTATE_WY
				173	/* Rotate macros */
				174	.set WY_32, WY_28
				175	.set WY_28, WY_24
				176	.set WY_24, WY_20
				177	.set WY_20, WY_16
				178	.set WY_16, WY_12
				179	.set WY_12, WY_08
				180	.set WY_08, WY_04
				181	.set WY_04, WY_00
				182	.set WY_00, WY_32
				183
				184	/* Define register aliases */
				185	.set WY, WY_00
				186	.set WY_minus_04, WY_04
				187	.set WY_minus_08, WY_08
				188	.set WY_minus_12, WY_12
				189	.set WY_minus_16, WY_16
				190	.set WY_minus_20, WY_20
				191	.set WY_minus_24, WY_24
				192	.set WY_minus_28, WY_28
				193	.set WY_minus_32, WY
				194	.endm
				195
				196	.macro PRECALC_00_15
				197	.if (i == 0) # Initialize and rotate registers
				198	PRECALC_RESET_WY
				199	PRECALC_ROTATE_WY
				200	.endif
				201
				202	/* message scheduling pre-compute for rounds 0-15 */
				203	.if ((i & 7) == 0)
				204	/*
				205	* blended AVX2 and ALU instruction scheduling
				206	* 1 vector iteration per 8 rounds
				207	*/
				208	vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP
				209	.elseif ((i & 7) == 1)
				210	vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\
				211	WY_TMP, WY_TMP
				212	.elseif ((i & 7) == 2)
				213	vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
				214	.elseif ((i & 7) == 4)
				215	vpaddd K_XMM(K_BASE), WY, WY_TMP
				216	.elseif ((i & 7) == 7)
				217	vmovdqu WY_TMP, PRECALC_WK(i&~7)
				218
				219	PRECALC_ROTATE_WY
				220	.endif
				221	.endm
				222
				223	.macro PRECALC_16_31
				224	/*
				225	* message scheduling pre-compute for rounds 16-31
				226	* calculating last 32 w[i] values in 8 XMM registers
				227	* pre-calculate K+w[i] values and store to mem
				228	* for later load by ALU add instruction
				229	*
				230	* "brute force" vectorization for rounds 16-31 only
				231	* due to w[i]->w[i-3] dependency
				232	*/
				233	.if ((i & 7) == 0)
				234	/*
				235	* blended AVX2 and ALU instruction scheduling
				236	* 1 vector iteration per 8 rounds
				237	*/
				238	/* w[i-14] */
				239	vpalignr $8, WY_minus_16, WY_minus_12, WY
				240	vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */
				241	.elseif ((i & 7) == 1)
				242	vpxor WY_minus_08, WY, WY
				243	vpxor WY_minus_16, WY_TMP, WY_TMP
				244	.elseif ((i & 7) == 2)
				245	vpxor WY_TMP, WY, WY
				246	vpslldq $12, WY, WY_TMP2
				247	.elseif ((i & 7) == 3)
				248	vpslld $1, WY, WY_TMP
				249	vpsrld $31, WY, WY
				250	.elseif ((i & 7) == 4)
				251	vpor WY, WY_TMP, WY_TMP
				252	vpslld $2, WY_TMP2, WY
				253	.elseif ((i & 7) == 5)
				254	vpsrld $30, WY_TMP2, WY_TMP2
				255	vpxor WY, WY_TMP, WY_TMP
				256	.elseif ((i & 7) == 7)
				257	vpxor WY_TMP2, WY_TMP, WY
				258	vpaddd K_XMM(K_BASE), WY, WY_TMP
				259	vmovdqu WY_TMP, PRECALC_WK(i&~7)
				260
				261	PRECALC_ROTATE_WY
				262	.endif
				263	.endm
				264
				265	.macro PRECALC_32_79
				266	/*
				267	* in SHA-1 specification:
				268	* w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
				269	* instead we do equal:
				270	* w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
				271	* allows more efficient vectorization
				272	* since w[i]=>w[i-3] dependency is broken
				273	*/
				274
				275	.if ((i & 7) == 0)
				276	/*
				277	* blended AVX2 and ALU instruction scheduling
				278	* 1 vector iteration per 8 rounds
				279	*/
				280	vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP
				281	.elseif ((i & 7) == 1)
				282	/* W is W_minus_32 before xor */
				283	vpxor WY_minus_28, WY, WY
				284	.elseif ((i & 7) == 2)
				285	vpxor WY_minus_16, WY_TMP, WY_TMP
				286	.elseif ((i & 7) == 3)
				287	vpxor WY_TMP, WY, WY
				288	.elseif ((i & 7) == 4)
				289	vpslld $2, WY, WY_TMP
				290	.elseif ((i & 7) == 5)
				291	vpsrld $30, WY, WY
				292	vpor WY, WY_TMP, WY
				293	.elseif ((i & 7) == 7)
				294	vpaddd K_XMM(K_BASE), WY, WY_TMP
				295	vmovdqu WY_TMP, PRECALC_WK(i&~7)
				296
				297	PRECALC_ROTATE_WY
				298	.endif
				299	.endm
				300
				301	.macro PRECALC r, s
				302	.set i, \r
				303
				304	.if (i < 40)
				305	.set K_XMM, 32*0
				306	.elseif (i < 80)
				307	.set K_XMM, 32*1
				308	.elseif (i < 120)
				309	.set K_XMM, 32*2
				310	.else
				311	.set K_XMM, 32*3
				312	.endif
				313
				314	.if (i<32)
				315	PRECALC_00_15 \s
				316	.elseif (i<64)
				317	PRECALC_16_31 \s
				318	.elseif (i < 160)
				319	PRECALC_32_79 \s
				320	.endif
				321	.endm
				322
				323	.macro ROTATE_STATE
				324	.set T_REG, E
				325	.set E, D
				326	.set D, C
				327	.set C, B
				328	.set B, TB
				329	.set TB, A
				330	.set A, T_REG
				331
				332	.set T_REG, RE
				333	.set RE, RD
				334	.set RD, RC
				335	.set RC, RB
				336	.set RB, RTB
				337	.set RTB, RA
				338	.set RA, T_REG
				339	.endm
				340
				341	/* Macro relies on saved ROUND_Fx */
				342
				343	.macro RND_FUN f, r
				344	.if (\f == RND_F1)
				345	ROUND_F1 \r
				346	.elseif (\f == RND_F2)
				347	ROUND_F2 \r
				348	.elseif (\f == RND_F3)
				349	ROUND_F3 \r
				350	.endif
				351	.endm
				352
				353	.macro RR r
				354	.set round_id, (\r % 80)
				355
				356	.if (round_id == 0) /* Precalculate F for first round */
				357	.set ROUND_FUNC, RND_F1
				358	mov B, TB
				359
				360	rorx $(32-30), B, B /* b>>>2 */
				361	andn D, TB, T1
				362	and C, TB
				363	xor T1, TB
				364	.endif
				365
				366	RND_FUN ROUND_FUNC, \r
				367	ROTATE_STATE
				368
				369	.if (round_id == 18)
				370	.set ROUND_FUNC, RND_F2
				371	.elseif (round_id == 38)
				372	.set ROUND_FUNC, RND_F3
				373	.elseif (round_id == 58)
				374	.set ROUND_FUNC, RND_F2
				375	.endif
				376
				377	.set round_id, ( (\r+1) % 80)
				378
				379	RND_FUN ROUND_FUNC, (\r+1)
				380	ROTATE_STATE
				381	.endm
				382
				383	.macro ROUND_F1 r
				384	add WK(\r), E
				385
				386	andn C, A, T1 /* ~b&d */
				387	lea (RE,RTB), E /* Add F from the previous round */
				388
				389	rorx $(32-5), A, TA /* T2 = A >>> 5 */
				390	rorx $(32-30),A, TB /* b>>>2 for next round */
				391
				392	PRECALC (\r) /* msg scheduling for next 2 blocks */
				393
				394	/*
				395	* Calculate F for the next round
				396	* (b & c) ^ andn[b, d]
				397	*/
				398	and B, A /* b&c */
				399	xor T1, A /* F1 = (b&c) ^ (~b&d) */
				400
				401	lea (RE,RTA), E /* E += A >>> 5 */
				402	.endm
				403
				404	.macro ROUND_F2 r
				405	add WK(\r), E
				406	lea (RE,RTB), E /* Add F from the previous round */
				407
				408	/* Calculate F for the next round */
				409	rorx $(32-5), A, TA /* T2 = A >>> 5 */
				410	.if ((round_id) < 79)
				411	rorx $(32-30), A, TB /* b>>>2 for next round */
				412	.endif
				413	PRECALC (\r) /* msg scheduling for next 2 blocks */
				414
				415	.if ((round_id) < 79)
				416	xor B, A
				417	.endif
				418
				419	add TA, E /* E += A >>> 5 */
				420
				421	.if ((round_id) < 79)
				422	xor C, A
				423	.endif
				424	.endm
				425
				426	.macro ROUND_F3 r
				427	add WK(\r), E
				428	PRECALC (\r) /* msg scheduling for next 2 blocks */
				429
				430	lea (RE,RTB), E /* Add F from the previous round */
				431
				432	mov B, T1
				433	or A, T1
				434
				435	rorx $(32-5), A, TA /* T2 = A >>> 5 */
				436	rorx $(32-30), A, TB /* b>>>2 for next round */
				437
				438	/* Calculate F for the next round
				439	* (b and c) or (d and (b or c))
				440	*/
				441	and C, T1
				442	and B, A
				443	or T1, A
				444
				445	add TA, E /* E += A >>> 5 */
				446
				447	.endm
				448
				449	/*
				450	* macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
				451	*/
				452	.macro SHA1_PIPELINED_MAIN_BODY
				453
				454	REGALLOC
				455
				456	mov (HASH_PTR), A
				457	mov 4(HASH_PTR), B
				458	mov 8(HASH_PTR), C
				459	mov 12(HASH_PTR), D
				460	mov 16(HASH_PTR), E
				461
				462	mov %rsp, PRECALC_BUF
				463	lea (2480+32)(%rsp), WK_BUF
				464
				465	# Precalc WK for first 2 blocks
				466	PRECALC_OFFSET = 0
				467	.set i, 0
				468	.rept 160
				469	PRECALC i
				470	.set i, i + 1
				471	.endr
				472	PRECALC_OFFSET = 128
				473	xchg WK_BUF, PRECALC_BUF
				474
				475	.align 32
				476	_loop:
				477	/*
				478	* code loops through more than one block
				479	* we use K_BASE value as a signal of a last block,
				480	* it is set below by: cmovae BUFFER_PTR, K_BASE
				481	*/
				482	cmp K_BASE, BUFFER_PTR
				483	jne _begin
				484	.align 32
				485	jmp _end
				486	.align 32
				487	_begin:
				488
				489	/*
				490	* Do first block
				491	* rounds: 0,2,4,6,8
				492	*/
				493	.set j, 0
				494	.rept 5
				495	RR j
				496	.set j, j+2
				497	.endr
				498
				499	jmp _loop0
				500	_loop0:
				501
				502	/*
				503	* rounds:
				504	* 10,12,14,16,18
				505	* 20,22,24,26,28
				506	* 30,32,34,36,38
				507	* 40,42,44,46,48
				508	* 50,52,54,56,58
				509	*/
				510	.rept 25
				511	RR j
				512	.set j, j+2
				513	.endr
				514
				515	add $(264), BUFFER_PTR / move to next odd-64-byte block */
				516	cmp BUFFER_END, BUFFER_PTR /* is current block the last one? */
				517	cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */
				518
				519	/*
				520	* rounds
				521	* 60,62,64,66,68
				522	* 70,72,74,76,78
				523	*/
				524	.rept 10
				525	RR j
				526	.set j, j+2
				527	.endr
				528
				529	UPDATE_HASH (HASH_PTR), A
				530	UPDATE_HASH 4(HASH_PTR), TB
				531	UPDATE_HASH 8(HASH_PTR), C
				532	UPDATE_HASH 12(HASH_PTR), D
				533	UPDATE_HASH 16(HASH_PTR), E
				534
				535	cmp K_BASE, BUFFER_PTR /* is current block the last one? */
				536	je _loop
				537
				538	mov TB, B
				539
				540	/* Process second block */
				541	/*
				542	* rounds
				543	* 0+80, 2+80, 4+80, 6+80, 8+80
				544	* 10+80,12+80,14+80,16+80,18+80
				545	*/
				546
				547	.set j, 0
				548	.rept 10
				549	RR j+80
				550	.set j, j+2
				551	.endr
				552
				553	jmp _loop1
				554	_loop1:
				555	/*
				556	* rounds
				557	* 20+80,22+80,24+80,26+80,28+80
				558	* 30+80,32+80,34+80,36+80,38+80
				559	*/
				560	.rept 10
				561	RR j+80
				562	.set j, j+2
				563	.endr
				564
				565	jmp _loop2
				566	_loop2:
				567
				568	/*
				569	* rounds
				570	* 40+80,42+80,44+80,46+80,48+80
				571	* 50+80,52+80,54+80,56+80,58+80
				572	*/
				573	.rept 10
				574	RR j+80
				575	.set j, j+2
				576	.endr
				577
				578	add $(264), BUFFER_PTR2 / move to next even-64-byte block */
				579
				580	cmp BUFFER_END, BUFFER_PTR2 /* is current block the last one */
				581	cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */
				582
				583	jmp _loop3
				584	_loop3:
				585
				586	/*
				587	* rounds
				588	* 60+80,62+80,64+80,66+80,68+80
				589	* 70+80,72+80,74+80,76+80,78+80
				590	*/
				591	.rept 10
				592	RR j+80
				593	.set j, j+2
				594	.endr
				595
				596	UPDATE_HASH (HASH_PTR), A
				597	UPDATE_HASH 4(HASH_PTR), TB
				598	UPDATE_HASH 8(HASH_PTR), C
				599	UPDATE_HASH 12(HASH_PTR), D
				600	UPDATE_HASH 16(HASH_PTR), E
				601
				602	/* Reset state for AVX2 reg permutation */
				603	mov A, TA
				604	mov TB, A
				605	mov C, TB
				606	mov E, C
				607	mov D, B
				608	mov TA, D
				609
				610	REGALLOC
				611
				612	xchg WK_BUF, PRECALC_BUF
				613
				614	jmp _loop
				615
				616	.align 32
				617	_end:
				618
				619	.endm
				620	/*
				621	* macro implements SHA-1 function's body for several 64-byte blocks
				622	* param: function's name
				623	*/
				624	.macro SHA1_VECTOR_ASM name
				625	ENTRY(\name)
chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	626
				627	push %rbx
				628	push %rbp
				629	push %r12
				630	push %r13
				631	push %r14
				632	push %r15
				633
				634	RESERVE_STACK = (W_SIZE*4 + 8+24)
				635
				636	/* Align stack */
				637	mov %rsp, %rbx
Mathias Krause	6c8c17c	2014-03-24 17:10:38 +0100	[diff] [blame]	638	and $~(0x20-1), %rsp
chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	639	push %rbx
				640	sub $RESERVE_STACK, %rsp
				641
				642	avx2_zeroupper
				643
				644	lea K_XMM_AR(%rip), K_BASE
				645
				646	mov CTX, HASH_PTR
				647	mov BUF, BUFFER_PTR
				648	lea 64(BUF), BUFFER_PTR2
				649
				650	shl $6, CNT /* mul by 64 */
				651	add BUF, CNT
				652	add $64, CNT
				653	mov CNT, BUFFER_END
				654
				655	cmp BUFFER_END, BUFFER_PTR2
				656	cmovae K_BASE, BUFFER_PTR2
				657
				658	xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
				659
				660	SHA1_PIPELINED_MAIN_BODY
				661
				662	avx2_zeroupper
				663
				664	add $RESERVE_STACK, %rsp
Mathias Krause	6c8c17c	2014-03-24 17:10:38 +0100	[diff] [blame]	665	pop %rsp
chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	666
				667	pop %r15
				668	pop %r14
				669	pop %r13
				670	pop %r12
				671	pop %rbp
				672	pop %rbx
				673
				674	ret
				675
				676	ENDPROC(\name)
				677	.endm
				678
				679	.section .rodata
				680
				681	#define K1 0x5a827999
				682	#define K2 0x6ed9eba1
				683	#define K3 0x8f1bbcdc
				684	#define K4 0xca62c1d6
				685
				686	.align 128
				687	K_XMM_AR:
				688	.long K1, K1, K1, K1
				689	.long K1, K1, K1, K1
				690	.long K2, K2, K2, K2
				691	.long K2, K2, K2, K2
				692	.long K3, K3, K3, K3
				693	.long K3, K3, K3, K3
				694	.long K4, K4, K4, K4
				695	.long K4, K4, K4, K4
				696
				697	BSWAP_SHUFB_CTL:
				698	.long 0x00010203
				699	.long 0x04050607
				700	.long 0x08090a0b
				701	.long 0x0c0d0e0f
				702	.long 0x00010203
				703	.long 0x04050607
				704	.long 0x08090a0b
				705	.long 0x0c0d0e0f
				706	.text
				707
				708	SHA1_VECTOR_ASM sha1_transform_avx2