Blame - arch/x86/crypto/crct10dif-pcl-asm_64.S - kernel/msm-4.19

blob: de04d3e98d8d3fd93671cbf7b50c5c64d1300e13 [file] [log] [blame]

Herbert Xu	68411521	2013-09-07 12:56:26 +1000	[diff] [blame]	1	########################################################################
				2	# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
				3	#
				4	# Copyright (c) 2013, Intel Corporation
				5	#
				6	# Authors:
				7	# Erdinc Ozturk <erdinc.ozturk@intel.com>
				8	# Vinodh Gopal <vinodh.gopal@intel.com>
				9	# James Guilford <james.guilford@intel.com>
				10	# Tim Chen <tim.c.chen@linux.intel.com>
				11	#
				12	# This software is available to you under a choice of one of two
				13	# licenses. You may choose to be licensed under the terms of the GNU
				14	# General Public License (GPL) Version 2, available from the file
				15	# COPYING in the main directory of this source tree, or the
				16	# OpenIB.org BSD license below:
				17	#
				18	# Redistribution and use in source and binary forms, with or without
				19	# modification, are permitted provided that the following conditions are
				20	# met:
				21	#
				22	# * Redistributions of source code must retain the above copyright
				23	# notice, this list of conditions and the following disclaimer.
				24	#
				25	# * Redistributions in binary form must reproduce the above copyright
				26	# notice, this list of conditions and the following disclaimer in the
				27	# documentation and/or other materials provided with the
				28	# distribution.
				29	#
				30	# * Neither the name of the Intel Corporation nor the names of its
				31	# contributors may be used to endorse or promote products derived from
				32	# this software without specific prior written permission.
				33	#
				34	#
				35	# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
				36	# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
				37	# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
				38	# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
				39	# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
				40	# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
				41	# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
				42	# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
				43	# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
				44	# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
				45	# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				46	########################################################################
				47	# Function API:
				48	# UINT16 crc_t10dif_pcl(
				49	# UINT16 init_crc, //initial CRC value, 16 bits
				50	# const unsigned char *buf, //buffer pointer to calculate CRC on
				51	# UINT64 len //buffer length in bytes (64-bit data)
				52	# );
				53	#
				54	# Reference paper titled "Fast CRC Computation for Generic
				55	# Polynomials Using PCLMULQDQ Instruction"
				56	# URL: http://www.intel.com/content/dam/www/public/us/en/documents
				57	# /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
				58	#
				59	#
				60
				61	#include <linux/linkage.h>
				62
				63	.text
				64
				65	#define arg1 %rdi
				66	#define arg2 %rsi
				67	#define arg3 %rdx
				68
				69	#define arg1_low32 %edi
				70
				71	ENTRY(crc_t10dif_pcl)
				72	.align 16
				73
				74	# adjust the 16-bit initial_crc value, scale it to 32 bits
				75	shl $16, arg1_low32
				76
				77	# Allocate Stack Space
				78	mov %rsp, %rcx
				79	sub $16*2, %rsp
				80	# align stack to 16 byte boundary
				81	and $~(0x10 - 1), %rsp
				82
				83	# check if smaller than 256
				84	cmp $256, arg3
				85
				86	# for sizes less than 128, we can't fold 64B at a time...
				87	jl _less_than_128
				88
				89
				90	# load the initial crc value
				91	movd arg1_low32, %xmm10 # initial crc
				92
				93	# crc value does not need to be byte-reflected, but it needs
				94	# to be moved to the high part of the register.
				95	# because data will be byte-reflected and will align with
				96	# initial crc at correct place.
				97	pslldq $12, %xmm10
				98
				99	movdqa SHUF_MASK(%rip), %xmm11
				100	# receive the initial 64B data, xor the initial crc value
				101	movdqu 16*0(arg2), %xmm0
				102	movdqu 16*1(arg2), %xmm1
				103	movdqu 16*2(arg2), %xmm2
				104	movdqu 16*3(arg2), %xmm3
				105	movdqu 16*4(arg2), %xmm4
				106	movdqu 16*5(arg2), %xmm5
				107	movdqu 16*6(arg2), %xmm6
				108	movdqu 16*7(arg2), %xmm7
				109
				110	pshufb %xmm11, %xmm0
				111	# XOR the initial_crc value
				112	pxor %xmm10, %xmm0
				113	pshufb %xmm11, %xmm1
				114	pshufb %xmm11, %xmm2
				115	pshufb %xmm11, %xmm3
				116	pshufb %xmm11, %xmm4
				117	pshufb %xmm11, %xmm5
				118	pshufb %xmm11, %xmm6
				119	pshufb %xmm11, %xmm7
				120
				121	movdqa rk3(%rip), %xmm10 #xmm10 has rk3 and rk4
				122	#imm value of pclmulqdq instruction
				123	#will determine which constant to use
				124
				125	#################################################################
				126	# we subtract 256 instead of 128 to save one instruction from the loop
				127	sub $256, arg3
				128
				129	# at this section of the code, there is 64*x+y (0<=y<64) bytes of
				130	# buffer. The _fold_64_B_loop will fold 64B at a time
				131	# until we have 64+y Bytes of buffer
				132
				133
				134	# fold 64B at a time. This section of the code folds 4 xmm
				135	# registers in parallel
				136	_fold_64_B_loop:
				137
				138	# update the buffer pointer
				139	add $128, arg2 # buf += 64#
				140
				141	movdqu 16*0(arg2), %xmm9
				142	movdqu 16*1(arg2), %xmm12
				143	pshufb %xmm11, %xmm9
				144	pshufb %xmm11, %xmm12
				145	movdqa %xmm0, %xmm8
				146	movdqa %xmm1, %xmm13
				147	pclmulqdq $0x0 , %xmm10, %xmm0
				148	pclmulqdq $0x11, %xmm10, %xmm8
				149	pclmulqdq $0x0 , %xmm10, %xmm1
				150	pclmulqdq $0x11, %xmm10, %xmm13
				151	pxor %xmm9 , %xmm0
				152	xorps %xmm8 , %xmm0
				153	pxor %xmm12, %xmm1
				154	xorps %xmm13, %xmm1
				155
				156	movdqu 16*2(arg2), %xmm9
				157	movdqu 16*3(arg2), %xmm12
				158	pshufb %xmm11, %xmm9
				159	pshufb %xmm11, %xmm12
				160	movdqa %xmm2, %xmm8
				161	movdqa %xmm3, %xmm13
				162	pclmulqdq $0x0, %xmm10, %xmm2
				163	pclmulqdq $0x11, %xmm10, %xmm8
				164	pclmulqdq $0x0, %xmm10, %xmm3
				165	pclmulqdq $0x11, %xmm10, %xmm13
				166	pxor %xmm9 , %xmm2
				167	xorps %xmm8 , %xmm2
				168	pxor %xmm12, %xmm3
				169	xorps %xmm13, %xmm3
				170
				171	movdqu 16*4(arg2), %xmm9
				172	movdqu 16*5(arg2), %xmm12
				173	pshufb %xmm11, %xmm9
				174	pshufb %xmm11, %xmm12
				175	movdqa %xmm4, %xmm8
				176	movdqa %xmm5, %xmm13
				177	pclmulqdq $0x0, %xmm10, %xmm4
				178	pclmulqdq $0x11, %xmm10, %xmm8
				179	pclmulqdq $0x0, %xmm10, %xmm5
				180	pclmulqdq $0x11, %xmm10, %xmm13
				181	pxor %xmm9 , %xmm4
				182	xorps %xmm8 , %xmm4
				183	pxor %xmm12, %xmm5
				184	xorps %xmm13, %xmm5
				185
				186	movdqu 16*6(arg2), %xmm9
				187	movdqu 16*7(arg2), %xmm12
				188	pshufb %xmm11, %xmm9
				189	pshufb %xmm11, %xmm12
				190	movdqa %xmm6 , %xmm8
				191	movdqa %xmm7 , %xmm13
				192	pclmulqdq $0x0 , %xmm10, %xmm6
				193	pclmulqdq $0x11, %xmm10, %xmm8
				194	pclmulqdq $0x0 , %xmm10, %xmm7
				195	pclmulqdq $0x11, %xmm10, %xmm13
				196	pxor %xmm9 , %xmm6
				197	xorps %xmm8 , %xmm6
				198	pxor %xmm12, %xmm7
				199	xorps %xmm13, %xmm7
				200
				201	sub $128, arg3
				202
				203	# check if there is another 64B in the buffer to be able to fold
				204	jge _fold_64_B_loop
				205	##################################################################
				206
				207
				208	add $128, arg2
				209	# at this point, the buffer pointer is pointing at the last y Bytes
				210	# of the buffer the 64B of folded data is in 4 of the xmm
				211	# registers: xmm0, xmm1, xmm2, xmm3
				212
				213
				214	# fold the 8 xmm registers to 1 xmm register with different constants
				215
				216	movdqa rk9(%rip), %xmm10
				217	movdqa %xmm0, %xmm8
				218	pclmulqdq $0x11, %xmm10, %xmm0
				219	pclmulqdq $0x0 , %xmm10, %xmm8
				220	pxor %xmm8, %xmm7
				221	xorps %xmm0, %xmm7
				222
				223	movdqa rk11(%rip), %xmm10
				224	movdqa %xmm1, %xmm8
				225	pclmulqdq $0x11, %xmm10, %xmm1
				226	pclmulqdq $0x0 , %xmm10, %xmm8
				227	pxor %xmm8, %xmm7
				228	xorps %xmm1, %xmm7
				229
				230	movdqa rk13(%rip), %xmm10
				231	movdqa %xmm2, %xmm8
				232	pclmulqdq $0x11, %xmm10, %xmm2
				233	pclmulqdq $0x0 , %xmm10, %xmm8
				234	pxor %xmm8, %xmm7
				235	pxor %xmm2, %xmm7
				236
				237	movdqa rk15(%rip), %xmm10
				238	movdqa %xmm3, %xmm8
				239	pclmulqdq $0x11, %xmm10, %xmm3
				240	pclmulqdq $0x0 , %xmm10, %xmm8
				241	pxor %xmm8, %xmm7
				242	xorps %xmm3, %xmm7
				243
				244	movdqa rk17(%rip), %xmm10
				245	movdqa %xmm4, %xmm8
				246	pclmulqdq $0x11, %xmm10, %xmm4
				247	pclmulqdq $0x0 , %xmm10, %xmm8
				248	pxor %xmm8, %xmm7
				249	pxor %xmm4, %xmm7
				250
				251	movdqa rk19(%rip), %xmm10
				252	movdqa %xmm5, %xmm8
				253	pclmulqdq $0x11, %xmm10, %xmm5
				254	pclmulqdq $0x0 , %xmm10, %xmm8
				255	pxor %xmm8, %xmm7
				256	xorps %xmm5, %xmm7
				257
				258	movdqa rk1(%rip), %xmm10 #xmm10 has rk1 and rk2
				259	#imm value of pclmulqdq instruction
				260	#will determine which constant to use
				261	movdqa %xmm6, %xmm8
				262	pclmulqdq $0x11, %xmm10, %xmm6
				263	pclmulqdq $0x0 , %xmm10, %xmm8
				264	pxor %xmm8, %xmm7
				265	pxor %xmm6, %xmm7
				266
				267
				268	# instead of 64, we add 48 to the loop counter to save 1 instruction
				269	# from the loop instead of a cmp instruction, we use the negative
				270	# flag with the jl instruction
				271	add $128-16, arg3
				272	jl _final_reduction_for_128
				273
				274	# now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7
				275	# and the rest is in memory. We can fold 16 bytes at a time if y>=16
				276	# continue folding 16B at a time
				277
				278	_16B_reduction_loop:
				279	movdqa %xmm7, %xmm8
				280	pclmulqdq $0x11, %xmm10, %xmm7
				281	pclmulqdq $0x0 , %xmm10, %xmm8
				282	pxor %xmm8, %xmm7
				283	movdqu (arg2), %xmm0
				284	pshufb %xmm11, %xmm0
				285	pxor %xmm0 , %xmm7
				286	add $16, arg2
				287	sub $16, arg3
				288	# instead of a cmp instruction, we utilize the flags with the
				289	# jge instruction equivalent of: cmp arg3, 16-16
				290	# check if there is any more 16B in the buffer to be able to fold
				291	jge _16B_reduction_loop
				292
				293	#now we have 16+z bytes left to reduce, where 0<= z < 16.
				294	#first, we reduce the data in the xmm7 register
				295
				296
				297	_final_reduction_for_128:
				298	# check if any more data to fold. If not, compute the CRC of
				299	# the final 128 bits
				300	add $16, arg3
				301	je _128_done
				302
				303	# here we are getting data that is less than 16 bytes.
				304	# since we know that there was data before the pointer, we can
				305	# offset the input pointer before the actual point, to receive
				306	# exactly 16 bytes. after that the registers need to be adjusted.
				307	_get_last_two_xmms:
				308	movdqa %xmm7, %xmm2
				309
				310	movdqu -16(arg2, arg3), %xmm1
				311	pshufb %xmm11, %xmm1
				312
				313	# get rid of the extra data that was loaded before
				314	# load the shift constant
				315	lea pshufb_shf_table+16(%rip), %rax
				316	sub arg3, %rax
				317	movdqu (%rax), %xmm0
				318
				319	# shift xmm2 to the left by arg3 bytes
				320	pshufb %xmm0, %xmm2
				321
				322	# shift xmm7 to the right by 16-arg3 bytes
				323	pxor mask1(%rip), %xmm0
				324	pshufb %xmm0, %xmm7
				325	pblendvb %xmm2, %xmm1 #xmm0 is implicit
				326
				327	# fold 16 Bytes
				328	movdqa %xmm1, %xmm2
				329	movdqa %xmm7, %xmm8
				330	pclmulqdq $0x11, %xmm10, %xmm7
				331	pclmulqdq $0x0 , %xmm10, %xmm8
				332	pxor %xmm8, %xmm7
				333	pxor %xmm2, %xmm7
				334
				335	_128_done:
				336	# compute crc of a 128-bit value
				337	movdqa rk5(%rip), %xmm10 # rk5 and rk6 in xmm10
				338	movdqa %xmm7, %xmm0
				339
				340	#64b fold
				341	pclmulqdq $0x1, %xmm10, %xmm7
				342	pslldq $8 , %xmm0
				343	pxor %xmm0, %xmm7
				344
				345	#32b fold
				346	movdqa %xmm7, %xmm0
				347
				348	pand mask2(%rip), %xmm0
				349
				350	psrldq $12, %xmm7
				351	pclmulqdq $0x10, %xmm10, %xmm7
				352	pxor %xmm0, %xmm7
				353
				354	#barrett reduction
				355	_barrett:
				356	movdqa rk7(%rip), %xmm10 # rk7 and rk8 in xmm10
				357	movdqa %xmm7, %xmm0
				358	pclmulqdq $0x01, %xmm10, %xmm7
				359	pslldq $4, %xmm7
				360	pclmulqdq $0x11, %xmm10, %xmm7
				361
				362	pslldq $4, %xmm7
				363	pxor %xmm0, %xmm7
				364	pextrd $1, %xmm7, %eax
				365
				366	_cleanup:
				367	# scale the result back to 16 bits
				368	shr $16, %eax
				369	mov %rcx, %rsp
				370	ret
				371
				372	########################################################################
				373
				374	.align 16
				375	_less_than_128:
				376
				377	# check if there is enough buffer to be able to fold 16B at a time
				378	cmp $32, arg3
				379	jl _less_than_32
				380	movdqa SHUF_MASK(%rip), %xmm11
				381
				382	# now if there is, load the constants
				383	movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10
				384
				385	movd arg1_low32, %xmm0 # get the initial crc value
				386	pslldq $12, %xmm0 # align it to its correct place
				387	movdqu (arg2), %xmm7 # load the plaintext
				388	pshufb %xmm11, %xmm7 # byte-reflect the plaintext
				389	pxor %xmm0, %xmm7
				390
				391
				392	# update the buffer pointer
				393	add $16, arg2
				394
				395	# update the counter. subtract 32 instead of 16 to save one
				396	# instruction from the loop
				397	sub $32, arg3
				398
				399	jmp _16B_reduction_loop
				400
				401
				402	.align 16
				403	_less_than_32:
				404	# mov initial crc to the return value. this is necessary for
				405	# zero-length buffers.
				406	mov arg1_low32, %eax
				407	test arg3, arg3
				408	je _cleanup
				409
				410	movdqa SHUF_MASK(%rip), %xmm11
				411
				412	movd arg1_low32, %xmm0 # get the initial crc value
				413	pslldq $12, %xmm0 # align it to its correct place
				414
				415	cmp $16, arg3
				416	je _exact_16_left
				417	jl _less_than_16_left
				418
				419	movdqu (arg2), %xmm7 # load the plaintext
				420	pshufb %xmm11, %xmm7 # byte-reflect the plaintext
				421	pxor %xmm0 , %xmm7 # xor the initial crc value
				422	add $16, arg2
				423	sub $16, arg3
				424	movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10
				425	jmp _get_last_two_xmms
				426
				427
				428	.align 16
				429	_less_than_16_left:
				430	# use stack space to load data less than 16 bytes, zero-out
				431	# the 16B in memory first.
				432
				433	pxor %xmm1, %xmm1
				434	mov %rsp, %r11
				435	movdqa %xmm1, (%r11)
				436
				437	cmp $4, arg3
				438	jl _only_less_than_4
				439
				440	# backup the counter value
				441	mov arg3, %r9
				442	cmp $8, arg3
				443	jl _less_than_8_left
				444
				445	# load 8 Bytes
				446	mov (arg2), %rax
				447	mov %rax, (%r11)
				448	add $8, %r11
				449	sub $8, arg3
				450	add $8, arg2
				451	_less_than_8_left:
				452
				453	cmp $4, arg3
				454	jl _less_than_4_left
				455
				456	# load 4 Bytes
				457	mov (arg2), %eax
				458	mov %eax, (%r11)
				459	add $4, %r11
				460	sub $4, arg3
				461	add $4, arg2
				462	_less_than_4_left:
				463
				464	cmp $2, arg3
				465	jl _less_than_2_left
				466
				467	# load 2 Bytes
				468	mov (arg2), %ax
				469	mov %ax, (%r11)
				470	add $2, %r11
				471	sub $2, arg3
				472	add $2, arg2
				473	_less_than_2_left:
				474	cmp $1, arg3
				475	jl _zero_left
				476
				477	# load 1 Byte
				478	mov (arg2), %al
				479	mov %al, (%r11)
				480	_zero_left:
				481	movdqa (%rsp), %xmm7
				482	pshufb %xmm11, %xmm7
				483	pxor %xmm0 , %xmm7 # xor the initial crc value
				484
				485	# shl r9, 4
				486	lea pshufb_shf_table+16(%rip), %rax
				487	sub %r9, %rax
				488	movdqu (%rax), %xmm0
				489	pxor mask1(%rip), %xmm0
				490
				491	pshufb %xmm0, %xmm7
				492	jmp _128_done
				493
				494	.align 16
				495	_exact_16_left:
				496	movdqu (arg2), %xmm7
				497	pshufb %xmm11, %xmm7
				498	pxor %xmm0 , %xmm7 # xor the initial crc value
				499
				500	jmp _128_done
				501
				502	_only_less_than_4:
				503	cmp $3, arg3
				504	jl _only_less_than_3
				505
				506	# load 3 Bytes
				507	mov (arg2), %al
				508	mov %al, (%r11)
				509
				510	mov 1(arg2), %al
				511	mov %al, 1(%r11)
				512
				513	mov 2(arg2), %al
				514	mov %al, 2(%r11)
				515
				516	movdqa (%rsp), %xmm7
				517	pshufb %xmm11, %xmm7
				518	pxor %xmm0 , %xmm7 # xor the initial crc value
				519
				520	psrldq $5, %xmm7
				521
				522	jmp _barrett
				523	_only_less_than_3:
				524	cmp $2, arg3
				525	jl _only_less_than_2
				526
				527	# load 2 Bytes
				528	mov (arg2), %al
				529	mov %al, (%r11)
				530
				531	mov 1(arg2), %al
				532	mov %al, 1(%r11)
				533
				534	movdqa (%rsp), %xmm7
				535	pshufb %xmm11, %xmm7
				536	pxor %xmm0 , %xmm7 # xor the initial crc value
				537
				538	psrldq $6, %xmm7
				539
				540	jmp _barrett
				541	_only_less_than_2:
				542
				543	# load 1 Byte
				544	mov (arg2), %al
				545	mov %al, (%r11)
				546
				547	movdqa (%rsp), %xmm7
				548	pshufb %xmm11, %xmm7
				549	pxor %xmm0 , %xmm7 # xor the initial crc value
				550
				551	psrldq $7, %xmm7
				552
				553	jmp _barrett
				554
				555	ENDPROC(crc_t10dif_pcl)
				556
Denys Vlasenko	e183914	2017-01-19 22:33:04 +0100	[diff] [blame]	557	.section .rodata, "a", @progbits
				558	.align 16
Herbert Xu	68411521	2013-09-07 12:56:26 +1000	[diff] [blame]	559	# precomputed constants
				560	# these constants are precomputed from the poly:
				561	# 0x8bb70000 (0x8bb7 scaled to 32 bits)
Herbert Xu	68411521	2013-09-07 12:56:26 +1000	[diff] [blame]	562	# Q = 0x18BB70000
				563	# rk1 = 2^(32*3) mod Q << 32
				564	# rk2 = 2^(32*5) mod Q << 32
				565	# rk3 = 2^(32*15) mod Q << 32
				566	# rk4 = 2^(32*17) mod Q << 32
				567	# rk5 = 2^(32*3) mod Q << 32
				568	# rk6 = 2^(32*2) mod Q << 32
				569	# rk7 = floor(2^64/Q)
				570	# rk8 = Q
				571	rk1:
				572	.quad 0x2d56000000000000
				573	rk2:
				574	.quad 0x06df000000000000
				575	rk3:
				576	.quad 0x9d9d000000000000
				577	rk4:
				578	.quad 0x7cf5000000000000
				579	rk5:
				580	.quad 0x2d56000000000000
				581	rk6:
				582	.quad 0x1368000000000000
				583	rk7:
				584	.quad 0x00000001f65a57f8
				585	rk8:
				586	.quad 0x000000018bb70000
				587
				588	rk9:
				589	.quad 0xceae000000000000
				590	rk10:
				591	.quad 0xbfd6000000000000
				592	rk11:
				593	.quad 0x1e16000000000000
				594	rk12:
				595	.quad 0x713c000000000000
				596	rk13:
				597	.quad 0xf7f9000000000000
				598	rk14:
				599	.quad 0x80a6000000000000
				600	rk15:
				601	.quad 0x044c000000000000
				602	rk16:
				603	.quad 0xe658000000000000
				604	rk17:
				605	.quad 0xad18000000000000
				606	rk18:
				607	.quad 0xa497000000000000
				608	rk19:
				609	.quad 0x6ee3000000000000
				610	rk20:
				611	.quad 0xe7b5000000000000
				612
				613
				614
Denys Vlasenko	e183914	2017-01-19 22:33:04 +0100	[diff] [blame]	615	.section .rodata.cst16.mask1, "aM", @progbits, 16
				616	.align 16
Herbert Xu	68411521	2013-09-07 12:56:26 +1000	[diff] [blame]	617	mask1:
				618	.octa 0x80808080808080808080808080808080
Denys Vlasenko	e183914	2017-01-19 22:33:04 +0100	[diff] [blame]	619
				620	.section .rodata.cst16.mask2, "aM", @progbits, 16
				621	.align 16
Herbert Xu	68411521	2013-09-07 12:56:26 +1000	[diff] [blame]	622	mask2:
				623	.octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
				624
Denys Vlasenko	e183914	2017-01-19 22:33:04 +0100	[diff] [blame]	625	.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
				626	.align 16
Herbert Xu	68411521	2013-09-07 12:56:26 +1000	[diff] [blame]	627	SHUF_MASK:
				628	.octa 0x000102030405060708090A0B0C0D0E0F
				629
Denys Vlasenko	e183914	2017-01-19 22:33:04 +0100	[diff] [blame]	630	.section .rodata.cst32.pshufb_shf_table, "aM", @progbits, 32
				631	.align 32
Herbert Xu	68411521	2013-09-07 12:56:26 +1000	[diff] [blame]	632	pshufb_shf_table:
				633	# use these values for shift constants for the pshufb instruction
				634	# different alignments result in values as shown:
				635	# DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
				636	# DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
				637	# DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
				638	# DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
				639	# DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
				640	# DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
				641	# DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
				642	# DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
				643	# DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
				644	# DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
				645	# DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
				646	# DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
				647	# DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
				648	# DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
				649	# DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
				650	.octa 0x8f8e8d8c8b8a89888786858483828100
				651	.octa 0x000e0d0c0b0a09080706050403020100