blob: 5a5e1f2eaf4fb50b58becca0828601e94f88a0ac [file] [log] [blame]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001#!/usr/bin/env perl
2
3######################################################################
4## Constant-time SSSE3 AES core implementation.
5## version 0.1
6##
7## By Mike Hamburg (Stanford University), 2009
8## Public domain.
9##
10## For details see http://shiftleft.org/papers/vector_aes/ and
11## http://crypto.stanford.edu/vpaes/.
12
13######################################################################
14# September 2011.
15#
16# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
17# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
18# doesn't handle partial vectors (doesn't have to if called from
19# EVP only). "Drop-in" implies that this module doesn't share key
20# schedule structure with the original nor does it make assumption
21# about its alignment...
22#
23# Performance summary. aes-586.pl column lists large-block CBC
24# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
25# byte processed with 128-bit key, and vpaes-x86.pl column - [also
26# large-block CBC] encrypt/decrypt.
27#
28# aes-586.pl vpaes-x86.pl
29#
30# Core 2(**) 28.1/41.4/18.3 21.9/25.2(***)
31# Nehalem 27.9/40.4/18.1 10.2/11.9
32# Atom 70.7/92.1/60.1 61.1/75.4(***)
33# Silvermont 45.4/62.9/24.1 49.2/61.1(***)
34#
35# (*) "Hyper-threading" in the context refers rather to cache shared
36# among multiple cores, than to specifically Intel HTT. As vast
37# majority of contemporary cores share cache, slower code path
38# is common place. In other words "with-hyper-threading-off"
39# results are presented mostly for reference purposes.
40#
41# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
42#
43# (***) Less impressive improvement on Core 2 and Atom is due to slow
44# pshufb, yet it's respectable +28%/64% improvement on Core 2
45# and +15% on Atom (as implied, over "hyper-threading-safe"
46# code path).
47#
48# <appro@openssl.org>
49
50$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
Robert Sloan572a4e22017-04-17 10:52:19 -070051push(@INC,"${dir}","${dir}../../../perlasm");
Adam Langleyd9e397b2015-01-22 14:27:53 -080052require "x86asm.pl";
53
David Benjaminc895d6b2016-08-11 13:26:41 -040054$output = pop;
55open OUT,">$output";
56*STDOUT=*OUT;
57
Robert Sloan8ff03552017-06-14 12:40:58 -070058&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
Adam Langleyd9e397b2015-01-22 14:27:53 -080059
60$PREFIX="vpaes";
61
62my ($round, $base, $magic, $key, $const, $inp, $out)=
63 ("eax", "ebx", "ecx", "edx","ebp", "esi","edi");
64
65&static_label("_vpaes_consts");
66&static_label("_vpaes_schedule_low_round");
67
68&set_label("_vpaes_consts",64);
69$k_inv=-0x30; # inv, inva
70 &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
71 &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
72
73$k_s0F=-0x10; # s0F
74 &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
75
76$k_ipt=0x00; # input transform (lo, hi)
77 &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
78 &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
79
80$k_sb1=0x20; # sb1u, sb1t
81 &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
82 &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
83$k_sb2=0x40; # sb2u, sb2t
84 &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
85 &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
86$k_sbo=0x60; # sbou, sbot
87 &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
88 &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
89
90$k_mc_forward=0x80; # mc_forward
91 &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
92 &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
93 &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
94 &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
95
96$k_mc_backward=0xc0; # mc_backward
97 &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
98 &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
99 &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
100 &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
101
102$k_sr=0x100; # sr
103 &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
104 &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
105 &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
106 &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
107
108$k_rcon=0x140; # rcon
109 &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
110
111$k_s63=0x150; # s63: all equal to 0x63 transformed
112 &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
113
114$k_opt=0x160; # output transform
115 &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
116 &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
117
118$k_deskew=0x180; # deskew tables: inverts the sbox's "skew"
119 &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
120 &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
121##
122## Decryption stuff
123## Key schedule constants
124##
125$k_dksd=0x1a0; # decryption key schedule: invskew x*D
126 &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
127 &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
128$k_dksb=0x1c0; # decryption key schedule: invskew x*B
129 &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
130 &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
131$k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63
132 &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
133 &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
134$k_dks9=0x200; # decryption key schedule: invskew x*9
135 &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
136 &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
137
138##
139## Decryption stuff
140## Round function constants
141##
142$k_dipt=0x220; # decryption input transform
143 &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
144 &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
145
146$k_dsb9=0x240; # decryption sbox output *9*u, *9*t
147 &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
148 &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
149$k_dsbd=0x260; # decryption sbox output *D*u, *D*t
150 &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
151 &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
152$k_dsbb=0x280; # decryption sbox output *B*u, *B*t
153 &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
154 &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
155$k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t
156 &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
157 &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
158$k_dsbo=0x2c0; # decryption sbox final output
159 &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
160 &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
161&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
162&align (64);
163
164&function_begin_B("_vpaes_preheat");
165 &add ($const,&DWP(0,"esp"));
166 &movdqa ("xmm7",&QWP($k_inv,$const));
167 &movdqa ("xmm6",&QWP($k_s0F,$const));
168 &ret ();
169&function_end_B("_vpaes_preheat");
170
171##
172## _aes_encrypt_core
173##
174## AES-encrypt %xmm0.
175##
176## Inputs:
177## %xmm0 = input
178## %xmm6-%xmm7 as in _vpaes_preheat
179## (%edx) = scheduled keys
180##
181## Output in %xmm0
182## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
183##
184##
185&function_begin_B("_vpaes_encrypt_core");
186 &mov ($magic,16);
187 &mov ($round,&DWP(240,$key));
188 &movdqa ("xmm1","xmm6")
189 &movdqa ("xmm2",&QWP($k_ipt,$const));
190 &pandn ("xmm1","xmm0");
191 &pand ("xmm0","xmm6");
192 &movdqu ("xmm5",&QWP(0,$key));
193 &pshufb ("xmm2","xmm0");
194 &movdqa ("xmm0",&QWP($k_ipt+16,$const));
195 &pxor ("xmm2","xmm5");
196 &psrld ("xmm1",4);
197 &add ($key,16);
198 &pshufb ("xmm0","xmm1");
199 &lea ($base,&DWP($k_mc_backward,$const));
200 &pxor ("xmm0","xmm2");
201 &jmp (&label("enc_entry"));
202
203
204&set_label("enc_loop",16);
205 # middle of middle round
206 &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u
207 &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
208 &pshufb ("xmm4","xmm2"); # 4 = sb1u
209 &pshufb ("xmm0","xmm3"); # 0 = sb1t
210 &pxor ("xmm4","xmm5"); # 4 = sb1u + k
211 &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u
212 &pxor ("xmm0","xmm4"); # 0 = A
213 &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
214 &pshufb ("xmm5","xmm2"); # 4 = sb2u
215 &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
216 &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[]
217 &pshufb ("xmm2","xmm3"); # 2 = sb2t
218 &movdqa ("xmm3","xmm0"); # 3 = A
219 &pxor ("xmm2","xmm5"); # 2 = 2A
220 &pshufb ("xmm0","xmm1"); # 0 = B
221 &add ($key,16); # next key
222 &pxor ("xmm0","xmm2"); # 0 = 2A+B
223 &pshufb ("xmm3","xmm4"); # 3 = D
224 &add ($magic,16); # next mc
225 &pxor ("xmm3","xmm0"); # 3 = 2A+B+D
226 &pshufb ("xmm0","xmm1"); # 0 = 2B+C
227 &and ($magic,0x30); # ... mod 4
228 &sub ($round,1); # nr--
229 &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D
230
231&set_label("enc_entry");
232 # top of round
233 &movdqa ("xmm1","xmm6"); # 1 : i
234 &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
235 &pandn ("xmm1","xmm0"); # 1 = i<<4
236 &psrld ("xmm1",4); # 1 = i
237 &pand ("xmm0","xmm6"); # 0 = k
238 &pshufb ("xmm5","xmm0"); # 2 = a/k
239 &movdqa ("xmm3","xmm7"); # 3 : 1/i
240 &pxor ("xmm0","xmm1"); # 0 = j
241 &pshufb ("xmm3","xmm1"); # 3 = 1/i
242 &movdqa ("xmm4","xmm7"); # 4 : 1/j
243 &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k
244 &pshufb ("xmm4","xmm0"); # 4 = 1/j
245 &movdqa ("xmm2","xmm7"); # 2 : 1/iak
246 &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k
247 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
248 &movdqa ("xmm3","xmm7"); # 3 : 1/jak
249 &pxor ("xmm2","xmm0"); # 2 = io
250 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
251 &movdqu ("xmm5",&QWP(0,$key));
252 &pxor ("xmm3","xmm1"); # 3 = jo
253 &jnz (&label("enc_loop"));
254
255 # middle of last round
256 &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo
257 &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16
258 &pshufb ("xmm4","xmm2"); # 4 = sbou
259 &pxor ("xmm4","xmm5"); # 4 = sb1u + k
260 &pshufb ("xmm0","xmm3"); # 0 = sb1t
261 &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
262 &pxor ("xmm0","xmm4"); # 0 = A
263 &pshufb ("xmm0","xmm1");
264 &ret ();
265&function_end_B("_vpaes_encrypt_core");
266
267##
268## Decryption core
269##
270## Same API as encryption core.
271##
272&function_begin_B("_vpaes_decrypt_core");
273 &lea ($base,&DWP($k_dsbd,$const));
274 &mov ($round,&DWP(240,$key));
275 &movdqa ("xmm1","xmm6");
276 &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base));
277 &pandn ("xmm1","xmm0");
278 &mov ($magic,$round);
279 &psrld ("xmm1",4)
280 &movdqu ("xmm5",&QWP(0,$key));
281 &shl ($magic,4);
282 &pand ("xmm0","xmm6");
283 &pshufb ("xmm2","xmm0");
284 &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
285 &xor ($magic,0x30);
286 &pshufb ("xmm0","xmm1");
287 &and ($magic,0x30);
288 &pxor ("xmm2","xmm5");
289 &movdqa ("xmm5",&QWP($k_mc_forward+48,$const));
290 &pxor ("xmm0","xmm2");
291 &add ($key,16);
292 &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
293 &jmp (&label("dec_entry"));
294
295&set_label("dec_loop",16);
296##
297## Inverse mix columns
298##
299 &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u
300 &movdqa ("xmm1",&QWP(-0x10,$base)); # 0 : sb9t
301 &pshufb ("xmm4","xmm2"); # 4 = sb9u
302 &pshufb ("xmm1","xmm3"); # 0 = sb9t
303 &pxor ("xmm0","xmm4");
304 &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu
305 &pxor ("xmm0","xmm1"); # 0 = ch
306 &movdqa ("xmm1",&QWP(0x10,$base)); # 0 : sbdt
307
308 &pshufb ("xmm4","xmm2"); # 4 = sbdu
309 &pshufb ("xmm0","xmm5"); # MC ch
310 &pshufb ("xmm1","xmm3"); # 0 = sbdt
311 &pxor ("xmm0","xmm4"); # 4 = ch
312 &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu
313 &pxor ("xmm0","xmm1"); # 0 = ch
314 &movdqa ("xmm1",&QWP(0x30,$base)); # 0 : sbbt
315
316 &pshufb ("xmm4","xmm2"); # 4 = sbbu
317 &pshufb ("xmm0","xmm5"); # MC ch
318 &pshufb ("xmm1","xmm3"); # 0 = sbbt
319 &pxor ("xmm0","xmm4"); # 4 = ch
320 &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu
321 &pxor ("xmm0","xmm1"); # 0 = ch
322 &movdqa ("xmm1",&QWP(0x50,$base)); # 0 : sbet
323
324 &pshufb ("xmm4","xmm2"); # 4 = sbeu
325 &pshufb ("xmm0","xmm5"); # MC ch
326 &pshufb ("xmm1","xmm3"); # 0 = sbet
327 &pxor ("xmm0","xmm4"); # 4 = ch
328 &add ($key,16); # next round key
329 &palignr("xmm5","xmm5",12);
330 &pxor ("xmm0","xmm1"); # 0 = ch
331 &sub ($round,1); # nr--
332
333&set_label("dec_entry");
334 # top of round
335 &movdqa ("xmm1","xmm6"); # 1 : i
336 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
337 &pandn ("xmm1","xmm0"); # 1 = i<<4
338 &pand ("xmm0","xmm6"); # 0 = k
339 &psrld ("xmm1",4); # 1 = i
340 &pshufb ("xmm2","xmm0"); # 2 = a/k
341 &movdqa ("xmm3","xmm7"); # 3 : 1/i
342 &pxor ("xmm0","xmm1"); # 0 = j
343 &pshufb ("xmm3","xmm1"); # 3 = 1/i
344 &movdqa ("xmm4","xmm7"); # 4 : 1/j
345 &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
346 &pshufb ("xmm4","xmm0"); # 4 = 1/j
347 &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
348 &movdqa ("xmm2","xmm7"); # 2 : 1/iak
349 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
350 &movdqa ("xmm3","xmm7"); # 3 : 1/jak
351 &pxor ("xmm2","xmm0"); # 2 = io
352 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
353 &movdqu ("xmm0",&QWP(0,$key));
354 &pxor ("xmm3","xmm1"); # 3 = jo
355 &jnz (&label("dec_loop"));
356
357 # middle of last round
358 &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou
359 &pshufb ("xmm4","xmm2"); # 4 = sbou
360 &pxor ("xmm4","xmm0"); # 4 = sb1u + k
361 &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot
362 &movdqa ("xmm2",&QWP(0,$magic));
363 &pshufb ("xmm0","xmm3"); # 0 = sb1t
364 &pxor ("xmm0","xmm4"); # 0 = A
365 &pshufb ("xmm0","xmm2");
366 &ret ();
367&function_end_B("_vpaes_decrypt_core");
368
369########################################################
370## ##
371## AES key schedule ##
372## ##
373########################################################
374&function_begin_B("_vpaes_schedule_core");
375 &add ($const,&DWP(0,"esp"));
376 &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned)
377 &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon
378
379 # input transform
380 &movdqa ("xmm3","xmm0");
381 &lea ($base,&DWP($k_ipt,$const));
382 &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8
383 &call ("_vpaes_schedule_transform");
384 &movdqa ("xmm7","xmm0");
385
386 &test ($out,$out);
387 &jnz (&label("schedule_am_decrypting"));
388
389 # encrypting, output zeroth round key after transform
390 &movdqu (&QWP(0,$key),"xmm0");
391 &jmp (&label("schedule_go"));
392
393&set_label("schedule_am_decrypting");
394 # decrypting, output zeroth round key after shiftrows
395 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
396 &pshufb ("xmm3","xmm1");
397 &movdqu (&QWP(0,$key),"xmm3");
398 &xor ($magic,0x30);
399
400&set_label("schedule_go");
401 &cmp ($round,192);
402 &ja (&label("schedule_256"));
403 &je (&label("schedule_192"));
404 # 128: fall though
405
406##
407## .schedule_128
408##
409## 128-bit specific part of key schedule.
410##
411## This schedule is really simple, because all its parts
412## are accomplished by the subroutines.
413##
414&set_label("schedule_128");
415 &mov ($round,10);
416
417&set_label("loop_schedule_128");
418 &call ("_vpaes_schedule_round");
419 &dec ($round);
420 &jz (&label("schedule_mangle_last"));
421 &call ("_vpaes_schedule_mangle"); # write output
422 &jmp (&label("loop_schedule_128"));
423
424##
425## .aes_schedule_192
426##
427## 192-bit specific part of key schedule.
428##
429## The main body of this schedule is the same as the 128-bit
430## schedule, but with more smearing. The long, high side is
431## stored in %xmm7 as before, and the short, low side is in
432## the high bits of %xmm6.
433##
434## This schedule is somewhat nastier, however, because each
435## round produces 192 bits of key material, or 1.5 round keys.
436## Therefore, on each cycle we do 2 rounds and produce 3 round
437## keys.
438##
439&set_label("schedule_192",16);
440 &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned)
Robert Sloana94fe052017-02-21 08:49:28 -0800441 &call ("_vpaes_schedule_transform"); # input transform
Adam Langleyd9e397b2015-01-22 14:27:53 -0800442 &movdqa ("xmm6","xmm0"); # save short part
443 &pxor ("xmm4","xmm4"); # clear 4
444 &movhlps("xmm6","xmm4"); # clobber low side with zeros
445 &mov ($round,4);
446
447&set_label("loop_schedule_192");
448 &call ("_vpaes_schedule_round");
449 &palignr("xmm0","xmm6",8);
450 &call ("_vpaes_schedule_mangle"); # save key n
451 &call ("_vpaes_schedule_192_smear");
452 &call ("_vpaes_schedule_mangle"); # save key n+1
453 &call ("_vpaes_schedule_round");
454 &dec ($round);
455 &jz (&label("schedule_mangle_last"));
456 &call ("_vpaes_schedule_mangle"); # save key n+2
457 &call ("_vpaes_schedule_192_smear");
458 &jmp (&label("loop_schedule_192"));
459
460##
461## .aes_schedule_256
462##
463## 256-bit specific part of key schedule.
464##
465## The structure here is very similar to the 128-bit
466## schedule, but with an additional "low side" in
467## %xmm6. The low side's rounds are the same as the
468## high side's, except no rcon and no rotation.
469##
470&set_label("schedule_256",16);
471 &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
Robert Sloana94fe052017-02-21 08:49:28 -0800472 &call ("_vpaes_schedule_transform"); # input transform
Adam Langleyd9e397b2015-01-22 14:27:53 -0800473 &mov ($round,7);
474
475&set_label("loop_schedule_256");
476 &call ("_vpaes_schedule_mangle"); # output low result
477 &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6
478
479 # high round
480 &call ("_vpaes_schedule_round");
481 &dec ($round);
482 &jz (&label("schedule_mangle_last"));
Robert Sloana94fe052017-02-21 08:49:28 -0800483 &call ("_vpaes_schedule_mangle");
Adam Langleyd9e397b2015-01-22 14:27:53 -0800484
485 # low round. swap xmm7 and xmm6
486 &pshufd ("xmm0","xmm0",0xFF);
487 &movdqa (&QWP(20,"esp"),"xmm7");
488 &movdqa ("xmm7","xmm6");
489 &call ("_vpaes_schedule_low_round");
490 &movdqa ("xmm7",&QWP(20,"esp"));
491
492 &jmp (&label("loop_schedule_256"));
493
494##
495## .aes_schedule_mangle_last
496##
497## Mangler for last round of key schedule
498## Mangles %xmm0
499## when encrypting, outputs out(%xmm0) ^ 63
500## when decrypting, outputs unskew(%xmm0)
501##
502## Always called right before return... jumps to cleanup and exits
503##
504&set_label("schedule_mangle_last",16);
505 # schedule last round key from xmm0
506 &lea ($base,&DWP($k_deskew,$const));
507 &test ($out,$out);
508 &jnz (&label("schedule_mangle_last_dec"));
509
510 # encrypting
511 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
512 &pshufb ("xmm0","xmm1"); # output permute
513 &lea ($base,&DWP($k_opt,$const)); # prepare to output transform
514 &add ($key,32);
515
516&set_label("schedule_mangle_last_dec");
517 &add ($key,-16);
518 &pxor ("xmm0",&QWP($k_s63,$const));
519 &call ("_vpaes_schedule_transform"); # output transform
520 &movdqu (&QWP(0,$key),"xmm0"); # save last key
521
522 # cleanup
523 &pxor ("xmm0","xmm0");
524 &pxor ("xmm1","xmm1");
525 &pxor ("xmm2","xmm2");
526 &pxor ("xmm3","xmm3");
527 &pxor ("xmm4","xmm4");
528 &pxor ("xmm5","xmm5");
529 &pxor ("xmm6","xmm6");
530 &pxor ("xmm7","xmm7");
531 &ret ();
532&function_end_B("_vpaes_schedule_core");
533
534##
535## .aes_schedule_192_smear
536##
537## Smear the short, low side in the 192-bit key schedule.
538##
539## Inputs:
540## %xmm7: high side, b a x y
541## %xmm6: low side, d c 0 0
542## %xmm13: 0
543##
544## Outputs:
545## %xmm6: b+c+d b+c 0 0
546## %xmm0: b+c+d b+c b a
547##
548&function_begin_B("_vpaes_schedule_192_smear");
549 &pshufd ("xmm1","xmm6",0x80); # d c 0 0 -> c 0 0 0
550 &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a
551 &pxor ("xmm6","xmm1"); # -> c+d c 0 0
552 &pxor ("xmm1","xmm1");
553 &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a
554 &movdqa ("xmm0","xmm6");
555 &movhlps("xmm6","xmm1"); # clobber low side with zeros
556 &ret ();
557&function_end_B("_vpaes_schedule_192_smear");
558
559##
560## .aes_schedule_round
561##
562## Runs one main round of the key schedule on %xmm0, %xmm7
563##
564## Specifically, runs subbytes on the high dword of %xmm0
565## then rotates it by one byte and xors into the low dword of
566## %xmm7.
567##
568## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
569## next rcon.
570##
571## Smears the dwords of %xmm7 by xoring the low into the
572## second low, result into third, result into highest.
573##
574## Returns results in %xmm7 = %xmm0.
575## Clobbers %xmm1-%xmm5.
576##
577&function_begin_B("_vpaes_schedule_round");
578 # extract rcon from xmm8
579 &movdqa ("xmm2",&QWP(8,"esp")); # xmm8
580 &pxor ("xmm1","xmm1");
581 &palignr("xmm1","xmm2",15);
582 &palignr("xmm2","xmm2",15);
583 &pxor ("xmm7","xmm1");
584
585 # rotate
586 &pshufd ("xmm0","xmm0",0xFF);
587 &palignr("xmm0","xmm0",1);
588
589 # fall through...
590 &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8
591
592 # low round: same as high round, but no rotation and no rcon.
593&set_label("_vpaes_schedule_low_round");
594 # smear xmm7
595 &movdqa ("xmm1","xmm7");
596 &pslldq ("xmm7",4);
597 &pxor ("xmm7","xmm1");
598 &movdqa ("xmm1","xmm7");
599 &pslldq ("xmm7",8);
600 &pxor ("xmm7","xmm1");
601 &pxor ("xmm7",&QWP($k_s63,$const));
602
603 # subbyte
604 &movdqa ("xmm4",&QWP($k_s0F,$const));
605 &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
Robert Sloana94fe052017-02-21 08:49:28 -0800606 &movdqa ("xmm1","xmm4");
Adam Langleyd9e397b2015-01-22 14:27:53 -0800607 &pandn ("xmm1","xmm0");
608 &psrld ("xmm1",4); # 1 = i
609 &pand ("xmm0","xmm4"); # 0 = k
610 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
611 &pshufb ("xmm2","xmm0"); # 2 = a/k
612 &pxor ("xmm0","xmm1"); # 0 = j
613 &movdqa ("xmm3","xmm5"); # 3 : 1/i
614 &pshufb ("xmm3","xmm1"); # 3 = 1/i
615 &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
616 &movdqa ("xmm4","xmm5"); # 4 : 1/j
617 &pshufb ("xmm4","xmm0"); # 4 = 1/j
618 &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
619 &movdqa ("xmm2","xmm5"); # 2 : 1/iak
620 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
621 &pxor ("xmm2","xmm0"); # 2 = io
622 &movdqa ("xmm3","xmm5"); # 3 : 1/jak
623 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
624 &pxor ("xmm3","xmm1"); # 3 = jo
625 &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou
626 &pshufb ("xmm4","xmm2"); # 4 = sbou
627 &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
628 &pshufb ("xmm0","xmm3"); # 0 = sb1t
629 &pxor ("xmm0","xmm4"); # 0 = sbox output
630
631 # add in smeared stuff
632 &pxor ("xmm0","xmm7");
633 &movdqa ("xmm7","xmm0");
634 &ret ();
635&function_end_B("_vpaes_schedule_round");
636
637##
638## .aes_schedule_transform
639##
640## Linear-transform %xmm0 according to tables at (%ebx)
641##
642## Output in %xmm0
643## Clobbers %xmm1, %xmm2
644##
645&function_begin_B("_vpaes_schedule_transform");
646 &movdqa ("xmm2",&QWP($k_s0F,$const));
647 &movdqa ("xmm1","xmm2");
648 &pandn ("xmm1","xmm0");
649 &psrld ("xmm1",4);
650 &pand ("xmm0","xmm2");
651 &movdqa ("xmm2",&QWP(0,$base));
652 &pshufb ("xmm2","xmm0");
653 &movdqa ("xmm0",&QWP(16,$base));
654 &pshufb ("xmm0","xmm1");
655 &pxor ("xmm0","xmm2");
656 &ret ();
657&function_end_B("_vpaes_schedule_transform");
658
659##
660## .aes_schedule_mangle
661##
662## Mangle xmm0 from (basis-transformed) standard version
663## to our version.
664##
665## On encrypt,
666## xor with 0x63
667## multiply by circulant 0,1,1,1
668## apply shiftrows transform
669##
670## On decrypt,
671## xor with 0x63
672## multiply by "inverse mixcolumns" circulant E,B,D,9
673## deskew
674## apply shiftrows transform
675##
676##
677## Writes out to (%edx), and increments or decrements it
678## Keeps track of round number mod 4 in %ecx
679## Preserves xmm0
680## Clobbers xmm1-xmm5
681##
682&function_begin_B("_vpaes_schedule_mangle");
683 &movdqa ("xmm4","xmm0"); # save xmm0 for later
684 &movdqa ("xmm5",&QWP($k_mc_forward,$const));
685 &test ($out,$out);
686 &jnz (&label("schedule_mangle_dec"));
687
688 # encrypting
689 &add ($key,16);
690 &pxor ("xmm4",&QWP($k_s63,$const));
691 &pshufb ("xmm4","xmm5");
692 &movdqa ("xmm3","xmm4");
693 &pshufb ("xmm4","xmm5");
694 &pxor ("xmm3","xmm4");
695 &pshufb ("xmm4","xmm5");
696 &pxor ("xmm3","xmm4");
697
698 &jmp (&label("schedule_mangle_both"));
699
700&set_label("schedule_mangle_dec",16);
701 # inverse mix columns
702 &movdqa ("xmm2",&QWP($k_s0F,$const));
703 &lea ($inp,&DWP($k_dksd,$const));
704 &movdqa ("xmm1","xmm2");
705 &pandn ("xmm1","xmm4");
706 &psrld ("xmm1",4); # 1 = hi
707 &pand ("xmm4","xmm2"); # 4 = lo
708
709 &movdqa ("xmm2",&QWP(0,$inp));
710 &pshufb ("xmm2","xmm4");
711 &movdqa ("xmm3",&QWP(0x10,$inp));
712 &pshufb ("xmm3","xmm1");
713 &pxor ("xmm3","xmm2");
714 &pshufb ("xmm3","xmm5");
715
716 &movdqa ("xmm2",&QWP(0x20,$inp));
717 &pshufb ("xmm2","xmm4");
718 &pxor ("xmm2","xmm3");
719 &movdqa ("xmm3",&QWP(0x30,$inp));
720 &pshufb ("xmm3","xmm1");
721 &pxor ("xmm3","xmm2");
722 &pshufb ("xmm3","xmm5");
723
724 &movdqa ("xmm2",&QWP(0x40,$inp));
725 &pshufb ("xmm2","xmm4");
726 &pxor ("xmm2","xmm3");
727 &movdqa ("xmm3",&QWP(0x50,$inp));
728 &pshufb ("xmm3","xmm1");
729 &pxor ("xmm3","xmm2");
730 &pshufb ("xmm3","xmm5");
731
732 &movdqa ("xmm2",&QWP(0x60,$inp));
733 &pshufb ("xmm2","xmm4");
734 &pxor ("xmm2","xmm3");
735 &movdqa ("xmm3",&QWP(0x70,$inp));
736 &pshufb ("xmm3","xmm1");
737 &pxor ("xmm3","xmm2");
738
739 &add ($key,-16);
740
741&set_label("schedule_mangle_both");
742 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
743 &pshufb ("xmm3","xmm1");
744 &add ($magic,-16);
745 &and ($magic,0x30);
746 &movdqu (&QWP(0,$key),"xmm3");
747 &ret ();
748&function_end_B("_vpaes_schedule_mangle");
749
750#
751# Interface to OpenSSL
752#
753&function_begin("${PREFIX}_set_encrypt_key");
754 &mov ($inp,&wparam(0)); # inp
755 &lea ($base,&DWP(-56,"esp"));
756 &mov ($round,&wparam(1)); # bits
757 &and ($base,-16);
758 &mov ($key,&wparam(2)); # key
759 &xchg ($base,"esp"); # alloca
760 &mov (&DWP(48,"esp"),$base);
761
762 &mov ($base,$round);
763 &shr ($base,5);
764 &add ($base,5);
765 &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
766 &mov ($magic,0x30);
767 &mov ($out,0);
768
769 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
770 &call ("_vpaes_schedule_core");
771&set_label("pic_point");
772
773 &mov ("esp",&DWP(48,"esp"));
774 &xor ("eax","eax");
775&function_end("${PREFIX}_set_encrypt_key");
776
777&function_begin("${PREFIX}_set_decrypt_key");
778 &mov ($inp,&wparam(0)); # inp
779 &lea ($base,&DWP(-56,"esp"));
780 &mov ($round,&wparam(1)); # bits
781 &and ($base,-16);
782 &mov ($key,&wparam(2)); # key
783 &xchg ($base,"esp"); # alloca
784 &mov (&DWP(48,"esp"),$base);
785
786 &mov ($base,$round);
787 &shr ($base,5);
788 &add ($base,5);
789 &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
790 &shl ($base,4);
791 &lea ($key,&DWP(16,$key,$base));
792
793 &mov ($out,1);
794 &mov ($magic,$round);
795 &shr ($magic,1);
796 &and ($magic,32);
797 &xor ($magic,32); # nbist==192?0:32;
798
799 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
800 &call ("_vpaes_schedule_core");
801&set_label("pic_point");
802
803 &mov ("esp",&DWP(48,"esp"));
804 &xor ("eax","eax");
805&function_end("${PREFIX}_set_decrypt_key");
806
807&function_begin("${PREFIX}_encrypt");
808 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
809 &call ("_vpaes_preheat");
810&set_label("pic_point");
811 &mov ($inp,&wparam(0)); # inp
812 &lea ($base,&DWP(-56,"esp"));
813 &mov ($out,&wparam(1)); # out
814 &and ($base,-16);
815 &mov ($key,&wparam(2)); # key
816 &xchg ($base,"esp"); # alloca
817 &mov (&DWP(48,"esp"),$base);
818
819 &movdqu ("xmm0",&QWP(0,$inp));
820 &call ("_vpaes_encrypt_core");
821 &movdqu (&QWP(0,$out),"xmm0");
822
823 &mov ("esp",&DWP(48,"esp"));
824&function_end("${PREFIX}_encrypt");
825
826&function_begin("${PREFIX}_decrypt");
827 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
828 &call ("_vpaes_preheat");
829&set_label("pic_point");
830 &mov ($inp,&wparam(0)); # inp
831 &lea ($base,&DWP(-56,"esp"));
832 &mov ($out,&wparam(1)); # out
833 &and ($base,-16);
834 &mov ($key,&wparam(2)); # key
835 &xchg ($base,"esp"); # alloca
836 &mov (&DWP(48,"esp"),$base);
837
838 &movdqu ("xmm0",&QWP(0,$inp));
839 &call ("_vpaes_decrypt_core");
840 &movdqu (&QWP(0,$out),"xmm0");
841
842 &mov ("esp",&DWP(48,"esp"));
843&function_end("${PREFIX}_decrypt");
844
845&function_begin("${PREFIX}_cbc_encrypt");
846 &mov ($inp,&wparam(0)); # inp
847 &mov ($out,&wparam(1)); # out
848 &mov ($round,&wparam(2)); # len
849 &mov ($key,&wparam(3)); # key
850 &sub ($round,16);
851 &jc (&label("cbc_abort"));
852 &lea ($base,&DWP(-56,"esp"));
853 &mov ($const,&wparam(4)); # ivp
854 &and ($base,-16);
855 &mov ($magic,&wparam(5)); # enc
856 &xchg ($base,"esp"); # alloca
857 &movdqu ("xmm1",&QWP(0,$const)); # load IV
858 &sub ($out,$inp);
859 &mov (&DWP(48,"esp"),$base);
860
861 &mov (&DWP(0,"esp"),$out); # save out
862 &mov (&DWP(4,"esp"),$key) # save key
863 &mov (&DWP(8,"esp"),$const); # save ivp
864 &mov ($out,$round); # $out works as $len
865
866 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
867 &call ("_vpaes_preheat");
868&set_label("pic_point");
869 &cmp ($magic,0);
870 &je (&label("cbc_dec_loop"));
871 &jmp (&label("cbc_enc_loop"));
872
873&set_label("cbc_enc_loop",16);
874 &movdqu ("xmm0",&QWP(0,$inp)); # load input
875 &pxor ("xmm0","xmm1"); # inp^=iv
876 &call ("_vpaes_encrypt_core");
877 &mov ($base,&DWP(0,"esp")); # restore out
878 &mov ($key,&DWP(4,"esp")); # restore key
879 &movdqa ("xmm1","xmm0");
880 &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
881 &lea ($inp,&DWP(16,$inp));
882 &sub ($out,16);
883 &jnc (&label("cbc_enc_loop"));
884 &jmp (&label("cbc_done"));
885
886&set_label("cbc_dec_loop",16);
887 &movdqu ("xmm0",&QWP(0,$inp)); # load input
888 &movdqa (&QWP(16,"esp"),"xmm1"); # save IV
889 &movdqa (&QWP(32,"esp"),"xmm0"); # save future IV
890 &call ("_vpaes_decrypt_core");
891 &mov ($base,&DWP(0,"esp")); # restore out
892 &mov ($key,&DWP(4,"esp")); # restore key
893 &pxor ("xmm0",&QWP(16,"esp")); # out^=iv
894 &movdqa ("xmm1",&QWP(32,"esp")); # load next IV
895 &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
896 &lea ($inp,&DWP(16,$inp));
897 &sub ($out,16);
898 &jnc (&label("cbc_dec_loop"));
899
900&set_label("cbc_done");
901 &mov ($base,&DWP(8,"esp")); # restore ivp
902 &mov ("esp",&DWP(48,"esp"));
903 &movdqu (&QWP(0,$base),"xmm1"); # write IV
904&set_label("cbc_abort");
905&function_end("${PREFIX}_cbc_encrypt");
906
907&asm_finish();
David Benjaminc895d6b2016-08-11 13:26:41 -0400908
909close STDOUT;