blob: 83a53818f0a5e3cb89a02bce1d83a9174fc0b0a6 [file] [log] [blame]
Johannes Goetzfried4ea12772012-07-11 19:38:57 +02001/*
2 * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64)
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
Jussi Kivilinnac09220e2012-08-28 14:24:54 +03007 * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
8 *
Johannes Goetzfried4ea12772012-07-11 19:38:57 +02009 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
22 * USA
23 *
24 */
25
Jussi Kivilinnacba1cce2012-10-20 15:06:41 +030026#include "glue_helper-asm-avx.S"
27
Johannes Goetzfried4ea12772012-07-11 19:38:57 +020028.file "cast6-avx-x86_64-asm_64.S"
Johannes Goetzfried4ea12772012-07-11 19:38:57 +020029
30.extern cast6_s1
31.extern cast6_s2
32.extern cast6_s3
33.extern cast6_s4
34
35/* structure of crypto context */
36#define km 0
37#define kr (12*4*4)
38
39/* s-boxes */
40#define s1 cast6_s1
41#define s2 cast6_s2
42#define s3 cast6_s3
43#define s4 cast6_s4
44
45/**********************************************************************
46 8-way AVX cast6
47 **********************************************************************/
48#define CTX %rdi
49
50#define RA1 %xmm0
51#define RB1 %xmm1
52#define RC1 %xmm2
53#define RD1 %xmm3
54
55#define RA2 %xmm4
56#define RB2 %xmm5
57#define RC2 %xmm6
58#define RD2 %xmm7
59
Jussi Kivilinnac09220e2012-08-28 14:24:54 +030060#define RX %xmm8
Johannes Goetzfried4ea12772012-07-11 19:38:57 +020061
62#define RKM %xmm9
Jussi Kivilinnac09220e2012-08-28 14:24:54 +030063#define RKR %xmm10
64#define RKRF %xmm11
65#define RKRR %xmm12
66#define R32 %xmm13
67#define R1ST %xmm14
Johannes Goetzfried4ea12772012-07-11 19:38:57 +020068
Jussi Kivilinnac09220e2012-08-28 14:24:54 +030069#define RTMP %xmm15
Johannes Goetzfried4ea12772012-07-11 19:38:57 +020070
Jussi Kivilinnac09220e2012-08-28 14:24:54 +030071#define RID1 %rbp
72#define RID1d %ebp
73#define RID2 %rsi
74#define RID2d %esi
Johannes Goetzfried4ea12772012-07-11 19:38:57 +020075
76#define RGI1 %rdx
77#define RGI1bl %dl
78#define RGI1bh %dh
79#define RGI2 %rcx
80#define RGI2bl %cl
81#define RGI2bh %ch
82
Jussi Kivilinnac09220e2012-08-28 14:24:54 +030083#define RGI3 %rax
84#define RGI3bl %al
85#define RGI3bh %ah
86#define RGI4 %rbx
87#define RGI4bl %bl
88#define RGI4bh %bh
89
Johannes Goetzfried4ea12772012-07-11 19:38:57 +020090#define RFS1 %r8
91#define RFS1d %r8d
92#define RFS2 %r9
93#define RFS2d %r9d
94#define RFS3 %r10
95#define RFS3d %r10d
96
97
Jussi Kivilinnac09220e2012-08-28 14:24:54 +030098#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
99 movzbl src ## bh, RID1d; \
100 movzbl src ## bl, RID2d; \
101 shrq $16, src; \
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200102 movl s1(, RID1, 4), dst ## d; \
103 op1 s2(, RID2, 4), dst ## d; \
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300104 movzbl src ## bh, RID1d; \
105 movzbl src ## bl, RID2d; \
106 interleave_op(il_reg); \
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200107 op2 s3(, RID1, 4), dst ## d; \
108 op3 s4(, RID2, 4), dst ## d;
109
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300110#define dummy(d) /* do nothing */
111
112#define shr_next(reg) \
113 shrq $16, reg;
114
115#define F_head(a, x, gi1, gi2, op0) \
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200116 op0 a, RKM, x; \
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300117 vpslld RKRF, x, RTMP; \
118 vpsrld RKRR, x, x; \
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200119 vpor RTMP, x, x; \
120 \
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300121 vmovq x, gi1; \
122 vpextrq $1, x, gi2;
123
124#define F_tail(a, x, gi1, gi2, op1, op2, op3) \
125 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
126 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200127 \
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300128 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
129 shlq $32, RFS2; \
130 orq RFS1, RFS2; \
131 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
132 shlq $32, RFS1; \
133 orq RFS1, RFS3; \
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200134 \
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300135 vmovq RFS2, x; \
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200136 vpinsrq $1, RFS3, x, x;
137
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300138#define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
139 F_head(b1, RX, RGI1, RGI2, op0); \
140 F_head(b2, RX, RGI3, RGI4, op0); \
141 \
142 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
143 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
144 \
145 vpxor a1, RX, a1; \
146 vpxor a2, RTMP, a2;
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200147
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300148#define F1_2(a1, b1, a2, b2) \
149 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
150#define F2_2(a1, b1, a2, b2) \
151 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
152#define F3_2(a1, b1, a2, b2) \
153 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
154
155#define qop(in, out, f) \
156 F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2);
157
158#define get_round_keys(nn) \
159 vbroadcastss (km+(4*(nn)))(CTX), RKM; \
160 vpand R1ST, RKR, RKRF; \
161 vpsubq RKRF, R32, RKRR; \
162 vpsrldq $1, RKR, RKR;
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200163
164#define Q(n) \
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300165 get_round_keys(4*n+0); \
166 qop(RD, RC, 1); \
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200167 \
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300168 get_round_keys(4*n+1); \
169 qop(RC, RB, 2); \
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200170 \
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300171 get_round_keys(4*n+2); \
172 qop(RB, RA, 3); \
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200173 \
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300174 get_round_keys(4*n+3); \
175 qop(RA, RD, 1);
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200176
177#define QBAR(n) \
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300178 get_round_keys(4*n+3); \
179 qop(RA, RD, 1); \
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200180 \
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300181 get_round_keys(4*n+2); \
182 qop(RB, RA, 3); \
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200183 \
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300184 get_round_keys(4*n+1); \
185 qop(RC, RB, 2); \
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200186 \
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300187 get_round_keys(4*n+0); \
188 qop(RD, RC, 1);
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200189
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300190#define shuffle(mask) \
191 vpshufb mask, RKR, RKR;
192
193#define preload_rkr(n, do_mask, mask) \
194 vbroadcastss .L16_mask, RKR; \
195 /* add 16-bit rotation to key rotations (mod 32) */ \
196 vpxor (kr+n*16)(CTX), RKR, RKR; \
197 do_mask(mask);
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200198
199#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
200 vpunpckldq x1, x0, t0; \
201 vpunpckhdq x1, x0, t2; \
202 vpunpckldq x3, x2, t1; \
203 vpunpckhdq x3, x2, x3; \
204 \
205 vpunpcklqdq t1, t0, x0; \
206 vpunpckhqdq t1, t0, x1; \
207 vpunpcklqdq x3, t2, x2; \
208 vpunpckhqdq x3, t2, x3;
209
Jussi Kivilinnacba1cce2012-10-20 15:06:41 +0300210#define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300211 vpshufb rmask, x0, x0; \
212 vpshufb rmask, x1, x1; \
213 vpshufb rmask, x2, x2; \
214 vpshufb rmask, x3, x3; \
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200215 \
216 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
217
Jussi Kivilinnacba1cce2012-10-20 15:06:41 +0300218#define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200219 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
220 \
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300221 vpshufb rmask, x0, x0; \
222 vpshufb rmask, x1, x1; \
223 vpshufb rmask, x2, x2; \
Jussi Kivilinnacba1cce2012-10-20 15:06:41 +0300224 vpshufb rmask, x3, x3;
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200225
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300226.data
227
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200228.align 16
229.Lbswap_mask:
230 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
Jussi Kivilinnacba1cce2012-10-20 15:06:41 +0300231.Lbswap128_mask:
232 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300233.Lrkr_enc_Q_Q_QBAR_QBAR:
234 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
235.Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
236 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
237.Lrkr_dec_Q_Q_Q_Q:
238 .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
239.Lrkr_dec_Q_Q_QBAR_QBAR:
240 .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0
241.Lrkr_dec_QBAR_QBAR_QBAR_QBAR:
242 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
243.L16_mask:
244 .byte 16, 16, 16, 16
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200245.L32_mask:
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300246 .byte 32, 0, 0, 0
247.Lfirst_mask:
248 .byte 0x1f, 0, 0, 0
249
250.text
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200251
Jussi Kivilinnacba1cce2012-10-20 15:06:41 +0300252.align 8
253.type __cast6_enc_blk8,@function;
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200254
Jussi Kivilinnacba1cce2012-10-20 15:06:41 +0300255__cast6_enc_blk8:
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200256 /* input:
257 * %rdi: ctx, CTX
Jussi Kivilinnacba1cce2012-10-20 15:06:41 +0300258 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
259 * output:
260 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200261 */
262
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300263 pushq %rbp;
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200264 pushq %rbx;
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200265
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300266 vmovdqa .Lbswap_mask, RKM;
267 vmovd .Lfirst_mask, R1ST;
268 vmovd .L32_mask, R32;
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200269
Jussi Kivilinnacba1cce2012-10-20 15:06:41 +0300270 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
271 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200272
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300273 preload_rkr(0, dummy, none);
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200274 Q(0);
275 Q(1);
276 Q(2);
277 Q(3);
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300278 preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR);
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200279 Q(4);
280 Q(5);
281 QBAR(6);
282 QBAR(7);
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300283 preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR);
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200284 QBAR(8);
285 QBAR(9);
286 QBAR(10);
287 QBAR(11);
288
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200289 popq %rbx;
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300290 popq %rbp;
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200291
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300292 vmovdqa .Lbswap_mask, RKM;
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200293
Jussi Kivilinnacba1cce2012-10-20 15:06:41 +0300294 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
295 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200296
297 ret;
298
Jussi Kivilinnacba1cce2012-10-20 15:06:41 +0300299.align 8
300.type __cast6_dec_blk8,@function;
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200301
Jussi Kivilinnacba1cce2012-10-20 15:06:41 +0300302__cast6_dec_blk8:
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200303 /* input:
304 * %rdi: ctx, CTX
Jussi Kivilinnacba1cce2012-10-20 15:06:41 +0300305 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
306 * output:
307 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200308 */
309
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300310 pushq %rbp;
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200311 pushq %rbx;
312
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300313 vmovdqa .Lbswap_mask, RKM;
314 vmovd .Lfirst_mask, R1ST;
315 vmovd .L32_mask, R32;
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200316
Jussi Kivilinnacba1cce2012-10-20 15:06:41 +0300317 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
318 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200319
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300320 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200321 Q(11);
322 Q(10);
323 Q(9);
324 Q(8);
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300325 preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR);
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200326 Q(7);
327 Q(6);
328 QBAR(5);
329 QBAR(4);
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300330 preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR);
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200331 QBAR(3);
332 QBAR(2);
333 QBAR(1);
334 QBAR(0);
335
336 popq %rbx;
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300337 popq %rbp;
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200338
Jussi Kivilinnac09220e2012-08-28 14:24:54 +0300339 vmovdqa .Lbswap_mask, RKM;
Jussi Kivilinnacba1cce2012-10-20 15:06:41 +0300340 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
341 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
342
343 ret;
344
345.align 8
346.global cast6_ecb_enc_8way
347.type cast6_ecb_enc_8way,@function;
348
349cast6_ecb_enc_8way:
350 /* input:
351 * %rdi: ctx, CTX
352 * %rsi: dst
353 * %rdx: src
354 */
355
356 movq %rsi, %r11;
357
358 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
359
360 call __cast6_enc_blk8;
361
362 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
363
364 ret;
365
366.align 8
367.global cast6_ecb_dec_8way
368.type cast6_ecb_dec_8way,@function;
369
370cast6_ecb_dec_8way:
371 /* input:
372 * %rdi: ctx, CTX
373 * %rsi: dst
374 * %rdx: src
375 */
376
377 movq %rsi, %r11;
378
379 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
380
381 call __cast6_dec_blk8;
382
383 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
384
385 ret;
386
387.align 8
388.global cast6_cbc_dec_8way
389.type cast6_cbc_dec_8way,@function;
390
391cast6_cbc_dec_8way:
392 /* input:
393 * %rdi: ctx, CTX
394 * %rsi: dst
395 * %rdx: src
396 */
397
398 pushq %r12;
399
400 movq %rsi, %r11;
401 movq %rdx, %r12;
402
403 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
404
405 call __cast6_dec_blk8;
406
407 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
408
409 popq %r12;
410
411 ret;
412
413.align 8
414.global cast6_ctr_8way
415.type cast6_ctr_8way,@function;
416
417cast6_ctr_8way:
418 /* input:
419 * %rdi: ctx, CTX
420 * %rsi: dst
421 * %rdx: src
422 * %rcx: iv (little endian, 128bit)
423 */
424
425 pushq %r12;
426
427 movq %rsi, %r11;
428 movq %rdx, %r12;
429
430 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
431 RD2, RX, RKR, RKM);
432
433 call __cast6_enc_blk8;
434
435 store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
436
437 popq %r12;
Johannes Goetzfried4ea12772012-07-11 19:38:57 +0200438
439 ret;