blob: 0b3374335fdcb34ad466e7a9575da2b99fd1d853 [file] [log] [blame]
Jussi Kivilinna0b95ec52012-03-05 20:26:47 +02001/*
2 * Camellia Cipher Algorithm (x86_64)
3 *
4 * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
19 * USA
20 *
21 */
22
23.file "camellia-x86_64-asm_64.S"
24.text
25
26.extern camellia_sp10011110;
27.extern camellia_sp22000222;
28.extern camellia_sp03303033;
29.extern camellia_sp00444404;
30.extern camellia_sp02220222;
31.extern camellia_sp30333033;
32.extern camellia_sp44044404;
33.extern camellia_sp11101110;
34
35#define sp10011110 camellia_sp10011110
36#define sp22000222 camellia_sp22000222
37#define sp03303033 camellia_sp03303033
38#define sp00444404 camellia_sp00444404
39#define sp02220222 camellia_sp02220222
40#define sp30333033 camellia_sp30333033
41#define sp44044404 camellia_sp44044404
42#define sp11101110 camellia_sp11101110
43
44#define CAMELLIA_TABLE_BYTE_LEN 272
45
46/* struct camellia_ctx: */
47#define key_table 0
48#define key_length CAMELLIA_TABLE_BYTE_LEN
49
50/* register macros */
51#define CTX %rdi
52#define RIO %rsi
53#define RIOd %esi
54
55#define RAB0 %rax
56#define RCD0 %rcx
57#define RAB1 %rbx
58#define RCD1 %rdx
59
60#define RAB0d %eax
61#define RCD0d %ecx
62#define RAB1d %ebx
63#define RCD1d %edx
64
65#define RAB0bl %al
66#define RCD0bl %cl
67#define RAB1bl %bl
68#define RCD1bl %dl
69
70#define RAB0bh %ah
71#define RCD0bh %ch
72#define RAB1bh %bh
73#define RCD1bh %dh
74
75#define RT0 %rsi
76#define RT1 %rbp
77#define RT2 %r8
78
79#define RT0d %esi
80#define RT1d %ebp
81#define RT2d %r8d
82
83#define RT2bl %r8b
84
85#define RXOR %r9
86#define RRBP %r10
87#define RDST %r11
88
89#define RXORd %r9d
90#define RXORbl %r9b
91
92#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
93 movzbl ab ## bl, tmp2 ## d; \
94 movzbl ab ## bh, tmp1 ## d; \
95 rorq $16, ab; \
96 xorq T0(, tmp2, 8), dst; \
97 xorq T1(, tmp1, 8), dst;
98
99/**********************************************************************
100 1-way camellia
101 **********************************************************************/
102#define roundsm(ab, subkey, cd) \
103 movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \
104 \
105 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
106 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
107 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
108 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
109 \
110 xorq RT2, cd ## 0;
111
112#define fls(l, r, kl, kr) \
113 movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \
114 andl l ## 0d, RT0d; \
115 roll $1, RT0d; \
116 shlq $32, RT0; \
117 xorq RT0, l ## 0; \
118 movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \
119 orq r ## 0, RT1; \
120 shrq $32, RT1; \
121 xorq RT1, r ## 0; \
122 \
123 movq (key_table + ((kl) * 2) * 4)(CTX), RT2; \
124 orq l ## 0, RT2; \
125 shrq $32, RT2; \
126 xorq RT2, l ## 0; \
127 movl (key_table + ((kr) * 2) * 4)(CTX), RT0d; \
128 andl r ## 0d, RT0d; \
129 roll $1, RT0d; \
130 shlq $32, RT0; \
131 xorq RT0, r ## 0;
132
133#define enc_rounds(i) \
134 roundsm(RAB, i + 2, RCD); \
135 roundsm(RCD, i + 3, RAB); \
136 roundsm(RAB, i + 4, RCD); \
137 roundsm(RCD, i + 5, RAB); \
138 roundsm(RAB, i + 6, RCD); \
139 roundsm(RCD, i + 7, RAB);
140
141#define enc_fls(i) \
142 fls(RAB, RCD, i + 0, i + 1);
143
144#define enc_inpack() \
145 movq (RIO), RAB0; \
146 bswapq RAB0; \
147 rolq $32, RAB0; \
148 movq 4*2(RIO), RCD0; \
149 bswapq RCD0; \
150 rorq $32, RCD0; \
151 xorq key_table(CTX), RAB0;
152
153#define enc_outunpack(op, max) \
154 xorq key_table(CTX, max, 8), RCD0; \
155 rorq $32, RCD0; \
156 bswapq RCD0; \
157 op ## q RCD0, (RIO); \
158 rolq $32, RAB0; \
159 bswapq RAB0; \
160 op ## q RAB0, 4*2(RIO);
161
162#define dec_rounds(i) \
163 roundsm(RAB, i + 7, RCD); \
164 roundsm(RCD, i + 6, RAB); \
165 roundsm(RAB, i + 5, RCD); \
166 roundsm(RCD, i + 4, RAB); \
167 roundsm(RAB, i + 3, RCD); \
168 roundsm(RCD, i + 2, RAB);
169
170#define dec_fls(i) \
171 fls(RAB, RCD, i + 1, i + 0);
172
173#define dec_inpack(max) \
174 movq (RIO), RAB0; \
175 bswapq RAB0; \
176 rolq $32, RAB0; \
177 movq 4*2(RIO), RCD0; \
178 bswapq RCD0; \
179 rorq $32, RCD0; \
180 xorq key_table(CTX, max, 8), RAB0;
181
182#define dec_outunpack() \
183 xorq key_table(CTX), RCD0; \
184 rorq $32, RCD0; \
185 bswapq RCD0; \
186 movq RCD0, (RIO); \
187 rolq $32, RAB0; \
188 bswapq RAB0; \
189 movq RAB0, 4*2(RIO);
190
191.global __camellia_enc_blk;
192.type __camellia_enc_blk,@function;
193
194__camellia_enc_blk:
195 /* input:
196 * %rdi: ctx, CTX
197 * %rsi: dst
198 * %rdx: src
199 * %rcx: bool xor
200 */
201 movq %rbp, RRBP;
202
203 movq %rcx, RXOR;
204 movq %rsi, RDST;
205 movq %rdx, RIO;
206
207 enc_inpack();
208
209 enc_rounds(0);
210 enc_fls(8);
211 enc_rounds(8);
212 enc_fls(16);
213 enc_rounds(16);
214 movl $24, RT1d; /* max */
215
216 cmpb $16, key_length(CTX);
217 je __enc_done;
218
219 enc_fls(24);
220 enc_rounds(24);
221 movl $32, RT1d; /* max */
222
223__enc_done:
224 testb RXORbl, RXORbl;
225 movq RDST, RIO;
226
227 jnz __enc_xor;
228
229 enc_outunpack(mov, RT1);
230
231 movq RRBP, %rbp;
232 ret;
233
234__enc_xor:
235 enc_outunpack(xor, RT1);
236
237 movq RRBP, %rbp;
238 ret;
239
240.global camellia_dec_blk;
241.type camellia_dec_blk,@function;
242
243camellia_dec_blk:
244 /* input:
245 * %rdi: ctx, CTX
246 * %rsi: dst
247 * %rdx: src
248 */
249 cmpl $16, key_length(CTX);
250 movl $32, RT2d;
251 movl $24, RXORd;
252 cmovel RXORd, RT2d; /* max */
253
254 movq %rbp, RRBP;
255 movq %rsi, RDST;
256 movq %rdx, RIO;
257
258 dec_inpack(RT2);
259
260 cmpb $24, RT2bl;
261 je __dec_rounds16;
262
263 dec_rounds(24);
264 dec_fls(24);
265
266__dec_rounds16:
267 dec_rounds(16);
268 dec_fls(16);
269 dec_rounds(8);
270 dec_fls(8);
271 dec_rounds(0);
272
273 movq RDST, RIO;
274
275 dec_outunpack();
276
277 movq RRBP, %rbp;
278 ret;
279
280/**********************************************************************
281 2-way camellia
282 **********************************************************************/
283#define roundsm2(ab, subkey, cd) \
284 movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \
285 xorq RT2, cd ## 1; \
286 \
287 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
288 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
289 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
290 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
291 \
292 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
293 xorq RT2, cd ## 0; \
294 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
295 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
296 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);
297
298#define fls2(l, r, kl, kr) \
299 movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \
300 andl l ## 0d, RT0d; \
301 roll $1, RT0d; \
302 shlq $32, RT0; \
303 xorq RT0, l ## 0; \
304 movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \
305 orq r ## 0, RT1; \
306 shrq $32, RT1; \
307 xorq RT1, r ## 0; \
308 \
309 movl (key_table + ((kl) * 2) * 4)(CTX), RT2d; \
310 andl l ## 1d, RT2d; \
311 roll $1, RT2d; \
312 shlq $32, RT2; \
313 xorq RT2, l ## 1; \
314 movq (key_table + ((kr) * 2) * 4)(CTX), RT0; \
315 orq r ## 1, RT0; \
316 shrq $32, RT0; \
317 xorq RT0, r ## 1; \
318 \
319 movq (key_table + ((kl) * 2) * 4)(CTX), RT1; \
320 orq l ## 0, RT1; \
321 shrq $32, RT1; \
322 xorq RT1, l ## 0; \
323 movl (key_table + ((kr) * 2) * 4)(CTX), RT2d; \
324 andl r ## 0d, RT2d; \
325 roll $1, RT2d; \
326 shlq $32, RT2; \
327 xorq RT2, r ## 0; \
328 \
329 movq (key_table + ((kl) * 2) * 4)(CTX), RT0; \
330 orq l ## 1, RT0; \
331 shrq $32, RT0; \
332 xorq RT0, l ## 1; \
333 movl (key_table + ((kr) * 2) * 4)(CTX), RT1d; \
334 andl r ## 1d, RT1d; \
335 roll $1, RT1d; \
336 shlq $32, RT1; \
337 xorq RT1, r ## 1;
338
339#define enc_rounds2(i) \
340 roundsm2(RAB, i + 2, RCD); \
341 roundsm2(RCD, i + 3, RAB); \
342 roundsm2(RAB, i + 4, RCD); \
343 roundsm2(RCD, i + 5, RAB); \
344 roundsm2(RAB, i + 6, RCD); \
345 roundsm2(RCD, i + 7, RAB);
346
347#define enc_fls2(i) \
348 fls2(RAB, RCD, i + 0, i + 1);
349
350#define enc_inpack2() \
351 movq (RIO), RAB0; \
352 bswapq RAB0; \
353 rorq $32, RAB0; \
354 movq 4*2(RIO), RCD0; \
355 bswapq RCD0; \
356 rolq $32, RCD0; \
357 xorq key_table(CTX), RAB0; \
358 \
359 movq 8*2(RIO), RAB1; \
360 bswapq RAB1; \
361 rorq $32, RAB1; \
362 movq 12*2(RIO), RCD1; \
363 bswapq RCD1; \
364 rolq $32, RCD1; \
365 xorq key_table(CTX), RAB1;
366
367#define enc_outunpack2(op, max) \
368 xorq key_table(CTX, max, 8), RCD0; \
369 rolq $32, RCD0; \
370 bswapq RCD0; \
371 op ## q RCD0, (RIO); \
372 rorq $32, RAB0; \
373 bswapq RAB0; \
374 op ## q RAB0, 4*2(RIO); \
375 \
376 xorq key_table(CTX, max, 8), RCD1; \
377 rolq $32, RCD1; \
378 bswapq RCD1; \
379 op ## q RCD1, 8*2(RIO); \
380 rorq $32, RAB1; \
381 bswapq RAB1; \
382 op ## q RAB1, 12*2(RIO);
383
384#define dec_rounds2(i) \
385 roundsm2(RAB, i + 7, RCD); \
386 roundsm2(RCD, i + 6, RAB); \
387 roundsm2(RAB, i + 5, RCD); \
388 roundsm2(RCD, i + 4, RAB); \
389 roundsm2(RAB, i + 3, RCD); \
390 roundsm2(RCD, i + 2, RAB);
391
392#define dec_fls2(i) \
393 fls2(RAB, RCD, i + 1, i + 0);
394
395#define dec_inpack2(max) \
396 movq (RIO), RAB0; \
397 bswapq RAB0; \
398 rorq $32, RAB0; \
399 movq 4*2(RIO), RCD0; \
400 bswapq RCD0; \
401 rolq $32, RCD0; \
402 xorq key_table(CTX, max, 8), RAB0; \
403 \
404 movq 8*2(RIO), RAB1; \
405 bswapq RAB1; \
406 rorq $32, RAB1; \
407 movq 12*2(RIO), RCD1; \
408 bswapq RCD1; \
409 rolq $32, RCD1; \
410 xorq key_table(CTX, max, 8), RAB1;
411
412#define dec_outunpack2() \
413 xorq key_table(CTX), RCD0; \
414 rolq $32, RCD0; \
415 bswapq RCD0; \
416 movq RCD0, (RIO); \
417 rorq $32, RAB0; \
418 bswapq RAB0; \
419 movq RAB0, 4*2(RIO); \
420 \
421 xorq key_table(CTX), RCD1; \
422 rolq $32, RCD1; \
423 bswapq RCD1; \
424 movq RCD1, 8*2(RIO); \
425 rorq $32, RAB1; \
426 bswapq RAB1; \
427 movq RAB1, 12*2(RIO);
428
429.global __camellia_enc_blk_2way;
430.type __camellia_enc_blk_2way,@function;
431
432__camellia_enc_blk_2way:
433 /* input:
434 * %rdi: ctx, CTX
435 * %rsi: dst
436 * %rdx: src
437 * %rcx: bool xor
438 */
439 pushq %rbx;
440
441 movq %rbp, RRBP;
442 movq %rcx, RXOR;
443 movq %rsi, RDST;
444 movq %rdx, RIO;
445
446 enc_inpack2();
447
448 enc_rounds2(0);
449 enc_fls2(8);
450 enc_rounds2(8);
451 enc_fls2(16);
452 enc_rounds2(16);
453 movl $24, RT2d; /* max */
454
455 cmpb $16, key_length(CTX);
456 je __enc2_done;
457
458 enc_fls2(24);
459 enc_rounds2(24);
460 movl $32, RT2d; /* max */
461
462__enc2_done:
463 test RXORbl, RXORbl;
464 movq RDST, RIO;
465 jnz __enc2_xor;
466
467 enc_outunpack2(mov, RT2);
468
469 movq RRBP, %rbp;
470 popq %rbx;
471 ret;
472
473__enc2_xor:
474 enc_outunpack2(xor, RT2);
475
476 movq RRBP, %rbp;
477 popq %rbx;
478 ret;
479
480.global camellia_dec_blk_2way;
481.type camellia_dec_blk_2way,@function;
482
483camellia_dec_blk_2way:
484 /* input:
485 * %rdi: ctx, CTX
486 * %rsi: dst
487 * %rdx: src
488 */
489 cmpl $16, key_length(CTX);
490 movl $32, RT2d;
491 movl $24, RXORd;
492 cmovel RXORd, RT2d; /* max */
493
494 movq %rbx, RXOR;
495 movq %rbp, RRBP;
496 movq %rsi, RDST;
497 movq %rdx, RIO;
498
499 dec_inpack2(RT2);
500
501 cmpb $24, RT2bl;
502 je __dec2_rounds16;
503
504 dec_rounds2(24);
505 dec_fls2(24);
506
507__dec2_rounds16:
508 dec_rounds2(16);
509 dec_fls2(16);
510 dec_rounds2(8);
511 dec_fls2(8);
512 dec_rounds2(0);
513
514 movq RDST, RIO;
515
516 dec_outunpack2();
517
518 movq RRBP, %rbp;
519 movq RXOR, %rbx;
520 ret;