blob: 246c67006ed06ad84516b3a56ceac06e05734a1c [file] [log] [blame]
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +03001/*
2 * Blowfish Cipher Algorithm (x86_64)
3 *
4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
19 * USA
20 *
21 */
22
Jussi Kivilinna5186e392013-01-19 13:39:00 +020023#include <linux/linkage.h>
24
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +030025.file "blowfish-x86_64-asm.S"
26.text
27
28/* structure of crypto context */
29#define p 0
30#define s0 ((16 + 2) * 4)
31#define s1 ((16 + 2 + (1 * 256)) * 4)
32#define s2 ((16 + 2 + (2 * 256)) * 4)
33#define s3 ((16 + 2 + (3 * 256)) * 4)
34
35/* register macros */
36#define CTX %rdi
37#define RIO %rsi
38
39#define RX0 %rax
40#define RX1 %rbx
41#define RX2 %rcx
42#define RX3 %rdx
43
44#define RX0d %eax
45#define RX1d %ebx
46#define RX2d %ecx
47#define RX3d %edx
48
49#define RX0bl %al
50#define RX1bl %bl
51#define RX2bl %cl
52#define RX3bl %dl
53
54#define RX0bh %ah
55#define RX1bh %bh
56#define RX2bh %ch
57#define RX3bh %dh
58
59#define RT0 %rbp
60#define RT1 %rsi
Jussi Kivilinnae827bb02011-09-23 19:50:55 +030061#define RT2 %r8
62#define RT3 %r9
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +030063
64#define RT0d %ebp
65#define RT1d %esi
Jussi Kivilinnae827bb02011-09-23 19:50:55 +030066#define RT2d %r8d
67#define RT3d %r9d
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +030068
Jussi Kivilinnae827bb02011-09-23 19:50:55 +030069#define RKEY %r10
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +030070
71/***********************************************************************
72 * 1-way blowfish
73 ***********************************************************************/
Jussi Kivilinnae827bb02011-09-23 19:50:55 +030074#define F() \
75 rorq $16, RX0; \
76 movzbl RX0bh, RT0d; \
77 movzbl RX0bl, RT1d; \
78 rolq $16, RX0; \
79 movl s0(CTX,RT0,4), RT0d; \
80 addl s1(CTX,RT1,4), RT0d; \
81 movzbl RX0bh, RT1d; \
82 movzbl RX0bl, RT2d; \
83 rolq $32, RX0; \
84 xorl s2(CTX,RT1,4), RT0d; \
85 addl s3(CTX,RT2,4), RT0d; \
86 xorq RT0, RX0;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +030087
88#define add_roundkey_enc(n) \
89 xorq p+4*(n)(CTX), RX0;
90
91#define round_enc(n) \
92 add_roundkey_enc(n); \
93 \
Jussi Kivilinnae827bb02011-09-23 19:50:55 +030094 F(); \
95 F();
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +030096
97#define add_roundkey_dec(n) \
98 movq p+4*(n-1)(CTX), RT0; \
99 rorq $32, RT0; \
100 xorq RT0, RX0;
101
102#define round_dec(n) \
103 add_roundkey_dec(n); \
104 \
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300105 F(); \
106 F(); \
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300107
108#define read_block() \
109 movq (RIO), RX0; \
110 rorq $32, RX0; \
111 bswapq RX0;
112
113#define write_block() \
114 bswapq RX0; \
115 movq RX0, (RIO);
116
117#define xor_block() \
118 bswapq RX0; \
119 xorq RX0, (RIO);
120
Jussi Kivilinna5186e392013-01-19 13:39:00 +0200121ENTRY(__blowfish_enc_blk)
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300122 /* input:
123 * %rdi: ctx, CTX
124 * %rsi: dst
125 * %rdx: src
126 * %rcx: bool, if true: xor output
127 */
128 movq %rbp, %r11;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300129
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300130 movq %rsi, %r10;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300131 movq %rdx, RIO;
132
133 read_block();
134
135 round_enc(0);
136 round_enc(2);
137 round_enc(4);
138 round_enc(6);
139 round_enc(8);
140 round_enc(10);
141 round_enc(12);
142 round_enc(14);
143 add_roundkey_enc(16);
144
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300145 movq %r11, %rbp;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300146
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300147 movq %r10, RIO;
148 test %cl, %cl;
Jussi Kivilinna5186e392013-01-19 13:39:00 +0200149 jnz .L__enc_xor;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300150
151 write_block();
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300152 ret;
Jussi Kivilinna5186e392013-01-19 13:39:00 +0200153.L__enc_xor:
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300154 xor_block();
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300155 ret;
Jussi Kivilinna5186e392013-01-19 13:39:00 +0200156ENDPROC(__blowfish_enc_blk)
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300157
Jussi Kivilinna5186e392013-01-19 13:39:00 +0200158ENTRY(blowfish_dec_blk)
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300159 /* input:
160 * %rdi: ctx, CTX
161 * %rsi: dst
162 * %rdx: src
163 */
164 movq %rbp, %r11;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300165
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300166 movq %rsi, %r10;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300167 movq %rdx, RIO;
168
169 read_block();
170
171 round_dec(17);
172 round_dec(15);
173 round_dec(13);
174 round_dec(11);
175 round_dec(9);
176 round_dec(7);
177 round_dec(5);
178 round_dec(3);
179 add_roundkey_dec(1);
180
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300181 movq %r10, RIO;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300182 write_block();
183
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300184 movq %r11, %rbp;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300185
186 ret;
Jussi Kivilinna5186e392013-01-19 13:39:00 +0200187ENDPROC(blowfish_dec_blk)
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300188
189/**********************************************************************
190 4-way blowfish, four blocks parallel
191 **********************************************************************/
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300192
193/* F() for 4-way. Slower when used alone/1-way, but faster when used
194 * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
195 */
196#define F4(x) \
197 movzbl x ## bh, RT1d; \
198 movzbl x ## bl, RT3d; \
199 rorq $16, x; \
200 movzbl x ## bh, RT0d; \
201 movzbl x ## bl, RT2d; \
202 rorq $16, x; \
203 movl s0(CTX,RT0,4), RT0d; \
204 addl s1(CTX,RT2,4), RT0d; \
205 xorl s2(CTX,RT1,4), RT0d; \
206 addl s3(CTX,RT3,4), RT0d; \
207 xorq RT0, x;
208
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300209#define add_preloaded_roundkey4() \
210 xorq RKEY, RX0; \
211 xorq RKEY, RX1; \
212 xorq RKEY, RX2; \
213 xorq RKEY, RX3;
214
215#define preload_roundkey_enc(n) \
216 movq p+4*(n)(CTX), RKEY;
217
218#define add_roundkey_enc4(n) \
219 add_preloaded_roundkey4(); \
220 preload_roundkey_enc(n + 2);
221
222#define round_enc4(n) \
223 add_roundkey_enc4(n); \
224 \
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300225 F4(RX0); \
226 F4(RX1); \
227 F4(RX2); \
228 F4(RX3); \
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300229 \
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300230 F4(RX0); \
231 F4(RX1); \
232 F4(RX2); \
233 F4(RX3);
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300234
235#define preload_roundkey_dec(n) \
236 movq p+4*((n)-1)(CTX), RKEY; \
237 rorq $32, RKEY;
238
239#define add_roundkey_dec4(n) \
240 add_preloaded_roundkey4(); \
241 preload_roundkey_dec(n - 2);
242
243#define round_dec4(n) \
244 add_roundkey_dec4(n); \
245 \
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300246 F4(RX0); \
247 F4(RX1); \
248 F4(RX2); \
249 F4(RX3); \
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300250 \
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300251 F4(RX0); \
252 F4(RX1); \
253 F4(RX2); \
254 F4(RX3);
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300255
256#define read_block4() \
257 movq (RIO), RX0; \
258 rorq $32, RX0; \
259 bswapq RX0; \
260 \
261 movq 8(RIO), RX1; \
262 rorq $32, RX1; \
263 bswapq RX1; \
264 \
265 movq 16(RIO), RX2; \
266 rorq $32, RX2; \
267 bswapq RX2; \
268 \
269 movq 24(RIO), RX3; \
270 rorq $32, RX3; \
271 bswapq RX3;
272
273#define write_block4() \
274 bswapq RX0; \
275 movq RX0, (RIO); \
276 \
277 bswapq RX1; \
278 movq RX1, 8(RIO); \
279 \
280 bswapq RX2; \
281 movq RX2, 16(RIO); \
282 \
283 bswapq RX3; \
284 movq RX3, 24(RIO);
285
286#define xor_block4() \
287 bswapq RX0; \
288 xorq RX0, (RIO); \
289 \
290 bswapq RX1; \
291 xorq RX1, 8(RIO); \
292 \
293 bswapq RX2; \
294 xorq RX2, 16(RIO); \
295 \
296 bswapq RX3; \
297 xorq RX3, 24(RIO);
298
Jussi Kivilinna5186e392013-01-19 13:39:00 +0200299ENTRY(__blowfish_enc_blk_4way)
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300300 /* input:
301 * %rdi: ctx, CTX
302 * %rsi: dst
303 * %rdx: src
304 * %rcx: bool, if true: xor output
305 */
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300306 pushq %rbp;
307 pushq %rbx;
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300308 pushq %rcx;
309
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300310 preload_roundkey_enc(0);
311
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300312 movq %rsi, %r11;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300313 movq %rdx, RIO;
314
315 read_block4();
316
317 round_enc4(0);
318 round_enc4(2);
319 round_enc4(4);
320 round_enc4(6);
321 round_enc4(8);
322 round_enc4(10);
323 round_enc4(12);
324 round_enc4(14);
325 add_preloaded_roundkey4();
326
327 popq %rbp;
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300328 movq %r11, RIO;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300329
330 test %bpl, %bpl;
Jussi Kivilinna5186e392013-01-19 13:39:00 +0200331 jnz .L__enc_xor4;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300332
333 write_block4();
334
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300335 popq %rbx;
336 popq %rbp;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300337 ret;
338
Jussi Kivilinna5186e392013-01-19 13:39:00 +0200339.L__enc_xor4:
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300340 xor_block4();
341
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300342 popq %rbx;
343 popq %rbp;
344 ret;
Jussi Kivilinna5186e392013-01-19 13:39:00 +0200345ENDPROC(__blowfish_enc_blk_4way)
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300346
Jussi Kivilinna5186e392013-01-19 13:39:00 +0200347ENTRY(blowfish_dec_blk_4way)
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300348 /* input:
349 * %rdi: ctx, CTX
350 * %rsi: dst
351 * %rdx: src
352 */
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300353 pushq %rbp;
354 pushq %rbx;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300355 preload_roundkey_dec(17);
356
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300357 movq %rsi, %r11;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300358 movq %rdx, RIO;
359
360 read_block4();
361
362 round_dec4(17);
363 round_dec4(15);
364 round_dec4(13);
365 round_dec4(11);
366 round_dec4(9);
367 round_dec4(7);
368 round_dec4(5);
369 round_dec4(3);
370 add_preloaded_roundkey4();
371
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300372 movq %r11, RIO;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300373 write_block4();
374
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300375 popq %rbx;
376 popq %rbp;
377
378 ret;
Jussi Kivilinna5186e392013-01-19 13:39:00 +0200379ENDPROC(blowfish_dec_blk_4way)