blob: 391d245dc0867f94ae37184ffb3e6f66619e1e84 [file] [log] [blame]
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +03001/*
2 * Blowfish Cipher Algorithm (x86_64)
3 *
4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
19 * USA
20 *
21 */
22
23.file "blowfish-x86_64-asm.S"
24.text
25
26/* structure of crypto context */
27#define p 0
28#define s0 ((16 + 2) * 4)
29#define s1 ((16 + 2 + (1 * 256)) * 4)
30#define s2 ((16 + 2 + (2 * 256)) * 4)
31#define s3 ((16 + 2 + (3 * 256)) * 4)
32
33/* register macros */
34#define CTX %rdi
35#define RIO %rsi
36
37#define RX0 %rax
38#define RX1 %rbx
39#define RX2 %rcx
40#define RX3 %rdx
41
42#define RX0d %eax
43#define RX1d %ebx
44#define RX2d %ecx
45#define RX3d %edx
46
47#define RX0bl %al
48#define RX1bl %bl
49#define RX2bl %cl
50#define RX3bl %dl
51
52#define RX0bh %ah
53#define RX1bh %bh
54#define RX2bh %ch
55#define RX3bh %dh
56
57#define RT0 %rbp
58#define RT1 %rsi
Jussi Kivilinnae827bb02011-09-23 19:50:55 +030059#define RT2 %r8
60#define RT3 %r9
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +030061
62#define RT0d %ebp
63#define RT1d %esi
Jussi Kivilinnae827bb02011-09-23 19:50:55 +030064#define RT2d %r8d
65#define RT3d %r9d
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +030066
Jussi Kivilinnae827bb02011-09-23 19:50:55 +030067#define RKEY %r10
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +030068
69/***********************************************************************
70 * 1-way blowfish
71 ***********************************************************************/
Jussi Kivilinnae827bb02011-09-23 19:50:55 +030072#define F() \
73 rorq $16, RX0; \
74 movzbl RX0bh, RT0d; \
75 movzbl RX0bl, RT1d; \
76 rolq $16, RX0; \
77 movl s0(CTX,RT0,4), RT0d; \
78 addl s1(CTX,RT1,4), RT0d; \
79 movzbl RX0bh, RT1d; \
80 movzbl RX0bl, RT2d; \
81 rolq $32, RX0; \
82 xorl s2(CTX,RT1,4), RT0d; \
83 addl s3(CTX,RT2,4), RT0d; \
84 xorq RT0, RX0;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +030085
86#define add_roundkey_enc(n) \
87 xorq p+4*(n)(CTX), RX0;
88
89#define round_enc(n) \
90 add_roundkey_enc(n); \
91 \
Jussi Kivilinnae827bb02011-09-23 19:50:55 +030092 F(); \
93 F();
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +030094
95#define add_roundkey_dec(n) \
96 movq p+4*(n-1)(CTX), RT0; \
97 rorq $32, RT0; \
98 xorq RT0, RX0;
99
100#define round_dec(n) \
101 add_roundkey_dec(n); \
102 \
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300103 F(); \
104 F(); \
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300105
106#define read_block() \
107 movq (RIO), RX0; \
108 rorq $32, RX0; \
109 bswapq RX0;
110
111#define write_block() \
112 bswapq RX0; \
113 movq RX0, (RIO);
114
115#define xor_block() \
116 bswapq RX0; \
117 xorq RX0, (RIO);
118
119.align 8
120.global __blowfish_enc_blk
121.type __blowfish_enc_blk,@function;
122
123__blowfish_enc_blk:
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300124 /* input:
125 * %rdi: ctx, CTX
126 * %rsi: dst
127 * %rdx: src
128 * %rcx: bool, if true: xor output
129 */
130 movq %rbp, %r11;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300131
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300132 movq %rsi, %r10;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300133 movq %rdx, RIO;
134
135 read_block();
136
137 round_enc(0);
138 round_enc(2);
139 round_enc(4);
140 round_enc(6);
141 round_enc(8);
142 round_enc(10);
143 round_enc(12);
144 round_enc(14);
145 add_roundkey_enc(16);
146
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300147 movq %r11, %rbp;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300148
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300149 movq %r10, RIO;
150 test %cl, %cl;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300151 jnz __enc_xor;
152
153 write_block();
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300154 ret;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300155__enc_xor:
156 xor_block();
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300157 ret;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300158
159.align 8
160.global blowfish_dec_blk
161.type blowfish_dec_blk,@function;
162
163blowfish_dec_blk:
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300164 /* input:
165 * %rdi: ctx, CTX
166 * %rsi: dst
167 * %rdx: src
168 */
169 movq %rbp, %r11;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300170
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300171 movq %rsi, %r10;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300172 movq %rdx, RIO;
173
174 read_block();
175
176 round_dec(17);
177 round_dec(15);
178 round_dec(13);
179 round_dec(11);
180 round_dec(9);
181 round_dec(7);
182 round_dec(5);
183 round_dec(3);
184 add_roundkey_dec(1);
185
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300186 movq %r10, RIO;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300187 write_block();
188
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300189 movq %r11, %rbp;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300190
191 ret;
192
193/**********************************************************************
194 4-way blowfish, four blocks parallel
195 **********************************************************************/
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300196
197/* F() for 4-way. Slower when used alone/1-way, but faster when used
198 * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
199 */
200#define F4(x) \
201 movzbl x ## bh, RT1d; \
202 movzbl x ## bl, RT3d; \
203 rorq $16, x; \
204 movzbl x ## bh, RT0d; \
205 movzbl x ## bl, RT2d; \
206 rorq $16, x; \
207 movl s0(CTX,RT0,4), RT0d; \
208 addl s1(CTX,RT2,4), RT0d; \
209 xorl s2(CTX,RT1,4), RT0d; \
210 addl s3(CTX,RT3,4), RT0d; \
211 xorq RT0, x;
212
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300213#define add_preloaded_roundkey4() \
214 xorq RKEY, RX0; \
215 xorq RKEY, RX1; \
216 xorq RKEY, RX2; \
217 xorq RKEY, RX3;
218
219#define preload_roundkey_enc(n) \
220 movq p+4*(n)(CTX), RKEY;
221
222#define add_roundkey_enc4(n) \
223 add_preloaded_roundkey4(); \
224 preload_roundkey_enc(n + 2);
225
226#define round_enc4(n) \
227 add_roundkey_enc4(n); \
228 \
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300229 F4(RX0); \
230 F4(RX1); \
231 F4(RX2); \
232 F4(RX3); \
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300233 \
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300234 F4(RX0); \
235 F4(RX1); \
236 F4(RX2); \
237 F4(RX3);
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300238
239#define preload_roundkey_dec(n) \
240 movq p+4*((n)-1)(CTX), RKEY; \
241 rorq $32, RKEY;
242
243#define add_roundkey_dec4(n) \
244 add_preloaded_roundkey4(); \
245 preload_roundkey_dec(n - 2);
246
247#define round_dec4(n) \
248 add_roundkey_dec4(n); \
249 \
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300250 F4(RX0); \
251 F4(RX1); \
252 F4(RX2); \
253 F4(RX3); \
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300254 \
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300255 F4(RX0); \
256 F4(RX1); \
257 F4(RX2); \
258 F4(RX3);
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300259
260#define read_block4() \
261 movq (RIO), RX0; \
262 rorq $32, RX0; \
263 bswapq RX0; \
264 \
265 movq 8(RIO), RX1; \
266 rorq $32, RX1; \
267 bswapq RX1; \
268 \
269 movq 16(RIO), RX2; \
270 rorq $32, RX2; \
271 bswapq RX2; \
272 \
273 movq 24(RIO), RX3; \
274 rorq $32, RX3; \
275 bswapq RX3;
276
277#define write_block4() \
278 bswapq RX0; \
279 movq RX0, (RIO); \
280 \
281 bswapq RX1; \
282 movq RX1, 8(RIO); \
283 \
284 bswapq RX2; \
285 movq RX2, 16(RIO); \
286 \
287 bswapq RX3; \
288 movq RX3, 24(RIO);
289
290#define xor_block4() \
291 bswapq RX0; \
292 xorq RX0, (RIO); \
293 \
294 bswapq RX1; \
295 xorq RX1, 8(RIO); \
296 \
297 bswapq RX2; \
298 xorq RX2, 16(RIO); \
299 \
300 bswapq RX3; \
301 xorq RX3, 24(RIO);
302
303.align 8
304.global __blowfish_enc_blk_4way
305.type __blowfish_enc_blk_4way,@function;
306
307__blowfish_enc_blk_4way:
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300308 /* input:
309 * %rdi: ctx, CTX
310 * %rsi: dst
311 * %rdx: src
312 * %rcx: bool, if true: xor output
313 */
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300314 pushq %rbp;
315 pushq %rbx;
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300316 pushq %rcx;
317
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300318 preload_roundkey_enc(0);
319
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300320 movq %rsi, %r11;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300321 movq %rdx, RIO;
322
323 read_block4();
324
325 round_enc4(0);
326 round_enc4(2);
327 round_enc4(4);
328 round_enc4(6);
329 round_enc4(8);
330 round_enc4(10);
331 round_enc4(12);
332 round_enc4(14);
333 add_preloaded_roundkey4();
334
335 popq %rbp;
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300336 movq %r11, RIO;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300337
338 test %bpl, %bpl;
339 jnz __enc_xor4;
340
341 write_block4();
342
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300343 popq %rbx;
344 popq %rbp;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300345 ret;
346
347__enc_xor4:
348 xor_block4();
349
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300350 popq %rbx;
351 popq %rbp;
352 ret;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300353
354.align 8
355.global blowfish_dec_blk_4way
356.type blowfish_dec_blk_4way,@function;
357
358blowfish_dec_blk_4way:
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300359 /* input:
360 * %rdi: ctx, CTX
361 * %rsi: dst
362 * %rdx: src
363 */
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300364 pushq %rbp;
365 pushq %rbx;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300366 preload_roundkey_dec(17);
367
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300368 movq %rsi, %r11;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300369 movq %rdx, RIO;
370
371 read_block4();
372
373 round_dec4(17);
374 round_dec4(15);
375 round_dec4(13);
376 round_dec4(11);
377 round_dec4(9);
378 round_dec4(7);
379 round_dec4(5);
380 round_dec4(3);
381 add_preloaded_roundkey4();
382
Jussi Kivilinnae827bb02011-09-23 19:50:55 +0300383 movq %r11, RIO;
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300384 write_block4();
385
Jussi Kivilinna64b94ce2011-09-02 01:45:22 +0300386 popq %rbx;
387 popq %rbp;
388
389 ret;
390