blob: 5dc6bce90a77db2c3021cda19e9e3e0d710f4013 [file] [log] [blame]
Markus Stockhausen1c201e62015-02-22 09:59:49 +01001/*
2 * Fast AES implementation for SPE instruction set (PPC)
3 *
4 * This code makes use of the SPE SIMD instruction set as defined in
5 * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
6 * Implementation is based on optimization guide notes from
7 * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
8 *
9 * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
10 *
11 * This program is free software; you can redistribute it and/or modify it
12 * under the terms of the GNU General Public License as published by the Free
13 * Software Foundation; either version 2 of the License, or (at your option)
14 * any later version.
15 *
16 */
17
18#include <asm/ppc_asm.h>
19#include "aes-spe-regs.h"
20
21#define EAD(in, bpos) \
22 rlwimi rT0,in,28-((bpos+3)%4)*8,20,27;
23
24#define DAD(in, bpos) \
25 rlwimi rT1,in,24-((bpos+3)%4)*8,24,31;
26
27#define LWH(out, off) \
28 evlwwsplat out,off(rT0); /* load word high */
29
30#define LWL(out, off) \
31 lwz out,off(rT0); /* load word low */
32
33#define LBZ(out, tab, off) \
34 lbz out,off(tab); /* load byte */
35
36#define LAH(out, in, bpos, off) \
37 EAD(in, bpos) /* calc addr + load word high */ \
38 LWH(out, off)
39
40#define LAL(out, in, bpos, off) \
41 EAD(in, bpos) /* calc addr + load word low */ \
42 LWL(out, off)
43
44#define LAE(out, in, bpos) \
45 EAD(in, bpos) /* calc addr + load enc byte */ \
46 LBZ(out, rT0, 8)
47
48#define LBE(out) \
49 LBZ(out, rT0, 8) /* load enc byte */
50
51#define LAD(out, in, bpos) \
52 DAD(in, bpos) /* calc addr + load dec byte */ \
53 LBZ(out, rT1, 0)
54
55#define LBD(out) \
56 LBZ(out, rT1, 0)
57
58/*
59 * ppc_encrypt_block: The central encryption function for a single 16 bytes
60 * block. It does no stack handling or register saving to support fast calls
61 * via bl/blr. It expects that caller has pre-xored input data with first
62 * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
63 * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
64 * and rW0-rW3 and caller must execute a final xor on the ouput registers.
65 * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
66 *
67 */
68_GLOBAL(ppc_encrypt_block)
69 LAH(rW4, rD1, 2, 4)
70 LAH(rW6, rD0, 3, 0)
71 LAH(rW3, rD0, 1, 8)
72ppc_encrypt_block_loop:
73 LAH(rW0, rD3, 0, 12)
74 LAL(rW0, rD0, 0, 12)
75 LAH(rW1, rD1, 0, 12)
76 LAH(rW2, rD2, 1, 8)
77 LAL(rW2, rD3, 1, 8)
78 LAL(rW3, rD1, 1, 8)
79 LAL(rW4, rD2, 2, 4)
80 LAL(rW6, rD1, 3, 0)
81 LAH(rW5, rD3, 2, 4)
82 LAL(rW5, rD0, 2, 4)
83 LAH(rW7, rD2, 3, 0)
84 evldw rD1,16(rKP)
85 EAD(rD3, 3)
86 evxor rW2,rW2,rW4
87 LWL(rW7, 0)
88 evxor rW2,rW2,rW6
89 EAD(rD2, 0)
90 evxor rD1,rD1,rW2
91 LWL(rW1, 12)
92 evxor rD1,rD1,rW0
93 evldw rD3,24(rKP)
94 evmergehi rD0,rD0,rD1
95 EAD(rD1, 2)
96 evxor rW3,rW3,rW5
97 LWH(rW4, 4)
98 evxor rW3,rW3,rW7
99 EAD(rD0, 3)
100 evxor rD3,rD3,rW3
101 LWH(rW6, 0)
102 evxor rD3,rD3,rW1
103 EAD(rD0, 1)
104 evmergehi rD2,rD2,rD3
105 LWH(rW3, 8)
106 LAH(rW0, rD3, 0, 12)
107 LAL(rW0, rD0, 0, 12)
108 LAH(rW1, rD1, 0, 12)
109 LAH(rW2, rD2, 1, 8)
110 LAL(rW2, rD3, 1, 8)
111 LAL(rW3, rD1, 1, 8)
112 LAL(rW4, rD2, 2, 4)
113 LAL(rW6, rD1, 3, 0)
114 LAH(rW5, rD3, 2, 4)
115 LAL(rW5, rD0, 2, 4)
116 LAH(rW7, rD2, 3, 0)
117 evldw rD1,32(rKP)
118 EAD(rD3, 3)
119 evxor rW2,rW2,rW4
120 LWL(rW7, 0)
121 evxor rW2,rW2,rW6
122 EAD(rD2, 0)
123 evxor rD1,rD1,rW2
124 LWL(rW1, 12)
125 evxor rD1,rD1,rW0
126 evldw rD3,40(rKP)
127 evmergehi rD0,rD0,rD1
128 EAD(rD1, 2)
129 evxor rW3,rW3,rW5
130 LWH(rW4, 4)
131 evxor rW3,rW3,rW7
132 EAD(rD0, 3)
133 evxor rD3,rD3,rW3
134 LWH(rW6, 0)
135 evxor rD3,rD3,rW1
136 EAD(rD0, 1)
137 evmergehi rD2,rD2,rD3
138 LWH(rW3, 8)
139 addi rKP,rKP,32
140 bdnz ppc_encrypt_block_loop
141 LAH(rW0, rD3, 0, 12)
142 LAL(rW0, rD0, 0, 12)
143 LAH(rW1, rD1, 0, 12)
144 LAH(rW2, rD2, 1, 8)
145 LAL(rW2, rD3, 1, 8)
146 LAL(rW3, rD1, 1, 8)
147 LAL(rW4, rD2, 2, 4)
148 LAH(rW5, rD3, 2, 4)
149 LAL(rW6, rD1, 3, 0)
150 LAL(rW5, rD0, 2, 4)
151 LAH(rW7, rD2, 3, 0)
152 evldw rD1,16(rKP)
153 EAD(rD3, 3)
154 evxor rW2,rW2,rW4
155 LWL(rW7, 0)
156 evxor rW2,rW2,rW6
157 EAD(rD2, 0)
158 evxor rD1,rD1,rW2
159 LWL(rW1, 12)
160 evxor rD1,rD1,rW0
161 evldw rD3,24(rKP)
162 evmergehi rD0,rD0,rD1
163 EAD(rD1, 0)
164 evxor rW3,rW3,rW5
165 LBE(rW2)
166 evxor rW3,rW3,rW7
167 EAD(rD0, 1)
168 evxor rD3,rD3,rW3
169 LBE(rW6)
170 evxor rD3,rD3,rW1
171 EAD(rD0, 0)
172 evmergehi rD2,rD2,rD3
173 LBE(rW1)
174 LAE(rW0, rD3, 0)
175 LAE(rW1, rD0, 0)
176 LAE(rW4, rD2, 1)
177 LAE(rW5, rD3, 1)
178 LAE(rW3, rD2, 0)
179 LAE(rW7, rD1, 1)
180 rlwimi rW0,rW4,8,16,23
181 rlwimi rW1,rW5,8,16,23
182 LAE(rW4, rD1, 2)
183 LAE(rW5, rD2, 2)
184 rlwimi rW2,rW6,8,16,23
185 rlwimi rW3,rW7,8,16,23
186 LAE(rW6, rD3, 2)
187 LAE(rW7, rD0, 2)
188 rlwimi rW0,rW4,16,8,15
189 rlwimi rW1,rW5,16,8,15
190 LAE(rW4, rD0, 3)
191 LAE(rW5, rD1, 3)
192 rlwimi rW2,rW6,16,8,15
193 lwz rD0,32(rKP)
194 rlwimi rW3,rW7,16,8,15
195 lwz rD1,36(rKP)
196 LAE(rW6, rD2, 3)
197 LAE(rW7, rD3, 3)
198 rlwimi rW0,rW4,24,0,7
199 lwz rD2,40(rKP)
200 rlwimi rW1,rW5,24,0,7
201 lwz rD3,44(rKP)
202 rlwimi rW2,rW6,24,0,7
203 rlwimi rW3,rW7,24,0,7
204 blr
205
206/*
207 * ppc_decrypt_block: The central decryption function for a single 16 bytes
208 * block. It does no stack handling or register saving to support fast calls
209 * via bl/blr. It expects that caller has pre-xored input data with first
210 * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
211 * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
212 * and rW0-rW3 and caller must execute a final xor on the ouput registers.
213 * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
214 *
215 */
216_GLOBAL(ppc_decrypt_block)
217 LAH(rW0, rD1, 0, 12)
218 LAH(rW6, rD0, 3, 0)
219 LAH(rW3, rD0, 1, 8)
220ppc_decrypt_block_loop:
221 LAH(rW1, rD3, 0, 12)
222 LAL(rW0, rD2, 0, 12)
223 LAH(rW2, rD2, 1, 8)
224 LAL(rW2, rD3, 1, 8)
225 LAH(rW4, rD3, 2, 4)
226 LAL(rW4, rD0, 2, 4)
227 LAL(rW6, rD1, 3, 0)
228 LAH(rW5, rD1, 2, 4)
229 LAH(rW7, rD2, 3, 0)
230 LAL(rW7, rD3, 3, 0)
231 LAL(rW3, rD1, 1, 8)
232 evldw rD1,16(rKP)
233 EAD(rD0, 0)
234 evxor rW4,rW4,rW6
235 LWL(rW1, 12)
236 evxor rW0,rW0,rW4
237 EAD(rD2, 2)
238 evxor rW0,rW0,rW2
239 LWL(rW5, 4)
240 evxor rD1,rD1,rW0
241 evldw rD3,24(rKP)
242 evmergehi rD0,rD0,rD1
243 EAD(rD1, 0)
244 evxor rW3,rW3,rW7
245 LWH(rW0, 12)
246 evxor rW3,rW3,rW1
247 EAD(rD0, 3)
248 evxor rD3,rD3,rW3
249 LWH(rW6, 0)
250 evxor rD3,rD3,rW5
251 EAD(rD0, 1)
252 evmergehi rD2,rD2,rD3
253 LWH(rW3, 8)
254 LAH(rW1, rD3, 0, 12)
255 LAL(rW0, rD2, 0, 12)
256 LAH(rW2, rD2, 1, 8)
257 LAL(rW2, rD3, 1, 8)
258 LAH(rW4, rD3, 2, 4)
259 LAL(rW4, rD0, 2, 4)
260 LAL(rW6, rD1, 3, 0)
261 LAH(rW5, rD1, 2, 4)
262 LAH(rW7, rD2, 3, 0)
263 LAL(rW7, rD3, 3, 0)
264 LAL(rW3, rD1, 1, 8)
265 evldw rD1,32(rKP)
266 EAD(rD0, 0)
267 evxor rW4,rW4,rW6
268 LWL(rW1, 12)
269 evxor rW0,rW0,rW4
270 EAD(rD2, 2)
271 evxor rW0,rW0,rW2
272 LWL(rW5, 4)
273 evxor rD1,rD1,rW0
274 evldw rD3,40(rKP)
275 evmergehi rD0,rD0,rD1
276 EAD(rD1, 0)
277 evxor rW3,rW3,rW7
278 LWH(rW0, 12)
279 evxor rW3,rW3,rW1
280 EAD(rD0, 3)
281 evxor rD3,rD3,rW3
282 LWH(rW6, 0)
283 evxor rD3,rD3,rW5
284 EAD(rD0, 1)
285 evmergehi rD2,rD2,rD3
286 LWH(rW3, 8)
287 addi rKP,rKP,32
288 bdnz ppc_decrypt_block_loop
289 LAH(rW1, rD3, 0, 12)
290 LAL(rW0, rD2, 0, 12)
291 LAH(rW2, rD2, 1, 8)
292 LAL(rW2, rD3, 1, 8)
293 LAH(rW4, rD3, 2, 4)
294 LAL(rW4, rD0, 2, 4)
295 LAL(rW6, rD1, 3, 0)
296 LAH(rW5, rD1, 2, 4)
297 LAH(rW7, rD2, 3, 0)
298 LAL(rW7, rD3, 3, 0)
299 LAL(rW3, rD1, 1, 8)
300 evldw rD1,16(rKP)
301 EAD(rD0, 0)
302 evxor rW4,rW4,rW6
303 LWL(rW1, 12)
304 evxor rW0,rW0,rW4
305 EAD(rD2, 2)
306 evxor rW0,rW0,rW2
307 LWL(rW5, 4)
308 evxor rD1,rD1,rW0
309 evldw rD3,24(rKP)
310 evmergehi rD0,rD0,rD1
311 DAD(rD1, 0)
312 evxor rW3,rW3,rW7
313 LBD(rW0)
314 evxor rW3,rW3,rW1
315 DAD(rD0, 1)
316 evxor rD3,rD3,rW3
317 LBD(rW6)
318 evxor rD3,rD3,rW5
319 DAD(rD0, 0)
320 evmergehi rD2,rD2,rD3
321 LBD(rW3)
322 LAD(rW2, rD3, 0)
323 LAD(rW1, rD2, 0)
324 LAD(rW4, rD2, 1)
325 LAD(rW5, rD3, 1)
326 LAD(rW7, rD1, 1)
327 rlwimi rW0,rW4,8,16,23
328 rlwimi rW1,rW5,8,16,23
329 LAD(rW4, rD3, 2)
330 LAD(rW5, rD0, 2)
331 rlwimi rW2,rW6,8,16,23
332 rlwimi rW3,rW7,8,16,23
333 LAD(rW6, rD1, 2)
334 LAD(rW7, rD2, 2)
335 rlwimi rW0,rW4,16,8,15
336 rlwimi rW1,rW5,16,8,15
337 LAD(rW4, rD0, 3)
338 LAD(rW5, rD1, 3)
339 rlwimi rW2,rW6,16,8,15
340 lwz rD0,32(rKP)
341 rlwimi rW3,rW7,16,8,15
342 lwz rD1,36(rKP)
343 LAD(rW6, rD2, 3)
344 LAD(rW7, rD3, 3)
345 rlwimi rW0,rW4,24,0,7
346 lwz rD2,40(rKP)
347 rlwimi rW1,rW5,24,0,7
348 lwz rD3,44(rKP)
349 rlwimi rW2,rW6,24,0,7
350 rlwimi rW3,rW7,24,0,7
351 blr