blob: 0d9ce156eaaa4f8e213f135916fe964a8cc1afea [file] [log] [blame]
Robert Sloan89678152019-03-12 14:24:00 -07001#!/usr/bin/env perl
2# Copyright (c) 2019, Google Inc.
3#
4# Permission to use, copy, modify, and/or distribute this software for any
5# purpose with or without fee is hereby granted, provided that the above
6# copyright notice and this permission notice appear in all copies.
7#
8# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
11# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
13# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
14# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15
16# ghash-ssse3-x86.pl is a constant-time variant of the traditional 4-bit
17# table-based GHASH implementation. It requires SSSE3 instructions.
18#
19# For background, the table-based strategy is a 4-bit windowed multiplication.
20# It precomputes all 4-bit multiples of H (this is 16 128-bit rows), then loops
21# over 4-bit windows of the input and indexes them up into the table. Visually,
22# it multiplies as in the schoolbook multiplication diagram below, but with
23# more terms. (Each term is 4 bits, so there are 32 terms in each row.) First
24# it incorporates the terms labeled '1' by indexing the most significant term
25# of X into the table. Then it shifts and repeats for '2' and so on.
26#
27# hhhhhh
28# * xxxxxx
29# ============
30# 666666
31# 555555
32# 444444
33# 333333
34# 222222
35# 111111
36#
37# This implementation changes the order. We treat the table as a 16×16 matrix
38# and transpose it. The first row is then the first byte of each multiple of H,
39# and so on. We then reorder terms as below. Observe that the terms labeled '1'
40# and '2' are all lookups into the first row, etc. This maps well to the SSSE3
41# pshufb instruction, using alternating terms of X in parallel as indices. This
42# alternation is needed because pshufb maps 4 bits to 8 bits. Then we shift and
43# repeat for each row.
44#
45# hhhhhh
46# * xxxxxx
47# ============
48# 224466
49# 113355
50# 224466
51# 113355
52# 224466
53# 113355
54#
55# Next we account for GCM's confusing bit order. The "first" bit is the least
56# significant coefficient, but GCM treats the most sigificant bit within a byte
57# as first. Bytes are little-endian, and bits are big-endian. We reverse the
58# bytes in XMM registers for a consistent bit and byte ordering, but this means
59# the least significant bit is the most significant coefficient and vice versa.
60#
61# For consistency, "low", "high", "left-shift", and "right-shift" refer to the
62# bit ordering within the XMM register, rather than the reversed coefficient
63# ordering. Low bits are less significant bits and more significant
64# coefficients. Right-shifts move from MSB to the LSB and correspond to
65# increasing the power of each coefficient.
66#
67# Note this bit reversal enters into the table's column indices. H*1 is stored
68# in column 0b1000 and H*x^3 is stored in column 0b0001. It also means earlier
69# table rows contain more significant coefficients, so we iterate forwards.
70
71$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
72push(@INC,"${dir}","${dir}../../../perlasm");
73require "x86asm.pl";
74
75$output = pop;
76open STDOUT, ">$output";
77
78&asm_init($ARGV[0]);
79
80my ($Xi, $Htable, $in, $len) = ("edi", "esi", "edx", "ecx");
81&static_label("reverse_bytes");
82&static_label("low4_mask");
83
84my $call_counter = 0;
85# process_rows emits assembly code to process $rows rows of the table. On
86# input, $Htable stores the pointer to the next row. xmm0 and xmm1 store the
87# low and high halves of the input. The result so far is passed in xmm2. xmm3
88# must be zero. On output, $Htable is advanced to the next row and xmm2 is
89# updated. xmm3 remains zero. It clobbers eax, xmm4, xmm5, and xmm6.
90sub process_rows {
91 my ($rows) = @_;
92 $call_counter++;
93
94 # Shifting whole XMM registers by bits is complex. psrldq shifts by
95 # bytes, and psrlq shifts the two 64-bit halves separately. Each row
96 # produces 8 bits of carry, and the reduction needs an additional 7-bit
97 # shift. This must fit in 64 bits so reduction can use psrlq. This
98 # allows up to 7 rows at a time.
99 die "Carry register would overflow 64 bits." if ($rows*8 + 7 > 64);
100
101 &mov("eax", $rows);
102&set_label("loop_row_$call_counter");
103 &movdqa("xmm4", &QWP(0, $Htable));
104 &lea($Htable, &DWP(16, $Htable));
105
106 # Right-shift xmm2 and xmm3 by 8 bytes.
107 &movdqa("xmm6", "xmm2");
108 &palignr("xmm6", "xmm3", 1);
109 &movdqa("xmm3", "xmm6");
110 &psrldq("xmm2", 1);
111
112 # Load the next table row and index the low and high bits of the input.
113 # Note the low (respectively, high) half corresponds to more
114 # (respectively, less) significant coefficients.
115 &movdqa("xmm5", "xmm4");
116 &pshufb("xmm4", "xmm0");
117 &pshufb("xmm5", "xmm1");
118
119 # Add the high half (xmm5) without shifting.
120 &pxor("xmm2", "xmm5");
121
122 # Add the low half (xmm4). This must be right-shifted by 4 bits. First,
123 # add into the carry register (xmm3).
124 &movdqa("xmm5", "xmm4");
125 &psllq("xmm5", 60);
126 &movdqa("xmm6", "xmm5");
127 &pslldq("xmm6", 8);
128 &pxor("xmm3", "xmm6");
129
130 # Next, add into xmm2.
131 &psrldq("xmm5", 8);
132 &pxor("xmm2", "xmm5");
133 &psrlq("xmm4", 4);
134 &pxor("xmm2", "xmm4");
135
136 &sub("eax", 1);
137 &jnz(&label("loop_row_$call_counter"));
138
139 # Reduce the carry register. The reduction polynomial is 1 + x + x^2 +
140 # x^7, so we shift and XOR four times.
141 &pxor("xmm2", "xmm3"); # x^0 = 0
142 &psrlq("xmm3", 1);
143 &pxor("xmm2", "xmm3"); # x^1 = x
144 &psrlq("xmm3", 1);
145 &pxor("xmm2", "xmm3"); # x^(1+1) = x^2
146 &psrlq("xmm3", 5);
147 &pxor("xmm2", "xmm3"); # x^(1+1+5) = x^7
148 &pxor("xmm3", "xmm3");
149____
150}
151
152# gcm_gmult_ssse3 multiplies |Xi| by |Htable| and writes the result to |Xi|.
153# |Xi| is represented in GHASH's serialized byte representation. |Htable| is
154# formatted as described above.
155# void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]);
156&function_begin("gcm_gmult_ssse3");
157 &mov($Xi, &wparam(0));
158 &mov($Htable, &wparam(1));
159
160 &movdqu("xmm0", &QWP(0, $Xi));
161 &call(&label("pic_point"));
162&set_label("pic_point");
163 &blindpop("eax");
164 &movdqa("xmm7", &QWP(&label("reverse_bytes")."-".&label("pic_point"), "eax"));
165 &movdqa("xmm2", &QWP(&label("low4_mask")."-".&label("pic_point"), "eax"));
166
167 # Reverse input bytes to deserialize.
168 &pshufb("xmm0", "xmm7");
169
170 # Split each byte into low (xmm0) and high (xmm1) halves.
171 &movdqa("xmm1", "xmm2");
172 &pandn("xmm1", "xmm0");
173 &psrld("xmm1", 4);
174 &pand("xmm0", "xmm2");
175
176 # Maintain the result in xmm2 (the value) and xmm3 (carry bits). Note
177 # that, due to bit reversal, xmm3 contains bits that fall off when
178 # right-shifting, not left-shifting.
179 &pxor("xmm2", "xmm2");
180 &pxor("xmm3", "xmm3");
181
182 # We must reduce at least once every 7 rows, so divide into three
183 # chunks.
184 &process_rows(5);
185 &process_rows(5);
186 &process_rows(6);
187
188 # Store the result. Reverse bytes to serialize.
189 &pshufb("xmm2", "xmm7");
190 &movdqu(&QWP(0, $Xi), "xmm2");
191
192 # Zero any registers which contain secrets.
193 &pxor("xmm0", "xmm0");
194 &pxor("xmm1", "xmm1");
195 &pxor("xmm2", "xmm2");
196 &pxor("xmm3", "xmm3");
197 &pxor("xmm4", "xmm4");
198 &pxor("xmm5", "xmm5");
199 &pxor("xmm6", "xmm6");
200&function_end("gcm_gmult_ssse3");
201
202# gcm_ghash_ssse3 incorporates |len| bytes from |in| to |Xi|, using |Htable| as
203# the key. It writes the result back to |Xi|. |Xi| is represented in GHASH's
204# serialized byte representation. |Htable| is formatted as described above.
205# void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in,
206# size_t len);
207&function_begin("gcm_ghash_ssse3");
208 &mov($Xi, &wparam(0));
209 &mov($Htable, &wparam(1));
210 &mov($in, &wparam(2));
211 &mov($len, &wparam(3));
212
213 &movdqu("xmm0", &QWP(0, $Xi));
214 &call(&label("pic_point"));
215&set_label("pic_point");
216 &blindpop("ebx");
217 &movdqa("xmm7", &QWP(&label("reverse_bytes")."-".&label("pic_point"), "ebx"));
218
219 # This function only processes whole blocks.
220 &and($len, -16);
221
222 # Reverse input bytes to deserialize. We maintain the running
223 # total in xmm0.
224 &pshufb("xmm0", "xmm7");
225
226 # Iterate over each block. On entry to each iteration, xmm3 is zero.
227 &pxor("xmm3", "xmm3");
228&set_label("loop_ghash");
229 &movdqa("xmm2", &QWP(&label("low4_mask")."-".&label("pic_point"), "ebx"));
230
231 # Incorporate the next block of input.
232 &movdqu("xmm1", &QWP(0, $in));
233 &pshufb("xmm1", "xmm7"); # Reverse bytes.
234 &pxor("xmm0", "xmm1");
235
236 # Split each byte into low (xmm0) and high (xmm1) halves.
237 &movdqa("xmm1", "xmm2");
238 &pandn("xmm1", "xmm0");
239 &psrld("xmm1", 4);
240 &pand("xmm0", "xmm2");
241
242 # Maintain the result in xmm2 (the value) and xmm3 (carry bits). Note
243 # that, due to bit reversal, xmm3 contains bits that fall off when
244 # right-shifting, not left-shifting.
245 &pxor("xmm2", "xmm2");
246 # xmm3 is already zero at this point.
247
248 # We must reduce at least once every 7 rows, so divide into three
249 # chunks.
250 &process_rows(5);
251 &process_rows(5);
252 &process_rows(6);
253
254 &movdqa("xmm0", "xmm2");
255
256 # Rewind $Htable for the next iteration.
257 &lea($Htable, &DWP(-256, $Htable));
258
259 # Advance input and continue.
260 &lea($in, &DWP(16, $in));
261 &sub($len, 16);
262 &jnz(&label("loop_ghash"));
263
264 # Reverse bytes and store the result.
265 &pshufb("xmm0", "xmm7");
266 &movdqu(&QWP(0, $Xi), "xmm0");
267
268 # Zero any registers which contain secrets.
269 &pxor("xmm0", "xmm0");
270 &pxor("xmm1", "xmm1");
271 &pxor("xmm2", "xmm2");
272 &pxor("xmm3", "xmm3");
273 &pxor("xmm4", "xmm4");
274 &pxor("xmm5", "xmm5");
275 &pxor("xmm6", "xmm6");
276&function_end("gcm_ghash_ssse3");
277
278# reverse_bytes is a permutation which, if applied with pshufb, reverses the
279# bytes in an XMM register.
280&set_label("reverse_bytes", 16);
281&data_byte(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
282# low4_mask is an XMM mask which selects the low four bits of each byte.
283&set_label("low4_mask", 16);
284&data_word(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);
285
286&asm_finish();
287
Srinivas Paladugudd42a612019-08-09 19:30:39 +0000288close STDOUT;