blob: 3b6c34eefd515ac3ad528c0ef32aa670f00cd3f1 [file] [log] [blame]
Christian Heimes985ecdc2013-11-20 11:46:18 +01001/* Set of hash utility functions to help maintaining the invariant that
2 if a==b then hash(a)==hash(b)
3
4 All the utility functions (_Py_Hash*()) return "-1" to signify an error.
5*/
6#include "Python.h"
7
8#ifdef __APPLE__
9# include <libkern/OSByteOrder.h>
10#elif defined(HAVE_LE64TOH) && defined(HAVE_ENDIAN_H)
11# include <endian.h>
12#elif defined(HAVE_LE64TOH) && defined(HAVE_SYS_ENDIAN_H)
13# include <sys/endian.h>
14#endif
15
16#ifdef __cplusplus
17extern "C" {
18#endif
19
Serhiy Storchakabfe4fd52018-02-09 17:31:26 +020020_Py_HashSecret_t _Py_HashSecret = {{0}};
Christian Heimes985ecdc2013-11-20 11:46:18 +010021
22#if Py_HASH_ALGORITHM == Py_HASH_EXTERNAL
23extern PyHash_FuncDef PyHash_Func;
24#else
25static PyHash_FuncDef PyHash_Func;
26#endif
27
28/* Count _Py_HashBytes() calls */
29#ifdef Py_HASH_STATS
30#define Py_HASH_STATS_MAX 32
31static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
32#endif
33
34/* For numeric types, the hash of a number x is based on the reduction
35 of x modulo the prime P = 2**_PyHASH_BITS - 1. It's designed so that
36 hash(x) == hash(y) whenever x and y are numerically equal, even if
37 x and y have different types.
38
39 A quick summary of the hashing strategy:
40
41 (1) First define the 'reduction of x modulo P' for any rational
42 number x; this is a standard extension of the usual notion of
43 reduction modulo P for integers. If x == p/q (written in lowest
44 terms), the reduction is interpreted as the reduction of p times
45 the inverse of the reduction of q, all modulo P; if q is exactly
46 divisible by P then define the reduction to be infinity. So we've
47 got a well-defined map
48
49 reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
50
51 (2) Now for a rational number x, define hash(x) by:
52
53 reduce(x) if x >= 0
54 -reduce(-x) if x < 0
55
56 If the result of the reduction is infinity (this is impossible for
57 integers, floats and Decimals) then use the predefined hash value
58 _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
59 _PyHASH_INF, -_PyHASH_INF and _PyHASH_NAN are also used for the
60 hashes of float and Decimal infinities and nans.
61
62 A selling point for the above strategy is that it makes it possible
63 to compute hashes of decimal and binary floating-point numbers
64 efficiently, even if the exponent of the binary or decimal number
65 is large. The key point is that
66
67 reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS)
68
69 provided that {reduce(x), reduce(y)} != {0, infinity}. The reduction of a
70 binary or decimal float is never infinity, since the denominator is a power
71 of 2 (for binary) or a divisor of a power of 10 (for decimal). So we have,
72 for nonnegative x,
73
74 reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS
75
76 reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS
77
78 and reduce(10**e) can be computed efficiently by the usual modular
79 exponentiation algorithm. For reduce(2**e) it's even better: since
80 P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
81 by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.
82
83 */
84
85Py_hash_t
86_Py_HashDouble(double v)
87{
88 int e, sign;
89 double m;
90 Py_uhash_t x, y;
91
92 if (!Py_IS_FINITE(v)) {
93 if (Py_IS_INFINITY(v))
94 return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
95 else
96 return _PyHASH_NAN;
97 }
98
99 m = frexp(v, &e);
100
101 sign = 1;
102 if (m < 0) {
103 sign = -1;
104 m = -m;
105 }
106
107 /* process 28 bits at a time; this should work well both for binary
108 and hexadecimal floating point. */
109 x = 0;
110 while (m) {
111 x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28);
112 m *= 268435456.0; /* 2**28 */
113 e -= 28;
114 y = (Py_uhash_t)m; /* pull out integer part */
115 m -= y;
116 x += y;
117 if (x >= _PyHASH_MODULUS)
118 x -= _PyHASH_MODULUS;
119 }
120
121 /* adjust for the exponent; first reduce it modulo _PyHASH_BITS */
122 e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS);
123 x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e);
124
125 x = x * sign;
126 if (x == (Py_uhash_t)-1)
127 x = (Py_uhash_t)-2;
128 return (Py_hash_t)x;
129}
130
131Py_hash_t
Victor Stinnerf4532212020-05-12 18:46:20 +0200132_Py_HashPointerRaw(const void *p)
Christian Heimes985ecdc2013-11-20 11:46:18 +0100133{
Christian Heimes985ecdc2013-11-20 11:46:18 +0100134 size_t y = (size_t)p;
135 /* bottom 3 or 4 bits are likely to be 0; rotate y by 4 to avoid
136 excessive hash collisions for dicts and sets */
137 y = (y >> 4) | (y << (8 * SIZEOF_VOID_P - 4));
Victor Stinnerf4532212020-05-12 18:46:20 +0200138 return (Py_hash_t)y;
139}
140
141Py_hash_t
142_Py_HashPointer(const void *p)
143{
144 Py_hash_t x = _Py_HashPointerRaw(p);
145 if (x == -1) {
Christian Heimes985ecdc2013-11-20 11:46:18 +0100146 x = -2;
Victor Stinnerf4532212020-05-12 18:46:20 +0200147 }
Christian Heimes985ecdc2013-11-20 11:46:18 +0100148 return x;
149}
150
151Py_hash_t
152_Py_HashBytes(const void *src, Py_ssize_t len)
153{
154 Py_hash_t x;
155 /*
156 We make the hash of the empty string be 0, rather than using
157 (prefix ^ suffix), since this slightly obfuscates the hash secret
158 */
159 if (len == 0) {
160 return 0;
161 }
162
163#ifdef Py_HASH_STATS
164 hashstats[(len <= Py_HASH_STATS_MAX) ? len : 0]++;
165#endif
166
167#if Py_HASH_CUTOFF > 0
168 if (len < Py_HASH_CUTOFF) {
169 /* Optimize hashing of very small strings with inline DJBX33A. */
170 Py_uhash_t hash;
171 const unsigned char *p = src;
172 hash = 5381; /* DJBX33A starts with 5381 */
173
174 switch(len) {
175 /* ((hash << 5) + hash) + *p == hash * 33 + *p */
176 case 7: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
177 case 6: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
178 case 5: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
179 case 4: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
180 case 3: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
181 case 2: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
182 case 1: hash = ((hash << 5) + hash) + *p++; break;
183 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700184 Py_UNREACHABLE();
Christian Heimes985ecdc2013-11-20 11:46:18 +0100185 }
186 hash ^= len;
187 hash ^= (Py_uhash_t) _Py_HashSecret.djbx33a.suffix;
188 x = (Py_hash_t)hash;
189 }
190 else
191#endif /* Py_HASH_CUTOFF */
192 x = PyHash_Func.hash(src, len);
193
194 if (x == -1)
195 return -2;
196 return x;
197}
198
199void
200_PyHash_Fini(void)
201{
202#ifdef Py_HASH_STATS
Christian Heimes985ecdc2013-11-20 11:46:18 +0100203 fprintf(stderr, "len calls total\n");
Victor Stinnerd36cf5f2020-06-10 18:38:05 +0200204 Py_ssize_t total = 0;
205 for (int i = 1; i <= Py_HASH_STATS_MAX; i++) {
Christian Heimes985ecdc2013-11-20 11:46:18 +0100206 total += hashstats[i];
Victor Stinnerd36cf5f2020-06-10 18:38:05 +0200207 fprintf(stderr, "%2i %8zd %8zd\n", i, hashstats[i], total);
Christian Heimes985ecdc2013-11-20 11:46:18 +0100208 }
209 total += hashstats[0];
Victor Stinnerd36cf5f2020-06-10 18:38:05 +0200210 fprintf(stderr, "> %8zd %8zd\n", hashstats[0], total);
Christian Heimes985ecdc2013-11-20 11:46:18 +0100211#endif
212}
213
214PyHash_FuncDef *
215PyHash_GetFuncDef(void)
216{
217 return &PyHash_Func;
218}
219
220/* Optimized memcpy() for Windows */
221#ifdef _MSC_VER
222# if SIZEOF_PY_UHASH_T == 4
223# define PY_UHASH_CPY(dst, src) do { \
224 dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
225 } while(0)
226# elif SIZEOF_PY_UHASH_T == 8
227# define PY_UHASH_CPY(dst, src) do { \
228 dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
229 dst[4] = src[4]; dst[5] = src[5]; dst[6] = src[6]; dst[7] = src[7]; \
230 } while(0)
231# else
232# error SIZEOF_PY_UHASH_T must be 4 or 8
233# endif /* SIZEOF_PY_UHASH_T */
234#else /* not Windows */
235# define PY_UHASH_CPY(dst, src) memcpy(dst, src, SIZEOF_PY_UHASH_T)
236#endif /* _MSC_VER */
237
238
239#if Py_HASH_ALGORITHM == Py_HASH_FNV
240/* **************************************************************************
241 * Modified Fowler-Noll-Vo (FNV) hash function
242 */
243static Py_hash_t
244fnv(const void *src, Py_ssize_t len)
245{
246 const unsigned char *p = src;
247 Py_uhash_t x;
248 Py_ssize_t remainder, blocks;
249 union {
250 Py_uhash_t value;
251 unsigned char bytes[SIZEOF_PY_UHASH_T];
252 } block;
253
254#ifdef Py_DEBUG
255 assert(_Py_HashSecret_Initialized);
256#endif
257 remainder = len % SIZEOF_PY_UHASH_T;
258 if (remainder == 0) {
259 /* Process at least one block byte by byte to reduce hash collisions
260 * for strings with common prefixes. */
261 remainder = SIZEOF_PY_UHASH_T;
262 }
263 blocks = (len - remainder) / SIZEOF_PY_UHASH_T;
264
265 x = (Py_uhash_t) _Py_HashSecret.fnv.prefix;
266 x ^= (Py_uhash_t) *p << 7;
267 while (blocks--) {
268 PY_UHASH_CPY(block.bytes, p);
269 x = (_PyHASH_MULTIPLIER * x) ^ block.value;
270 p += SIZEOF_PY_UHASH_T;
271 }
272 /* add remainder */
273 for (; remainder > 0; remainder--)
274 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++;
275 x ^= (Py_uhash_t) len;
276 x ^= (Py_uhash_t) _Py_HashSecret.fnv.suffix;
A. Jesse Jiryu Davisa8eb5852018-06-04 06:57:08 -0400277 if (x == (Py_uhash_t) -1) {
278 x = (Py_uhash_t) -2;
Christian Heimes985ecdc2013-11-20 11:46:18 +0100279 }
280 return x;
281}
282
283static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T,
284 16 * SIZEOF_PY_HASH_T};
285
286#endif /* Py_HASH_ALGORITHM == Py_HASH_FNV */
287
288
Christian Heimes985ecdc2013-11-20 11:46:18 +0100289/* **************************************************************************
290 <MIT License>
291 Copyright (c) 2013 Marek Majkowski <marek@popcount.org>
292
293 Permission is hereby granted, free of charge, to any person obtaining a copy
294 of this software and associated documentation files (the "Software"), to deal
295 in the Software without restriction, including without limitation the rights
296 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
297 copies of the Software, and to permit persons to whom the Software is
298 furnished to do so, subject to the following conditions:
299
300 The above copyright notice and this permission notice shall be included in
301 all copies or substantial portions of the Software.
302
303 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
304 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
305 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
306 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
307 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
308 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
309 THE SOFTWARE.
310 </MIT License>
311
312 Original location:
313 https://github.com/majek/csiphash/
314
315 Solution inspired by code from:
316 Samuel Neves (supercop/crypto_auth/siphash24/little)
317 djb (supercop/crypto_auth/siphash24/little2)
318 Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c)
319
320 Modified for Python by Christian Heimes:
321 - C89 / MSVC compatibility
Christian Heimes985ecdc2013-11-20 11:46:18 +0100322 - _rotl64() on Windows
323 - letoh64() fallback
324*/
325
Christian Heimes985ecdc2013-11-20 11:46:18 +0100326/* byte swap little endian to host endian
327 * Endian conversion not only ensures that the hash function returns the same
328 * value on all platforms. It is also required to for a good dispersion of
329 * the hash values' least significant bits.
330 */
331#if PY_LITTLE_ENDIAN
Benjamin Peterson9b3d7702016-09-06 13:24:00 -0700332# define _le64toh(x) ((uint64_t)(x))
Christian Heimes985ecdc2013-11-20 11:46:18 +0100333#elif defined(__APPLE__)
334# define _le64toh(x) OSSwapLittleToHostInt64(x)
335#elif defined(HAVE_LETOH64)
336# define _le64toh(x) le64toh(x)
337#else
Benjamin Peterson9b3d7702016-09-06 13:24:00 -0700338# define _le64toh(x) (((uint64_t)(x) << 56) | \
339 (((uint64_t)(x) << 40) & 0xff000000000000ULL) | \
340 (((uint64_t)(x) << 24) & 0xff0000000000ULL) | \
341 (((uint64_t)(x) << 8) & 0xff00000000ULL) | \
342 (((uint64_t)(x) >> 8) & 0xff000000ULL) | \
343 (((uint64_t)(x) >> 24) & 0xff0000ULL) | \
344 (((uint64_t)(x) >> 40) & 0xff00ULL) | \
345 ((uint64_t)(x) >> 56))
Christian Heimes985ecdc2013-11-20 11:46:18 +0100346#endif
347
348
349#ifdef _MSC_VER
350# define ROTATE(x, b) _rotl64(x, b)
351#else
Benjamin Peterson9b3d7702016-09-06 13:24:00 -0700352# define ROTATE(x, b) (uint64_t)( ((x) << (b)) | ( (x) >> (64 - (b))) )
Christian Heimes985ecdc2013-11-20 11:46:18 +0100353#endif
354
355#define HALF_ROUND(a,b,c,d,s,t) \
356 a += b; c += d; \
357 b = ROTATE(b, s) ^ a; \
358 d = ROTATE(d, t) ^ c; \
359 a = ROTATE(a, 32);
360
361#define DOUBLE_ROUND(v0,v1,v2,v3) \
362 HALF_ROUND(v0,v1,v2,v3,13,16); \
363 HALF_ROUND(v2,v1,v0,v3,17,21); \
364 HALF_ROUND(v0,v1,v2,v3,13,16); \
365 HALF_ROUND(v2,v1,v0,v3,17,21);
366
367
Benjamin Peterson42aa93b2017-12-09 10:26:52 -0800368static uint64_t
Benjamin Peterson4e3e1562017-12-09 11:24:18 -0800369siphash24(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -0700370 uint64_t b = (uint64_t)src_sz << 56;
Andy Lestere6be9b52020-02-11 20:28:35 -0600371 const uint8_t *in = (const uint8_t*)src;
Christian Heimes985ecdc2013-11-20 11:46:18 +0100372
Benjamin Peterson9b3d7702016-09-06 13:24:00 -0700373 uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
374 uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
375 uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
376 uint64_t v3 = k1 ^ 0x7465646279746573ULL;
Christian Heimes985ecdc2013-11-20 11:46:18 +0100377
Benjamin Peterson9b3d7702016-09-06 13:24:00 -0700378 uint64_t t;
379 uint8_t *pt;
Christian Heimes985ecdc2013-11-20 11:46:18 +0100380
381 while (src_sz >= 8) {
Rolf Eike Beer1e2ec8a2018-05-13 12:57:31 +0200382 uint64_t mi;
383 memcpy(&mi, in, sizeof(mi));
384 mi = _le64toh(mi);
385 in += sizeof(mi);
386 src_sz -= sizeof(mi);
Christian Heimes985ecdc2013-11-20 11:46:18 +0100387 v3 ^= mi;
388 DOUBLE_ROUND(v0,v1,v2,v3);
389 v0 ^= mi;
390 }
391
392 t = 0;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -0700393 pt = (uint8_t *)&t;
Christian Heimes985ecdc2013-11-20 11:46:18 +0100394 switch (src_sz) {
Rolf Eike Beer1e2ec8a2018-05-13 12:57:31 +0200395 case 7: pt[6] = in[6]; /* fall through */
396 case 6: pt[5] = in[5]; /* fall through */
397 case 5: pt[4] = in[4]; /* fall through */
398 case 4: memcpy(pt, in, sizeof(uint32_t)); break;
399 case 3: pt[2] = in[2]; /* fall through */
400 case 2: pt[1] = in[1]; /* fall through */
401 case 1: pt[0] = in[0]; /* fall through */
Christian Heimes985ecdc2013-11-20 11:46:18 +0100402 }
403 b |= _le64toh(t);
404
405 v3 ^= b;
406 DOUBLE_ROUND(v0,v1,v2,v3);
407 v0 ^= b;
408 v2 ^= 0xff;
409 DOUBLE_ROUND(v0,v1,v2,v3);
410 DOUBLE_ROUND(v0,v1,v2,v3);
411
412 /* modified */
413 t = (v0 ^ v1) ^ (v2 ^ v3);
Benjamin Peterson42aa93b2017-12-09 10:26:52 -0800414 return t;
Christian Heimes985ecdc2013-11-20 11:46:18 +0100415}
416
Benjamin Peterson42aa93b2017-12-09 10:26:52 -0800417uint64_t
418_Py_KeyedHash(uint64_t key, const void *src, Py_ssize_t src_sz)
419{
420 return siphash24(key, 0, src, src_sz);
421}
422
423
424#if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
Batuhan Taşkaya1b215732020-04-05 00:25:12 +0300425static Py_hash_t
426pysiphash(const void *src, Py_ssize_t src_sz) {
427 return (Py_hash_t)siphash24(
428 _le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1),
429 src, src_sz);
430}
431
Benjamin Peterson42aa93b2017-12-09 10:26:52 -0800432static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash24", 64, 128};
433#endif
Christian Heimes985ecdc2013-11-20 11:46:18 +0100434
435#ifdef __cplusplus
436}
437#endif