blob: f0c82356f1e26cf11d848cdcc4fd0f7d8a2adb13 [file] [log] [blame]
Christian Heimes985ecdc2013-11-20 11:46:18 +01001/* Set of hash utility functions to help maintaining the invariant that
2 if a==b then hash(a)==hash(b)
3
4 All the utility functions (_Py_Hash*()) return "-1" to signify an error.
5*/
6#include "Python.h"
7
8#ifdef __APPLE__
9# include <libkern/OSByteOrder.h>
10#elif defined(HAVE_LE64TOH) && defined(HAVE_ENDIAN_H)
11# include <endian.h>
12#elif defined(HAVE_LE64TOH) && defined(HAVE_SYS_ENDIAN_H)
13# include <sys/endian.h>
14#endif
15
16#ifdef __cplusplus
17extern "C" {
18#endif
19
Serhiy Storchakabfe4fd52018-02-09 17:31:26 +020020_Py_HashSecret_t _Py_HashSecret = {{0}};
Christian Heimes985ecdc2013-11-20 11:46:18 +010021
22#if Py_HASH_ALGORITHM == Py_HASH_EXTERNAL
23extern PyHash_FuncDef PyHash_Func;
24#else
25static PyHash_FuncDef PyHash_Func;
26#endif
27
28/* Count _Py_HashBytes() calls */
29#ifdef Py_HASH_STATS
30#define Py_HASH_STATS_MAX 32
31static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
32#endif
33
34/* For numeric types, the hash of a number x is based on the reduction
35 of x modulo the prime P = 2**_PyHASH_BITS - 1. It's designed so that
36 hash(x) == hash(y) whenever x and y are numerically equal, even if
37 x and y have different types.
38
39 A quick summary of the hashing strategy:
40
41 (1) First define the 'reduction of x modulo P' for any rational
42 number x; this is a standard extension of the usual notion of
43 reduction modulo P for integers. If x == p/q (written in lowest
44 terms), the reduction is interpreted as the reduction of p times
45 the inverse of the reduction of q, all modulo P; if q is exactly
46 divisible by P then define the reduction to be infinity. So we've
47 got a well-defined map
48
49 reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
50
51 (2) Now for a rational number x, define hash(x) by:
52
53 reduce(x) if x >= 0
54 -reduce(-x) if x < 0
55
56 If the result of the reduction is infinity (this is impossible for
57 integers, floats and Decimals) then use the predefined hash value
58 _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
Raymond Hettingera07da092021-04-22 08:34:57 -070059 _PyHASH_INF and -_PyHASH_INF are also used for the
60 hashes of float and Decimal infinities.
61
62 NaNs hash with a pointer hash. Having distinct hash values prevents
63 catastrophic pileups from distinct NaN instances which used to always
64 have the same hash value but would compare unequal.
Christian Heimes985ecdc2013-11-20 11:46:18 +010065
66 A selling point for the above strategy is that it makes it possible
67 to compute hashes of decimal and binary floating-point numbers
68 efficiently, even if the exponent of the binary or decimal number
69 is large. The key point is that
70
71 reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS)
72
73 provided that {reduce(x), reduce(y)} != {0, infinity}. The reduction of a
74 binary or decimal float is never infinity, since the denominator is a power
75 of 2 (for binary) or a divisor of a power of 10 (for decimal). So we have,
76 for nonnegative x,
77
78 reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS
79
80 reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS
81
82 and reduce(10**e) can be computed efficiently by the usual modular
83 exponentiation algorithm. For reduce(2**e) it's even better: since
84 P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
85 by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.
86
87 */
88
Raymond Hettingera07da092021-04-22 08:34:57 -070089Py_hash_t _Py_HashPointer(const void *);
90
Christian Heimes985ecdc2013-11-20 11:46:18 +010091Py_hash_t
Raymond Hettingera07da092021-04-22 08:34:57 -070092_Py_HashDouble(PyObject *inst, double v)
Christian Heimes985ecdc2013-11-20 11:46:18 +010093{
94 int e, sign;
95 double m;
96 Py_uhash_t x, y;
97
98 if (!Py_IS_FINITE(v)) {
99 if (Py_IS_INFINITY(v))
100 return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
101 else
Raymond Hettingera07da092021-04-22 08:34:57 -0700102 return _Py_HashPointer(inst);
Christian Heimes985ecdc2013-11-20 11:46:18 +0100103 }
104
105 m = frexp(v, &e);
106
107 sign = 1;
108 if (m < 0) {
109 sign = -1;
110 m = -m;
111 }
112
113 /* process 28 bits at a time; this should work well both for binary
114 and hexadecimal floating point. */
115 x = 0;
116 while (m) {
117 x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28);
118 m *= 268435456.0; /* 2**28 */
119 e -= 28;
120 y = (Py_uhash_t)m; /* pull out integer part */
121 m -= y;
122 x += y;
123 if (x >= _PyHASH_MODULUS)
124 x -= _PyHASH_MODULUS;
125 }
126
127 /* adjust for the exponent; first reduce it modulo _PyHASH_BITS */
128 e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS);
129 x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e);
130
131 x = x * sign;
132 if (x == (Py_uhash_t)-1)
133 x = (Py_uhash_t)-2;
134 return (Py_hash_t)x;
135}
136
137Py_hash_t
Victor Stinnerf4532212020-05-12 18:46:20 +0200138_Py_HashPointerRaw(const void *p)
Christian Heimes985ecdc2013-11-20 11:46:18 +0100139{
Christian Heimes985ecdc2013-11-20 11:46:18 +0100140 size_t y = (size_t)p;
141 /* bottom 3 or 4 bits are likely to be 0; rotate y by 4 to avoid
142 excessive hash collisions for dicts and sets */
143 y = (y >> 4) | (y << (8 * SIZEOF_VOID_P - 4));
Victor Stinnerf4532212020-05-12 18:46:20 +0200144 return (Py_hash_t)y;
145}
146
147Py_hash_t
148_Py_HashPointer(const void *p)
149{
150 Py_hash_t x = _Py_HashPointerRaw(p);
151 if (x == -1) {
Christian Heimes985ecdc2013-11-20 11:46:18 +0100152 x = -2;
Victor Stinnerf4532212020-05-12 18:46:20 +0200153 }
Christian Heimes985ecdc2013-11-20 11:46:18 +0100154 return x;
155}
156
157Py_hash_t
158_Py_HashBytes(const void *src, Py_ssize_t len)
159{
160 Py_hash_t x;
161 /*
162 We make the hash of the empty string be 0, rather than using
163 (prefix ^ suffix), since this slightly obfuscates the hash secret
164 */
165 if (len == 0) {
166 return 0;
167 }
168
169#ifdef Py_HASH_STATS
170 hashstats[(len <= Py_HASH_STATS_MAX) ? len : 0]++;
171#endif
172
173#if Py_HASH_CUTOFF > 0
174 if (len < Py_HASH_CUTOFF) {
175 /* Optimize hashing of very small strings with inline DJBX33A. */
176 Py_uhash_t hash;
177 const unsigned char *p = src;
178 hash = 5381; /* DJBX33A starts with 5381 */
179
180 switch(len) {
181 /* ((hash << 5) + hash) + *p == hash * 33 + *p */
182 case 7: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
183 case 6: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
184 case 5: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
185 case 4: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
186 case 3: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
187 case 2: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
188 case 1: hash = ((hash << 5) + hash) + *p++; break;
189 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700190 Py_UNREACHABLE();
Christian Heimes985ecdc2013-11-20 11:46:18 +0100191 }
192 hash ^= len;
193 hash ^= (Py_uhash_t) _Py_HashSecret.djbx33a.suffix;
194 x = (Py_hash_t)hash;
195 }
196 else
197#endif /* Py_HASH_CUTOFF */
198 x = PyHash_Func.hash(src, len);
199
200 if (x == -1)
201 return -2;
202 return x;
203}
204
205void
206_PyHash_Fini(void)
207{
208#ifdef Py_HASH_STATS
Christian Heimes985ecdc2013-11-20 11:46:18 +0100209 fprintf(stderr, "len calls total\n");
Victor Stinnerd36cf5f2020-06-10 18:38:05 +0200210 Py_ssize_t total = 0;
211 for (int i = 1; i <= Py_HASH_STATS_MAX; i++) {
Christian Heimes985ecdc2013-11-20 11:46:18 +0100212 total += hashstats[i];
Victor Stinnerd36cf5f2020-06-10 18:38:05 +0200213 fprintf(stderr, "%2i %8zd %8zd\n", i, hashstats[i], total);
Christian Heimes985ecdc2013-11-20 11:46:18 +0100214 }
215 total += hashstats[0];
Victor Stinnerd36cf5f2020-06-10 18:38:05 +0200216 fprintf(stderr, "> %8zd %8zd\n", hashstats[0], total);
Christian Heimes985ecdc2013-11-20 11:46:18 +0100217#endif
218}
219
220PyHash_FuncDef *
221PyHash_GetFuncDef(void)
222{
223 return &PyHash_Func;
224}
225
226/* Optimized memcpy() for Windows */
227#ifdef _MSC_VER
228# if SIZEOF_PY_UHASH_T == 4
229# define PY_UHASH_CPY(dst, src) do { \
230 dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
231 } while(0)
232# elif SIZEOF_PY_UHASH_T == 8
233# define PY_UHASH_CPY(dst, src) do { \
234 dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
235 dst[4] = src[4]; dst[5] = src[5]; dst[6] = src[6]; dst[7] = src[7]; \
236 } while(0)
237# else
238# error SIZEOF_PY_UHASH_T must be 4 or 8
239# endif /* SIZEOF_PY_UHASH_T */
240#else /* not Windows */
241# define PY_UHASH_CPY(dst, src) memcpy(dst, src, SIZEOF_PY_UHASH_T)
242#endif /* _MSC_VER */
243
244
245#if Py_HASH_ALGORITHM == Py_HASH_FNV
246/* **************************************************************************
247 * Modified Fowler-Noll-Vo (FNV) hash function
248 */
249static Py_hash_t
250fnv(const void *src, Py_ssize_t len)
251{
252 const unsigned char *p = src;
253 Py_uhash_t x;
254 Py_ssize_t remainder, blocks;
255 union {
256 Py_uhash_t value;
257 unsigned char bytes[SIZEOF_PY_UHASH_T];
258 } block;
259
260#ifdef Py_DEBUG
261 assert(_Py_HashSecret_Initialized);
262#endif
263 remainder = len % SIZEOF_PY_UHASH_T;
264 if (remainder == 0) {
265 /* Process at least one block byte by byte to reduce hash collisions
266 * for strings with common prefixes. */
267 remainder = SIZEOF_PY_UHASH_T;
268 }
269 blocks = (len - remainder) / SIZEOF_PY_UHASH_T;
270
271 x = (Py_uhash_t) _Py_HashSecret.fnv.prefix;
272 x ^= (Py_uhash_t) *p << 7;
273 while (blocks--) {
274 PY_UHASH_CPY(block.bytes, p);
275 x = (_PyHASH_MULTIPLIER * x) ^ block.value;
276 p += SIZEOF_PY_UHASH_T;
277 }
278 /* add remainder */
279 for (; remainder > 0; remainder--)
280 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++;
281 x ^= (Py_uhash_t) len;
282 x ^= (Py_uhash_t) _Py_HashSecret.fnv.suffix;
A. Jesse Jiryu Davisa8eb5852018-06-04 06:57:08 -0400283 if (x == (Py_uhash_t) -1) {
284 x = (Py_uhash_t) -2;
Christian Heimes985ecdc2013-11-20 11:46:18 +0100285 }
286 return x;
287}
288
289static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T,
290 16 * SIZEOF_PY_HASH_T};
291
292#endif /* Py_HASH_ALGORITHM == Py_HASH_FNV */
293
294
Christian Heimes985ecdc2013-11-20 11:46:18 +0100295/* **************************************************************************
296 <MIT License>
297 Copyright (c) 2013 Marek Majkowski <marek@popcount.org>
298
299 Permission is hereby granted, free of charge, to any person obtaining a copy
300 of this software and associated documentation files (the "Software"), to deal
301 in the Software without restriction, including without limitation the rights
302 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
303 copies of the Software, and to permit persons to whom the Software is
304 furnished to do so, subject to the following conditions:
305
306 The above copyright notice and this permission notice shall be included in
307 all copies or substantial portions of the Software.
308
309 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
310 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
311 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
312 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
313 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
314 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
315 THE SOFTWARE.
316 </MIT License>
317
318 Original location:
319 https://github.com/majek/csiphash/
320
321 Solution inspired by code from:
322 Samuel Neves (supercop/crypto_auth/siphash24/little)
323 djb (supercop/crypto_auth/siphash24/little2)
324 Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c)
325
326 Modified for Python by Christian Heimes:
327 - C89 / MSVC compatibility
Christian Heimes985ecdc2013-11-20 11:46:18 +0100328 - _rotl64() on Windows
329 - letoh64() fallback
330*/
331
Christian Heimes985ecdc2013-11-20 11:46:18 +0100332/* byte swap little endian to host endian
333 * Endian conversion not only ensures that the hash function returns the same
334 * value on all platforms. It is also required to for a good dispersion of
335 * the hash values' least significant bits.
336 */
337#if PY_LITTLE_ENDIAN
Benjamin Peterson9b3d7702016-09-06 13:24:00 -0700338# define _le64toh(x) ((uint64_t)(x))
Christian Heimes985ecdc2013-11-20 11:46:18 +0100339#elif defined(__APPLE__)
340# define _le64toh(x) OSSwapLittleToHostInt64(x)
341#elif defined(HAVE_LETOH64)
342# define _le64toh(x) le64toh(x)
343#else
Benjamin Peterson9b3d7702016-09-06 13:24:00 -0700344# define _le64toh(x) (((uint64_t)(x) << 56) | \
345 (((uint64_t)(x) << 40) & 0xff000000000000ULL) | \
346 (((uint64_t)(x) << 24) & 0xff0000000000ULL) | \
347 (((uint64_t)(x) << 8) & 0xff00000000ULL) | \
348 (((uint64_t)(x) >> 8) & 0xff000000ULL) | \
349 (((uint64_t)(x) >> 24) & 0xff0000ULL) | \
350 (((uint64_t)(x) >> 40) & 0xff00ULL) | \
351 ((uint64_t)(x) >> 56))
Christian Heimes985ecdc2013-11-20 11:46:18 +0100352#endif
353
354
355#ifdef _MSC_VER
356# define ROTATE(x, b) _rotl64(x, b)
357#else
Benjamin Peterson9b3d7702016-09-06 13:24:00 -0700358# define ROTATE(x, b) (uint64_t)( ((x) << (b)) | ( (x) >> (64 - (b))) )
Christian Heimes985ecdc2013-11-20 11:46:18 +0100359#endif
360
361#define HALF_ROUND(a,b,c,d,s,t) \
362 a += b; c += d; \
363 b = ROTATE(b, s) ^ a; \
364 d = ROTATE(d, t) ^ c; \
365 a = ROTATE(a, 32);
366
367#define DOUBLE_ROUND(v0,v1,v2,v3) \
368 HALF_ROUND(v0,v1,v2,v3,13,16); \
369 HALF_ROUND(v2,v1,v0,v3,17,21); \
370 HALF_ROUND(v0,v1,v2,v3,13,16); \
371 HALF_ROUND(v2,v1,v0,v3,17,21);
372
373
Benjamin Peterson42aa93b2017-12-09 10:26:52 -0800374static uint64_t
Benjamin Peterson4e3e1562017-12-09 11:24:18 -0800375siphash24(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -0700376 uint64_t b = (uint64_t)src_sz << 56;
Andy Lestere6be9b52020-02-11 20:28:35 -0600377 const uint8_t *in = (const uint8_t*)src;
Christian Heimes985ecdc2013-11-20 11:46:18 +0100378
Benjamin Peterson9b3d7702016-09-06 13:24:00 -0700379 uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
380 uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
381 uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
382 uint64_t v3 = k1 ^ 0x7465646279746573ULL;
Christian Heimes985ecdc2013-11-20 11:46:18 +0100383
Benjamin Peterson9b3d7702016-09-06 13:24:00 -0700384 uint64_t t;
385 uint8_t *pt;
Christian Heimes985ecdc2013-11-20 11:46:18 +0100386
387 while (src_sz >= 8) {
Rolf Eike Beer1e2ec8a2018-05-13 12:57:31 +0200388 uint64_t mi;
389 memcpy(&mi, in, sizeof(mi));
390 mi = _le64toh(mi);
391 in += sizeof(mi);
392 src_sz -= sizeof(mi);
Christian Heimes985ecdc2013-11-20 11:46:18 +0100393 v3 ^= mi;
394 DOUBLE_ROUND(v0,v1,v2,v3);
395 v0 ^= mi;
396 }
397
398 t = 0;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -0700399 pt = (uint8_t *)&t;
Christian Heimes985ecdc2013-11-20 11:46:18 +0100400 switch (src_sz) {
Rolf Eike Beer1e2ec8a2018-05-13 12:57:31 +0200401 case 7: pt[6] = in[6]; /* fall through */
402 case 6: pt[5] = in[5]; /* fall through */
403 case 5: pt[4] = in[4]; /* fall through */
404 case 4: memcpy(pt, in, sizeof(uint32_t)); break;
405 case 3: pt[2] = in[2]; /* fall through */
406 case 2: pt[1] = in[1]; /* fall through */
407 case 1: pt[0] = in[0]; /* fall through */
Christian Heimes985ecdc2013-11-20 11:46:18 +0100408 }
409 b |= _le64toh(t);
410
411 v3 ^= b;
412 DOUBLE_ROUND(v0,v1,v2,v3);
413 v0 ^= b;
414 v2 ^= 0xff;
415 DOUBLE_ROUND(v0,v1,v2,v3);
416 DOUBLE_ROUND(v0,v1,v2,v3);
417
418 /* modified */
419 t = (v0 ^ v1) ^ (v2 ^ v3);
Benjamin Peterson42aa93b2017-12-09 10:26:52 -0800420 return t;
Christian Heimes985ecdc2013-11-20 11:46:18 +0100421}
422
Benjamin Peterson42aa93b2017-12-09 10:26:52 -0800423uint64_t
424_Py_KeyedHash(uint64_t key, const void *src, Py_ssize_t src_sz)
425{
426 return siphash24(key, 0, src, src_sz);
427}
428
429
430#if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
Batuhan Taşkaya1b215732020-04-05 00:25:12 +0300431static Py_hash_t
432pysiphash(const void *src, Py_ssize_t src_sz) {
433 return (Py_hash_t)siphash24(
434 _le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1),
435 src, src_sz);
436}
437
Benjamin Peterson42aa93b2017-12-09 10:26:52 -0800438static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash24", 64, 128};
439#endif
Christian Heimes985ecdc2013-11-20 11:46:18 +0100440
441#ifdef __cplusplus
442}
443#endif