blob: 4c0b929586fc16e59f6917dfe6c9c1c858299275 [file] [log] [blame]
Christian Heimes985ecdc2013-11-20 11:46:18 +01001/* Set of hash utility functions to help maintaining the invariant that
2 if a==b then hash(a)==hash(b)
3
4 All the utility functions (_Py_Hash*()) return "-1" to signify an error.
5*/
6#include "Python.h"
7
8#ifdef __APPLE__
9# include <libkern/OSByteOrder.h>
10#elif defined(HAVE_LE64TOH) && defined(HAVE_ENDIAN_H)
11# include <endian.h>
12#elif defined(HAVE_LE64TOH) && defined(HAVE_SYS_ENDIAN_H)
13# include <sys/endian.h>
14#endif
15
16#ifdef __cplusplus
17extern "C" {
18#endif
19
Serhiy Storchakabfe4fd52018-02-09 17:31:26 +020020_Py_HashSecret_t _Py_HashSecret = {{0}};
Christian Heimes985ecdc2013-11-20 11:46:18 +010021
22#if Py_HASH_ALGORITHM == Py_HASH_EXTERNAL
23extern PyHash_FuncDef PyHash_Func;
24#else
25static PyHash_FuncDef PyHash_Func;
26#endif
27
28/* Count _Py_HashBytes() calls */
29#ifdef Py_HASH_STATS
30#define Py_HASH_STATS_MAX 32
31static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
32#endif
33
34/* For numeric types, the hash of a number x is based on the reduction
35 of x modulo the prime P = 2**_PyHASH_BITS - 1. It's designed so that
36 hash(x) == hash(y) whenever x and y are numerically equal, even if
37 x and y have different types.
38
39 A quick summary of the hashing strategy:
40
41 (1) First define the 'reduction of x modulo P' for any rational
42 number x; this is a standard extension of the usual notion of
43 reduction modulo P for integers. If x == p/q (written in lowest
44 terms), the reduction is interpreted as the reduction of p times
45 the inverse of the reduction of q, all modulo P; if q is exactly
46 divisible by P then define the reduction to be infinity. So we've
47 got a well-defined map
48
49 reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
50
51 (2) Now for a rational number x, define hash(x) by:
52
53 reduce(x) if x >= 0
54 -reduce(-x) if x < 0
55
56 If the result of the reduction is infinity (this is impossible for
57 integers, floats and Decimals) then use the predefined hash value
58 _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
59 _PyHASH_INF, -_PyHASH_INF and _PyHASH_NAN are also used for the
60 hashes of float and Decimal infinities and nans.
61
62 A selling point for the above strategy is that it makes it possible
63 to compute hashes of decimal and binary floating-point numbers
64 efficiently, even if the exponent of the binary or decimal number
65 is large. The key point is that
66
67 reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS)
68
69 provided that {reduce(x), reduce(y)} != {0, infinity}. The reduction of a
70 binary or decimal float is never infinity, since the denominator is a power
71 of 2 (for binary) or a divisor of a power of 10 (for decimal). So we have,
72 for nonnegative x,
73
74 reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS
75
76 reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS
77
78 and reduce(10**e) can be computed efficiently by the usual modular
79 exponentiation algorithm. For reduce(2**e) it's even better: since
80 P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
81 by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.
82
83 */
84
85Py_hash_t
86_Py_HashDouble(double v)
87{
88 int e, sign;
89 double m;
90 Py_uhash_t x, y;
91
92 if (!Py_IS_FINITE(v)) {
93 if (Py_IS_INFINITY(v))
94 return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
95 else
96 return _PyHASH_NAN;
97 }
98
99 m = frexp(v, &e);
100
101 sign = 1;
102 if (m < 0) {
103 sign = -1;
104 m = -m;
105 }
106
107 /* process 28 bits at a time; this should work well both for binary
108 and hexadecimal floating point. */
109 x = 0;
110 while (m) {
111 x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28);
112 m *= 268435456.0; /* 2**28 */
113 e -= 28;
114 y = (Py_uhash_t)m; /* pull out integer part */
115 m -= y;
116 x += y;
117 if (x >= _PyHASH_MODULUS)
118 x -= _PyHASH_MODULUS;
119 }
120
121 /* adjust for the exponent; first reduce it modulo _PyHASH_BITS */
122 e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS);
123 x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e);
124
125 x = x * sign;
126 if (x == (Py_uhash_t)-1)
127 x = (Py_uhash_t)-2;
128 return (Py_hash_t)x;
129}
130
131Py_hash_t
132_Py_HashPointer(void *p)
133{
134 Py_hash_t x;
135 size_t y = (size_t)p;
136 /* bottom 3 or 4 bits are likely to be 0; rotate y by 4 to avoid
137 excessive hash collisions for dicts and sets */
138 y = (y >> 4) | (y << (8 * SIZEOF_VOID_P - 4));
139 x = (Py_hash_t)y;
140 if (x == -1)
141 x = -2;
142 return x;
143}
144
145Py_hash_t
146_Py_HashBytes(const void *src, Py_ssize_t len)
147{
148 Py_hash_t x;
149 /*
150 We make the hash of the empty string be 0, rather than using
151 (prefix ^ suffix), since this slightly obfuscates the hash secret
152 */
153 if (len == 0) {
154 return 0;
155 }
156
157#ifdef Py_HASH_STATS
158 hashstats[(len <= Py_HASH_STATS_MAX) ? len : 0]++;
159#endif
160
161#if Py_HASH_CUTOFF > 0
162 if (len < Py_HASH_CUTOFF) {
163 /* Optimize hashing of very small strings with inline DJBX33A. */
164 Py_uhash_t hash;
165 const unsigned char *p = src;
166 hash = 5381; /* DJBX33A starts with 5381 */
167
168 switch(len) {
169 /* ((hash << 5) + hash) + *p == hash * 33 + *p */
170 case 7: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
171 case 6: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
172 case 5: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
173 case 4: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
174 case 3: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
175 case 2: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
176 case 1: hash = ((hash << 5) + hash) + *p++; break;
177 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700178 Py_UNREACHABLE();
Christian Heimes985ecdc2013-11-20 11:46:18 +0100179 }
180 hash ^= len;
181 hash ^= (Py_uhash_t) _Py_HashSecret.djbx33a.suffix;
182 x = (Py_hash_t)hash;
183 }
184 else
185#endif /* Py_HASH_CUTOFF */
186 x = PyHash_Func.hash(src, len);
187
188 if (x == -1)
189 return -2;
190 return x;
191}
192
193void
194_PyHash_Fini(void)
195{
196#ifdef Py_HASH_STATS
197 int i;
198 Py_ssize_t total = 0;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +0200199 const char *fmt = "%2i %8" PY_FORMAT_SIZE_T "d %8" PY_FORMAT_SIZE_T "d\n";
Christian Heimes985ecdc2013-11-20 11:46:18 +0100200
201 fprintf(stderr, "len calls total\n");
202 for (i = 1; i <= Py_HASH_STATS_MAX; i++) {
203 total += hashstats[i];
204 fprintf(stderr, fmt, i, hashstats[i], total);
205 }
206 total += hashstats[0];
207 fprintf(stderr, "> %8" PY_FORMAT_SIZE_T "d %8" PY_FORMAT_SIZE_T "d\n",
208 hashstats[0], total);
209#endif
210}
211
212PyHash_FuncDef *
213PyHash_GetFuncDef(void)
214{
215 return &PyHash_Func;
216}
217
218/* Optimized memcpy() for Windows */
219#ifdef _MSC_VER
220# if SIZEOF_PY_UHASH_T == 4
221# define PY_UHASH_CPY(dst, src) do { \
222 dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
223 } while(0)
224# elif SIZEOF_PY_UHASH_T == 8
225# define PY_UHASH_CPY(dst, src) do { \
226 dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
227 dst[4] = src[4]; dst[5] = src[5]; dst[6] = src[6]; dst[7] = src[7]; \
228 } while(0)
229# else
230# error SIZEOF_PY_UHASH_T must be 4 or 8
231# endif /* SIZEOF_PY_UHASH_T */
232#else /* not Windows */
233# define PY_UHASH_CPY(dst, src) memcpy(dst, src, SIZEOF_PY_UHASH_T)
234#endif /* _MSC_VER */
235
236
237#if Py_HASH_ALGORITHM == Py_HASH_FNV
238/* **************************************************************************
239 * Modified Fowler-Noll-Vo (FNV) hash function
240 */
241static Py_hash_t
242fnv(const void *src, Py_ssize_t len)
243{
244 const unsigned char *p = src;
245 Py_uhash_t x;
246 Py_ssize_t remainder, blocks;
247 union {
248 Py_uhash_t value;
249 unsigned char bytes[SIZEOF_PY_UHASH_T];
250 } block;
251
252#ifdef Py_DEBUG
253 assert(_Py_HashSecret_Initialized);
254#endif
255 remainder = len % SIZEOF_PY_UHASH_T;
256 if (remainder == 0) {
257 /* Process at least one block byte by byte to reduce hash collisions
258 * for strings with common prefixes. */
259 remainder = SIZEOF_PY_UHASH_T;
260 }
261 blocks = (len - remainder) / SIZEOF_PY_UHASH_T;
262
263 x = (Py_uhash_t) _Py_HashSecret.fnv.prefix;
264 x ^= (Py_uhash_t) *p << 7;
265 while (blocks--) {
266 PY_UHASH_CPY(block.bytes, p);
267 x = (_PyHASH_MULTIPLIER * x) ^ block.value;
268 p += SIZEOF_PY_UHASH_T;
269 }
270 /* add remainder */
271 for (; remainder > 0; remainder--)
272 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++;
273 x ^= (Py_uhash_t) len;
274 x ^= (Py_uhash_t) _Py_HashSecret.fnv.suffix;
A. Jesse Jiryu Davisa8eb5852018-06-04 06:57:08 -0400275 if (x == (Py_uhash_t) -1) {
276 x = (Py_uhash_t) -2;
Christian Heimes985ecdc2013-11-20 11:46:18 +0100277 }
278 return x;
279}
280
281static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T,
282 16 * SIZEOF_PY_HASH_T};
283
284#endif /* Py_HASH_ALGORITHM == Py_HASH_FNV */
285
286
Christian Heimes985ecdc2013-11-20 11:46:18 +0100287/* **************************************************************************
288 <MIT License>
289 Copyright (c) 2013 Marek Majkowski <marek@popcount.org>
290
291 Permission is hereby granted, free of charge, to any person obtaining a copy
292 of this software and associated documentation files (the "Software"), to deal
293 in the Software without restriction, including without limitation the rights
294 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
295 copies of the Software, and to permit persons to whom the Software is
296 furnished to do so, subject to the following conditions:
297
298 The above copyright notice and this permission notice shall be included in
299 all copies or substantial portions of the Software.
300
301 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
302 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
303 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
304 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
305 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
306 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
307 THE SOFTWARE.
308 </MIT License>
309
310 Original location:
311 https://github.com/majek/csiphash/
312
313 Solution inspired by code from:
314 Samuel Neves (supercop/crypto_auth/siphash24/little)
315 djb (supercop/crypto_auth/siphash24/little2)
316 Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c)
317
318 Modified for Python by Christian Heimes:
319 - C89 / MSVC compatibility
Christian Heimes985ecdc2013-11-20 11:46:18 +0100320 - _rotl64() on Windows
321 - letoh64() fallback
322*/
323
Christian Heimes985ecdc2013-11-20 11:46:18 +0100324/* byte swap little endian to host endian
325 * Endian conversion not only ensures that the hash function returns the same
326 * value on all platforms. It is also required to for a good dispersion of
327 * the hash values' least significant bits.
328 */
329#if PY_LITTLE_ENDIAN
Benjamin Peterson9b3d7702016-09-06 13:24:00 -0700330# define _le64toh(x) ((uint64_t)(x))
Christian Heimes985ecdc2013-11-20 11:46:18 +0100331#elif defined(__APPLE__)
332# define _le64toh(x) OSSwapLittleToHostInt64(x)
333#elif defined(HAVE_LETOH64)
334# define _le64toh(x) le64toh(x)
335#else
Benjamin Peterson9b3d7702016-09-06 13:24:00 -0700336# define _le64toh(x) (((uint64_t)(x) << 56) | \
337 (((uint64_t)(x) << 40) & 0xff000000000000ULL) | \
338 (((uint64_t)(x) << 24) & 0xff0000000000ULL) | \
339 (((uint64_t)(x) << 8) & 0xff00000000ULL) | \
340 (((uint64_t)(x) >> 8) & 0xff000000ULL) | \
341 (((uint64_t)(x) >> 24) & 0xff0000ULL) | \
342 (((uint64_t)(x) >> 40) & 0xff00ULL) | \
343 ((uint64_t)(x) >> 56))
Christian Heimes985ecdc2013-11-20 11:46:18 +0100344#endif
345
346
347#ifdef _MSC_VER
348# define ROTATE(x, b) _rotl64(x, b)
349#else
Benjamin Peterson9b3d7702016-09-06 13:24:00 -0700350# define ROTATE(x, b) (uint64_t)( ((x) << (b)) | ( (x) >> (64 - (b))) )
Christian Heimes985ecdc2013-11-20 11:46:18 +0100351#endif
352
353#define HALF_ROUND(a,b,c,d,s,t) \
354 a += b; c += d; \
355 b = ROTATE(b, s) ^ a; \
356 d = ROTATE(d, t) ^ c; \
357 a = ROTATE(a, 32);
358
359#define DOUBLE_ROUND(v0,v1,v2,v3) \
360 HALF_ROUND(v0,v1,v2,v3,13,16); \
361 HALF_ROUND(v2,v1,v0,v3,17,21); \
362 HALF_ROUND(v0,v1,v2,v3,13,16); \
363 HALF_ROUND(v2,v1,v0,v3,17,21);
364
365
Benjamin Peterson42aa93b2017-12-09 10:26:52 -0800366static uint64_t
Benjamin Peterson4e3e1562017-12-09 11:24:18 -0800367siphash24(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -0700368 uint64_t b = (uint64_t)src_sz << 56;
Rolf Eike Beer1e2ec8a2018-05-13 12:57:31 +0200369 const uint8_t *in = (uint8_t*)src;
Christian Heimes985ecdc2013-11-20 11:46:18 +0100370
Benjamin Peterson9b3d7702016-09-06 13:24:00 -0700371 uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
372 uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
373 uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
374 uint64_t v3 = k1 ^ 0x7465646279746573ULL;
Christian Heimes985ecdc2013-11-20 11:46:18 +0100375
Benjamin Peterson9b3d7702016-09-06 13:24:00 -0700376 uint64_t t;
377 uint8_t *pt;
Christian Heimes985ecdc2013-11-20 11:46:18 +0100378
379 while (src_sz >= 8) {
Rolf Eike Beer1e2ec8a2018-05-13 12:57:31 +0200380 uint64_t mi;
381 memcpy(&mi, in, sizeof(mi));
382 mi = _le64toh(mi);
383 in += sizeof(mi);
384 src_sz -= sizeof(mi);
Christian Heimes985ecdc2013-11-20 11:46:18 +0100385 v3 ^= mi;
386 DOUBLE_ROUND(v0,v1,v2,v3);
387 v0 ^= mi;
388 }
389
390 t = 0;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -0700391 pt = (uint8_t *)&t;
Christian Heimes985ecdc2013-11-20 11:46:18 +0100392 switch (src_sz) {
Rolf Eike Beer1e2ec8a2018-05-13 12:57:31 +0200393 case 7: pt[6] = in[6]; /* fall through */
394 case 6: pt[5] = in[5]; /* fall through */
395 case 5: pt[4] = in[4]; /* fall through */
396 case 4: memcpy(pt, in, sizeof(uint32_t)); break;
397 case 3: pt[2] = in[2]; /* fall through */
398 case 2: pt[1] = in[1]; /* fall through */
399 case 1: pt[0] = in[0]; /* fall through */
Christian Heimes985ecdc2013-11-20 11:46:18 +0100400 }
401 b |= _le64toh(t);
402
403 v3 ^= b;
404 DOUBLE_ROUND(v0,v1,v2,v3);
405 v0 ^= b;
406 v2 ^= 0xff;
407 DOUBLE_ROUND(v0,v1,v2,v3);
408 DOUBLE_ROUND(v0,v1,v2,v3);
409
410 /* modified */
411 t = (v0 ^ v1) ^ (v2 ^ v3);
Benjamin Peterson42aa93b2017-12-09 10:26:52 -0800412 return t;
Christian Heimes985ecdc2013-11-20 11:46:18 +0100413}
414
Benjamin Peterson42aa93b2017-12-09 10:26:52 -0800415static Py_hash_t
416pysiphash(const void *src, Py_ssize_t src_sz) {
417 return (Py_hash_t)siphash24(
Benjamin Peterson60ed1302017-12-09 13:11:39 -0800418 _le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1),
Benjamin Peterson42aa93b2017-12-09 10:26:52 -0800419 src, src_sz);
420}
Christian Heimes985ecdc2013-11-20 11:46:18 +0100421
Benjamin Peterson42aa93b2017-12-09 10:26:52 -0800422uint64_t
423_Py_KeyedHash(uint64_t key, const void *src, Py_ssize_t src_sz)
424{
425 return siphash24(key, 0, src, src_sz);
426}
427
428
429#if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
430static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash24", 64, 128};
431#endif
Christian Heimes985ecdc2013-11-20 11:46:18 +0100432
433#ifdef __cplusplus
434}
435#endif