ssue #19183: Implement PEP 456 'secure and interchangeable hash algorithm'. Python now uses SipHash24 on all major platforms.

commit: 985ecdcfc29adfc36ce2339acf03f819ad414869 [log] [tgz]
author: Christian Heimes <christian@cheimes.de> Wed Nov 20 11:46:18 2013 +0100
committer: Christian Heimes <christian@cheimes.de> Wed Nov 20 11:46:18 2013 +0100
tree: 06a11f82271e768dbe49469c8736b65b083f671c
parent: fe32aec25a8b36498d840bd69485e9bc94195b9c [diff]
diff --git a/Doc/library/sys.rst b/Doc/library/sys.rst
index 341764a..b885de8 100644
--- a/Doc/library/sys.rst
+++ b/Doc/library/sys.rst

@@ -594,9 +594,20 @@
    | :const:`imag`       | multiplier used for the imaginary part of a      |
    |                     | complex number                                   |
    +---------------------+--------------------------------------------------+
+   | :const:`algorithm`  | name of the algorithm for hashing of str, bytes, |
+   |                     | and memoryview                                   |
+   +---------------------+--------------------------------------------------+
+   | :const:`hash_bits`  | internal output size of the hash algorithm       |
+   +---------------------+--------------------------------------------------+
+   | :const:`seed_bits`  | size of the seed key of the hash algorithm       |
+   +---------------------+--------------------------------------------------+
+
 
    .. versionadded:: 3.2
 
+   .. versionchanged: 3.4
+      Added *algorithm*, *hash_bits* and *seed_bits*
+
 
 .. data:: hexversion
 

diff --git a/Doc/license.rst b/Doc/license.rst
index 598ba86..5e6ed26 100644
--- a/Doc/license.rst
+++ b/Doc/license.rst

@@ -609,6 +609,35 @@
   http://creativecommons.org/publicdomain/zero/1.0/
 
 
+SipHash24
+---------
+
+The file :file:`Python/pyhash.c` contains Marek Majkowski' implementation of
+Dan Bernstein's SipHash24 algorithm. The contains the following note::
+
+  <MIT License>
+  Copyright (c) 2013  Marek Majkowski <marek@popcount.org>
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+  </MIT License>
+
+  Original location:
+     https://github.com/majek/csiphash/
+
+  Solution inspired by code from:
+     Samuel Neves (supercop/crypto_auth/siphash24/little)
+     djb (supercop/crypto_auth/siphash24/little2)
+     Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c)
+
+
 strtod and dtoa
 ---------------
 

diff --git a/Doc/whatsnew/3.4.rst b/Doc/whatsnew/3.4.rst
index f72f544..19b20bf 100644
--- a/Doc/whatsnew/3.4.rst
+++ b/Doc/whatsnew/3.4.rst

@@ -116,6 +116,7 @@
 
 * :ref:`PEP 442: Safe object finalization <pep-442>`
 * :ref:`PEP 445: Configurable memory allocators <pep-445>`
+* :pep:`456` Secure and interchangeable hash algorithm
 * Improve finalization of Python modules to avoid setting their globals
   to None, in most cases (:issue:`18214`).
 * A more efficient :mod:`marshal` format (:issue:`16475`).

diff --git a/Include/Python.h b/Include/Python.h
index a78a721..2dd8290 100644
--- a/Include/Python.h
+++ b/Include/Python.h

@@ -68,6 +68,7 @@
 #include "object.h"
 #include "objimpl.h"
 #include "typeslots.h"
+#include "pyhash.h"
 
 #include "pydebug.h"
 

diff --git a/Include/object.h b/Include/object.h
index a6130fe..8de2208 100644
--- a/Include/object.h
+++ b/Include/object.h

@@ -562,23 +562,6 @@
 PyAPI_FUNC(int) Py_ReprEnter(PyObject *);
 PyAPI_FUNC(void) Py_ReprLeave(PyObject *);
 
-/* Helpers for hash functions */
-#ifndef Py_LIMITED_API
-PyAPI_FUNC(Py_hash_t) _Py_HashDouble(double);
-PyAPI_FUNC(Py_hash_t) _Py_HashPointer(void*);
-PyAPI_FUNC(Py_hash_t) _Py_HashBytes(unsigned char*, Py_ssize_t);
-#endif
-
-typedef struct {
-    Py_hash_t prefix;
-    Py_hash_t suffix;
-} _Py_HashSecret_t;
-PyAPI_DATA(_Py_HashSecret_t) _Py_HashSecret;
-
-#ifdef Py_DEBUG
-PyAPI_DATA(int) _Py_HashSecret_Initialized;
-#endif
-
 /* Helper for passing objects to printf and the like */
 #define PyObject_REPR(obj) _PyUnicode_AsString(PyObject_Repr(obj))
 

diff --git a/Include/pyhash.h b/Include/pyhash.h
new file mode 100644
index 0000000..83c9281
--- /dev/null
+++ b/Include/pyhash.h

@@ -0,0 +1,147 @@
+#ifndef Py_HASH_H
+
+#define Py_HASH_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Helpers for hash functions */
+#ifndef Py_LIMITED_API
+PyAPI_FUNC(Py_hash_t) _Py_HashDouble(double);
+PyAPI_FUNC(Py_hash_t) _Py_HashPointer(void*);
+PyAPI_FUNC(Py_hash_t) _Py_HashBytes(const void*, Py_ssize_t);
+#endif
+
+/* Prime multiplier used in string and various other hashes. */
+#define _PyHASH_MULTIPLIER 1000003UL  /* 0xf4243 */
+
+/* Parameters used for the numeric hash implementation.  See notes for
+   _Py_HashDouble in Objects/object.c.  Numeric hashes are based on
+   reduction modulo the prime 2**_PyHASH_BITS - 1. */
+
+#if SIZEOF_VOID_P >= 8
+#  define _PyHASH_BITS 61
+#else
+#  define _PyHASH_BITS 31
+#endif
+
+#define _PyHASH_MODULUS (((size_t)1 << _PyHASH_BITS) - 1)
+#define _PyHASH_INF 314159
+#define _PyHASH_NAN 0
+#define _PyHASH_IMAG _PyHASH_MULTIPLIER
+
+
+/* hash secret
+ *
+ * memory layout on 64 bit systems
+ *   cccccccc cccccccc cccccccc  uc -- unsigned char[24]
+ *   pppppppp ssssssss ........  fnv -- two Py_hash_t
+ *   k0k0k0k0 k1k1k1k1 ........  siphash -- two PY_UINT64_T
+ *   ........ ........ ssssssss  djbx33a -- 16 bytes padding + one Py_hash_t
+ *   ........ ........ eeeeeeee  pyexpat XML hash salt
+ *
+ * memory layout on 32 bit systems
+ *   cccccccc cccccccc cccccccc  uc
+ *   ppppssss ........ ........  fnv -- two Py_hash_t
+ *   k0k0k0k0 k1k1k1k1 ........  siphash -- two PY_UINT64_T (*)
+ *   ........ ........ ssss....  djbx33a -- 16 bytes padding + one Py_hash_t
+ *   ........ ........ eeee....  pyexpat XML hash salt
+ *
+ * (*) The siphash member may not be available on 32 bit platforms without
+ *     an unsigned int64 data type.
+ */
+typedef union {
+    /* ensure 24 bytes */
+    unsigned char uc[24];
+    /* two Py_hash_t for FNV */
+    struct {
+        Py_hash_t prefix;
+        Py_hash_t suffix;
+    } fnv;
+#ifdef PY_UINT64_T
+    /* two uint64 for SipHash24 */
+    struct {
+        PY_UINT64_T k0;
+        PY_UINT64_T k1;
+    } siphash;
+#endif
+    /* a different (!) Py_hash_t for small string optimization */
+    struct {
+        unsigned char padding[16];
+        Py_hash_t suffix;
+    } djbx33a;
+    struct {
+        unsigned char padding[16];
+        Py_hash_t hashsalt;
+    } expat;
+} _Py_HashSecret_t;
+PyAPI_DATA(_Py_HashSecret_t) _Py_HashSecret;
+
+#ifdef Py_DEBUG
+PyAPI_DATA(int) _Py_HashSecret_Initialized;
+#endif
+
+
+/* hash function definition */
+#ifndef Py_LIMITED_API
+typedef struct {
+    Py_hash_t (*const hash)(const void *, Py_ssize_t);
+    const char *name;
+    const int hash_bits;
+    const int seed_bits;
+} PyHash_FuncDef;
+
+PyAPI_FUNC(PyHash_FuncDef*) PyHash_GetFuncDef(void);
+#endif
+
+
+/* cutoff for small string DJBX33A optimization in range [1, cutoff).
+ *
+ * About 50% of the strings in a typical Python application are smaller than
+ * 6 to 7 chars. However DJBX33A is vulnerable to hash collision attacks.
+ * NEVER use DJBX33A for long strings!
+ *
+ * A Py_HASH_CUTOFF of 0 disables small string optimization. 32 bit platforms
+ * should use a smaller cutoff because it is easier to create colliding
+ * strings. A cutoff of 7 on 64bit platforms and 5 on 32bit platforms should
+ * provide a decent safety margin.
+ */
+#ifndef Py_HASH_CUTOFF
+#  define Py_HASH_CUTOFF 0
+#elif (Py_HASH_CUTOFF > 7 || Py_HASH_CUTOFF < 0)
+#  error Py_HASH_CUTOFF must in range 0...7.
+#endif /* Py_HASH_CUTOFF */
+
+
+/* hash algorithm selection
+ *
+ * The values for Py_HASH_SIPHASH24 and Py_HASH_FNV are hard-coded in the
+ * configure script.
+ *
+ * - FNV is available on all platforms and architectures.
+ * - SIPHASH24 only works on plaforms that provide PY_UINT64_T and doesn't
+ *   require aligned memory for integers.
+ * - With EXTERNAL embedders can provide an alternative implementation with::
+ *
+ *     PyHash_FuncDef PyHash_Func = {...};
+ *
+ * XXX: Figure out __declspec() for extern PyHash_FuncDef.
+ */
+#define Py_HASH_EXTERNAL 0
+#define Py_HASH_SIPHASH24 1
+#define Py_HASH_FNV 2
+
+#ifndef Py_HASH_ALGORITHM
+#  if (defined(PY_UINT64_T) && defined(PY_UINT32_T) \
+       && !defined(HAVE_ALIGNED_REQUIRED))
+#    define Py_HASH_ALGORITHM Py_HASH_SIPHASH24
+#  else
+#    define Py_HASH_ALGORITHM Py_HASH_FNV
+#  endif /* uint64_t && uint32_t && aligned */
+#endif /* Py_HASH_ALGORITHM */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !Py_HASH_H */

diff --git a/Include/pyport.h b/Include/pyport.h
index ca20b22..b6b426a 100644
--- a/Include/pyport.h
+++ b/Include/pyport.h

@@ -144,23 +144,6 @@
 #endif
 #endif
 
-/* Prime multiplier used in string and various other hashes. */
-#define _PyHASH_MULTIPLIER 1000003UL  /* 0xf4243 */
-
-/* Parameters used for the numeric hash implementation.  See notes for
-   _Py_HashDouble in Objects/object.c.  Numeric hashes are based on
-   reduction modulo the prime 2**_PyHASH_BITS - 1. */
-
-#if SIZEOF_VOID_P >= 8
-#define _PyHASH_BITS 61
-#else
-#define _PyHASH_BITS 31
-#endif
-#define _PyHASH_MODULUS (((size_t)1 << _PyHASH_BITS) - 1)
-#define _PyHASH_INF 314159
-#define _PyHASH_NAN 0
-#define _PyHASH_IMAG _PyHASH_MULTIPLIER
-
 /* uintptr_t is the C9X name for an unsigned integral type such that a
  * legitimate void* can be cast to uintptr_t and then back to void* again
  * without loss of information.  Similarly for intptr_t, wrt a signed
@@ -199,8 +182,10 @@
 #endif
 
 /* Py_hash_t is the same size as a pointer. */
+#define SIZEOF_PY_HASH_T SIZEOF_SIZE_T
 typedef Py_ssize_t Py_hash_t;
 /* Py_uhash_t is the unsigned equivalent needed to calculate numeric hash. */
+#define SIZEOF_PY_UHASH_T SIZEOF_SIZE_T
 typedef size_t Py_uhash_t;
 
 /* Largest possible value of size_t.

diff --git a/Lib/test/regrtest.py b/Lib/test/regrtest.py
index a5d707e..c1c831f 100755
--- a/Lib/test/regrtest.py
+++ b/Lib/test/regrtest.py

@@ -601,6 +601,8 @@
         print("==", platform.python_implementation(), *sys.version.split())
         print("==  ", platform.platform(aliased=True),
                       "%s-endian" % sys.byteorder)
+        print("==  ", "hash algorithm:", sys.hash_info.algorithm,
+              "64bit" if sys.maxsize > 2**32 else "32bit")
         print("==  ", os.getcwd())
         print("Testing with flags:", sys.flags)
 

diff --git a/Lib/test/test_hash.py b/Lib/test/test_hash.py
index e3ab6e4..66e4155 100644
--- a/Lib/test/test_hash.py
+++ b/Lib/test/test_hash.py

@@ -12,6 +12,40 @@
 
 IS_64BIT = sys.maxsize > 2**32
 
+def lcg(x, length=16):
+    """Linear congruential generator"""
+    if x == 0:
+        return bytes(length)
+    out = bytearray(length)
+    for i in range(length):
+        x = (214013 * x + 2531011) & 0x7fffffff
+        out[i] = (x >> 16) & 0xff
+    return bytes(out)
+
+def pysiphash(uint64):
+    """Convert SipHash24 output to Py_hash_t
+    """
+    assert 0 <= uint64 < (1 << 64)
+    # simple unsigned to signed int64
+    if uint64 > (1 << 63) - 1:
+        int64 = uint64 - (1 << 64)
+    else:
+        int64 = uint64
+    # mangle uint64 to uint32
+    uint32 = (uint64 ^ uint64 >> 32) & 0xffffffff
+    # simple unsigned to signed int32
+    if uint32 > (1 << 31) - 1:
+        int32 = uint32 - (1 << 32)
+    else:
+        int32 = uint32
+    return int32, int64
+
+def skip_unless_internalhash(test):
+    """Skip decorator for tests that depend on SipHash24 or FNV"""
+    ok = sys.hash_info.algorithm in {"fnv", "siphash24"}
+    msg = "Requires SipHash24 or FNV"
+    return test if ok else unittest.skip(msg)(test)
+
 
 class HashEqualityTestCase(unittest.TestCase):
 
@@ -138,7 +172,7 @@
     # an object to be tested
 
     def get_hash_command(self, repr_):
-        return 'print(hash(%s))' % repr_
+        return 'print(hash(eval(%s.decode("utf-8"))))' % repr_.encode("utf-8")
 
     def get_hash(self, repr_, seed=None):
         env = os.environ.copy()
@@ -161,12 +195,67 @@
         self.assertNotEqual(run1, run2)
 
 class StringlikeHashRandomizationTests(HashRandomizationTests):
+    repr_ = None
+    repr_long = None
+
+    # 32bit little, 64bit little, 32bit big, 64bit big
+    known_hashes = {
+        'djba33x': [ # only used for small strings
+            # seed 0, 'abc'
+            [193485960, 193485960,  193485960, 193485960],
+            # seed 42, 'abc'
+            [-678966196, 573763426263223372, -820489388, -4282905804826039665],
+            ],
+        'siphash24': [
+            # seed 0, 'abc'
+            [2025351752, 4596069200710135518, 1433332804,
+             -3481057401533226760],
+            # seed 42, 'abc'
+            [-774632014, -4501618152524544106, 1054608210,
+             -1493500025205289231],
+            # seed 42, 'abcdefghijk'
+            [-1436007334, 4436719588892876975, -1436007334,
+             4436719588892876975],
+            # seed 0, 'äú∑ℇ', PyUCS2 layout depends on endianess
+            [1386693832, 5749986484189612790, 1776982909,
+             -5915111450199468540],
+            # seed 42, 'äú∑ℇ'
+            [1260387190, -2947981342227738144, 1430287772,
+             -4296699217652516017],
+        ],
+        'fnv': [
+            # seed 0, 'abc'
+            [-1600925533, 1453079729188098211, -1600925533,
+             1453079729188098211],
+            # seed 42, 'abc'
+            [-206076799, -4410911502303878509, -1024014457,
+             -3570150969479994130],
+            # seed 42, 'abcdefghijk'
+            [811136751, -5046230049376118746, -77208053 ,
+             -4779029615281019666],
+            # seed 0, 'äú∑ℇ'
+            [44402817, 8998297579845987431, -1956240331,
+             -782697888614047887],
+            # seed 42, 'äú∑ℇ'
+            [-283066365, -4576729883824601543, -271871407, None],
+        ]
+    }
+
+    def get_expected_hash(self, position, length):
+        if length < sys.hash_info.cutoff:
+            algorithm = "djba33x"
+        else:
+            algorithm = sys.hash_info.algorithm
+        if sys.byteorder == 'little':
+            platform = 1 if IS_64BIT else 0
+        else:
+            assert(sys.byteorder == 'big')
+            platform = 3 if IS_64BIT else 2
+        return self.known_hashes[algorithm][position][platform]
+
     def test_null_hash(self):
         # PYTHONHASHSEED=0 disables the randomized hash
-        if IS_64BIT:
-            known_hash_of_obj = 1453079729188098211
-        else:
-            known_hash_of_obj = -1600925533
+        known_hash_of_obj = self.get_expected_hash(0, 3)
 
         # Randomization is enabled by default:
         self.assertNotEqual(self.get_hash(self.repr_), known_hash_of_obj)
@@ -174,39 +263,53 @@
         # It can also be disabled by setting the seed to 0:
         self.assertEqual(self.get_hash(self.repr_, seed=0), known_hash_of_obj)
 
+    @skip_unless_internalhash
     def test_fixed_hash(self):
         # test a fixed seed for the randomized hash
         # Note that all types share the same values:
-        if IS_64BIT:
-            if sys.byteorder == 'little':
-                h = -4410911502303878509
-            else:
-                h = -3570150969479994130
-        else:
-            if sys.byteorder == 'little':
-                h = -206076799
-            else:
-                h = -1024014457
+        h = self.get_expected_hash(1, 3)
         self.assertEqual(self.get_hash(self.repr_, seed=42), h)
 
+    @skip_unless_internalhash
+    def test_long_fixed_hash(self):
+        if self.repr_long is None:
+            return
+        h = self.get_expected_hash(2, 11)
+        self.assertEqual(self.get_hash(self.repr_long, seed=42), h)
+
+
 class StrHashRandomizationTests(StringlikeHashRandomizationTests,
                                 unittest.TestCase):
     repr_ = repr('abc')
+    repr_long = repr('abcdefghijk')
+    repr_ucs2 = repr('äú∑ℇ')
 
+    @skip_unless_internalhash
     def test_empty_string(self):
         self.assertEqual(hash(""), 0)
 
+    @skip_unless_internalhash
+    def test_ucs2_string(self):
+        h = self.get_expected_hash(3, 6)
+        self.assertEqual(self.get_hash(self.repr_ucs2, seed=0), h)
+        h = self.get_expected_hash(4, 6)
+        self.assertEqual(self.get_hash(self.repr_ucs2, seed=42), h)
+
 class BytesHashRandomizationTests(StringlikeHashRandomizationTests,
                                   unittest.TestCase):
     repr_ = repr(b'abc')
+    repr_long = repr(b'abcdefghijk')
 
+    @skip_unless_internalhash
     def test_empty_string(self):
         self.assertEqual(hash(b""), 0)
 
 class MemoryviewHashRandomizationTests(StringlikeHashRandomizationTests,
                                        unittest.TestCase):
     repr_ = "memoryview(b'abc')"
+    repr_long = "memoryview(b'abcdefghijk')"
 
+    @skip_unless_internalhash
     def test_empty_string(self):
         self.assertEqual(hash(memoryview(b"")), 0)
 
@@ -224,5 +327,22 @@
     repr_ = repr(datetime.time(0))
 
 
+class HashDistributionTestCase(unittest.TestCase):
+
+    def test_hash_distribution(self):
+        # check for hash collision
+        base = "abcdefghabcdefg"
+        for i in range(1, len(base)):
+            prefix = base[:i]
+            s15 = set()
+            s255 = set()
+            for c in range(256):
+                h = hash(prefix + chr(c))
+                s15.add(h & 0xf)
+                s255.add(h & 0xff)
+            # SipHash24 distribution depends on key, usually > 60%
+            self.assertGreater(len(s15), 8, prefix)
+            self.assertGreater(len(s255), 128, prefix)
+
 if __name__ == "__main__":
     unittest.main()

diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py
index 0565f39..f0c7148 100644
--- a/Lib/test/test_sys.py
+++ b/Lib/test/test_sys.py

@@ -8,6 +8,7 @@
 import codecs
 import gc
 import sysconfig
+import platform
 
 # count the number of test runs, used to create unique
 # strings to intern in test_intern()
@@ -431,7 +432,7 @@
         self.assertEqual(type(sys.int_info.sizeof_digit), int)
         self.assertIsInstance(sys.hexversion, int)
 
-        self.assertEqual(len(sys.hash_info), 5)
+        self.assertEqual(len(sys.hash_info), 9)
         self.assertLess(sys.hash_info.modulus, 2**sys.hash_info.width)
         # sys.hash_info.modulus should be a prime; we do a quick
         # probable primality test (doesn't exclude the possibility of
@@ -446,6 +447,26 @@
         self.assertIsInstance(sys.hash_info.inf, int)
         self.assertIsInstance(sys.hash_info.nan, int)
         self.assertIsInstance(sys.hash_info.imag, int)
+        algo = sysconfig.get_config_var("PY_HASH_ALGORITHM")
+        if sys.hash_info.algorithm in {"fnv", "siphash24"}:
+            self.assertIn(sys.hash_info.hash_bits, {32, 64})
+            self.assertIn(sys.hash_info.seed_bits, {32, 64, 128})
+
+            if algo == 1:
+                self.assertEqual(sys.hash_info.algorithm, "siphash24")
+            elif algo == 2:
+                self.assertEqual(sys.hash_info.algorithm, "fnv")
+            else:
+                processor = platform.processor().lower()
+                if processor in {"sparc", "mips"}:
+                    self.assertEqual(sys.hash_info.algorithm, "fnv")
+                else:
+                    self.assertEqual(sys.hash_info.algorithm, "siphash24")
+        else:
+            # PY_HASH_EXTERNAL
+            self.assertEqual(algo, 0)
+        self.assertGreaterEqual(sys.hash_info.cutoff, 0)
+        self.assertLess(sys.hash_info.cutoff, 8)
 
         self.assertIsInstance(sys.maxsize, int)
         self.assertIsInstance(sys.maxunicode, int)

diff --git a/Makefile.pre.in b/Makefile.pre.in
index eae8d75..33f6f86 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in

@@ -366,6 +366,7 @@
 		Python/pyarena.o \
 		Python/pyctype.o \
 		Python/pyfpe.o \
+		Python/pyhash.o \
 		Python/pymath.o \
 		Python/pystate.o \
 		Python/pythonrun.o \
@@ -868,6 +869,7 @@
 		$(srcdir)/Include/pydebug.h \
 		$(srcdir)/Include/pyerrors.h \
 		$(srcdir)/Include/pyfpe.h \
+		$(srcdir)/Include/pyhash.h \
 		$(srcdir)/Include/pymath.h \
 		$(srcdir)/Include/pygetopt.h \
 		$(srcdir)/Include/pymacro.h \

diff --git a/Misc/ACKS b/Misc/ACKS
index 436b9ac..8c89f16 100644
--- a/Misc/ACKS
+++ b/Misc/ACKS

@@ -802,6 +802,7 @@
 Don MacMillen
 Tomasz Maćkowiak
 Steve Majewski
+Marek Majkowski
 Grzegorz Makarewicz
 David Malcolm
 Greg Malcolm

diff --git a/Misc/NEWS b/Misc/NEWS
index 619feb2..62f1c3b 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS

@@ -10,6 +10,9 @@
 Core and Builtins
 -----------------
 
+- Issue #19183: Implement PEP 456 'secure and interchangeable hash algorithm'.
+  Python now uses SipHash24 on all major platforms.
+
 - Issue #12892: The utf-16* and utf-32* encoders no longer allow surrogate code
   points (U+D800-U+DFFF) to be encoded.  The utf-32* decoders no longer decode
   byte sequences that correspond to surrogate code points.  The surrogatepass

diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c
index 156dbf1..e11c153 100644
--- a/Modules/pyexpat.c
+++ b/Modules/pyexpat.c

@@ -1218,7 +1218,7 @@
      * has a backport of this feature where we also define XML_HAS_SET_HASH_SALT
      * to indicate that we can still use it. */
     XML_SetHashSalt(self->itself,
-                    (unsigned long)_Py_HashSecret.prefix);
+                    (unsigned long)_Py_HashSecret.expat.hashsalt);
 #endif
     XML_SetUserData(self->itself, (void *)self);
     XML_SetUnknownEncodingHandler(self->itself,

diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c
index efa0192..8217b1e 100644
--- a/Objects/bytesobject.c
+++ b/Objects/bytesobject.c

@@ -897,7 +897,7 @@
 {
     if (a->ob_shash == -1) {
         /* Can't fail */
-        a->ob_shash = _Py_HashBytes((unsigned char *) a->ob_sval, Py_SIZE(a));
+        a->ob_shash = _Py_HashBytes(a->ob_sval, Py_SIZE(a));
     }
     return a->ob_shash;
 }

diff --git a/Objects/memoryobject.c b/Objects/memoryobject.c
index 1d52d9d..cb644b8 100644
--- a/Objects/memoryobject.c
+++ b/Objects/memoryobject.c

@@ -2742,7 +2742,7 @@
         }
 
         /* Can't fail */
-        self->hash = _Py_HashBytes((unsigned char *)mem, view->len);
+        self->hash = _Py_HashBytes(mem, view->len);
 
         if (mem != view->buf)
             PyMem_Free(mem);

diff --git a/Objects/object.c b/Objects/object.c
index acc34af..395e28d 100644
--- a/Objects/object.c
+++ b/Objects/object.c

@@ -731,150 +731,6 @@
     return ok;
 }
 
-/* Set of hash utility functions to help maintaining the invariant that
-    if a==b then hash(a)==hash(b)
-
-   All the utility functions (_Py_Hash*()) return "-1" to signify an error.
-*/
-
-/* For numeric types, the hash of a number x is based on the reduction
-   of x modulo the prime P = 2**_PyHASH_BITS - 1.  It's designed so that
-   hash(x) == hash(y) whenever x and y are numerically equal, even if
-   x and y have different types.
-
-   A quick summary of the hashing strategy:
-
-   (1) First define the 'reduction of x modulo P' for any rational
-   number x; this is a standard extension of the usual notion of
-   reduction modulo P for integers.  If x == p/q (written in lowest
-   terms), the reduction is interpreted as the reduction of p times
-   the inverse of the reduction of q, all modulo P; if q is exactly
-   divisible by P then define the reduction to be infinity.  So we've
-   got a well-defined map
-
-      reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
-
-   (2) Now for a rational number x, define hash(x) by:
-
-      reduce(x)   if x >= 0
-      -reduce(-x) if x < 0
-
-   If the result of the reduction is infinity (this is impossible for
-   integers, floats and Decimals) then use the predefined hash value
-   _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
-   _PyHASH_INF, -_PyHASH_INF and _PyHASH_NAN are also used for the
-   hashes of float and Decimal infinities and nans.
-
-   A selling point for the above strategy is that it makes it possible
-   to compute hashes of decimal and binary floating-point numbers
-   efficiently, even if the exponent of the binary or decimal number
-   is large.  The key point is that
-
-      reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS)
-
-   provided that {reduce(x), reduce(y)} != {0, infinity}.  The reduction of a
-   binary or decimal float is never infinity, since the denominator is a power
-   of 2 (for binary) or a divisor of a power of 10 (for decimal).  So we have,
-   for nonnegative x,
-
-      reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS
-
-      reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS
-
-   and reduce(10**e) can be computed efficiently by the usual modular
-   exponentiation algorithm.  For reduce(2**e) it's even better: since
-   P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
-   by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.
-
-   */
-
-Py_hash_t
-_Py_HashDouble(double v)
-{
-    int e, sign;
-    double m;
-    Py_uhash_t x, y;
-
-    if (!Py_IS_FINITE(v)) {
-        if (Py_IS_INFINITY(v))
-            return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
-        else
-            return _PyHASH_NAN;
-    }
-
-    m = frexp(v, &e);
-
-    sign = 1;
-    if (m < 0) {
-        sign = -1;
-        m = -m;
-    }
-
-    /* process 28 bits at a time;  this should work well both for binary
-       and hexadecimal floating point. */
-    x = 0;
-    while (m) {
-        x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28);
-        m *= 268435456.0;  /* 2**28 */
-        e -= 28;
-        y = (Py_uhash_t)m;  /* pull out integer part */
-        m -= y;
-        x += y;
-        if (x >= _PyHASH_MODULUS)
-            x -= _PyHASH_MODULUS;
-    }
-
-    /* adjust for the exponent;  first reduce it modulo _PyHASH_BITS */
-    e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS);
-    x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e);
-
-    x = x * sign;
-    if (x == (Py_uhash_t)-1)
-        x = (Py_uhash_t)-2;
-    return (Py_hash_t)x;
-}
-
-Py_hash_t
-_Py_HashPointer(void *p)
-{
-    Py_hash_t x;
-    size_t y = (size_t)p;
-    /* bottom 3 or 4 bits are likely to be 0; rotate y by 4 to avoid
-       excessive hash collisions for dicts and sets */
-    y = (y >> 4) | (y << (8 * SIZEOF_VOID_P - 4));
-    x = (Py_hash_t)y;
-    if (x == -1)
-        x = -2;
-    return x;
-}
-
-Py_hash_t
-_Py_HashBytes(unsigned char *p, Py_ssize_t len)
-{
-    Py_uhash_t x;
-    Py_ssize_t i;
-
-    /*
-      We make the hash of the empty string be 0, rather than using
-      (prefix ^ suffix), since this slightly obfuscates the hash secret
-    */
-#ifdef Py_DEBUG
-    assert(_Py_HashSecret_Initialized);
-#endif
-    if (len == 0) {
-        return 0;
-    }
-    x = (Py_uhash_t) _Py_HashSecret.prefix;
-    x ^= (Py_uhash_t) *p << 7;
-    for (i = 0; i < len; i++)
-        x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++;
-    x ^= (Py_uhash_t) len;
-    x ^= (Py_uhash_t) _Py_HashSecret.suffix;
-    if (x == -1)
-        x = -2;
-    return x;
-}
-
 Py_hash_t
 PyObject_HashNotImplemented(PyObject *v)
 {
@@ -883,8 +739,6 @@
     return -1;
 }
 
-_Py_HashSecret_t _Py_HashSecret;
-
 Py_hash_t
 PyObject_Hash(PyObject *v)
 {

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 880889e..3644db3 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c

@@ -11386,39 +11386,8 @@
         _PyUnicode_HASH(self) = 0;
         return 0;
     }
-
-    /* The hash function as a macro, gets expanded three times below. */
-#define HASH(P)                                            \
-    x ^= (Py_uhash_t) *P << 7;                             \
-    while (--len >= 0)                                     \
-        x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++;  \
-
-    x = (Py_uhash_t) _Py_HashSecret.prefix;
-    switch (PyUnicode_KIND(self)) {
-    case PyUnicode_1BYTE_KIND: {
-        const unsigned char *c = PyUnicode_1BYTE_DATA(self);
-        HASH(c);
-        break;
-    }
-    case PyUnicode_2BYTE_KIND: {
-        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
-        HASH(s);
-        break;
-    }
-    default: {
-        Py_UCS4 *l;
-        assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
-               "Impossible switch case in unicode_hash");
-        l = PyUnicode_4BYTE_DATA(self);
-        HASH(l);
-        break;
-    }
-    }
-    x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
-    x ^= (Py_uhash_t) _Py_HashSecret.suffix;
-
-    if (x == -1)
-        x = -2;
+    x = _Py_HashBytes(PyUnicode_DATA(self),
+                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
     _PyUnicode_HASH(self) = x;
     return x;
 }

diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj
index 1b6e864..c2e1eb3 100644
--- a/PCbuild/pythoncore.vcxproj
+++ b/PCbuild/pythoncore.vcxproj

@@ -412,6 +412,7 @@
     <ClInclude Include="..\Include\patchlevel.h" />
     <ClInclude Include="..\Include\pgen.h" />
     <ClInclude Include="..\Include\pgenheaders.h" />
+    <ClInclude Include="..\Include\pyhash.h" />
     <ClInclude Include="..\Include\py_curses.h" />
     <ClInclude Include="..\Include\pyarena.h" />
     <ClInclude Include="..\Include\pycapsule.h" />
@@ -616,6 +617,7 @@
     <ClCompile Include="..\PC\dl_nt.c" />
     <ClCompile Include="..\PC\getpathp.c" />
     <ClCompile Include="..\PC\msvcrtmodule.c" />
+    <ClCompile Include="..\Python\pyhash.c" />
     <ClCompile Include="..\Python\random.c" />
     <ClCompile Include="..\Python\_warnings.c" />
     <ClCompile Include="..\Python\asdl.c" />

diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters
index e1a9301..41fe1b2 100644
--- a/PCbuild/pythoncore.vcxproj.filters
+++ b/PCbuild/pythoncore.vcxproj.filters

@@ -421,6 +421,9 @@
     <ClInclude Include="..\Python\ceval_gil.h">
       <Filter>Python</Filter>
     </ClInclude>
+    <ClInclude Include="..\Include\pyhash.h">
+      <Filter>Include</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="..\Modules\_bisectmodule.c">
@@ -931,6 +934,9 @@
     <ClCompile Include="..\Modules\_stat.c">
       <Filter>Modules</Filter>
     </ClCompile>
+    <ClCompile Include="..\Python\pyhash.c">
+      <Filter>Python</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ResourceCompile Include="..\PC\python_nt.rc">

diff --git a/Python/pyhash.c b/Python/pyhash.c
new file mode 100644
index 0000000..158c631
--- /dev/null
+++ b/Python/pyhash.c

@@ -0,0 +1,430 @@
+/* Set of hash utility functions to help maintaining the invariant that
+    if a==b then hash(a)==hash(b)
+
+   All the utility functions (_Py_Hash*()) return "-1" to signify an error.
+*/
+#include "Python.h"
+
+#ifdef __APPLE__
+#  include <libkern/OSByteOrder.h>
+#elif defined(HAVE_LE64TOH) && defined(HAVE_ENDIAN_H)
+#  include <endian.h>
+#elif defined(HAVE_LE64TOH) && defined(HAVE_SYS_ENDIAN_H)
+#  include <sys/endian.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+_Py_HashSecret_t _Py_HashSecret;
+
+#if Py_HASH_ALGORITHM == Py_HASH_EXTERNAL
+extern PyHash_FuncDef PyHash_Func;
+#else
+static PyHash_FuncDef PyHash_Func;
+#endif
+
+/* Count _Py_HashBytes() calls */
+#ifdef Py_HASH_STATS
+#define Py_HASH_STATS_MAX 32
+static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
+#endif
+
+/* For numeric types, the hash of a number x is based on the reduction
+   of x modulo the prime P = 2**_PyHASH_BITS - 1.  It's designed so that
+   hash(x) == hash(y) whenever x and y are numerically equal, even if
+   x and y have different types.
+
+   A quick summary of the hashing strategy:
+
+   (1) First define the 'reduction of x modulo P' for any rational
+   number x; this is a standard extension of the usual notion of
+   reduction modulo P for integers.  If x == p/q (written in lowest
+   terms), the reduction is interpreted as the reduction of p times
+   the inverse of the reduction of q, all modulo P; if q is exactly
+   divisible by P then define the reduction to be infinity.  So we've
+   got a well-defined map
+
+      reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
+
+   (2) Now for a rational number x, define hash(x) by:
+
+      reduce(x)   if x >= 0
+      -reduce(-x) if x < 0
+
+   If the result of the reduction is infinity (this is impossible for
+   integers, floats and Decimals) then use the predefined hash value
+   _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
+   _PyHASH_INF, -_PyHASH_INF and _PyHASH_NAN are also used for the
+   hashes of float and Decimal infinities and nans.
+
+   A selling point for the above strategy is that it makes it possible
+   to compute hashes of decimal and binary floating-point numbers
+   efficiently, even if the exponent of the binary or decimal number
+   is large.  The key point is that
+
+      reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS)
+
+   provided that {reduce(x), reduce(y)} != {0, infinity}.  The reduction of a
+   binary or decimal float is never infinity, since the denominator is a power
+   of 2 (for binary) or a divisor of a power of 10 (for decimal).  So we have,
+   for nonnegative x,
+
+      reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS
+
+      reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS
+
+   and reduce(10**e) can be computed efficiently by the usual modular
+   exponentiation algorithm.  For reduce(2**e) it's even better: since
+   P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
+   by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.
+
+   */
+
+Py_hash_t
+_Py_HashDouble(double v)
+{
+    int e, sign;
+    double m;
+    Py_uhash_t x, y;
+
+    if (!Py_IS_FINITE(v)) {
+        if (Py_IS_INFINITY(v))
+            return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
+        else
+            return _PyHASH_NAN;
+    }
+
+    m = frexp(v, &e);
+
+    sign = 1;
+    if (m < 0) {
+        sign = -1;
+        m = -m;
+    }
+
+    /* process 28 bits at a time;  this should work well both for binary
+       and hexadecimal floating point. */
+    x = 0;
+    while (m) {
+        x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28);
+        m *= 268435456.0;  /* 2**28 */
+        e -= 28;
+        y = (Py_uhash_t)m;  /* pull out integer part */
+        m -= y;
+        x += y;
+        if (x >= _PyHASH_MODULUS)
+            x -= _PyHASH_MODULUS;
+    }
+
+    /* adjust for the exponent;  first reduce it modulo _PyHASH_BITS */
+    e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS);
+    x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e);
+
+    x = x * sign;
+    if (x == (Py_uhash_t)-1)
+        x = (Py_uhash_t)-2;
+    return (Py_hash_t)x;
+}
+
+Py_hash_t
+_Py_HashPointer(void *p)
+{
+    Py_hash_t x;
+    size_t y = (size_t)p;
+    /* bottom 3 or 4 bits are likely to be 0; rotate y by 4 to avoid
+       excessive hash collisions for dicts and sets */
+    y = (y >> 4) | (y << (8 * SIZEOF_VOID_P - 4));
+    x = (Py_hash_t)y;
+    if (x == -1)
+        x = -2;
+    return x;
+}
+
+Py_hash_t
+_Py_HashBytes(const void *src, Py_ssize_t len)
+{
+    Py_hash_t x;
+    /*
+      We make the hash of the empty string be 0, rather than using
+      (prefix ^ suffix), since this slightly obfuscates the hash secret
+    */
+    if (len == 0) {
+        return 0;
+    }
+
+#ifdef Py_HASH_STATS
+    hashstats[(len <= Py_HASH_STATS_MAX) ? len : 0]++;
+#endif
+
+#if Py_HASH_CUTOFF > 0
+    if (len < Py_HASH_CUTOFF) {
+        /* Optimize hashing of very small strings with inline DJBX33A. */
+        Py_uhash_t hash;
+        const unsigned char *p = src;
+        hash = 5381; /* DJBX33A starts with 5381 */
+
+        switch(len) {
+            /* ((hash << 5) + hash) + *p == hash * 33 + *p */
+            case 7: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
+            case 6: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
+            case 5: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
+            case 4: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
+            case 3: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
+            case 2: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
+            case 1: hash = ((hash << 5) + hash) + *p++; break;
+            default:
+                assert(0);
+        }
+        hash ^= len;
+        hash ^= (Py_uhash_t) _Py_HashSecret.djbx33a.suffix;
+        x = (Py_hash_t)hash;
+    }
+    else
+#endif /* Py_HASH_CUTOFF */
+        x = PyHash_Func.hash(src, len);
+
+    if (x == -1)
+        return -2;
+    return x;
+}
+
+void
+_PyHash_Fini(void)
+{
+#ifdef Py_HASH_STATS
+    int i;
+    Py_ssize_t total = 0;
+    char *fmt = "%2i %8" PY_FORMAT_SIZE_T "d %8" PY_FORMAT_SIZE_T "d\n";
+
+    fprintf(stderr, "len   calls    total\n");
+    for (i = 1; i <= Py_HASH_STATS_MAX; i++) {
+        total += hashstats[i];
+        fprintf(stderr, fmt, i, hashstats[i], total);
+    }
+    total += hashstats[0];
+    fprintf(stderr, ">  %8" PY_FORMAT_SIZE_T "d %8" PY_FORMAT_SIZE_T "d\n",
+            hashstats[0], total);
+#endif
+}
+
+PyHash_FuncDef *
+PyHash_GetFuncDef(void)
+{
+    return &PyHash_Func;
+}
+
+/* Optimized memcpy() for Windows */
+#ifdef _MSC_VER
+#  if SIZEOF_PY_UHASH_T == 4
+#    define PY_UHASH_CPY(dst, src) do {                                    \
+       dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
+       } while(0)
+#  elif SIZEOF_PY_UHASH_T == 8
+#    define PY_UHASH_CPY(dst, src) do {                                    \
+       dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
+       dst[4] = src[4]; dst[5] = src[5]; dst[6] = src[6]; dst[7] = src[7]; \
+       } while(0)
+#  else
+#    error SIZEOF_PY_UHASH_T must be 4 or 8
+#  endif /* SIZEOF_PY_UHASH_T */
+#else /* not Windows */
+#  define PY_UHASH_CPY(dst, src) memcpy(dst, src, SIZEOF_PY_UHASH_T)
+#endif /* _MSC_VER */
+
+
+#if Py_HASH_ALGORITHM == Py_HASH_FNV
+/* **************************************************************************
+ * Modified Fowler-Noll-Vo (FNV) hash function
+ */
+static Py_hash_t
+fnv(const void *src, Py_ssize_t len)
+{
+    const unsigned char *p = src;
+    Py_uhash_t x;
+    Py_ssize_t remainder, blocks;
+    union {
+        Py_uhash_t value;
+        unsigned char bytes[SIZEOF_PY_UHASH_T];
+    } block;
+
+#ifdef Py_DEBUG
+    assert(_Py_HashSecret_Initialized);
+#endif
+    remainder = len % SIZEOF_PY_UHASH_T;
+    if (remainder == 0) {
+        /* Process at least one block byte by byte to reduce hash collisions
+         * for strings with common prefixes. */
+        remainder = SIZEOF_PY_UHASH_T;
+    }
+    blocks = (len - remainder) / SIZEOF_PY_UHASH_T;
+
+    x = (Py_uhash_t) _Py_HashSecret.fnv.prefix;
+    x ^= (Py_uhash_t) *p << 7;
+    while (blocks--) {
+        PY_UHASH_CPY(block.bytes, p);
+        x = (_PyHASH_MULTIPLIER * x) ^ block.value;
+        p += SIZEOF_PY_UHASH_T;
+    }
+    /* add remainder */
+    for (; remainder > 0; remainder--)
+        x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++;
+    x ^= (Py_uhash_t) len;
+    x ^= (Py_uhash_t) _Py_HashSecret.fnv.suffix;
+    if (x == -1) {
+        x = -2;
+    }
+    return x;
+}
+
+static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T,
+                                     16 * SIZEOF_PY_HASH_T};
+
+#endif /* Py_HASH_ALGORITHM == Py_HASH_FNV */
+
+
+#if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
+/* **************************************************************************
+ <MIT License>
+ Copyright (c) 2013  Marek Majkowski <marek@popcount.org>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+ </MIT License>
+
+ Original location:
+    https://github.com/majek/csiphash/
+
+ Solution inspired by code from:
+    Samuel Neves (supercop/crypto_auth/siphash24/little)
+    djb (supercop/crypto_auth/siphash24/little2)
+    Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c)
+
+ Modified for Python by Christian Heimes:
+    - C89 / MSVC compatibility
+    - PY_UINT64_T, PY_UINT32_T and PY_UINT8_T
+    - _rotl64() on Windows
+    - letoh64() fallback
+*/
+
+typedef unsigned char PY_UINT8_T;
+
+/* byte swap little endian to host endian
+ * Endian conversion not only ensures that the hash function returns the same
+ * value on all platforms. It is also required to for a good dispersion of
+ * the hash values' least significant bits.
+ */
+#if PY_LITTLE_ENDIAN
+#  define _le64toh(x) ((PY_UINT64_T)(x))
+#elif defined(__APPLE__)
+#  define _le64toh(x) OSSwapLittleToHostInt64(x)
+#elif defined(HAVE_LETOH64)
+#  define _le64toh(x) le64toh(x)
+#else
+#  define _le64toh(x) (((PY_UINT64_T)(x) << 56) | \
+                      (((PY_UINT64_T)(x) << 40) & 0xff000000000000ULL) | \
+                      (((PY_UINT64_T)(x) << 24) & 0xff0000000000ULL) | \
+                      (((PY_UINT64_T)(x) << 8)  & 0xff00000000ULL) | \
+                      (((PY_UINT64_T)(x) >> 8)  & 0xff000000ULL) | \
+                      (((PY_UINT64_T)(x) >> 24) & 0xff0000ULL) | \
+                      (((PY_UINT64_T)(x) >> 40) & 0xff00ULL) | \
+                      ((PY_UINT64_T)(x)  >> 56))
+#endif
+
+
+#ifdef _MSC_VER
+#  define ROTATE(x, b)  _rotl64(x, b)
+#else
+#  define ROTATE(x, b) (PY_UINT64_T)( ((x) << (b)) | ( (x) >> (64 - (b))) )
+#endif
+
+#define HALF_ROUND(a,b,c,d,s,t)         \
+    a += b; c += d;             \
+    b = ROTATE(b, s) ^ a;           \
+    d = ROTATE(d, t) ^ c;           \
+    a = ROTATE(a, 32);
+
+#define DOUBLE_ROUND(v0,v1,v2,v3)       \
+    HALF_ROUND(v0,v1,v2,v3,13,16);      \
+    HALF_ROUND(v2,v1,v0,v3,17,21);      \
+    HALF_ROUND(v0,v1,v2,v3,13,16);      \
+    HALF_ROUND(v2,v1,v0,v3,17,21);
+
+
+static Py_hash_t
+siphash24(const void *src, Py_ssize_t src_sz) {
+    PY_UINT64_T k0 = _le64toh(_Py_HashSecret.siphash.k0);
+    PY_UINT64_T k1 = _le64toh(_Py_HashSecret.siphash.k1);
+    PY_UINT64_T b = (PY_UINT64_T)src_sz << 56;
+    const PY_UINT64_T *in = (PY_UINT64_T*)src;
+
+    PY_UINT64_T v0 = k0 ^ 0x736f6d6570736575ULL;
+    PY_UINT64_T v1 = k1 ^ 0x646f72616e646f6dULL;
+    PY_UINT64_T v2 = k0 ^ 0x6c7967656e657261ULL;
+    PY_UINT64_T v3 = k1 ^ 0x7465646279746573ULL;
+
+    PY_UINT64_T t;
+    PY_UINT8_T *pt;
+    PY_UINT8_T *m;
+
+    while (src_sz >= 8) {
+        PY_UINT64_T mi = _le64toh(*in);
+        in += 1;
+        src_sz -= 8;
+        v3 ^= mi;
+        DOUBLE_ROUND(v0,v1,v2,v3);
+        v0 ^= mi;
+    }
+
+    t = 0;
+    pt = (PY_UINT8_T *)&t;
+    m = (PY_UINT8_T *)in;
+    switch (src_sz) {
+        case 7: pt[6] = m[6];
+        case 6: pt[5] = m[5];
+        case 5: pt[4] = m[4];
+        case 4: *((PY_UINT32_T*)&pt[0]) = *((PY_UINT32_T*)&m[0]); break;
+        case 3: pt[2] = m[2];
+        case 2: pt[1] = m[1];
+        case 1: pt[0] = m[0];
+    }
+    b |= _le64toh(t);
+
+    v3 ^= b;
+    DOUBLE_ROUND(v0,v1,v2,v3);
+    v0 ^= b;
+    v2 ^= 0xff;
+    DOUBLE_ROUND(v0,v1,v2,v3);
+    DOUBLE_ROUND(v0,v1,v2,v3);
+
+    /* modified */
+    t = (v0 ^ v1) ^ (v2 ^ v3);
+#if SIZEOF_VOID_P == 4
+    t ^= (t >> 32);
+#endif
+    return (Py_hash_t)t;
+}
+
+static PyHash_FuncDef PyHash_Func = {siphash24, "siphash24", 64, 128};
+
+#endif /* Py_HASH_ALGORITHM == Py_HASH_SIPHASH24 */
+
+#ifdef __cplusplus
+}
+#endif

diff --git a/Python/pythonrun.c b/Python/pythonrun.c
index e427be3..b5d57df 100644
--- a/Python/pythonrun.c
+++ b/Python/pythonrun.c

@@ -104,6 +104,7 @@
 extern void PyLong_Fini(void);
 extern int _PyFaulthandler_Init(void);
 extern void _PyFaulthandler_Fini(void);
+extern void _PyHash_Fini(void);
 
 #ifdef WITH_THREAD
 extern void _PyGILState_Init(PyInterpreterState *, PyThreadState *);
@@ -650,6 +651,8 @@
 #ifdef COUNT_ALLOCS
     dump_counts(stdout);
 #endif
+    /* dump hash stats */
+    _PyHash_Fini();
 
     PRINT_TOTAL_REFS();
 

diff --git a/Python/random.c b/Python/random.c
index d9c7e77..de8e9e7 100644
--- a/Python/random.c
+++ b/Python/random.c

@@ -95,7 +95,7 @@
 /* Read size bytes from /dev/urandom into buffer.
    Call Py_FatalError() on error. */
 static void
-dev_urandom_noraise(char *buffer, Py_ssize_t size)
+dev_urandom_noraise(unsigned char *buffer, Py_ssize_t size)
 {
     int fd;
     Py_ssize_t n;
@@ -249,8 +249,9 @@
 _PyRandom_Init(void)
 {
     char *env;
-    void *secret = &_Py_HashSecret;
+    unsigned char *secret = (unsigned char *)&_Py_HashSecret.uc;
     Py_ssize_t secret_size = sizeof(_Py_HashSecret_t);
+    assert(secret_size == sizeof(_Py_HashSecret.uc));
 
     if (_Py_HashSecret_Initialized)
         return;
@@ -278,17 +279,17 @@
             memset(secret, 0, secret_size);
         }
         else {
-            lcg_urandom(seed, (unsigned char*)secret, secret_size);
+            lcg_urandom(seed, secret, secret_size);
         }
     }
     else {
 #ifdef MS_WINDOWS
-        (void)win32_urandom((unsigned char *)secret, secret_size, 0);
+        (void)win32_urandom(secret, secret_size, 0);
 #else /* #ifdef MS_WINDOWS */
 # ifdef __VMS
-        vms_urandom((unsigned char *)secret, secret_size, 0);
+        vms_urandom(secret, secret_size, 0);
 # else
-        dev_urandom_noraise((char*)secret, secret_size);
+        dev_urandom_noraise(secret, secret_size);
 # endif
 #endif
     }

diff --git a/Python/sysmodule.c b/Python/sysmodule.c
index 35a0671..4028a01 100644
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c

@@ -658,7 +658,7 @@
 "hash_info\n\
 \n\
 A struct sequence providing parameters used for computing\n\
-numeric hashes.  The attributes are read only.");
+hashes. The attributes are read only.");
 
 static PyStructSequence_Field hash_info_fields[] = {
     {"width", "width of the type used for hashing, in bits"},
@@ -667,6 +667,11 @@
     {"inf", "value to be used for hash of a positive infinity"},
     {"nan", "value to be used for hash of a nan"},
     {"imag", "multiplier used for the imaginary part of a complex number"},
+    {"algorithm", "name of the algorithm for hashing of str, bytes and "
+                  "memoryviews"},
+    {"hash_bits", "internal output size of hash algorithm"},
+    {"seed_bits", "seed size of hash algorithm"},
+    {"cutoff", "small string optimization cutoff"},
     {NULL, NULL}
 };
 
@@ -674,7 +679,7 @@
     "sys.hash_info",
     hash_info_doc,
     hash_info_fields,
-    5,
+    9,
 };
 
 static PyObject *
@@ -682,9 +687,11 @@
 {
     PyObject *hash_info;
     int field = 0;
+    PyHash_FuncDef *hashfunc;
     hash_info = PyStructSequence_New(&Hash_InfoType);
     if (hash_info == NULL)
         return NULL;
+    hashfunc = PyHash_GetFuncDef();
     PyStructSequence_SET_ITEM(hash_info, field++,
                               PyLong_FromLong(8*sizeof(Py_hash_t)));
     PyStructSequence_SET_ITEM(hash_info, field++,
@@ -695,6 +702,14 @@
                               PyLong_FromLong(_PyHASH_NAN));
     PyStructSequence_SET_ITEM(hash_info, field++,
                               PyLong_FromLong(_PyHASH_IMAG));
+    PyStructSequence_SET_ITEM(hash_info, field++,
+                              PyUnicode_FromString(hashfunc->name));
+    PyStructSequence_SET_ITEM(hash_info, field++,
+                              PyLong_FromLong(hashfunc->hash_bits));
+    PyStructSequence_SET_ITEM(hash_info, field++,
+                              PyLong_FromLong(hashfunc->seed_bits));
+    PyStructSequence_SET_ITEM(hash_info, field++,
+                              PyLong_FromLong(Py_HASH_CUTOFF));
     if (PyErr_Occurred()) {
         Py_CLEAR(hash_info);
         return NULL;
@@ -1338,6 +1353,7 @@
 executable -- absolute path of the executable binary of the Python interpreter\n\
 float_info -- a struct sequence with information about the float implementation.\n\
 float_repr_style -- string indicating the style of repr() output for floats\n\
+hash_info -- a struct sequence with information about the hash algorithm.\n\
 hexversion -- version information encoded as a single integer\n\
 implementation -- Python implementation information.\n\
 int_info -- a struct sequence with information about the int implementation.\n\

diff --git a/configure b/configure
index 225e8a4..cb01b74 100755
--- a/configure
+++ b/configure

@@ -792,6 +792,7 @@
 enable_shared
 enable_profiling
 with_pydebug
+with_hash_algorithm
 with_libs
 with_system_expat
 with_system_ffi
@@ -1465,6 +1466,8 @@
                           compiler
   --with-suffix=.exe      set executable suffix
   --with-pydebug          build with Py_DEBUG defined
+  --with-hash-algorithm=[fnv|siphash24]
+                          select hash algorithm
   --with-libs='lib1 ...'  link against additional libs
   --with-system-expat     build pyexpat module using an installed expat
                           library
@@ -6956,7 +6959,8 @@
 sys/stat.h sys/syscall.h sys/sys_domain.h sys/termio.h sys/time.h \
 sys/times.h sys/types.h sys/uio.h sys/un.h sys/utsname.h sys/wait.h pty.h \
 libutil.h sys/resource.h netpacket/packet.h sysexits.h bluetooth.h \
-bluetooth/bluetooth.h linux/tipc.h spawn.h util.h alloca.h
+bluetooth/bluetooth.h linux/tipc.h spawn.h util.h alloca.h endian.h \
+sys/endian.h
 do :
   as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
 ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default"
@@ -7330,6 +7334,43 @@
 
 fi
 
+# byte swapping
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for le64toh" >&5
+$as_echo_n "checking for le64toh... " >&6; }
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+#ifdef HAVE_ENDIAN_H
+#include <endian.h>
+#elif defined(HAVE_SYS_ENDIAN_H)
+#include <sys/endian.h>
+#endif
+
+int
+main ()
+{
+
+   le64toh(1)
+  ;
+  return 0;
+}
+
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_has_le64toh=yes
+else
+  ac_cv_has_le64toh=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_has_le64toh" >&5
+$as_echo "$ac_cv_has_le64toh" >&6; }
+if test "$ac_cv_has_le64toh" = "yes"; then
+
+$as_echo "#define HAVE_HTOLE64 1" >>confdefs.h
+
+fi
+
 # Enabling LFS on Solaris (2.6 to 9) with gcc 2.95 triggers a bug in
 # the system headers: If _XOPEN_SOURCE and _LARGEFILE_SOURCE are
 # defined, but the compiler does not support pragma redefine_extname,
@@ -8987,6 +9028,79 @@
 	*) ;;
 esac
 
+# check for systems that require aligned memory access
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking aligned memory access is required" >&5
+$as_echo_n "checking aligned memory access is required... " >&6; }
+if test "$cross_compiling" = yes; then :
+  aligned_required=yes
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int main()
+{
+    char s[16];
+    int i, *p1, *p2;
+    for (i=0; i < 16; i++)
+        s[i] = i;
+    p1 = (int*)(s+1);
+    p2 = (int*)(s+2);
+    if (*p1 == *p2)
+        return 1;
+    return 0;
+}
+
+_ACEOF
+if ac_fn_c_try_run "$LINENO"; then :
+  aligned_required=no
+else
+  aligned_required=yes
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
+fi
+
+
+if test "$aligned_required" = yes ; then
+
+$as_echo "#define HAVE_ALIGNED_REQUIRED 1" >>confdefs.h
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $aligned_required" >&5
+$as_echo "$aligned_required" >&6; }
+
+
+# str, bytes and memoryview hash algorithm
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for --with-hash-algorithm" >&5
+$as_echo_n "checking for --with-hash-algorithm... " >&6; }
+
+# Check whether --with-hash_algorithm was given.
+if test "${with_hash_algorithm+set}" = set; then :
+  withval=$with_hash_algorithm;
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $withval" >&5
+$as_echo "$withval" >&6; }
+case "$withval" in
+    siphash24)
+        $as_echo "#define Py_HASH_ALGORITHM 1" >>confdefs.h
+
+        ;;
+    fnv)
+        $as_echo "#define Py_HASH_ALGORITHM 2" >>confdefs.h
+
+        ;;
+    *)
+        as_fn_error $? "unknown hash algorithm '$withval'" "$LINENO" 5
+        ;;
+esac
+
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: default" >&5
+$as_echo "default" >&6; }
+fi
+
+
 # Most SVR4 platforms (e.g. Solaris) need -lsocket and -lnsl.
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for t_open in -lnsl" >&5
 $as_echo_n "checking for t_open in -lnsl... " >&6; }

diff --git a/configure.ac b/configure.ac
index 369c7e5..5ba7d03 100644
--- a/configure.ac
+++ b/configure.ac

@@ -1543,7 +1543,8 @@
 sys/stat.h sys/syscall.h sys/sys_domain.h sys/termio.h sys/time.h \
 sys/times.h sys/types.h sys/uio.h sys/un.h sys/utsname.h sys/wait.h pty.h \
 libutil.h sys/resource.h netpacket/packet.h sysexits.h bluetooth.h \
-bluetooth/bluetooth.h linux/tipc.h spawn.h util.h alloca.h)
+bluetooth/bluetooth.h linux/tipc.h spawn.h util.h alloca.h endian.h \
+sys/endian.h)
 CPPFLAGS=$ac_save_cppflags
 AC_HEADER_DIRENT
 AC_HEADER_MAJOR
@@ -1614,6 +1615,22 @@
     AC_DEFINE(HAVE_MAKEDEV, 1, [Define this if you have the makedev macro.])
 fi
 
+# byte swapping
+AC_MSG_CHECKING(for le64toh)
+AC_LINK_IFELSE([AC_LANG_PROGRAM([[
+#ifdef HAVE_ENDIAN_H
+#include <endian.h>
+#elif defined(HAVE_SYS_ENDIAN_H)
+#include <sys/endian.h>
+#endif
+]], [[
+   le64toh(1) ]])
+],[ac_cv_has_le64toh=yes],[ac_cv_has_le64toh=no])
+AC_MSG_RESULT($ac_cv_has_le64toh)
+if test "$ac_cv_has_le64toh" = "yes"; then
+    AC_DEFINE(HAVE_HTOLE64, 1, [Define this if you have le64toh()])
+fi
+
 # Enabling LFS on Solaris (2.6 to 9) with gcc 2.95 triggers a bug in
 # the system headers: If _XOPEN_SOURCE and _LARGEFILE_SOURCE are
 # defined, but the compiler does not support pragma redefine_extname,
@@ -2229,6 +2246,59 @@
 	*) ;;
 esac
 
+# check for systems that require aligned memory access
+AC_MSG_CHECKING(aligned memory access is required)
+AC_TRY_RUN([
+int main()
+{
+    char s[16];
+    int i, *p1, *p2;
+    for (i=0; i < 16; i++)
+        s[i] = i;
+    p1 = (int*)(s+1);
+    p2 = (int*)(s+2);
+    if (*p1 == *p2)
+        return 1;
+    return 0;
+}
+              ],
+     [aligned_required=no],
+     [aligned_required=yes],
+     [aligned_required=yes])
+
+if test "$aligned_required" = yes ; then
+  AC_DEFINE([HAVE_ALIGNED_REQUIRED], [1],
+    [Define if aligned memory access is required])
+fi
+AC_MSG_RESULT($aligned_required)
+
+
+# str, bytes and memoryview hash algorithm
+AH_TEMPLATE(Py_HASH_ALGORITHM,
+  [Define hash algorithm for str, bytes and memoryview.
+   SipHash24: 1, FNV: 2, externally defined: 0])
+
+AC_MSG_CHECKING(for --with-hash-algorithm)
+dnl quadrigraphs "@<:@" and "@:>@" produce "[" and "]" in the output
+AC_ARG_WITH(hash_algorithm,
+            AS_HELP_STRING([--with-hash-algorithm=@<:@fnv|siphash24@:>@],
+                           [select hash algorithm]),
+[
+AC_MSG_RESULT($withval)
+case "$withval" in
+    siphash24)
+        AC_DEFINE(Py_HASH_ALGORITHM, 1)
+        ;;
+    fnv)
+        AC_DEFINE(Py_HASH_ALGORITHM, 2)
+        ;;
+    *)
+        AC_MSG_ERROR([unknown hash algorithm '$withval'])
+        ;;
+esac
+],
+[AC_MSG_RESULT(default)])
+
 # Most SVR4 platforms (e.g. Solaris) need -lsocket and -lnsl.
 AC_CHECK_LIB(nsl, t_open, [LIBS="-lnsl $LIBS"]) # SVR4
 AC_CHECK_LIB(socket, socket, [LIBS="-lsocket $LIBS"], [], $LIBS) # SVR4 sockets

diff --git a/pyconfig.h.in b/pyconfig.h.in
index 13979fc..29e1bfa 100644
--- a/pyconfig.h.in
+++ b/pyconfig.h.in

@@ -49,6 +49,9 @@
 /* Define to 1 if you have the `alarm' function. */
 #undef HAVE_ALARM
 
+/* Define if aligned memory access is required */
+#undef HAVE_ALIGNED_REQUIRED
+
 /* Define to 1 if you have the <alloca.h> header file. */
 #undef HAVE_ALLOCA_H
 
@@ -199,6 +202,9 @@
 /* Defined when any dynamic module loading is enabled. */
 #undef HAVE_DYNAMIC_LOADING
 
+/* Define to 1 if you have the <endian.h> header file. */
+#undef HAVE_ENDIAN_H
+
 /* Define if you have the 'epoll' functions. */
 #undef HAVE_EPOLL
 
@@ -408,6 +414,9 @@
 /* Define if you have the 'hstrerror' function. */
 #undef HAVE_HSTRERROR
 
+/* Define this if you have le64toh() */
+#undef HAVE_HTOLE64
+
 /* Define to 1 if you have the `hypot' function. */
 #undef HAVE_HYPOT
 
@@ -927,6 +936,9 @@
    */
 #undef HAVE_SYS_DIR_H
 
+/* Define to 1 if you have the <sys/endian.h> header file. */
+#undef HAVE_SYS_ENDIAN_H
+
 /* Define to 1 if you have the <sys/epoll.h> header file. */
 #undef HAVE_SYS_EPOLL_H
 
@@ -1193,6 +1205,10 @@
 /* Defined if Python is built as a shared library. */
 #undef Py_ENABLE_SHARED
 
+/* Define hash algorithm for str, bytes and memoryview. SipHash24: 1, FNV: 2,
+   externally defined: 0 */
+#undef Py_HASH_ALGORITHM
+
 /* assume C89 semantics that RETSIGTYPE is always void */
 #undef RETSIGTYPE
commit	985ecdcfc29adfc36ce2339acf03f819ad414869	[log] [tgz]
author	Christian Heimes <christian@cheimes.de>	Wed Nov 20 11:46:18 2013 +0100
committer	Christian Heimes <christian@cheimes.de>	Wed Nov 20 11:46:18 2013 +0100
tree	06a11f82271e768dbe49469c8736b65b083f671c
parent	fe32aec25a8b36498d840bd69485e9bc94195b9c [diff]