bpo-29882: Add _Py_popcount32() function (GH-20518)
* Rename pycore_byteswap.h to pycore_bitutils.h.
* Move popcount_digit() to pycore_bitutils.h as _Py_popcount32().
* _Py_popcount32() uses GCC and clang builtin function if available.
* Add unit tests to _Py_popcount32().
diff --git a/Include/internal/pycore_bitutils.h b/Include/internal/pycore_bitutils.h
new file mode 100644
index 0000000..36ffe23
--- /dev/null
+++ b/Include/internal/pycore_bitutils.h
@@ -0,0 +1,138 @@
+/* Bit and bytes utilities.
+
+ Bytes swap functions, reverse order of bytes:
+
+ - _Py_bswap16(uint16_t)
+ - _Py_bswap32(uint32_t)
+ - _Py_bswap64(uint64_t)
+*/
+
+#ifndef Py_INTERNAL_BSWAP_H
+#define Py_INTERNAL_BSWAP_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+# error "this header requires Py_BUILD_CORE define"
+#endif
+
+#if defined(__clang__) || \
+ (defined(__GNUC__) && \
+ ((__GNUC__ >= 5) || (__GNUC__ == 4) && (__GNUC_MINOR__ >= 8)))
+ /* __builtin_bswap16() is available since GCC 4.8,
+ __builtin_bswap32() is available since GCC 4.3,
+ __builtin_bswap64() is available since GCC 4.3. */
+# define _PY_HAVE_BUILTIN_BSWAP
+#endif
+
+#ifdef _MSC_VER
+ /* Get _byteswap_ushort(), _byteswap_ulong(), _byteswap_uint64() */
+# include <intrin.h>
+#endif
+
+static inline uint16_t
+_Py_bswap16(uint16_t word)
+{
+#ifdef _PY_HAVE_BUILTIN_BSWAP
+ return __builtin_bswap16(word);
+#elif defined(_MSC_VER)
+ Py_BUILD_ASSERT(sizeof(word) == sizeof(unsigned short));
+ return _byteswap_ushort(word);
+#else
+ // Portable implementation which doesn't rely on circular bit shift
+ return ( ((word & UINT16_C(0x00FF)) << 8)
+ | ((word & UINT16_C(0xFF00)) >> 8));
+#endif
+}
+
+static inline uint32_t
+_Py_bswap32(uint32_t word)
+{
+#ifdef _PY_HAVE_BUILTIN_BSWAP
+ return __builtin_bswap32(word);
+#elif defined(_MSC_VER)
+ Py_BUILD_ASSERT(sizeof(word) == sizeof(unsigned long));
+ return _byteswap_ulong(word);
+#else
+ // Portable implementation which doesn't rely on circular bit shift
+ return ( ((word & UINT32_C(0x000000FF)) << 24)
+ | ((word & UINT32_C(0x0000FF00)) << 8)
+ | ((word & UINT32_C(0x00FF0000)) >> 8)
+ | ((word & UINT32_C(0xFF000000)) >> 24));
+#endif
+}
+
+static inline uint64_t
+_Py_bswap64(uint64_t word)
+{
+#ifdef _PY_HAVE_BUILTIN_BSWAP
+ return __builtin_bswap64(word);
+#elif defined(_MSC_VER)
+ return _byteswap_uint64(word);
+#else
+ // Portable implementation which doesn't rely on circular bit shift
+ return ( ((word & UINT64_C(0x00000000000000FF)) << 56)
+ | ((word & UINT64_C(0x000000000000FF00)) << 40)
+ | ((word & UINT64_C(0x0000000000FF0000)) << 24)
+ | ((word & UINT64_C(0x00000000FF000000)) << 8)
+ | ((word & UINT64_C(0x000000FF00000000)) >> 8)
+ | ((word & UINT64_C(0x0000FF0000000000)) >> 24)
+ | ((word & UINT64_C(0x00FF000000000000)) >> 40)
+ | ((word & UINT64_C(0xFF00000000000000)) >> 56));
+#endif
+}
+
+
+// Population count: count the number of 1's in 'x'
+// (number of bits set to 1), also known as the hamming weight.
+//
+// Implementation note. CPUID is not used, to test if x86 POPCNT instruction
+// can be used, to keep the implementation simple. For example, Visual Studio
+// __popcnt() is not used this reason. The clang and GCC builtin function can
+// use the x86 POPCNT instruction if the target architecture has SSE4a or
+// newer.
+static inline int
+_Py_popcount32(uint32_t x)
+{
+#if (defined(__clang__) || defined(__GNUC__))
+
+#if SIZEOF_INT >= 4
+ Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned int));
+ return __builtin_popcount(x);
+#else
+ // The C standard guarantees that unsigned long will always be big enough
+ // to hold a uint32_t value without losing information.
+ Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned long));
+ return __builtin_popcountl(x);
+#endif
+
+#else
+ // 32-bit SWAR (SIMD Within A Register) popcount
+
+ // Binary: 0 1 0 1 ...
+ const uint32_t M1 = 0x55555555;
+ // Binary: 00 11 00 11. ..
+ const uint32_t M2 = 0x33333333;
+ // Binary: 0000 1111 0000 1111 ...
+ const uint32_t M4 = 0x0F0F0F0F;
+ // 256**4 + 256**3 + 256**2 + 256**1
+ const uint32_t SUM = 0x01010101;
+
+ // Put count of each 2 bits into those 2 bits
+ x = x - ((x >> 1) & M1);
+ // Put count of each 4 bits into those 4 bits
+ x = (x & M2) + ((x >> 2) & M2);
+ // Put count of each 8 bits into those 8 bits
+ x = (x + (x >> 4)) & M4;
+ // Sum of the 4 byte counts
+ return (uint32_t)((uint64_t)x * (uint64_t)SUM) >> 24;
+#endif
+}
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_BSWAP_H */
+
diff --git a/Include/internal/pycore_byteswap.h b/Include/internal/pycore_byteswap.h
deleted file mode 100644
index 5e64704..0000000
--- a/Include/internal/pycore_byteswap.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Bytes swap functions, reverse order of bytes:
-
- - _Py_bswap16(uint16_t)
- - _Py_bswap32(uint32_t)
- - _Py_bswap64(uint64_t)
-*/
-
-#ifndef Py_INTERNAL_BSWAP_H
-#define Py_INTERNAL_BSWAP_H
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef Py_BUILD_CORE
-# error "this header requires Py_BUILD_CORE define"
-#endif
-
-#if defined(__clang__) || \
- (defined(__GNUC__) && \
- ((__GNUC__ >= 5) || (__GNUC__ == 4) && (__GNUC_MINOR__ >= 8)))
- /* __builtin_bswap16() is available since GCC 4.8,
- __builtin_bswap32() is available since GCC 4.3,
- __builtin_bswap64() is available since GCC 4.3. */
-# define _PY_HAVE_BUILTIN_BSWAP
-#endif
-
-#ifdef _MSC_VER
- /* Get _byteswap_ushort(), _byteswap_ulong(), _byteswap_uint64() */
-# include <intrin.h>
-#endif
-
-static inline uint16_t
-_Py_bswap16(uint16_t word)
-{
-#ifdef _PY_HAVE_BUILTIN_BSWAP
- return __builtin_bswap16(word);
-#elif defined(_MSC_VER)
- Py_BUILD_ASSERT(sizeof(word) == sizeof(unsigned short));
- return _byteswap_ushort(word);
-#else
- // Portable implementation which doesn't rely on circular bit shift
- return ( ((word & UINT16_C(0x00FF)) << 8)
- | ((word & UINT16_C(0xFF00)) >> 8));
-#endif
-}
-
-static inline uint32_t
-_Py_bswap32(uint32_t word)
-{
-#ifdef _PY_HAVE_BUILTIN_BSWAP
- return __builtin_bswap32(word);
-#elif defined(_MSC_VER)
- Py_BUILD_ASSERT(sizeof(word) == sizeof(unsigned long));
- return _byteswap_ulong(word);
-#else
- // Portable implementation which doesn't rely on circular bit shift
- return ( ((word & UINT32_C(0x000000FF)) << 24)
- | ((word & UINT32_C(0x0000FF00)) << 8)
- | ((word & UINT32_C(0x00FF0000)) >> 8)
- | ((word & UINT32_C(0xFF000000)) >> 24));
-#endif
-}
-
-static inline uint64_t
-_Py_bswap64(uint64_t word)
-{
-#ifdef _PY_HAVE_BUILTIN_BSWAP
- return __builtin_bswap64(word);
-#elif defined(_MSC_VER)
- return _byteswap_uint64(word);
-#else
- // Portable implementation which doesn't rely on circular bit shift
- return ( ((word & UINT64_C(0x00000000000000FF)) << 56)
- | ((word & UINT64_C(0x000000000000FF00)) << 40)
- | ((word & UINT64_C(0x0000000000FF0000)) << 24)
- | ((word & UINT64_C(0x00000000FF000000)) << 8)
- | ((word & UINT64_C(0x000000FF00000000)) >> 8)
- | ((word & UINT64_C(0x0000FF0000000000)) >> 24)
- | ((word & UINT64_C(0x00FF000000000000)) >> 40)
- | ((word & UINT64_C(0xFF00000000000000)) >> 56));
-#endif
-}
-
-
-#ifdef __cplusplus
-}
-#endif
-#endif /* !Py_INTERNAL_BSWAP_H */
-
diff --git a/Makefile.pre.in b/Makefile.pre.in
index 5a18704..b115e7f 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -1121,7 +1121,7 @@
$(srcdir)/Include/internal/pycore_abstract.h \
$(srcdir)/Include/internal/pycore_accu.h \
$(srcdir)/Include/internal/pycore_atomic.h \
- $(srcdir)/Include/internal/pycore_byteswap.h \
+ $(srcdir)/Include/internal/pycore_bitutils.h \
$(srcdir)/Include/internal/pycore_bytes_methods.h \
$(srcdir)/Include/internal/pycore_call.h \
$(srcdir)/Include/internal/pycore_ceval.h \
diff --git a/Modules/_ctypes/cfield.c b/Modules/_ctypes/cfield.c
index 32a2bee..3a9b711 100644
--- a/Modules/_ctypes/cfield.c
+++ b/Modules/_ctypes/cfield.c
@@ -1,5 +1,5 @@
#include "Python.h"
-#include "pycore_byteswap.h" // _Py_bswap32()
+#include "pycore_bitutils.h" // _Py_bswap32()
#include <ffi.h>
#ifdef MS_WIN32
diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c
index 5f217dc..6d5af59 100644
--- a/Modules/_testinternalcapi.c
+++ b/Modules/_testinternalcapi.c
@@ -12,7 +12,7 @@
#define PY_SSIZE_T_CLEAN
#include "Python.h"
-#include "pycore_byteswap.h" // _Py_bswap32()
+#include "pycore_bitutils.h" // _Py_bswap32()
#include "pycore_initconfig.h" // _Py_GetConfigsAsDict()
#include "pycore_hashtable.h" // _Py_hashtable_new()
#include "pycore_gc.h" // PyGC_Head
@@ -63,6 +63,45 @@
}
+static int
+check_popcount(uint32_t x, int expected)
+{
+ // Use volatile to prevent the compiler to optimize out the whole test
+ volatile uint32_t u = x;
+ int bits = _Py_popcount32(u);
+ if (bits != expected) {
+ PyErr_Format(PyExc_AssertionError,
+ "_Py_popcount32(%lu) returns %i, expected %i",
+ (unsigned long)x, bits, expected);
+ return -1;
+ }
+ return 0;
+}
+
+
+static PyObject*
+test_popcount(PyObject *self, PyObject *Py_UNUSED(args))
+{
+#define CHECK(X, RESULT) \
+ do { \
+ if (check_popcount(X, RESULT) < 0) { \
+ return NULL; \
+ } \
+ } while (0)
+
+ CHECK(0, 0);
+ CHECK(1, 1);
+ CHECK(0x08080808, 4);
+ CHECK(0x10101010, 4);
+ CHECK(0x10204080, 4);
+ CHECK(0xDEADCAFE, 22);
+ CHECK(0xFFFFFFFF, 32);
+ Py_RETURN_NONE;
+
+#undef CHECK
+}
+
+
#define TO_PTR(ch) ((void*)(uintptr_t)ch)
#define FROM_PTR(ptr) ((uintptr_t)ptr)
#define VALUE(key) (1 + ((int)(key) - 'a'))
@@ -157,6 +196,7 @@
{"get_configs", get_configs, METH_NOARGS},
{"get_recursion_depth", get_recursion_depth, METH_NOARGS},
{"test_bswap", test_bswap, METH_NOARGS},
+ {"test_popcount", test_popcount, METH_NOARGS},
{"test_hashtable", test_hashtable, METH_NOARGS},
{NULL, NULL} /* sentinel */
};
diff --git a/Modules/sha256module.c b/Modules/sha256module.c
index 8edb1d5..261f9da 100644
--- a/Modules/sha256module.c
+++ b/Modules/sha256module.c
@@ -17,7 +17,7 @@
/* SHA objects */
#include "Python.h"
-#include "pycore_byteswap.h" // _Py_bswap32()
+#include "pycore_bitutils.h" // _Py_bswap32()
#include "structmember.h" // PyMemberDef
#include "hashlib.h"
#include "pystrhex.h"
diff --git a/Modules/sha512module.c b/Modules/sha512module.c
index 561ef8e..aa2aeed 100644
--- a/Modules/sha512module.c
+++ b/Modules/sha512module.c
@@ -17,7 +17,7 @@
/* SHA objects */
#include "Python.h"
-#include "pycore_byteswap.h" // _Py_bswap32()
+#include "pycore_bitutils.h" // _Py_bswap32()
#include "structmember.h" // PyMemberDef
#include "hashlib.h"
#include "pystrhex.h"
diff --git a/Objects/longobject.c b/Objects/longobject.c
index 0b209a4..ce10c4f 100644
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@@ -3,8 +3,9 @@
/* XXX The functional organization of this file is terrible */
#include "Python.h"
-#include "pycore_interp.h" // _PY_NSMALLPOSINTS
-#include "pycore_pystate.h" // _Py_IsMainInterpreter()
+#include "pycore_bitutils.h" // _Py_popcount32()
+#include "pycore_interp.h" // _PY_NSMALLPOSINTS
+#include "pycore_pystate.h" // _Py_IsMainInterpreter()
#include "longintrepr.h"
#include <float.h>
@@ -5307,12 +5308,10 @@
static int
popcount_digit(digit d)
{
- /* 32bit SWAR popcount. */
- uint32_t u = d;
- u -= (u >> 1) & 0x55555555U;
- u = (u & 0x33333333U) + ((u >> 2) & 0x33333333U);
- u = (u + (u >> 4)) & 0x0f0f0f0fU;
- return (uint32_t)(u * 0x01010101U) >> 24;
+ // digit can be larger than uint32_t, but only PyLong_SHIFT bits
+ // of it will be ever used.
+ Py_BUILD_ASSERT(PyLong_SHIFT <= 32);
+ return _Py_popcount32((uint32_t)d);
}
/*[clinic input]
diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h
index 9b2a29b..197605b 100644
--- a/Objects/stringlib/codecs.h
+++ b/Objects/stringlib/codecs.h
@@ -4,7 +4,7 @@
# error "codecs.h is specific to Unicode"
#endif
-#include "pycore_byteswap.h" // _Py_bswap32()
+#include "pycore_bitutils.h" // _Py_bswap32()
/* Mask to quickly check whether a C 'long' contains a
non-ASCII, UTF8-encoded char. */
diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj
index b6b0cf3..8d5f99f 100644
--- a/PCbuild/pythoncore.vcxproj
+++ b/PCbuild/pythoncore.vcxproj
@@ -170,7 +170,7 @@
<ClInclude Include="..\Include\internal\pycore_accu.h" />
<ClInclude Include="..\Include\internal\pycore_atomic.h" />
<ClInclude Include="..\Include\internal\pycore_bytes_methods.h" />
- <ClInclude Include="..\Include\internal\pycore_byteswap.h" />
+ <ClInclude Include="..\Include\internal\pycore_bitutils.h" />
<ClInclude Include="..\Include\internal\pycore_call.h" />
<ClInclude Include="..\Include\internal\pycore_ceval.h" />
<ClInclude Include="..\Include\internal\pycore_code.h" />
diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters
index 10dfffb..7bc9f8f 100644
--- a/PCbuild/pythoncore.vcxproj.filters
+++ b/PCbuild/pythoncore.vcxproj.filters
@@ -201,7 +201,7 @@
<ClInclude Include="..\Include\internal\pycore_atomic.h">
<Filter>Include</Filter>
</ClInclude>
- <ClInclude Include="..\Include\internal\pycore_byteswap.h">
+ <ClInclude Include="..\Include\internal\pycore_bitutils.h">
<Filter>Include</Filter>
</ClInclude>
<ClInclude Include="..\Include\internal\pycore_bytes_methods.h">
diff --git a/Python/hamt.c b/Python/hamt.c
index 8801c5e..e272e88 100644
--- a/Python/hamt.c
+++ b/Python/hamt.c
@@ -1,5 +1,6 @@
#include "Python.h"
+#include "pycore_bitutils.h" // _Py_popcount32
#include "pycore_hamt.h"
#include "pycore_object.h" // _PyObject_GC_TRACK()
#include <stddef.h> // offsetof()
@@ -434,29 +435,9 @@
}
static inline uint32_t
-hamt_bitcount(uint32_t i)
-{
- /* We could use native popcount instruction but that would
- require to either add configure flags to enable SSE4.2
- support or to detect it dynamically. Otherwise, we have
- a risk of CPython not working properly on older hardware.
-
- In practice, there's no observable difference in
- performance between using a popcount instruction or the
- following fallback code.
-
- The algorithm is copied from:
- https://graphics.stanford.edu/~seander/bithacks.html
- */
- i = i - ((i >> 1) & 0x55555555);
- i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
- return (((i + (i >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
-}
-
-static inline uint32_t
hamt_bitindex(uint32_t bitmap, uint32_t bit)
{
- return hamt_bitcount(bitmap & (bit - 1));
+ return (uint32_t)_Py_popcount32(bitmap & (bit - 1));
}
@@ -820,7 +801,7 @@
else {
/* There was no key before with the same (shift,hash). */
- uint32_t n = hamt_bitcount(self->b_bitmap);
+ uint32_t n = (uint32_t)_Py_popcount32(self->b_bitmap);
if (n >= 16) {
/* When we have a situation where we want to store more