bpo-29882: Add _Py_popcount32() function (GH-20518)

* Rename pycore_byteswap.h to pycore_bitutils.h.
* Move popcount_digit() to pycore_bitutils.h as _Py_popcount32().
* _Py_popcount32() uses GCC and clang builtin function if available.
* Add unit tests to _Py_popcount32().
diff --git a/Objects/longobject.c b/Objects/longobject.c
index 0b209a4..ce10c4f 100644
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@@ -3,8 +3,9 @@
 /* XXX The functional organization of this file is terrible */
 
 #include "Python.h"
-#include "pycore_interp.h"    // _PY_NSMALLPOSINTS
-#include "pycore_pystate.h"   // _Py_IsMainInterpreter()
+#include "pycore_bitutils.h"      // _Py_popcount32()
+#include "pycore_interp.h"        // _PY_NSMALLPOSINTS
+#include "pycore_pystate.h"       // _Py_IsMainInterpreter()
 #include "longintrepr.h"
 
 #include <float.h>
@@ -5307,12 +5308,10 @@
 static int
 popcount_digit(digit d)
 {
-    /* 32bit SWAR popcount. */
-    uint32_t u = d;
-    u -= (u >> 1) & 0x55555555U;
-    u = (u & 0x33333333U) + ((u >> 2) & 0x33333333U);
-    u = (u + (u >> 4)) & 0x0f0f0f0fU;
-    return (uint32_t)(u * 0x01010101U) >> 24;
+    // digit can be larger than uint32_t, but only PyLong_SHIFT bits
+    // of it will be ever used.
+    Py_BUILD_ASSERT(PyLong_SHIFT <= 32);
+    return _Py_popcount32((uint32_t)d);
 }
 
 /*[clinic input]
diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h
index 9b2a29b..197605b 100644
--- a/Objects/stringlib/codecs.h
+++ b/Objects/stringlib/codecs.h
@@ -4,7 +4,7 @@
 # error "codecs.h is specific to Unicode"
 #endif
 
-#include "pycore_byteswap.h"      // _Py_bswap32()
+#include "pycore_bitutils.h"      // _Py_bswap32()
 
 /* Mask to quickly check whether a C 'long' contains a
    non-ASCII, UTF8-encoded char. */