bpo-35081: Add _PyThreadState_GET() internal macro (GH-10266)

If Py_BUILD_CORE is defined, the PyThreadState_GET() macro access
_PyRuntime which comes from the internal pycore_state.h header.
Public headers must not require internal headers.

Move PyThreadState_GET() and _PyInterpreterState_GET_UNSAFE() from
Include/pystate.h to Include/internal/pycore_state.h, and rename
PyThreadState_GET() to _PyThreadState_GET() there.

The PyThreadState_GET() macro of pystate.h is now redefined when
pycore_state.h is included, to use the fast _PyThreadState_GET().

Changes:

* Add _PyThreadState_GET() macro
* Replace "PyThreadState_GET()->interp" with
  _PyInterpreterState_GET_UNSAFE()
* Replace PyThreadState_GET() with _PyThreadState_GET() in internal C
  files (compiled with Py_BUILD_CORE defined), but keep
  PyThreadState_GET() in the public header files.
* _testcapimodule.c: replace PyThreadState_GET() with
  PyThreadState_Get(); the module is not compiled with Py_BUILD_CORE
  defined.
* pycore_state.h now requires Py_BUILD_CORE to be defined.
diff --git a/Include/pystate.h b/Include/pystate.h
index 7fc921e..b30c318 100644
--- a/Include/pystate.h
+++ b/Include/pystate.h
@@ -245,15 +245,17 @@
 PyAPI_FUNC(PyInterpreterState *) PyInterpreterState_New(void);
 PyAPI_FUNC(void) PyInterpreterState_Clear(PyInterpreterState *);
 PyAPI_FUNC(void) PyInterpreterState_Delete(PyInterpreterState *);
+
 #if !defined(Py_LIMITED_API)
+/* Get the current interpreter state.
+
+   Issue a fatal error if there no current Python thread state or no current
+   interpreter. It cannot return NULL.
+
+   The caller must hold the GIL.*/
 PyAPI_FUNC(PyInterpreterState *) _PyInterpreterState_Get(void);
 #endif
-#ifdef Py_BUILD_CORE
-   /* Macro which should only be used for performance critical code.
-      Need "#include "pycore_state.h". See also _PyInterpreterState_Get()
-      and _PyGILState_GetInterpreterStateUnsafe(). */
-#  define _PyInterpreterState_GET_UNSAFE() (PyThreadState_GET()->interp)
-#endif
+
 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03070000
 /* New in 3.7 */
 PyAPI_FUNC(int64_t) PyInterpreterState_GetID(PyInterpreterState *);
@@ -286,11 +288,27 @@
 PyAPI_FUNC(void) _PyGILState_Reinit(void);
 #endif /* !Py_LIMITED_API */
 
-/* Return the current thread state. The global interpreter lock must be held.
- * When the current thread state is NULL, this issues a fatal error (so that
- * the caller needn't check for NULL). */
+/* Get the current thread state.
+
+   When the current thread state is NULL, this issues a fatal error (so that
+   the caller needn't check for NULL).
+
+   The caller must hold the GIL.
+
+   See also PyThreadState_GET() and _PyThreadState_GET(). */
 PyAPI_FUNC(PyThreadState *) PyThreadState_Get(void);
 
+/* Get the current Python thread state.
+
+   Macro using PyThreadState_Get() or _PyThreadState_GET() depending if
+   pycore_state.h is included or not (this header redefines the macro).
+
+   If PyThreadState_Get() is used, issue a fatal error if the current thread
+   state is NULL.
+
+   See also PyThreadState_Get() and _PyThreadState_GET(). */
+#define PyThreadState_GET() PyThreadState_Get()
+
 #ifndef Py_LIMITED_API
 /* Similar to PyThreadState_Get(), but don't issue a fatal error
  * if it is NULL. */
@@ -301,18 +319,6 @@
 PyAPI_FUNC(PyObject *) PyThreadState_GetDict(void);
 PyAPI_FUNC(int) PyThreadState_SetAsyncExc(unsigned long, PyObject *);
 
-
-/* Variable and macro for in-line access to current thread state */
-
-/* Assuming the current thread holds the GIL, this is the
-   PyThreadState for the current thread. */
-#ifdef Py_BUILD_CORE
-#  define PyThreadState_GET() \
-             ((PyThreadState*)_Py_atomic_load_relaxed(&_PyRuntime.gilstate.tstate_current))
-#else
-#  define PyThreadState_GET() PyThreadState_Get()
-#endif
-
 typedef
     enum {PyGILState_LOCKED, PyGILState_UNLOCKED}
         PyGILState_STATE;
@@ -366,11 +372,11 @@
    The function returns 1 if _PyGILState_check_enabled is non-zero. */
 PyAPI_FUNC(int) PyGILState_Check(void);
 
-/* Unsafe function to get the single PyInterpreterState used by this process'
-   GILState implementation.
+/* Get the single PyInterpreterState used by this process' GILState
+   implementation.
 
-   Return NULL before _PyGILState_Init() is called and after _PyGILState_Fini()
-   is called.
+   This function doesn't check for error. Return NULL before _PyGILState_Init()
+   is called and after _PyGILState_Fini() is called.
 
    See also _PyInterpreterState_Get() and _PyInterpreterState_GET_UNSAFE(). */
 PyAPI_FUNC(PyInterpreterState *) _PyGILState_GetInterpreterStateUnsafe(void);