bpo-35081: Add _PyThreadState_GET() internal macro (GH-10266)

If Py_BUILD_CORE is defined, the PyThreadState_GET() macro access
_PyRuntime which comes from the internal pycore_state.h header.
Public headers must not require internal headers.

Move PyThreadState_GET() and _PyInterpreterState_GET_UNSAFE() from
Include/pystate.h to Include/internal/pycore_state.h, and rename
PyThreadState_GET() to _PyThreadState_GET() there.

The PyThreadState_GET() macro of pystate.h is now redefined when
pycore_state.h is included, to use the fast _PyThreadState_GET().

Changes:

* Add _PyThreadState_GET() macro
* Replace "PyThreadState_GET()->interp" with
  _PyInterpreterState_GET_UNSAFE()
* Replace PyThreadState_GET() with _PyThreadState_GET() in internal C
  files (compiled with Py_BUILD_CORE defined), but keep
  PyThreadState_GET() in the public header files.
* _testcapimodule.c: replace PyThreadState_GET() with
  PyThreadState_Get(); the module is not compiled with Py_BUILD_CORE
  defined.
* pycore_state.h now requires Py_BUILD_CORE to be defined.
diff --git a/Objects/call.c b/Objects/call.c
index 9061d0b..48e3aaf 100644
--- a/Objects/call.c
+++ b/Objects/call.c
@@ -258,7 +258,7 @@
                        PyObject *globals)
 {
     PyFrameObject *f;
-    PyThreadState *tstate = PyThreadState_GET();
+    PyThreadState *tstate = _PyThreadState_GET();
     PyObject **fastlocals;
     Py_ssize_t i;
     PyObject *result;
diff --git a/Objects/dictobject.c b/Objects/dictobject.c
index a9ae907..ea564a2 100644
--- a/Objects/dictobject.c
+++ b/Objects/dictobject.c
@@ -1314,9 +1314,9 @@
     /* We can arrive here with a NULL tstate during initialization: try
        running "python -Wi" for an example related to string interning.
        Let's just hope that no exception occurs then...  This must be
-       PyThreadState_GET() and not PyThreadState_Get() because the latter
+       _PyThreadState_GET() and not PyThreadState_Get() because the latter
        abort Python if tstate is NULL. */
-    tstate = PyThreadState_GET();
+    tstate = _PyThreadState_GET();
     if (tstate != NULL && tstate->curexc_type != NULL) {
         /* preserve the existing exception */
         PyObject *err_type, *err_value, *err_tb;
diff --git a/Objects/genobject.c b/Objects/genobject.c
index 885b3f2..7c2948b 100644
--- a/Objects/genobject.c
+++ b/Objects/genobject.c
@@ -151,7 +151,7 @@
 static PyObject *
 gen_send_ex(PyGenObject *gen, PyObject *arg, int exc, int closing)
 {
-    PyThreadState *tstate = PyThreadState_GET();
+    PyThreadState *tstate = _PyThreadState_GET();
     PyFrameObject *f = gen->gi_frame;
     PyObject *result;
 
@@ -1157,7 +1157,7 @@
         return NULL;
     }
 
-    PyThreadState *tstate = PyThreadState_GET();
+    PyThreadState *tstate = _PyThreadState_GET();
     int origin_depth = tstate->coroutine_origin_tracking_depth;
 
     if (origin_depth == 0) {
@@ -1267,7 +1267,7 @@
 
     o->ag_hooks_inited = 1;
 
-    tstate = PyThreadState_GET();
+    tstate = _PyThreadState_GET();
 
     finalizer = tstate->async_gen_finalizer;
     if (finalizer) {
diff --git a/Objects/object.c b/Objects/object.c
index f7395c7..d3a97f6 100644
--- a/Objects/object.c
+++ b/Objects/object.c
@@ -2136,7 +2136,7 @@
 void
 _PyTrash_thread_deposit_object(PyObject *op)
 {
-    PyThreadState *tstate = PyThreadState_GET();
+    PyThreadState *tstate = _PyThreadState_GET();
     _PyObject_ASSERT(op, PyObject_IS_GC(op));
     _PyObject_ASSERT(op, !_PyObject_GC_IS_TRACKED(op));
     _PyObject_ASSERT(op, op->ob_refcnt == 0);
@@ -2174,7 +2174,7 @@
 void
 _PyTrash_thread_destroy_chain(void)
 {
-    PyThreadState *tstate = PyThreadState_GET();
+    PyThreadState *tstate = _PyThreadState_GET();
     /* We need to increase trash_delete_nesting here, otherwise,
        _PyTrash_thread_destroy_chain will be called recursively
        and then possibly crash.  An example that may crash without
diff --git a/Objects/odictobject.c b/Objects/odictobject.c
index 81c996b..52ac7e5 100644
--- a/Objects/odictobject.c
+++ b/Objects/odictobject.c
@@ -1355,7 +1355,7 @@
 static void
 odict_dealloc(PyODictObject *self)
 {
-    PyThreadState *tstate = PyThreadState_GET();
+    PyThreadState *tstate = _PyThreadState_GET();
 
     PyObject_GC_UnTrack(self);
     Py_TRASHCAN_SAFE_BEGIN(self)
diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index 9a390b3..dedc4f7 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -1115,7 +1115,7 @@
 {
     PyTypeObject *type, *base;
     destructor basedealloc;
-    PyThreadState *tstate = PyThreadState_GET();
+    PyThreadState *tstate = _PyThreadState_GET();
     int has_finalizer;
 
     /* Extract the type; we expect it to be a heap type */
@@ -7678,7 +7678,7 @@
         PyFrameObject *f;
         PyCodeObject *co;
         Py_ssize_t i, n;
-        f = PyThreadState_GET()->frame;
+        f = _PyThreadState_GET()->frame;
         if (f == NULL) {
             PyErr_SetString(PyExc_RuntimeError,
                             "super(): no current frame");