bpo-43760: Speed up check for tracing in interpreter dispatch (#25276)

* Remove redundant tracing_possible field from interpreter state.

* Move 'use_tracing' from tstate onto C stack, for fastest possible checking in dispatch logic.

* Add comments stressing the importance stack discipline when dealing with CFrames.

* Add NEWS
diff --git a/Include/cpython/pystate.h b/Include/cpython/pystate.h
index cfaee89..e3ccc54 100644
--- a/Include/cpython/pystate.h
+++ b/Include/cpython/pystate.h
@@ -29,6 +29,21 @@ typedef int (*Py_tracefunc)(PyObject *, PyFrameObject *, int, PyObject *);
 #define PyTrace_OPCODE 7
 
 
+typedef struct _cframe {
+    /* This struct will be threaded through the C stack
+     * allowing fast access to per-thread state that needs
+     * to be accessed quickly by the interpreter, but can
+     * be modified outside of the interpreter.
+     *
+     * WARNING: This makes data on the C stack accessible from
+     * heap objects. Care must be taken to maintain stack
+     * discipline and make sure that instances of this struct cannot
+     * accessed outside of their lifetime.
+     */
+    int use_tracing;
+    struct _cframe *previous;
+} CFrame;
+
 typedef struct _err_stackitem {
     /* This struct represents an entry on the exception stack, which is a
      * per-coroutine state. (Coroutine in the computer science sense,
@@ -61,7 +76,10 @@ struct _ts {
        This is to prevent the actual trace/profile code from being recorded in
        the trace/profile. */
     int tracing;
-    int use_tracing;
+
+    /* Pointer to current CFrame in the C stack frame of the currently,
+     * or most recently, executing _PyEval_EvalFrameDefault. */
+    CFrame *cframe;
 
     Py_tracefunc c_profilefunc;
     Py_tracefunc c_tracefunc;
@@ -129,6 +147,8 @@ struct _ts {
     /* Unique thread state id. */
     uint64_t id;
 
+    CFrame root_cframe;
+
     /* XXX signal handlers should also be here */
 
 };