bpo-33608: Simplify ceval's DISPATCH by hoisting eval_breaker ahead of time. (gh-12062)

This includes fixes to various _Py_atomic_* macros.
diff --git a/Include/internal/pycore_atomic.h b/Include/internal/pycore_atomic.h
index 5669f71..7aa7eed 100644
--- a/Include/internal/pycore_atomic.h
+++ b/Include/internal/pycore_atomic.h
@@ -58,10 +58,10 @@
     atomic_thread_fence(ORDER)
 
 #define _Py_atomic_store_explicit(ATOMIC_VAL, NEW_VAL, ORDER) \
-    atomic_store_explicit(&(ATOMIC_VAL)->_value, NEW_VAL, ORDER)
+    atomic_store_explicit(&((ATOMIC_VAL)->_value), NEW_VAL, ORDER)
 
 #define _Py_atomic_load_explicit(ATOMIC_VAL, ORDER) \
-    atomic_load_explicit(&(ATOMIC_VAL)->_value, ORDER)
+    atomic_load_explicit(&((ATOMIC_VAL)->_value), ORDER)
 
 /* Use builtin atomic operations in GCC >= 4.7 */
 #elif defined(HAVE_BUILTIN_ATOMIC)
@@ -92,14 +92,14 @@
     (assert((ORDER) == __ATOMIC_RELAXED                       \
             || (ORDER) == __ATOMIC_SEQ_CST                    \
             || (ORDER) == __ATOMIC_RELEASE),                  \
-     __atomic_store_n(&(ATOMIC_VAL)->_value, NEW_VAL, ORDER))
+     __atomic_store_n(&((ATOMIC_VAL)->_value), NEW_VAL, ORDER))
 
 #define _Py_atomic_load_explicit(ATOMIC_VAL, ORDER)           \
     (assert((ORDER) == __ATOMIC_RELAXED                       \
             || (ORDER) == __ATOMIC_SEQ_CST                    \
             || (ORDER) == __ATOMIC_ACQUIRE                    \
             || (ORDER) == __ATOMIC_CONSUME),                  \
-     __atomic_load_n(&(ATOMIC_VAL)->_value, ORDER))
+     __atomic_load_n(&((ATOMIC_VAL)->_value), ORDER))
 
 /* Only support GCC (for expression statements) and x86 (for simple
  * atomic semantics) and MSVC x86/x64/ARM */
@@ -324,7 +324,7 @@
 }
 
 #else
-#define _Py_atomic_load_64bit(ATOMIC_VAL, ORDER) *ATOMIC_VAL
+#define _Py_atomic_load_64bit(ATOMIC_VAL, ORDER) *(ATOMIC_VAL)
 #endif
 
 inline int _Py_atomic_load_32bit(volatile int* value, int order) {
@@ -359,15 +359,15 @@
 }
 
 #define _Py_atomic_store_explicit(ATOMIC_VAL, NEW_VAL, ORDER) \
-  if (sizeof(*ATOMIC_VAL._value) == 8) { \
-    _Py_atomic_store_64bit((volatile long long*)ATOMIC_VAL._value, NEW_VAL, ORDER) } else { \
-    _Py_atomic_store_32bit((volatile long*)ATOMIC_VAL._value, NEW_VAL, ORDER) }
+  if (sizeof((ATOMIC_VAL)->_value) == 8) { \
+    _Py_atomic_store_64bit((volatile long long*)&((ATOMIC_VAL)->_value), NEW_VAL, ORDER) } else { \
+    _Py_atomic_store_32bit((volatile long*)&((ATOMIC_VAL)->_value), NEW_VAL, ORDER) }
 
 #define _Py_atomic_load_explicit(ATOMIC_VAL, ORDER) \
   ( \
-    sizeof(*(ATOMIC_VAL._value)) == 8 ? \
-    _Py_atomic_load_64bit((volatile long long*)ATOMIC_VAL._value, ORDER) : \
-    _Py_atomic_load_32bit((volatile long*)ATOMIC_VAL._value, ORDER) \
+    sizeof((ATOMIC_VAL)->_value) == 8 ? \
+    _Py_atomic_load_64bit((volatile long long*)&((ATOMIC_VAL)->_value), ORDER) : \
+    _Py_atomic_load_32bit((volatile long*)&((ATOMIC_VAL)->_value), ORDER) \
   )
 #elif defined(_M_ARM) || defined(_M_ARM64)
 typedef enum _Py_memory_order {
@@ -391,13 +391,13 @@
 #define _Py_atomic_store_64bit(ATOMIC_VAL, NEW_VAL, ORDER) \
     switch (ORDER) { \
     case _Py_memory_order_acquire: \
-      _InterlockedExchange64_acq((__int64 volatile*)ATOMIC_VAL, (__int64)NEW_VAL); \
+      _InterlockedExchange64_acq((__int64 volatile*)&((ATOMIC_VAL)->_value), (__int64)NEW_VAL); \
       break; \
     case _Py_memory_order_release: \
-      _InterlockedExchange64_rel((__int64 volatile*)ATOMIC_VAL, (__int64)NEW_VAL); \
+      _InterlockedExchange64_rel((__int64 volatile*)&((ATOMIC_VAL)->_value), (__int64)NEW_VAL); \
       break; \
     default: \
-      _InterlockedExchange64((__int64 volatile*)ATOMIC_VAL, (__int64)NEW_VAL); \
+      _InterlockedExchange64((__int64 volatile*)&((ATOMIC_VAL)->_value), (__int64)NEW_VAL); \
       break; \
   }
 #else
@@ -407,13 +407,13 @@
 #define _Py_atomic_store_32bit(ATOMIC_VAL, NEW_VAL, ORDER) \
   switch (ORDER) { \
   case _Py_memory_order_acquire: \
-    _InterlockedExchange_acq((volatile long*)ATOMIC_VAL, (int)NEW_VAL); \
+    _InterlockedExchange_acq((volatile long*)&((ATOMIC_VAL)->_value), (int)NEW_VAL); \
     break; \
   case _Py_memory_order_release: \
-    _InterlockedExchange_rel((volatile long*)ATOMIC_VAL, (int)NEW_VAL); \
+    _InterlockedExchange_rel((volatile long*)&((ATOMIC_VAL)->_value), (int)NEW_VAL); \
     break; \
   default: \
-    _InterlockedExchange((volatile long*)ATOMIC_VAL, (int)NEW_VAL); \
+    _InterlockedExchange((volatile long*)&((ATOMIC_VAL)->_value), (int)NEW_VAL); \
     break; \
   }
 
@@ -454,7 +454,7 @@
 }
 
 #else
-#define _Py_atomic_load_64bit(ATOMIC_VAL, ORDER) *ATOMIC_VAL
+#define _Py_atomic_load_64bit(ATOMIC_VAL, ORDER) *(ATOMIC_VAL)
 #endif
 
 inline int _Py_atomic_load_32bit(volatile int* value, int order) {
@@ -489,15 +489,15 @@
 }
 
 #define _Py_atomic_store_explicit(ATOMIC_VAL, NEW_VAL, ORDER) \
-  if (sizeof(*ATOMIC_VAL._value) == 8) { \
-    _Py_atomic_store_64bit(ATOMIC_VAL._value, NEW_VAL, ORDER) } else { \
-    _Py_atomic_store_32bit(ATOMIC_VAL._value, NEW_VAL, ORDER) }
+  if (sizeof((ATOMIC_VAL)->_value) == 8) { \
+    _Py_atomic_store_64bit(&((ATOMIC_VAL)->_value), NEW_VAL, ORDER) } else { \
+    _Py_atomic_store_32bit(&((ATOMIC_VAL)->_value), NEW_VAL, ORDER) }
 
 #define _Py_atomic_load_explicit(ATOMIC_VAL, ORDER) \
   ( \
-    sizeof(*(ATOMIC_VAL._value)) == 8 ? \
-    _Py_atomic_load_64bit(ATOMIC_VAL._value, ORDER) : \
-    _Py_atomic_load_32bit(ATOMIC_VAL._value, ORDER) \
+    sizeof((ATOMIC_VAL)->_value) == 8 ? \
+    _Py_atomic_load_64bit(&((ATOMIC_VAL)->_value), ORDER) : \
+    _Py_atomic_load_32bit(&((ATOMIC_VAL)->_value), ORDER) \
   )
 #endif
 #else  /* !gcc x86  !_msc_ver */
diff --git a/Python/ceval.c b/Python/ceval.c
index 68c1617..be75ade 100644
--- a/Python/ceval.c
+++ b/Python/ceval.c
@@ -637,6 +637,7 @@
     PyObject **fastlocals, **freevars;
     PyObject *retval = NULL;            /* Return value */
     PyThreadState *tstate = _PyThreadState_GET();
+    _Py_atomic_int *eval_breaker = &tstate->interp->ceval.eval_breaker;
     PyCodeObject *co;
 
     /* when tracing we set things up so that
@@ -722,7 +723,7 @@
 
 #define DISPATCH() \
     { \
-        if (!_Py_atomic_load_relaxed(&tstate->interp->ceval.eval_breaker)) { \
+        if (!_Py_atomic_load_relaxed(eval_breaker)) { \
                     FAST_DISPATCH(); \
         } \
         continue; \
@@ -1024,7 +1025,7 @@
            async I/O handler); see Py_AddPendingCall() and
            Py_MakePendingCalls() above. */
 
-        if (_Py_atomic_load_relaxed(&(tstate->interp->ceval.eval_breaker))) {
+        if (_Py_atomic_load_relaxed(eval_breaker)) {
             opcode = _Py_OPCODE(*next_instr);
             if (opcode == SETUP_FINALLY ||
                 opcode == SETUP_WITH ||