bpo-26219: per opcode cache for LOAD_GLOBAL (GH-12884)

This patch implements per opcode cache mechanism, and use it in
only LOAD_GLOBAL opcode.

Based on Yury's opcache3.patch in bpo-26219.
diff --git a/Include/code.h b/Include/code.h
index 933de97..b79d977 100644
--- a/Include/code.h
+++ b/Include/code.h
@@ -17,6 +17,8 @@
 #  define _Py_OPARG(word) ((word) >> 8)
 #endif
 
+typedef struct _PyOpcache _PyOpcache;
+
 /* Bytecode object */
 typedef struct {
     PyObject_HEAD
@@ -49,6 +51,21 @@
        Type is a void* to keep the format private in codeobject.c to force
        people to go through the proper APIs. */
     void *co_extra;
+
+    /* Per opcodes just-in-time cache
+     *
+     * To reduce cache size, we use indirect mapping from opcode index to
+     * cache object:
+     *   cache = co_opcache[co_opcache_map[next_instr - first_instr] - 1]
+     */
+
+    // co_opcache_map is indexed by (next_instr - first_instr).
+    //  * 0 means there is no cache for this opcode.
+    //  * n > 0 means there is cache in co_opcache[n-1].
+    unsigned char *co_opcache_map;
+    _PyOpcache *co_opcache;
+    int co_opcache_flag;  // used to determine when create a cache.
+    unsigned char co_opcache_size;  // length of co_opcache.
 } PyCodeObject;
 
 /* Masks for co_flags above */
diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h
index d44afdf..2c6df9a 100644
--- a/Include/internal/pycore_ceval.h
+++ b/Include/internal/pycore_ceval.h
@@ -31,6 +31,9 @@
 PyAPI_FUNC(void) _PyEval_ReInitThreads(
     _PyRuntimeState *runtime);
 
+/* Private function */
+void _PyEval_Fini(void);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h
new file mode 100644
index 0000000..88956f1
--- /dev/null
+++ b/Include/internal/pycore_code.h
@@ -0,0 +1,27 @@
+#ifndef Py_INTERNAL_CODE_H
+#define Py_INTERNAL_CODE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+ 
+typedef struct {
+    PyObject *ptr;  /* Cached pointer (borrowed reference) */
+    uint64_t globals_ver;  /* ma_version of global dict */
+    uint64_t builtins_ver; /* ma_version of builtin dict */
+} _PyOpcache_LoadGlobal;
+
+struct _PyOpcache {
+    union {
+        _PyOpcache_LoadGlobal lg;
+    } u;
+    char optimized;
+};
+
+/* Private API */
+int _PyCode_InitOpcache(PyCodeObject *co);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_CODE_H */