bpo-26219: per opcode cache for LOAD_GLOBAL (GH-12884)

This patch implements per opcode cache mechanism, and use it in
only LOAD_GLOBAL opcode.

Based on Yury's opcache3.patch in bpo-26219.
diff --git a/Include/code.h b/Include/code.h
index 933de97..b79d977 100644
--- a/Include/code.h
+++ b/Include/code.h
@@ -17,6 +17,8 @@
 #  define _Py_OPARG(word) ((word) >> 8)
 #endif
 
+typedef struct _PyOpcache _PyOpcache;
+
 /* Bytecode object */
 typedef struct {
     PyObject_HEAD
@@ -49,6 +51,21 @@
        Type is a void* to keep the format private in codeobject.c to force
        people to go through the proper APIs. */
     void *co_extra;
+
+    /* Per opcodes just-in-time cache
+     *
+     * To reduce cache size, we use indirect mapping from opcode index to
+     * cache object:
+     *   cache = co_opcache[co_opcache_map[next_instr - first_instr] - 1]
+     */
+
+    // co_opcache_map is indexed by (next_instr - first_instr).
+    //  * 0 means there is no cache for this opcode.
+    //  * n > 0 means there is cache in co_opcache[n-1].
+    unsigned char *co_opcache_map;
+    _PyOpcache *co_opcache;
+    int co_opcache_flag;  // used to determine when create a cache.
+    unsigned char co_opcache_size;  // length of co_opcache.
 } PyCodeObject;
 
 /* Masks for co_flags above */