bpo-26219: per opcode cache for LOAD_GLOBAL (GH-12884)

This patch implements per opcode cache mechanism, and use it in
only LOAD_GLOBAL opcode.

Based on Yury's opcache3.patch in bpo-26219.
diff --git a/Objects/codeobject.c b/Objects/codeobject.c
index 2333075..0d9e5d1 100644
--- a/Objects/codeobject.c
+++ b/Objects/codeobject.c
@@ -2,7 +2,9 @@
 
 #include "Python.h"
 #include "code.h"
+#include "opcode.h"
 #include "structmember.h"
+#include "pycore_code.h"
 #include "pycore_pystate.h"
 #include "pycore_tupleobject.h"
 #include "clinic/codeobject.c.h"
@@ -233,9 +235,56 @@
     co->co_zombieframe = NULL;
     co->co_weakreflist = NULL;
     co->co_extra = NULL;
+
+    co->co_opcache_map = NULL;
+    co->co_opcache = NULL;
+    co->co_opcache_flag = 0;
+    co->co_opcache_size = 0;
     return co;
 }
 
+int
+_PyCode_InitOpcache(PyCodeObject *co)
+{
+    Py_ssize_t co_size = PyBytes_Size(co->co_code) / sizeof(_Py_CODEUNIT);
+    co->co_opcache_map = (unsigned char *)PyMem_Calloc(co_size, 1);
+    if (co->co_opcache_map == NULL) {
+        return -1;
+    }
+
+    _Py_CODEUNIT *opcodes = (_Py_CODEUNIT*)PyBytes_AS_STRING(co->co_code);
+    Py_ssize_t opts = 0;
+
+    for (Py_ssize_t i = 0; i < co_size;) {
+        unsigned char opcode = _Py_OPCODE(opcodes[i]);
+        i++;  // 'i' is now aligned to (next_instr - first_instr)
+
+        // TODO: LOAD_METHOD, LOAD_ATTR
+        if (opcode == LOAD_GLOBAL) {
+            co->co_opcache_map[i] = ++opts;
+            if (opts > 254) {
+                break;
+            }
+        }
+    }
+
+    if (opts) {
+        co->co_opcache = (_PyOpcache *)PyMem_Calloc(opts, sizeof(_PyOpcache));
+        if (co->co_opcache == NULL) {
+            PyMem_FREE(co->co_opcache_map);
+            return -1;
+        }
+    }
+    else {
+        PyMem_FREE(co->co_opcache_map);
+        co->co_opcache_map = NULL;
+        co->co_opcache = NULL;
+    }
+
+    co->co_opcache_size = opts;
+    return 0;
+}
+
 PyCodeObject *
 PyCode_NewEmpty(const char *filename, const char *funcname, int firstlineno)
 {
@@ -458,6 +507,15 @@
 static void
 code_dealloc(PyCodeObject *co)
 {
+    if (co->co_opcache != NULL) {
+        PyMem_FREE(co->co_opcache);
+    }
+    if (co->co_opcache_map != NULL) {
+        PyMem_FREE(co->co_opcache_map);
+    }
+    co->co_opcache_flag = 0;
+    co->co_opcache_size = 0;
+
     if (co->co_extra != NULL) {
         PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
         _PyCodeObjectExtra *co_extra = co->co_extra;
@@ -504,6 +562,13 @@
         res += sizeof(_PyCodeObjectExtra) +
                (co_extra->ce_size-1) * sizeof(co_extra->ce_extras[0]);
     }
+    if (co->co_opcache != NULL) {
+        assert(co->co_opcache_map != NULL);
+        // co_opcache_map
+        res += PyBytes_GET_SIZE(co->co_code) / sizeof(_Py_CODEUNIT);
+        // co_opcache
+        res += co->co_opcache_size * sizeof(_PyOpcache);
+    }
     return PyLong_FromSsize_t(res);
 }
 
diff --git a/Objects/dictobject.c b/Objects/dictobject.c
index 2b04b0b..0cc1443 100644
--- a/Objects/dictobject.c
+++ b/Objects/dictobject.c
@@ -1080,20 +1080,21 @@
         return 0;
     }
 
-    if (_PyDict_HasSplitTable(mp)) {
-        mp->ma_values[ix] = value;
-        if (old_value == NULL) {
-            /* pending state */
-            assert(ix == mp->ma_used);
-            mp->ma_used++;
+    if (old_value != value) {
+        if (_PyDict_HasSplitTable(mp)) {
+            mp->ma_values[ix] = value;
+            if (old_value == NULL) {
+                /* pending state */
+                assert(ix == mp->ma_used);
+                mp->ma_used++;
+            }
         }
+        else {
+            assert(old_value != NULL);
+            DK_ENTRIES(mp->ma_keys)[ix].me_value = value;
+        }
+        mp->ma_version_tag = DICT_NEXT_VERSION();
     }
-    else {
-        assert(old_value != NULL);
-        DK_ENTRIES(mp->ma_keys)[ix].me_value = value;
-    }
-
-    mp->ma_version_tag = DICT_NEXT_VERSION();
     Py_XDECREF(old_value); /* which **CAN** re-enter (see issue #22653) */
     ASSERT_CONSISTENT(mp);
     Py_DECREF(key);