Close #19787: PyThread_set_key_value() now always set the value. In Python 3.3,
the function did nothing if the key already exists (if the current value is a
non-NULL pointer).

_testcapi.run_in_subinterp() now correctly sets the new Python thread state of
the current thread when a subinterpreter is created.
diff --git a/Doc/whatsnew/3.4.rst b/Doc/whatsnew/3.4.rst
index 5d360c4..8fe906b 100644
--- a/Doc/whatsnew/3.4.rst
+++ b/Doc/whatsnew/3.4.rst
@@ -1068,3 +1068,8 @@
   working directory will also now have an absolute path, including when using
   ``-m`` with the interpreter (this does not influence when the path to a file
   is specified on the command-line).
+
+* (C API) :c:func:`PyThread_set_key_value` now always set the value. In Python
+  3.3, the function did nothing if the key already exists (if the current
+  value is a non-NULL pointer).
+
diff --git a/Misc/NEWS b/Misc/NEWS
index 8be054f..800002a 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,10 @@
 Core and Builtins
 -----------------
 
+- Issue #19787: PyThread_set_key_value() now always set the value. In Python
+  3.3, the function did nothing if the key already exists (if the current value
+  is a non-NULL pointer).
+
 - Issue #14432: Remove the thread state field from the frame structure. Fix a
   crash when a generator is created in a C thread that is destroyed while the
   generator is still used. The issue was that a generator contains a frame, and
diff --git a/Modules/_testcapimodule.c b/Modules/_testcapimodule.c
index a0cffde..6f2a75c 100644
--- a/Modules/_testcapimodule.c
+++ b/Modules/_testcapimodule.c
@@ -2511,6 +2511,10 @@
     r = PyRun_SimpleString(code);
     Py_EndInterpreter(substate);
 
+    /* restore previous thread safe. It was replaced by Py_NewInterpreter()
+       which creates a new thread state. */
+    _PyThreadState_Init(mainstate);
+
     PyThreadState_Swap(mainstate);
 
     return PyLong_FromLong(r);
diff --git a/Modules/_tracemalloc.c b/Modules/_tracemalloc.c
index b39e950..95b05d6 100644
--- a/Modules/_tracemalloc.c
+++ b/Modules/_tracemalloc.c
@@ -168,14 +168,11 @@
     assert(reentrant == 0 || reentrant == 1);
     if (reentrant) {
         assert(PyThread_get_key_value(tracemalloc_reentrant_key) == NULL);
-        PyThread_set_key_value(tracemalloc_reentrant_key,
-                               REENTRANT);
+        PyThread_set_key_value(tracemalloc_reentrant_key, REENTRANT);
     }
     else {
-        /* FIXME: PyThread_set_key_value() cannot be used to set the flag
-           to zero, because it does nothing if the variable has already
-           a value set. */
-        PyThread_delete_key_value(tracemalloc_reentrant_key);
+        assert(PyThread_get_key_value(tracemalloc_reentrant_key) == REENTRANT);
+        PyThread_set_key_value(tracemalloc_reentrant_key, NULL);
     }
 }
 
diff --git a/Python/thread.c b/Python/thread.c
index 8540942..5396ca3 100644
--- a/Python/thread.c
+++ b/Python/thread.c
@@ -205,7 +205,7 @@
  * segfaults.  Now we lock the whole routine.
  */
 static struct key *
-find_key(int key, void *value)
+find_key(int key, int update, void *value)
 {
     struct key *p, *prev_p;
     long id = PyThread_get_thread_ident();
@@ -215,8 +215,11 @@
     PyThread_acquire_lock(keymutex, 1);
     prev_p = NULL;
     for (p = keyhead; p != NULL; p = p->next) {
-        if (p->id == id && p->key == key)
+        if (p->id == id && p->key == key) {
+            if (update)
+                p->value = value;
             goto Done;
+        }
         /* Sanity check.  These states should never happen but if
          * they do we must abort.  Otherwise we'll end up spinning in
          * in a tight loop with the lock held.  A similar check is done
@@ -227,7 +230,7 @@
         if (p->next == keyhead)
             Py_FatalError("tls find_key: circular list(!)");
     }
-    if (value == NULL) {
+    if (!update && value == NULL) {
         assert(p == NULL);
         goto Done;
     }
@@ -279,19 +282,12 @@
     PyThread_release_lock(keymutex);
 }
 
-/* Confusing:  If the current thread has an association for key,
- * value is ignored, and 0 is returned.  Else an attempt is made to create
- * an association of key to value for the current thread.  0 is returned
- * if that succeeds, but -1 is returned if there's not enough memory
- * to create the association.  value must not be NULL.
- */
 int
 PyThread_set_key_value(int key, void *value)
 {
     struct key *p;
 
-    assert(value != NULL);
-    p = find_key(key, value);
+    p = find_key(key, 1, value);
     if (p == NULL)
         return -1;
     else
@@ -304,7 +300,7 @@
 void *
 PyThread_get_key_value(int key)
 {
-    struct key *p = find_key(key, NULL);
+    struct key *p = find_key(key, 0, NULL);
 
     if (p == NULL)
         return NULL;
diff --git a/Python/thread_nt.h b/Python/thread_nt.h
index ab5a081..ee2079f 100644
--- a/Python/thread_nt.h
+++ b/Python/thread_nt.h
@@ -389,20 +389,11 @@
     TlsFree(key);
 }
 
-/* We must be careful to emulate the strange semantics implemented in thread.c,
- * where the value is only set if it hasn't been set before.
- */
 int
 PyThread_set_key_value(int key, void *value)
 {
     BOOL ok;
-    void *oldvalue;
 
-    assert(value != NULL);
-    oldvalue = TlsGetValue(key);
-    if (oldvalue != NULL)
-        /* ignore value if already set */
-        return 0;
     ok = TlsSetValue(key, value);
     if (!ok)
         return -1;
diff --git a/Python/thread_pthread.h b/Python/thread_pthread.h
index 20f8535..d9f7c76 100644
--- a/Python/thread_pthread.h
+++ b/Python/thread_pthread.h
@@ -627,9 +627,6 @@
 PyThread_set_key_value(int key, void *value)
 {
     int fail;
-    void *oldValue = pthread_getspecific(key);
-    if (oldValue != NULL)
-        return 0;
     fail = pthread_setspecific(key, value);
     return fail ? -1 : 0;
 }