Fix GIL release and acquire when embedding the interpreter

Fixes a race condition when multiple threads try to acquire the GIL
before `detail::internals` have been initialized. `gil_scoped_release`
is now tasked with initializing `internals` (guaranteed single-threaded)
to ensure the safety of subsequent `acquire` calls from multiple threads.
diff --git a/include/pybind11/pybind11.h b/include/pybind11/pybind11.h
index 2bfd81c..08268b8 100644
--- a/include/pybind11/pybind11.h
+++ b/include/pybind11/pybind11.h
@@ -1661,9 +1661,13 @@
 class gil_scoped_release {
 public:
     explicit gil_scoped_release(bool disassoc = false) : disassoc(disassoc) {
+        // `get_internals()` must be called here unconditionally in order to initialize
+        // `internals.tstate` for subsequent `gil_scoped_acquire` calls. Otherwise, an
+        // initialization race could occur as multiple threads try `gil_scoped_acquire`.
+        const auto &internals = detail::get_internals();
         tstate = PyEval_SaveThread();
         if (disassoc) {
-            auto key = detail::get_internals().tstate;
+            auto key = internals.tstate;
             #if PY_MAJOR_VERSION < 3
                 PyThread_delete_key_value(key);
             #else
diff --git a/tests/test_embed/CMakeLists.txt b/tests/test_embed/CMakeLists.txt
index a651031..0a43e0e 100644
--- a/tests/test_embed/CMakeLists.txt
+++ b/tests/test_embed/CMakeLists.txt
@@ -26,6 +26,9 @@
   target_link_libraries(test_embed PRIVATE ${PYTHON_LIBRARIES})
 endif()
 
+find_package(Threads REQUIRED)
+target_link_libraries(test_embed PUBLIC ${CMAKE_THREAD_LIBS_INIT})
+
 add_custom_target(cpptest COMMAND $<TARGET_FILE:test_embed>
                   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
 add_dependencies(check cpptest)
diff --git a/tests/test_embed/test_interpreter.cpp b/tests/test_embed/test_interpreter.cpp
index 58f791a..2d5823c 100644
--- a/tests/test_embed/test_interpreter.cpp
+++ b/tests/test_embed/test_interpreter.cpp
@@ -1,6 +1,8 @@
 #include <pybind11/embed.h>
 #include <catch.hpp>
 
+#include <thread>
+
 namespace py = pybind11;
 using namespace py::literals;
 
@@ -185,3 +187,32 @@
     py::exec("var = dict(number=42)");
     REQUIRE(py::globals()["var"]["number"].cast<int>() == 42);
 }
+
+TEST_CASE("Threads") {
+    // Restart interpreter to ensure threads are not initialized
+    py::finalize_interpreter();
+    py::initialize_interpreter();
+    REQUIRE_FALSE(has_pybind11_internals_static());
+
+    constexpr auto num_threads = 10;
+    auto locals = py::dict("count"_a=0);
+
+    {
+        py::gil_scoped_release gil_release{};
+        REQUIRE(has_pybind11_internals_static());
+
+        auto threads = std::vector<std::thread>();
+        for (auto i = 0; i < num_threads; ++i) {
+            threads.emplace_back([&]() {
+                py::gil_scoped_acquire gil{};
+                py::exec("count += 1", py::globals(), locals);
+            });
+        }
+
+        for (auto &thread : threads) {
+            thread.join();
+        }
+    }
+
+    REQUIRE(locals["count"].cast<int>() == num_threads);
+}