Fix async Python functors invoking from multiple C++ threads (#1587) (#1595)

* Fix async Python functors invoking from multiple C++ threads (#1587)

Ensure GIL is held during functor destruction.

* Add async Python callbacks test that runs in separate Python thread
diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py
index 93c42c2..6439c8e 100644
--- a/tests/test_callbacks.py
+++ b/tests/test_callbacks.py
@@ -1,5 +1,6 @@
 import pytest
 from pybind11_tests import callbacks as m
+from threading import Thread
 
 
 def test_callbacks():
@@ -105,3 +106,31 @@
 
 def test_movable_object():
     assert m.callback_with_movable(lambda _: None) is True
+
+
+def test_async_callbacks():
+    # serves as state for async callback
+    class Item:
+        def __init__(self, value):
+            self.value = value
+
+    res = []
+
+    # generate stateful lambda that will store result in `res`
+    def gen_f():
+        s = Item(3)
+        return lambda j: res.append(s.value + j)
+
+    # do some work async
+    work = [1, 2, 3, 4]
+    m.test_async_callback(gen_f(), work)
+    # wait until work is done
+    from time import sleep
+    sleep(0.5)
+    assert sum(res) == sum([x + 3 for x in work])
+
+
+def test_async_async_callbacks():
+    t = Thread(target=test_async_callbacks)
+    t.start()
+    t.join()