Allow module-local classes to be loaded externally

The main point of `py::module_local` is to make the C++ -> Python cast
unique so that returning/casting a C++ instance is well-defined.
Unfortunately it also makes loading unique, but this isn't particularly
desirable: when an instance contains `Type` instance there's no reason
it shouldn't be possible to pass that instance to a bound function
taking a `Type` parameter, even if that function is in another module.

This commit solves the issue by allowing foreign module (and global)
type loaders have a chance to load the value if the local module loader
fails.  The implementation here does this by storing a module-local
loading function in a capsule in the python type, which we can then call
if the local (and possibly global, if the local type is masking a global
type) version doesn't work.
diff --git a/tests/local_bindings.h b/tests/local_bindings.h
index 06a56fc..8f48ed9 100644
--- a/tests/local_bindings.h
+++ b/tests/local_bindings.h
@@ -18,7 +18,7 @@
 using LocalExternal = LocalBase<3>;
 /// Mixed: registered local first, then global
 using MixedLocalGlobal = LocalBase<4>;
-/// Mixed: global first, then local (which fails)
+/// Mixed: global first, then local
 using MixedGlobalLocal = LocalBase<5>;
 
 using LocalVec = std::vector<LocalType>;
@@ -29,6 +29,15 @@
 using NonLocalMap = std::unordered_map<std::string, NonLocalType>;
 using NonLocalMap2 = std::unordered_map<std::string, uint8_t>;
 
+PYBIND11_MAKE_OPAQUE(LocalVec);
+PYBIND11_MAKE_OPAQUE(LocalVec2);
+PYBIND11_MAKE_OPAQUE(LocalMap);
+PYBIND11_MAKE_OPAQUE(NonLocalVec);
+//PYBIND11_MAKE_OPAQUE(NonLocalVec2); // same type as LocalVec2
+PYBIND11_MAKE_OPAQUE(NonLocalMap);
+PYBIND11_MAKE_OPAQUE(NonLocalMap2);
+
+
 // Simple bindings (used with the above):
 template <typename T, int Adjust, typename... Args>
 py::class_<T> bind_local(Args && ...args) {
@@ -36,3 +45,16 @@
         .def(py::init<int>())
         .def("get", [](T &i) { return i.i + Adjust; });
 };
+
+// Simulate a foreign library base class (to match the example in the docs):
+namespace pets {
+class Pet {
+public:
+    Pet(std::string name) : name_(name) {}
+    std::string name_;
+    const std::string &name() { return name_; }
+};
+}
+
+struct MixGL { int i; MixGL(int i) : i{i} {} };
+struct MixGL2 { int i; MixGL2(int i) : i{i} {} };
diff --git a/tests/pybind11_cross_module_tests.cpp b/tests/pybind11_cross_module_tests.cpp
index 252f893..2091624 100644
--- a/tests/pybind11_cross_module_tests.cpp
+++ b/tests/pybind11_cross_module_tests.cpp
@@ -87,4 +87,21 @@
     m.def("load_vector_via_binding", [](std::vector<int> &v) {
         return std::accumulate(v.begin(), v.end(), 0);
     });
+
+    // test_cross_module_calls
+    m.def("return_self", [](LocalVec *v) { return v; });
+    m.def("return_copy", [](const LocalVec &v) { return LocalVec(v); });
+
+    class Dog : public pets::Pet { public: Dog(std::string name) : Pet(name) {}; };
+    py::class_<pets::Pet>(m, "Pet", py::module_local())
+        .def("name", &pets::Pet::name);
+    // Binding for local extending class:
+    py::class_<Dog, pets::Pet>(m, "Dog")
+        .def(py::init<std::string>());
+    m.def("pet_name", [](pets::Pet &p) { return p.name(); });
+
+    py::class_<MixGL>(m, "MixGL", py::module_local()).def(py::init<int>());
+    m.def("get_gl_value", [](MixGL &o) { return o.i + 100; });
+
+    py::class_<MixGL2>(m, "MixGL2", py::module_local()).def(py::init<int>());
 }
diff --git a/tests/test_local_bindings.cpp b/tests/test_local_bindings.cpp
index fdb67a1..359d6c6 100644
--- a/tests/test_local_bindings.cpp
+++ b/tests/test_local_bindings.cpp
@@ -14,13 +14,6 @@
 #include <pybind11/stl_bind.h>
 #include <numeric>
 
-PYBIND11_MAKE_OPAQUE(LocalVec);
-PYBIND11_MAKE_OPAQUE(LocalVec2);
-PYBIND11_MAKE_OPAQUE(LocalMap);
-PYBIND11_MAKE_OPAQUE(NonLocalVec);
-PYBIND11_MAKE_OPAQUE(NonLocalMap);
-PYBIND11_MAKE_OPAQUE(NonLocalMap2);
-
 TEST_SUBMODULE(local_bindings, m) {
     // test_local_bindings
     // Register a class with py::module_local:
@@ -84,4 +77,21 @@
     m.def("load_vector_via_caster", [](std::vector<int> v) {
         return std::accumulate(v.begin(), v.end(), 0);
     });
+
+    // test_cross_module_calls
+    m.def("return_self", [](LocalVec *v) { return v; });
+    m.def("return_copy", [](const LocalVec &v) { return LocalVec(v); });
+
+    class Cat : public pets::Pet { public: Cat(std::string name) : Pet(name) {}; };
+    py::class_<pets::Pet>(m, "Pet", py::module_local())
+        .def("get_name", &pets::Pet::name);
+    // Binding for local extending class:
+    py::class_<Cat, pets::Pet>(m, "Cat")
+        .def(py::init<std::string>());
+    m.def("pet_name", [](pets::Pet &p) { return p.name(); });
+
+    py::class_<MixGL>(m, "MixGL").def(py::init<int>());
+    m.def("get_gl_value", [](MixGL &o) { return o.i + 10; });
+
+    py::class_<MixGL2>(m, "MixGL2").def(py::init<int>());
 }
diff --git a/tests/test_local_bindings.py b/tests/test_local_bindings.py
index 3a6ad8b..e9a63ca 100644
--- a/tests/test_local_bindings.py
+++ b/tests/test_local_bindings.py
@@ -20,16 +20,14 @@
     assert not hasattr(i1, 'get2')
     assert not hasattr(i2, 'get3')
 
+    # Loading within the local module
     assert m.local_value(i1) == 5
     assert cm.local_value(i2) == 10
 
-    with pytest.raises(TypeError) as excinfo:
-        m.local_value(i2)
-    assert "incompatible function arguments" in str(excinfo.value)
-
-    with pytest.raises(TypeError) as excinfo:
-        cm.local_value(i1)
-    assert "incompatible function arguments" in str(excinfo.value)
+    # Cross-module loading works as well (on failure, the type loader looks for
+    # external module-local converters):
+    assert m.local_value(i2) == 10
+    assert cm.local_value(i1) == 5
 
 
 def test_nonlocal_failure():
@@ -60,13 +58,12 @@
     v2.append(cm.LocalType(1))
     v2.append(cm.LocalType(2))
 
-    with pytest.raises(TypeError):
-        v1.append(cm.LocalType(3))
-    with pytest.raises(TypeError):
-        v2.append(m.LocalType(3))
+    # Cross module value loading:
+    v1.append(cm.LocalType(3))
+    v2.append(m.LocalType(3))
 
-    assert [i.get() for i in v1] == [0, 1]
-    assert [i.get() for i in v2] == [2, 3]
+    assert [i.get() for i in v1] == [0, 1, 2]
+    assert [i.get() for i in v2] == [2, 3, 4]
 
     v3, v4 = m.NonLocalVec(), cm.NonLocalVec2()
     v3.append(m.NonLocalType(1))
@@ -158,3 +155,56 @@
 
     Invoked with: [1, 2, 3]
     """  # noqa: E501 line too long
+
+
+def test_cross_module_calls():
+    import pybind11_cross_module_tests as cm
+
+    v1 = m.LocalVec()
+    v1.append(m.LocalType(1))
+    v2 = cm.LocalVec()
+    v2.append(cm.LocalType(2))
+
+    # Returning the self pointer should get picked up as returning an existing
+    # instance (even when that instance is of a foreign, non-local type).
+    assert m.return_self(v1) is v1
+    assert cm.return_self(v2) is v2
+    assert m.return_self(v2) is v2
+    assert cm.return_self(v1) is v1
+
+    assert m.LocalVec is not cm.LocalVec
+    # Returning a copy, on the other hand, always goes to the local type,
+    # regardless of where the source type came from.
+    assert type(m.return_copy(v1)) is m.LocalVec
+    assert type(m.return_copy(v2)) is m.LocalVec
+    assert type(cm.return_copy(v1)) is cm.LocalVec
+    assert type(cm.return_copy(v2)) is cm.LocalVec
+
+    # Test the example given in the documentation (which also tests inheritance casting):
+    mycat = m.Cat("Fluffy")
+    mydog = cm.Dog("Rover")
+    assert mycat.get_name() == "Fluffy"
+    assert mydog.name() == "Rover"
+    assert m.Cat.__base__.__name__ == "Pet"
+    assert cm.Dog.__base__.__name__ == "Pet"
+    assert m.Cat.__base__ is not cm.Dog.__base__
+    assert m.pet_name(mycat) == "Fluffy"
+    assert m.pet_name(mydog) == "Rover"
+    assert cm.pet_name(mycat) == "Fluffy"
+    assert cm.pet_name(mydog) == "Rover"
+
+    assert m.MixGL is not cm.MixGL
+    a = m.MixGL(1)
+    b = cm.MixGL(2)
+    assert m.get_gl_value(a) == 11
+    assert m.get_gl_value(b) == 12
+    assert cm.get_gl_value(a) == 101
+    assert cm.get_gl_value(b) == 102
+
+    c, d = m.MixGL2(3), cm.MixGL2(4)
+    with pytest.raises(TypeError) as excinfo:
+        m.get_gl_value(c)
+    assert "incompatible function arguments" in str(excinfo)
+    with pytest.raises(TypeError) as excinfo:
+        m.get_gl_value(d)
+    assert "incompatible function arguments" in str(excinfo)