fix: support nvcc and test (#2461)

* fix: support nvcc and test

* fixup! fix: support nvcc and test

* docs: mention what compilers fail

* fix: much simpler logic

* refactor: slightly faster / clearer
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 1ebc9dc..3104f49 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -289,6 +289,28 @@
     - name: Interface test
       run: cmake --build build --target test_cmake_build
 
+  cuda:
+    runs-on: ubuntu-latest
+    name: "🐍 3.8 • CUDA 11 • Ubuntu 20.04"
+    container: nvidia/cuda:11.0-devel-ubuntu20.04
+
+    steps:
+    - uses: actions/checkout@v2
+
+    # tzdata will try to ask for the timezone, so set the DEBIAN_FRONTEND
+    - name: Install 🐍 3
+      run: apt-get update && DEBIAN_FRONTEND="noninteractive" apt-get install -y cmake python3-dev python3-pytest
+
+    - name: Configure
+      run: cmake -S . -B build -DPYBIND11_CUDA_TESTS=ON -DPYBIND11_WERROR=ON -DDOWNLOAD_CATCH=ON
+
+    - name: Build
+      run: cmake --build build -j2 -v
+
+    - name: Python tests
+      run: cmake --build build --target pytest
+
+
   install-classic:
     name: "🐍 3.5 • Debian • x86 •  Install"
     runs-on: ubuntu-latest
diff --git a/README.md b/README.md
index bae6cf2..633231f 100644
--- a/README.md
+++ b/README.md
@@ -98,6 +98,7 @@
 4. Intel C++ compiler 17 or newer (16 with pybind11 v2.0 and 15 with pybind11
    v2.0 and a [workaround][intel-15-workaround])
 5. Cygwin/GCC (tested on 2.5.1)
+6. NVCC (CUDA 11 tested)
 
 ## About
 
diff --git a/include/pybind11/cast.h b/include/pybind11/cast.h
index bae69c8..be62610 100644
--- a/include/pybind11/cast.h
+++ b/include/pybind11/cast.h
@@ -1006,6 +1006,7 @@
     std::is_same<CharT, wchar_t> /* std::wstring */
 >;
 
+
 template <typename T>
 struct type_caster<T, enable_if_t<std::is_arithmetic<T>::value && !is_std_char_type<T>::value>> {
     using _py_type_0 = conditional_t<sizeof(T) <= sizeof(long), long, long long>;
@@ -1034,12 +1035,12 @@
                 : (py_type) PYBIND11_LONG_AS_LONGLONG(src.ptr());
         }
 
+        // Python API reported an error
         bool py_err = py_value == (py_type) -1 && PyErr_Occurred();
 
-        // Protect std::numeric_limits::min/max with parentheses
-        if (py_err || (std::is_integral<T>::value && sizeof(py_type) != sizeof(T) &&
-                       (py_value < (py_type) (std::numeric_limits<T>::min)() ||
-                        py_value > (py_type) (std::numeric_limits<T>::max)()))) {
+        // Check to see if the conversion is valid (integers should match exactly)
+        // Signed/unsigned checks happen elsewhere
+        if (py_err || (std::is_integral<T>::value && sizeof(py_type) != sizeof(T) && py_value != (py_type) (T) py_value)) {
             bool type_error = py_err && PyErr_ExceptionMatches(
 #if PY_VERSION_HEX < 0x03000000 && !defined(PYPY_VERSION)
                 PyExc_SystemError
diff --git a/include/pybind11/numpy.h b/include/pybind11/numpy.h
index 674450a..0192a8b 100644
--- a/include/pybind11/numpy.h
+++ b/include/pybind11/numpy.h
@@ -1483,7 +1483,14 @@
 
 template <typename Func, typename Return, typename... Args>
 struct vectorize_helper {
+
+// NVCC for some reason breaks if NVectorized is private
+#ifdef __CUDACC__
+public:
+#else
 private:
+#endif
+
     static constexpr size_t N = sizeof...(Args);
     static constexpr size_t NVectorized = constexpr_sum(vectorize_arg<Args>::vectorize...);
     static_assert(NVectorized >= 1,
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 72de210..54f13fd 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -30,6 +30,7 @@
 
 option(PYBIND11_WERROR "Report all warnings as errors" OFF)
 option(DOWNLOAD_EIGEN "Download EIGEN (requires CMake 3.11+)" OFF)
+option(PYBIND11_CUDA_TESTS "Enable building CUDA tests (requires CMake 3.12+)" OFF)
 set(PYBIND11_TEST_OVERRIDE
     ""
     CACHE STRING "Tests from ;-separated list of *.cpp files will be built instead of all tests")
@@ -49,6 +50,14 @@
                                                "RelWithDebInfo")
 endif()
 
+if(PYBIND11_CUDA_TESTS)
+  enable_language(CUDA)
+  if(DEFINED CMAKE_CXX_STANDARD)
+    set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
+  endif()
+  set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+endif()
+
 # Full set of test files (you can override these; see below)
 set(PYBIND11_TEST_FILES
     test_async.cpp
@@ -104,6 +113,16 @@
   list(REMOVE_AT PYBIND11_TEST_FILES ${PYBIND11_TEST_FILES_ASYNC_I})
 endif()
 
+# Skip tests for CUDA check:
+# /pybind11/tests/test_constants_and_functions.cpp(125):
+#   error: incompatible exception specifications
+list(FIND PYBIND11_TEST_FILES test_constants_and_functions.cpp PYBIND11_TEST_FILES_CAF_I)
+if((PYBIND11_TEST_FILES_CAF_I GREATER -1) AND PYBIND11_CUDA_TESTS)
+  message(
+    STATUS "Skipping test_constants_and_functions due to incompatible exception specifications")
+  list(REMOVE_AT PYBIND11_TEST_FILES ${PYBIND11_TEST_FILES_CAF_I})
+endif()
+
 string(REPLACE ".cpp" ".py" PYBIND11_PYTEST_FILES "${PYBIND11_TEST_FILES}")
 
 # Contains the set of test files that require pybind11_cross_module_tests to be
@@ -195,7 +214,7 @@
 function(pybind11_enable_warnings target_name)
   if(MSVC)
     target_compile_options(${target_name} PRIVATE /W4)
-  elseif(CMAKE_CXX_COMPILER_ID MATCHES "(GNU|Intel|Clang)")
+  elseif(CMAKE_CXX_COMPILER_ID MATCHES "(GNU|Intel|Clang)" AND NOT PYBIND11_CUDA_TESTS)
     target_compile_options(${target_name} PRIVATE -Wall -Wextra -Wconversion -Wcast-qual
                                                   -Wdeprecated)
   endif()
@@ -203,6 +222,8 @@
   if(PYBIND11_WERROR)
     if(MSVC)
       target_compile_options(${target_name} PRIVATE /WX)
+    elseif(PYBIND11_CUDA_TESTS)
+      target_compile_options(${target_name} PRIVATE "SHELL:-Werror all-warnings")
     elseif(CMAKE_CXX_COMPILER_ID MATCHES "(GNU|Intel|Clang)")
       target_compile_options(${target_name} PRIVATE -Werror)
     endif()
@@ -239,12 +260,22 @@
   endif()
 endforeach()
 
+# Support CUDA testing by forcing the target file to compile with NVCC
+if(PYBIND11_CUDA_TESTS)
+  set_property(SOURCE ${PYBIND11_TEST_FILES} PROPERTY LANGUAGE CUDA)
+endif()
+
 foreach(target ${test_targets})
   set(test_files ${PYBIND11_TEST_FILES})
   if(NOT "${target}" STREQUAL "pybind11_tests")
     set(test_files "")
   endif()
 
+  # Support CUDA testing by forcing the target file to compile with NVCC
+  if(PYBIND11_CUDA_TESTS)
+    set_property(SOURCE ${target}.cpp PROPERTY LANGUAGE CUDA)
+  endif()
+
   # Create the binding library
   pybind11_add_module(${target} THIN_LTO ${target}.cpp ${test_files} ${PYBIND11_HEADERS})
   pybind11_enable_warnings(${target})
@@ -354,8 +385,10 @@
     $<TARGET_FILE:pybind11_tests>
     ${CMAKE_CURRENT_BINARY_DIR}/sosize-$<TARGET_FILE_NAME:pybind11_tests>.txt)
 
-# Test embedding the interpreter. Provides the `cpptest` target.
-add_subdirectory(test_embed)
+if(NOT PYBIND11_CUDA_TESTS)
+  # Test embedding the interpreter. Provides the `cpptest` target.
+  add_subdirectory(test_embed)
 
-# Test CMake build using functions and targets from subdirectory or installed location
-add_subdirectory(test_cmake_build)
+  # Test CMake build using functions and targets from subdirectory or installed location
+  add_subdirectory(test_cmake_build)
+endif()
diff --git a/tests/test_constants_and_functions.py b/tests/test_constants_and_functions.py
index 36b1aa6..b980ccf 100644
--- a/tests/test_constants_and_functions.py
+++ b/tests/test_constants_and_functions.py
@@ -1,5 +1,7 @@
 # -*- coding: utf-8 -*-
-from pybind11_tests import constants_and_functions as m
+import pytest
+
+m = pytest.importorskip("pybind11_tests.constants_and_functions")
 
 
 def test_constants():
diff --git a/tests/test_copy_move.cpp b/tests/test_copy_move.cpp
index 0f698bd..34f1c61 100644
--- a/tests/test_copy_move.cpp
+++ b/tests/test_copy_move.cpp
@@ -175,14 +175,20 @@
     m.attr("has_optional") = false;
 #endif
 
-    // #70 compilation issue if operator new is not public
+    // #70 compilation issue if operator new is not public - simple body added
+    // but not needed on most compilers; MSVC and nvcc don't like a local
+    // struct not having a method defined when declared, since it can not be
+    // added later.
     struct PrivateOpNew {
         int value = 1;
     private:
-#if defined(_MSC_VER)
-#  pragma warning(disable: 4822) // warning C4822: local class member function does not have a body
-#endif
-        void *operator new(size_t bytes);
+        void *operator new(size_t bytes) {
+            void *ptr = std::malloc(bytes);
+            if (ptr)
+                return ptr;
+            else
+                throw std::bad_alloc{};
+        }
     };
     py::class_<PrivateOpNew>(m, "PrivateOpNew").def_readonly("value", &PrivateOpNew::value);
     m.def("private_op_new_value", []() { return PrivateOpNew(); });
diff --git a/tests/test_virtual_functions.cpp b/tests/test_virtual_functions.cpp
index 583c1e6..6dcf294 100644
--- a/tests/test_virtual_functions.cpp
+++ b/tests/test_virtual_functions.cpp
@@ -139,7 +139,7 @@
     std::string print_movable(int a, int b) { return get_movable(a, b).get_value(); }
 };
 class NCVirtTrampoline : public NCVirt {
-#if !defined(__INTEL_COMPILER)
+#if !defined(__INTEL_COMPILER) && !defined(__CUDACC__)
     NonCopyable get_noncopyable(int a, int b) override {
         PYBIND11_OVERLOAD(NonCopyable, NCVirt, get_noncopyable, a, b);
     }
@@ -205,7 +205,7 @@
         .def(py::init<int, int>());
 
     // test_move_support
-#if !defined(__INTEL_COMPILER)
+#if !defined(__INTEL_COMPILER) && !defined(__CUDACC__)
     py::class_<NCVirt, NCVirtTrampoline>(m, "NCVirt")
         .def(py::init<>())
         .def("get_noncopyable", &NCVirt::get_noncopyable)