Use numpy rather than Eigen for copying

We're current copy by creating an Eigen::Map into the input numpy
array, then assigning that to the basic eigen type, effectively having
Eigen do the copy.  That doesn't work for negative strides, though:
Eigen doesn't allow them.

This commit makes numpy do the copying instead by allocating the eigen
type, then having numpy copy from the input array into a numpy reference
into the eigen object's data.  This also saves a copy when type
conversion is required: numpy can do the conversion on-the-fly as part
of the copy.

Finally this commit also makes non-reference parameters respect the
convert flag, declining the load when called in a noconvert pass with a
convertible, but non-array input or an array with the wrong dtype.
diff --git a/docs/advanced/pycpp/numpy.rst b/docs/advanced/pycpp/numpy.rst
index 42f376c..d063706 100644
--- a/docs/advanced/pycpp/numpy.rst
+++ b/docs/advanced/pycpp/numpy.rst
@@ -41,8 +41,8 @@
                 py::format_descriptor<float>::format(), /* Python struct-style format descriptor */
                 2,                                      /* Number of dimensions */
                 { m.rows(), m.cols() },                 /* Buffer dimensions */
-                { (ssize_t)( sizeof(float) * m.rows() ),/* Strides (in bytes) for each index */
-                  (ssize_t)( sizeof(float) ) }
+                { sizeof(float) * m.rows(),             /* Strides (in bytes) for each index */
+                  sizeof(float) }
             );
         });
 
@@ -118,11 +118,10 @@
             /* Number of dimensions */
             2,
             /* Buffer dimensions */
-            { (size_t) m.rows(),
-              (size_t) m.cols() },
+            { m.rows(), m.cols() },
             /* Strides (in bytes) for each index */
-            { (ssize_t)( sizeof(Scalar) * (rowMajor ? m.cols() : 1) ),
-              (ssize_t)( sizeof(Scalar) * (rowMajor ? 1 : m.rows()) ) }
+            { sizeof(Scalar) * (rowMajor ? m.cols() : 1),
+              sizeof(Scalar) * (rowMajor ? 1 : m.rows()) }
         );
      })
 
diff --git a/include/pybind11/buffer_info.h b/include/pybind11/buffer_info.h
index e26e063..3090a42 100644
--- a/include/pybind11/buffer_info.h
+++ b/include/pybind11/buffer_info.h
@@ -21,12 +21,12 @@
     std::string format;          // For homogeneous buffers, this should be set to format_descriptor<T>::format()
     size_t ndim = 0;             // Number of dimensions
     std::vector<size_t> shape;   // Shape of the tensor (1 entry per dimension)
-    std::vector<size_t> strides; // Number of entries between adjacent entries (for each per dimension)
+    std::vector<ssize_t> strides; // Number of entries between adjacent entries (for each per dimension)
 
     buffer_info() { }
 
     buffer_info(void *ptr, size_t itemsize, const std::string &format, size_t ndim,
-                detail::any_container<size_t> shape_in, detail::any_container<size_t> strides_in)
+                detail::any_container<size_t> shape_in, detail::any_container<ssize_t> strides_in)
     : ptr(ptr), itemsize(itemsize), size(1), format(format), ndim(ndim),
       shape(std::move(shape_in)), strides(std::move(strides_in)) {
         if (ndim != shape.size() || ndim != strides.size())
diff --git a/include/pybind11/class_support.h b/include/pybind11/class_support.h
index 0f16c4a..99d1477 100644
--- a/include/pybind11/class_support.h
+++ b/include/pybind11/class_support.h
@@ -433,7 +433,7 @@
 #endif
     type->tp_flags |= Py_TPFLAGS_HAVE_GC;
     type->tp_dictoffset = type->tp_basicsize; // place dict at the end
-    type->tp_basicsize += sizeof(PyObject *); // and allocate enough space for it
+    type->tp_basicsize += (Py_ssize_t)sizeof(PyObject *); // and allocate enough space for it
     type->tp_traverse = pybind11_traverse;
     type->tp_clear = pybind11_clear;
 
@@ -459,16 +459,18 @@
     view->ndim = 1;
     view->internal = info;
     view->buf = info->ptr;
-    view->itemsize = (ssize_t) info->itemsize;
+    view->itemsize = (Py_ssize_t) info->itemsize;
     view->len = view->itemsize;
     for (auto s : info->shape)
-        view->len *= s;
+        view->len *= (Py_ssize_t) s;
     if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT)
         view->format = const_cast<char *>(info->format.c_str());
     if ((flags & PyBUF_STRIDES) == PyBUF_STRIDES) {
         view->ndim = (int) info->ndim;
-        view->strides = (ssize_t *) &info->strides[0];
-        view->shape = (ssize_t *) &info->shape[0];
+        view->strides = &info->strides[0];
+        // Next is a pointer cast, let's make sure it's safe.
+        static_assert(sizeof(Py_ssize_t)==sizeof(info->shape[0]), "sizeof(Py_ssize_t) != sizeof(size_t)");
+        view->shape = (Py_ssize_t *) &info->shape[0];
     }
     Py_INCREF(view->obj);
     return 0;
diff --git a/include/pybind11/eigen.h b/include/pybind11/eigen.h
index e6a6c92..8022a2c 100644
--- a/include/pybind11/eigen.h
+++ b/include/pybind11/eigen.h
@@ -249,8 +249,14 @@
     using Scalar = typename Type::Scalar;
     using props = EigenProps<Type>;
 
-    bool load(handle src, bool) {
-        auto buf = array_t<Scalar>::ensure(src);
+    bool load(handle src, bool convert) {
+        // If we're in no-convert mode, only load if given an array of the correct type
+        if (!convert && !isinstance<array_t<Scalar>>(src))
+            return false;
+
+        // Coerce into an array, but don't do type conversion yet; the copy below handles it.
+        auto buf = array::ensure(src);
+
         if (!buf)
             return false;
 
@@ -259,25 +265,16 @@
             return false;
 
         auto fits = props::conformable(buf);
-        if (!fits)
-            return false; // Non-comformable vector/matrix types
+        // Allocate the new type, then build a numpy reference into it
+        value = Type(fits.rows, fits.cols);
+        auto ref = reinterpret_steal<array>(eigen_ref_array<props>(value));
+        if (dims == 1) ref = ref.squeeze();
 
-        if (fits.negativestrides) {
+        int result = detail::npy_api::get().PyArray_CopyInto_(ref.ptr(), buf.ptr());
 
-            // Eigen does not support negative strides, so we need to make a copy here with normal strides.
-            // TODO: when Eigen bug #747 is fixed, remove this if case, always execute the else part.
-            // http://eigen.tuxfamily.org/bz/show_bug.cgi?id=747
-            auto buf2 = array_t<Scalar,array::forcecast || array::f_style>::ensure(src);
-            if (!buf2)
-                return false;
-            // not checking sizes, we already did that
-            fits = props::conformable(buf2);
-            value = Eigen::Map<const Type, 0, EigenDStride>(buf2.data(), fits.rows, fits.cols, fits.stride);
-
-        } else {
-
-           value = Eigen::Map<const Type, 0, EigenDStride>(buf.data(), fits.rows, fits.cols, fits.stride);
-
+        if (result < 0) { // Copy failed!
+            PyErr_Clear();
+            return false;
         }
 
         return true;
diff --git a/include/pybind11/numpy.h b/include/pybind11/numpy.h
index c1205fe..0107366 100644
--- a/include/pybind11/numpy.h
+++ b/include/pybind11/numpy.h
@@ -152,6 +152,7 @@
         (PyTypeObject *, PyObject *, int, Py_intptr_t *,
          Py_intptr_t *, void *, int, PyObject *);
     PyObject *(*PyArray_DescrNewFromType_)(int);
+    int (*PyArray_CopyInto_)(PyObject *, PyObject *);
     PyObject *(*PyArray_NewCopy_)(PyObject *, int);
     PyTypeObject *PyArray_Type_;
     PyTypeObject *PyVoidArrType_Type_;
@@ -175,6 +176,7 @@
         API_PyArray_DescrFromScalar = 57,
         API_PyArray_FromAny = 69,
         API_PyArray_Resize = 80,
+        API_PyArray_CopyInto = 82,
         API_PyArray_NewCopy = 85,
         API_PyArray_NewFromDescr = 94,
         API_PyArray_DescrNewFromType = 9,
@@ -205,6 +207,7 @@
         DECL_NPY_API(PyArray_DescrFromScalar);
         DECL_NPY_API(PyArray_FromAny);
         DECL_NPY_API(PyArray_Resize);
+        DECL_NPY_API(PyArray_CopyInto);
         DECL_NPY_API(PyArray_NewCopy);
         DECL_NPY_API(PyArray_NewFromDescr);
         DECL_NPY_API(PyArray_DescrNewFromType);
diff --git a/include/pybind11/stl_bind.h b/include/pybind11/stl_bind.h
index 4cae4e0..7bba090 100644
--- a/include/pybind11/stl_bind.h
+++ b/include/pybind11/stl_bind.h
@@ -358,7 +358,7 @@
         vec.reserve(info.shape[0]);
         T *p = static_cast<T*>(info.ptr);
         auto step = info.strides[0] / static_cast<ssize_t>(sizeof(T));
-        T *end = p + info.shape[0] * step;
+        T *end = p + static_cast<ssize_t>(info.shape[0]) * step;
         for (; p < end; p += step)
             vec.push_back(*p);
     });
diff --git a/tests/test_numpy_array.cpp b/tests/test_numpy_array.cpp
index e431e09..2e7004e 100644
--- a/tests/test_numpy_array.cpp
+++ b/tests/test_numpy_array.cpp
@@ -20,11 +20,11 @@
 static_assert(std::is_same<arr_t::value_type, uint16_t>::value, "");
 
 template<typename... Ix> arr data(const arr& a, Ix... index) {
-    return arr(a.nbytes() - a.offset_at(index...), (const uint8_t *) a.data(index...));
+    return arr(a.nbytes() - size_t(a.offset_at(index...)), (const uint8_t *) a.data(index...));
 }
 
 template<typename... Ix> arr data_t(const arr_t& a, Ix... index) {
-    return arr(a.size() - a.index_at(index...), a.data(index...));
+    return arr(a.size() - size_t(a.index_at(index...)), a.data(index...));
 }
 
 arr& mutate_data(arr& a) {
@@ -43,23 +43,23 @@
 
 template<typename... Ix> arr& mutate_data(arr& a, Ix... index) {
     auto ptr = (uint8_t *) a.mutable_data(index...);
-    for (size_t i = 0; i < a.nbytes() - a.offset_at(index...); i++)
+    for (size_t i = 0; i < a.nbytes() - size_t(a.offset_at(index...)); i++)
         ptr[i] = (uint8_t) (ptr[i] * 2);
     return a;
 }
 
 template<typename... Ix> arr_t& mutate_data_t(arr_t& a, Ix... index) {
     auto ptr = a.mutable_data(index...);
-    for (size_t i = 0; i < a.size() - a.index_at(index...); i++)
+    for (size_t i = 0; i < a.size() - size_t(a.index_at(index...)); i++)
         ptr[i]++;
     return a;
 }
 
-template<typename... Ix> size_t index_at(const arr& a, Ix... idx) { return a.index_at(idx...); }
-template<typename... Ix> size_t index_at_t(const arr_t& a, Ix... idx) { return a.index_at(idx...); }
-template<typename... Ix> size_t offset_at(const arr& a, Ix... idx) { return a.offset_at(idx...); }
-template<typename... Ix> size_t offset_at_t(const arr_t& a, Ix... idx) { return a.offset_at(idx...); }
-template<typename... Ix> size_t at_t(const arr_t& a, Ix... idx) { return a.at(idx...); }
+template<typename... Ix> py::ssize_t index_at(const arr& a, Ix... idx) { return a.index_at(idx...); }
+template<typename... Ix> py::ssize_t index_at_t(const arr_t& a, Ix... idx) { return a.index_at(idx...); }
+template<typename... Ix> py::ssize_t offset_at(const arr& a, Ix... idx) { return a.offset_at(idx...); }
+template<typename... Ix> py::ssize_t offset_at_t(const arr_t& a, Ix... idx) { return a.offset_at(idx...); }
+template<typename... Ix> py::ssize_t at_t(const arr_t& a, Ix... idx) { return a.at(idx...); }
 template<typename... Ix> arr_t& mutate_at_t(arr_t& a, Ix... idx) { a.mutable_at(idx...)++; return a; }
 
 #define def_index_fn(name, type) \