Use numpy rather than Eigen for copying

We're current copy by creating an Eigen::Map into the input numpy
array, then assigning that to the basic eigen type, effectively having
Eigen do the copy.  That doesn't work for negative strides, though:
Eigen doesn't allow them.

This commit makes numpy do the copying instead by allocating the eigen
type, then having numpy copy from the input array into a numpy reference
into the eigen object's data.  This also saves a copy when type
conversion is required: numpy can do the conversion on-the-fly as part
of the copy.

Finally this commit also makes non-reference parameters respect the
convert flag, declining the load when called in a noconvert pass with a
convertible, but non-array input or an array with the wrong dtype.
diff --git a/tests/test_numpy_array.cpp b/tests/test_numpy_array.cpp
index e431e09..2e7004e 100644
--- a/tests/test_numpy_array.cpp
+++ b/tests/test_numpy_array.cpp
@@ -20,11 +20,11 @@
 static_assert(std::is_same<arr_t::value_type, uint16_t>::value, "");
 
 template<typename... Ix> arr data(const arr& a, Ix... index) {
-    return arr(a.nbytes() - a.offset_at(index...), (const uint8_t *) a.data(index...));
+    return arr(a.nbytes() - size_t(a.offset_at(index...)), (const uint8_t *) a.data(index...));
 }
 
 template<typename... Ix> arr data_t(const arr_t& a, Ix... index) {
-    return arr(a.size() - a.index_at(index...), a.data(index...));
+    return arr(a.size() - size_t(a.index_at(index...)), a.data(index...));
 }
 
 arr& mutate_data(arr& a) {
@@ -43,23 +43,23 @@
 
 template<typename... Ix> arr& mutate_data(arr& a, Ix... index) {
     auto ptr = (uint8_t *) a.mutable_data(index...);
-    for (size_t i = 0; i < a.nbytes() - a.offset_at(index...); i++)
+    for (size_t i = 0; i < a.nbytes() - size_t(a.offset_at(index...)); i++)
         ptr[i] = (uint8_t) (ptr[i] * 2);
     return a;
 }
 
 template<typename... Ix> arr_t& mutate_data_t(arr_t& a, Ix... index) {
     auto ptr = a.mutable_data(index...);
-    for (size_t i = 0; i < a.size() - a.index_at(index...); i++)
+    for (size_t i = 0; i < a.size() - size_t(a.index_at(index...)); i++)
         ptr[i]++;
     return a;
 }
 
-template<typename... Ix> size_t index_at(const arr& a, Ix... idx) { return a.index_at(idx...); }
-template<typename... Ix> size_t index_at_t(const arr_t& a, Ix... idx) { return a.index_at(idx...); }
-template<typename... Ix> size_t offset_at(const arr& a, Ix... idx) { return a.offset_at(idx...); }
-template<typename... Ix> size_t offset_at_t(const arr_t& a, Ix... idx) { return a.offset_at(idx...); }
-template<typename... Ix> size_t at_t(const arr_t& a, Ix... idx) { return a.at(idx...); }
+template<typename... Ix> py::ssize_t index_at(const arr& a, Ix... idx) { return a.index_at(idx...); }
+template<typename... Ix> py::ssize_t index_at_t(const arr_t& a, Ix... idx) { return a.index_at(idx...); }
+template<typename... Ix> py::ssize_t offset_at(const arr& a, Ix... idx) { return a.offset_at(idx...); }
+template<typename... Ix> py::ssize_t offset_at_t(const arr_t& a, Ix... idx) { return a.offset_at(idx...); }
+template<typename... Ix> py::ssize_t at_t(const arr_t& a, Ix... idx) { return a.at(idx...); }
 template<typename... Ix> arr_t& mutate_at_t(arr_t& a, Ix... idx) { a.mutable_at(idx...)++; return a; }
 
 #define def_index_fn(name, type) \