Fix occassional segfault introduced by #960

The fix for #960 could result a type being registered multiple times if
its `__init__` is called multiple times.  This can happen perfectly
ordinarily when python-side multiple inheritance is involved: for
example, with a diamond inheritance pattern with each intermediate
classes invoking the parent constructor.

With the change in #960, the multiple `__init__` calls meant
`register_instance` was called multiple times, but the deletion only
deleted it once.  Thus, if a future instance of the same type was
allocated at the same location, pybind would pick it up as a registered
type.

This fixes the issue by tracking whether a value pointer has been
registered to avoid both double-registering it.  (There's also a slight
optimization of not needing to do a registered_instances lookup when the
type is known not registered, but this is secondary).
diff --git a/include/pybind11/cast.h b/include/pybind11/cast.h
index 9c171ae..5db03e2 100644
--- a/include/pybind11/cast.h
+++ b/include/pybind11/cast.h
@@ -288,13 +288,24 @@
     bool holder_constructed() const {
         return inst->simple_layout
             ? inst->simple_holder_constructed
-            : inst->nonsimple.holder_constructed[index];
+            : inst->nonsimple.status[index] & instance::status_holder_constructed;
     }
     void set_holder_constructed() {
         if (inst->simple_layout)
             inst->simple_holder_constructed = true;
         else
-            inst->nonsimple.holder_constructed[index] = true;
+            inst->nonsimple.status[index] |= instance::status_holder_constructed;
+    }
+    bool instance_registered() const {
+        return inst->simple_layout
+            ? inst->simple_instance_registered
+            : inst->nonsimple.status[index] & instance::status_instance_registered;
+    }
+    void set_instance_registered() {
+        if (inst->simple_layout)
+            inst->simple_instance_registered = true;
+        else
+            inst->nonsimple.status[index] |= instance::status_instance_registered;
     }
 };
 
@@ -395,6 +406,7 @@
     if (simple_layout) {
         simple_value_holder[0] = nullptr;
         simple_holder_constructed = false;
+        simple_instance_registered = false;
     }
     else { // multiple base types or a too-large holder
         // Allocate space to hold: [v1*][h1][v2*][h2]...[bb...] where [vN*] is a value pointer,
@@ -407,7 +419,7 @@
             space += t->holder_size_in_ptrs; // holder instance
         }
         size_t flags_at = space;
-        space += size_in_ptrs(n_types * sizeof(bool)); // holder constructed flags
+        space += size_in_ptrs(n_types); // status bytes (holder_constructed and instance_registered)
 
         // Allocate space for flags, values, and holders, and initialize it to 0 (flags and values,
         // in particular, need to be 0).  Use Python's memory allocation functions: in Python 3.6
@@ -422,7 +434,7 @@
         if (!nonsimple.values_and_holders) throw std::bad_alloc();
         std::memset(nonsimple.values_and_holders, 0, space * sizeof(void *));
 #endif
-        nonsimple.holder_constructed = reinterpret_cast<bool *>(&nonsimple.values_and_holders[flags_at]);
+        nonsimple.status = reinterpret_cast<uint8_t *>(&nonsimple.values_and_holders[flags_at]);
     }
     owned = true;
 }
diff --git a/include/pybind11/class_support.h b/include/pybind11/class_support.h
index aee1591..8e18c4c 100644
--- a/include/pybind11/class_support.h
+++ b/include/pybind11/class_support.h
@@ -234,8 +234,8 @@
 
 /// Instance creation function for all pybind11 types. It only allocates space for the C++ object
 /// (or multiple objects, for Python-side inheritance from multiple pybind11 types), but doesn't
-/// call the constructor -- an `__init__` function must do that.  `register_instance` will need to
-/// be called after the object has been initialized.
+/// call the constructor -- an `__init__` function must do that (followed by an `init_instance`
+/// to set up the holder and register the instance).
 inline PyObject *make_new_instance(PyTypeObject *type, bool allocate_value /*= true (in cast.h)*/) {
 #if defined(PYPY_VERSION)
     // PyPy gets tp_basicsize wrong (issue 2482) under multiple inheritance when the first inherited
@@ -314,9 +314,11 @@
     // Deallocate any values/holders, if present:
     for (auto &v_h : values_and_holders(instance)) {
         if (v_h) {
-            // This is allowed to fail (the type might have been allocated but not
-            // initialized/registered):
-            deregister_instance(instance, v_h.value_ptr(), v_h.type);
+
+            // We have to deregister before we call dealloc because, for virtual MI types, we still
+            // need to be able to get the parent pointers.
+            if (v_h.instance_registered() && !deregister_instance(instance, v_h.value_ptr(), v_h.type))
+                pybind11_fail("pybind11_object_dealloc(): Tried to deallocate unregistered instance!");
 
             if (instance->owned || v_h.holder_constructed())
                 v_h.type->dealloc(v_h);
diff --git a/include/pybind11/common.h b/include/pybind11/common.h
index d0cc2d6..240f6d8 100644
--- a/include/pybind11/common.h
+++ b/include/pybind11/common.h
@@ -375,7 +375,7 @@
         void *simple_value_holder[1 + instance_simple_holder_in_ptrs()];
         struct {
             void **values_and_holders;
-            bool *holder_constructed;
+            uint8_t *status;
         } nonsimple;
     };
     /// Weak references (needed for keep alive):
@@ -396,14 +396,20 @@
      * python side.  Non-simple layout allocates the required amount of memory to have multiple
      * bound C++ classes as parents.  Under this layout, `nonsimple.values_and_holders` is set to a
      * pointer to allocated space of the required space to hold a a sequence of value pointers and
-     * holders followed by a set of holder-constructed flags (1 byte each), i.e.
+     * holders followed `status`, a set of bit flags (1 byte each), i.e.
      * [val1*][holder1][val2*][holder2]...[bb...]  where each [block] is rounded up to a multiple of
      * `sizeof(void *)`.  `nonsimple.holder_constructed` is, for convenience, a pointer to the
      * beginning of the [bb...] block (but not independently allocated).
+     *
+     * Status bits indicate whether the associated holder is constructed (&
+     * status_holder_constructed) and whether the value pointer is registered (&
+     * status_instance_registered) in `registered_instances`.
      */
     bool simple_layout : 1;
     /// For simple layout, tracks whether the holder has been constructed
     bool simple_holder_constructed : 1;
+    /// For simple layout, tracks whether the instance is registered in `registered_instances`
+    bool simple_instance_registered : 1;
     /// If true, get_internals().patients has an entry for this object
     bool has_patients : 1;
 
@@ -416,6 +422,10 @@
     /// Returns the value_and_holder wrapper for the given type (or the first, if `find_type`
     /// omitted)
     value_and_holder get_value_and_holder(const type_info *find_type = nullptr);
+
+    /// Bit values for the non-simple status flags
+    static constexpr uint8_t status_holder_constructed  = 1;
+    static constexpr uint8_t status_instance_registered = 2;
 };
 
 static_assert(std::is_standard_layout<instance>::value, "Internal error: `pybind11::detail::instance` is not standard layout!");
diff --git a/include/pybind11/pybind11.h b/include/pybind11/pybind11.h
index 09d3379..d3f34ee 100644
--- a/include/pybind11/pybind11.h
+++ b/include/pybind11/pybind11.h
@@ -1215,7 +1215,10 @@
     /// `.owned`, a new holder will be constructed to manage the value pointer.
     static void init_instance(detail::instance *inst, const void *holder_ptr) {
         auto v_h = inst->get_value_and_holder(detail::get_type_info(typeid(type)));
-        register_instance(inst, v_h.value_ptr(), v_h.type);
+        if (!v_h.instance_registered()) {
+            register_instance(inst, v_h.value_ptr(), v_h.type);
+            v_h.set_instance_registered();
+        }
         init_holder(inst, v_h, (const holder_type *) holder_ptr, v_h.value_ptr<type>());
     }