bpo-36974: Make tp_call=PyVectorcall_Call work for inherited types (GH-13699)

When inheriting a heap subclass from a vectorcall class that sets
`.tp_call=PyVectorcall_Call` (as recommended in PEP 590), the subclass does
not inherit `_Py_TPFLAGS_HAVE_VECTORCALL`, and thus `PyVectorcall_Call` does
not work for it.

This attempts to solve the issue by:
* always inheriting `tp_vectorcall_offset` unless `tp_call` is overridden
  in the subclass
* inheriting _Py_TPFLAGS_HAVE_VECTORCALL for static types, unless `tp_call`
  is overridden
* making `PyVectorcall_Call` ignore `_Py_TPFLAGS_HAVE_VECTORCALL`

This means it'll be ever more important to only call `PyVectorcall_Call`
on classes that support vectorcall. In `PyVectorcall_Call`'s intended role
as `tp_call` filler, that's not a problem.
diff --git a/Lib/test/test_capi.py b/Lib/test/test_capi.py
index fabc821..88bda05 100644
--- a/Lib/test/test_capi.py
+++ b/Lib/test/test_capi.py
@@ -515,9 +515,10 @@
 
     def test_vectorcall(self):
         # Test a bunch of different ways to call objects:
-        # 1. normal call
-        # 2. vectorcall using _PyObject_Vectorcall()
-        # 3. vectorcall using PyVectorcall_Call()
+        # 1. vectorcall using PyVectorcall_Call()
+        #   (only for objects that support vectorcall directly)
+        # 2. normal call
+        # 3. vectorcall using _PyObject_Vectorcall()
         # 4. call as bound method
         # 5. call using functools.partial
 
@@ -543,18 +544,37 @@
 
         for (func, args, kwargs, expected) in calls:
             with self.subTest(str(func)):
+                if not kwargs:
+                    self.assertEqual(expected, pyvectorcall_call(func, args))
+                self.assertEqual(expected, pyvectorcall_call(func, args, kwargs))
+
+        # Add derived classes (which do not support vectorcall directly,
+        # but do support all other ways of calling).
+
+        class MethodDescriptorHeap(_testcapi.MethodDescriptorBase):
+            pass
+
+        class MethodDescriptorOverridden(_testcapi.MethodDescriptorBase):
+            def __call__(self, n):
+                return 'new'
+
+        calls += [
+            (MethodDescriptorHeap(), (0,), {}, True),
+            (MethodDescriptorOverridden(), (0,), {}, 'new'),
+        ]
+
+        for (func, args, kwargs, expected) in calls:
+            with self.subTest(str(func)):
                 args1 = args[1:]
                 meth = MethodType(func, args[0])
                 wrapped = partial(func)
                 if not kwargs:
                     self.assertEqual(expected, func(*args))
                     self.assertEqual(expected, pyobject_vectorcall(func, args, None))
-                    self.assertEqual(expected, pyvectorcall_call(func, args))
                     self.assertEqual(expected, meth(*args1))
                     self.assertEqual(expected, wrapped(*args))
                 self.assertEqual(expected, func(*args, **kwargs))
                 self.assertEqual(expected, vectorcall(func, args, kwargs))
-                self.assertEqual(expected, pyvectorcall_call(func, args, kwargs))
                 self.assertEqual(expected, meth(*args1, **kwargs))
                 self.assertEqual(expected, wrapped(*args, **kwargs))
 
diff --git a/Modules/_testcapimodule.c b/Modules/_testcapimodule.c
index bf20e81..eed34c9 100644
--- a/Modules/_testcapimodule.c
+++ b/Modules/_testcapimodule.c
@@ -5854,7 +5854,7 @@
 static PyObject *
 MethodDescriptor_new(PyTypeObject* type, PyObject* args, PyObject *kw)
 {
-    MethodDescriptorObject *op = PyObject_New(MethodDescriptorObject, type);
+    MethodDescriptorObject *op = type->tp_alloc(type, 0);
     op->vectorcall = MethodDescriptor_vectorcall;
     return (PyObject *)op;
 }
diff --git a/Objects/call.c b/Objects/call.c
index c0d1456..578e1b3 100644
--- a/Objects/call.c
+++ b/Objects/call.c
@@ -173,12 +173,22 @@
 PyObject *
 PyVectorcall_Call(PyObject *callable, PyObject *tuple, PyObject *kwargs)
 {
-    vectorcallfunc func = _PyVectorcall_Function(callable);
+    /* get vectorcallfunc as in _PyVectorcall_Function, but without
+     * the _Py_TPFLAGS_HAVE_VECTORCALL check */
+    Py_ssize_t offset = Py_TYPE(callable)->tp_vectorcall_offset;
+    if ((offset <= 0) || (!Py_TYPE(callable)->tp_call)) {
+        PyErr_Format(PyExc_TypeError, "'%.200s' object does not support vectorcall",
+                     Py_TYPE(callable)->tp_name);
+        return NULL;
+    }
+    vectorcallfunc func = *(vectorcallfunc *)(((char *)callable) + offset);
     if (func == NULL) {
         PyErr_Format(PyExc_TypeError, "'%.200s' object does not support vectorcall",
                      Py_TYPE(callable)->tp_name);
         return NULL;
     }
+
+    /* Convert arguments & call */
     PyObject *const *args;
     Py_ssize_t nargs = PyTuple_GET_SIZE(tuple);
     PyObject *kwnames;
diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index b6d925c..76e06aa 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -5145,17 +5145,21 @@
     }
     COPYSLOT(tp_repr);
     /* tp_hash see tp_richcompare */
-    COPYSLOT(tp_call);
-    /* Inherit tp_vectorcall_offset and _Py_TPFLAGS_HAVE_VECTORCALL if tp_call
-     * was inherited, but only for extension types */
-    if ((base->tp_flags & _Py_TPFLAGS_HAVE_VECTORCALL) &&
-        !(type->tp_flags & _Py_TPFLAGS_HAVE_VECTORCALL) &&
-        !(type->tp_flags & Py_TPFLAGS_HEAPTYPE) &&
-        base->tp_call &&
-        type->tp_call == base->tp_call)
     {
-        type->tp_vectorcall_offset = base->tp_vectorcall_offset;
-        type->tp_flags |= _Py_TPFLAGS_HAVE_VECTORCALL;
+        /* Inherit tp_vectorcall_offset only if tp_call is not overridden */
+        if (!type->tp_call) {
+            COPYSLOT(tp_vectorcall_offset);
+        }
+        /* Inherit_Py_TPFLAGS_HAVE_VECTORCALL for non-heap types
+        * if tp_call is not overridden */
+        if (!type->tp_call &&
+            (base->tp_flags & _Py_TPFLAGS_HAVE_VECTORCALL) &&
+            !(type->tp_flags & _Py_TPFLAGS_HAVE_VECTORCALL) &&
+            !(type->tp_flags & Py_TPFLAGS_HEAPTYPE))
+        {
+            type->tp_flags |= _Py_TPFLAGS_HAVE_VECTORCALL;
+        }
+        COPYSLOT(tp_call);
     }
     COPYSLOT(tp_str);
     {