bpo-36974: Make tp_call=PyVectorcall_Call work for inherited types (GH-13699)

When inheriting a heap subclass from a vectorcall class that sets
`.tp_call=PyVectorcall_Call` (as recommended in PEP 590), the subclass does
not inherit `_Py_TPFLAGS_HAVE_VECTORCALL`, and thus `PyVectorcall_Call` does
not work for it.

This attempts to solve the issue by:
* always inheriting `tp_vectorcall_offset` unless `tp_call` is overridden
  in the subclass
* inheriting _Py_TPFLAGS_HAVE_VECTORCALL for static types, unless `tp_call`
  is overridden
* making `PyVectorcall_Call` ignore `_Py_TPFLAGS_HAVE_VECTORCALL`

This means it'll be ever more important to only call `PyVectorcall_Call`
on classes that support vectorcall. In `PyVectorcall_Call`'s intended role
as `tp_call` filler, that's not a problem.
diff --git a/Objects/call.c b/Objects/call.c
index c0d1456..578e1b3 100644
--- a/Objects/call.c
+++ b/Objects/call.c
@@ -173,12 +173,22 @@
 PyObject *
 PyVectorcall_Call(PyObject *callable, PyObject *tuple, PyObject *kwargs)
 {
-    vectorcallfunc func = _PyVectorcall_Function(callable);
+    /* get vectorcallfunc as in _PyVectorcall_Function, but without
+     * the _Py_TPFLAGS_HAVE_VECTORCALL check */
+    Py_ssize_t offset = Py_TYPE(callable)->tp_vectorcall_offset;
+    if ((offset <= 0) || (!Py_TYPE(callable)->tp_call)) {
+        PyErr_Format(PyExc_TypeError, "'%.200s' object does not support vectorcall",
+                     Py_TYPE(callable)->tp_name);
+        return NULL;
+    }
+    vectorcallfunc func = *(vectorcallfunc *)(((char *)callable) + offset);
     if (func == NULL) {
         PyErr_Format(PyExc_TypeError, "'%.200s' object does not support vectorcall",
                      Py_TYPE(callable)->tp_name);
         return NULL;
     }
+
+    /* Convert arguments & call */
     PyObject *const *args;
     Py_ssize_t nargs = PyTuple_GET_SIZE(tuple);
     PyObject *kwnames;
diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index b6d925c..76e06aa 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -5145,17 +5145,21 @@
     }
     COPYSLOT(tp_repr);
     /* tp_hash see tp_richcompare */
-    COPYSLOT(tp_call);
-    /* Inherit tp_vectorcall_offset and _Py_TPFLAGS_HAVE_VECTORCALL if tp_call
-     * was inherited, but only for extension types */
-    if ((base->tp_flags & _Py_TPFLAGS_HAVE_VECTORCALL) &&
-        !(type->tp_flags & _Py_TPFLAGS_HAVE_VECTORCALL) &&
-        !(type->tp_flags & Py_TPFLAGS_HEAPTYPE) &&
-        base->tp_call &&
-        type->tp_call == base->tp_call)
     {
-        type->tp_vectorcall_offset = base->tp_vectorcall_offset;
-        type->tp_flags |= _Py_TPFLAGS_HAVE_VECTORCALL;
+        /* Inherit tp_vectorcall_offset only if tp_call is not overridden */
+        if (!type->tp_call) {
+            COPYSLOT(tp_vectorcall_offset);
+        }
+        /* Inherit_Py_TPFLAGS_HAVE_VECTORCALL for non-heap types
+        * if tp_call is not overridden */
+        if (!type->tp_call &&
+            (base->tp_flags & _Py_TPFLAGS_HAVE_VECTORCALL) &&
+            !(type->tp_flags & _Py_TPFLAGS_HAVE_VECTORCALL) &&
+            !(type->tp_flags & Py_TPFLAGS_HEAPTYPE))
+        {
+            type->tp_flags |= _Py_TPFLAGS_HAVE_VECTORCALL;
+        }
+        COPYSLOT(tp_call);
     }
     COPYSLOT(tp_str);
     {