bpo-36974: Make tp_call=PyVectorcall_Call work for inherited types (GH-13699)

When inheriting a heap subclass from a vectorcall class that sets
`.tp_call=PyVectorcall_Call` (as recommended in PEP 590), the subclass does
not inherit `_Py_TPFLAGS_HAVE_VECTORCALL`, and thus `PyVectorcall_Call` does
not work for it.

This attempts to solve the issue by:
* always inheriting `tp_vectorcall_offset` unless `tp_call` is overridden
  in the subclass
* inheriting _Py_TPFLAGS_HAVE_VECTORCALL for static types, unless `tp_call`
  is overridden
* making `PyVectorcall_Call` ignore `_Py_TPFLAGS_HAVE_VECTORCALL`

This means it'll be ever more important to only call `PyVectorcall_Call`
on classes that support vectorcall. In `PyVectorcall_Call`'s intended role
as `tp_call` filler, that's not a problem.
diff --git a/Objects/call.c b/Objects/call.c
index c0d1456..578e1b3 100644
--- a/Objects/call.c
+++ b/Objects/call.c
@@ -173,12 +173,22 @@
 PyObject *
 PyVectorcall_Call(PyObject *callable, PyObject *tuple, PyObject *kwargs)
 {
-    vectorcallfunc func = _PyVectorcall_Function(callable);
+    /* get vectorcallfunc as in _PyVectorcall_Function, but without
+     * the _Py_TPFLAGS_HAVE_VECTORCALL check */
+    Py_ssize_t offset = Py_TYPE(callable)->tp_vectorcall_offset;
+    if ((offset <= 0) || (!Py_TYPE(callable)->tp_call)) {
+        PyErr_Format(PyExc_TypeError, "'%.200s' object does not support vectorcall",
+                     Py_TYPE(callable)->tp_name);
+        return NULL;
+    }
+    vectorcallfunc func = *(vectorcallfunc *)(((char *)callable) + offset);
     if (func == NULL) {
         PyErr_Format(PyExc_TypeError, "'%.200s' object does not support vectorcall",
                      Py_TYPE(callable)->tp_name);
         return NULL;
     }
+
+    /* Convert arguments & call */
     PyObject *const *args;
     Py_ssize_t nargs = PyTuple_GET_SIZE(tuple);
     PyObject *kwnames;