bpo-36051: Drop GIL during large bytes.join() (GH-17757)

Improve multi-threaded performance by dropping the GIL in the fast path
of bytes.join. To avoid increasing overhead for small joins, it is only
done if the output size exceeds a threshold.
diff --git a/Objects/stringlib/join.h b/Objects/stringlib/join.h
index 6f314e1..4d023ed 100644
--- a/Objects/stringlib/join.h
+++ b/Objects/stringlib/join.h
@@ -18,6 +18,9 @@
     Py_buffer *buffers = NULL;
 #define NB_STATIC_BUFFERS 10
     Py_buffer static_buffers[NB_STATIC_BUFFERS];
+#define GIL_THRESHOLD 1048576
+    int drop_gil = 1;
+    PyThreadState *save;
 
     seq = PySequence_Fast(iterable, "can only join an iterable");
     if (seq == NULL) {
@@ -65,12 +68,21 @@
             buffers[i].buf = PyBytes_AS_STRING(item);
             buffers[i].len = PyBytes_GET_SIZE(item);
         }
-        else if (PyObject_GetBuffer(item, &buffers[i], PyBUF_SIMPLE) != 0) {
-            PyErr_Format(PyExc_TypeError,
-                         "sequence item %zd: expected a bytes-like object, "
-                         "%.80s found",
-                         i, Py_TYPE(item)->tp_name);
-            goto error;
+        else {
+            if (PyObject_GetBuffer(item, &buffers[i], PyBUF_SIMPLE) != 0) {
+                PyErr_Format(PyExc_TypeError,
+                             "sequence item %zd: expected a bytes-like object, "
+                             "%.80s found",
+                             i, Py_TYPE(item)->tp_name);
+                goto error;
+            }
+            /* If the backing objects are mutable, then dropping the GIL
+             * opens up race conditions where another thread tries to modify
+             * the object which we hold a buffer on it. Such code has data
+             * races anyway, but this is a conservative approach that avoids
+             * changing the behaviour of that data race.
+             */
+            drop_gil = 0;
         }
         nbufs = i + 1;  /* for error cleanup */
         itemlen = buffers[i].len;
@@ -102,6 +114,12 @@
 
     /* Catenate everything. */
     p = STRINGLIB_STR(res);
+    if (sz < GIL_THRESHOLD) {
+        drop_gil = 0;   /* Benefits are likely outweighed by the overheads */
+    }
+    if (drop_gil) {
+        save = PyEval_SaveThread();
+    }
     if (!seplen) {
         /* fast path */
         for (i = 0; i < nbufs; i++) {
@@ -110,19 +128,23 @@
             memcpy(p, q, n);
             p += n;
         }
-        goto done;
     }
-    for (i = 0; i < nbufs; i++) {
-        Py_ssize_t n;
-        char *q;
-        if (i) {
-            memcpy(p, sepstr, seplen);
-            p += seplen;
+    else {
+        for (i = 0; i < nbufs; i++) {
+            Py_ssize_t n;
+            char *q;
+            if (i) {
+                memcpy(p, sepstr, seplen);
+                p += seplen;
+            }
+            n = buffers[i].len;
+            q = buffers[i].buf;
+            memcpy(p, q, n);
+            p += n;
         }
-        n = buffers[i].len;
-        q = buffers[i].buf;
-        memcpy(p, q, n);
-        p += n;
+    }
+    if (drop_gil) {
+        PyEval_RestoreThread(save);
     }
     goto done;
 
@@ -138,3 +160,4 @@
 }
 
 #undef NB_STATIC_BUFFERS
+#undef GIL_THRESHOLD