bpo-34454: fix .fromisoformat() methods crashing on inputs with surrogate code points (GH-8862)


The current C implementations **crash** if the input includes a surrogate
Unicode code point, which is not possible to encode in UTF-8.

Important notes:

1.  It is possible to pass a non-UTF-8 string as a separator to the
    `.isoformat()` methods.
2.  The pure-Python `datetime.fromisoformat()` implementation accepts
    strings with a surrogate as the separator.

In `datetime.fromisoformat()`, in the special case of non-UTF-8 separators,
this implementation will take a performance hit by making a copy of the
input string and replacing the separator with 'T'.

Co-authored-by: Alexey Izbyshev <izbyshev@ispras.ru>
Co-authored-by: Paul Ganssle <paul@ganssle.io>
(cherry picked from commit 096329f0b2bf5e3f0a16363aa631d993ce078737)

Co-authored-by: Paul Ganssle <pganssle@users.noreply.github.com>
diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c
index 20d2d25..91d6991 100644
--- a/Modules/_datetimemodule.c
+++ b/Modules/_datetimemodule.c
@@ -2883,6 +2883,9 @@
     Py_ssize_t len;
 
     const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len);
+    if (dt_ptr == NULL) {
+        goto invalid_string_error;
+    }
 
     int year = 0, month = 0, day = 0;
 
@@ -2894,12 +2897,15 @@
     }
 
     if (rv < 0) {
-        PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s",
-                     dt_ptr);
-        return NULL;
+        goto invalid_string_error;
     }
 
     return new_date_subclass_ex(year, month, day, cls);
+
+invalid_string_error:
+    PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R",
+                 dtstr);
+    return NULL;
 }
 
 
@@ -4258,6 +4264,10 @@
     Py_ssize_t len;
     const char *p = PyUnicode_AsUTF8AndSize(tstr, &len);
 
+    if (p == NULL) {
+        goto invalid_string_error;
+    }
+
     int hour = 0, minute = 0, second = 0, microsecond = 0;
     int tzoffset, tzimicrosecond = 0;
     int rv = parse_isoformat_time(p, len,
@@ -4265,8 +4275,7 @@
                                   &tzoffset, &tzimicrosecond);
 
     if (rv < 0) {
-        PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s", p);
-        return NULL;
+        goto invalid_string_error;
     }
 
     PyObject *tzinfo = tzinfo_from_isoformat_results(rv, tzoffset,
@@ -4286,6 +4295,10 @@
 
     Py_DECREF(tzinfo);
     return t;
+
+invalid_string_error:
+    PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", tstr);
+    return NULL;
 }
 
 
@@ -4840,6 +4853,33 @@
 }
 
 static PyObject *
+_sanitize_isoformat_str(PyObject *dtstr, int *needs_decref) {
+    // `fromisoformat` allows surrogate characters in exactly one position,
+    // the separator; to allow datetime_fromisoformat to make the simplifying
+    // assumption that all valid strings can be encoded in UTF-8, this function
+    // replaces any surrogate character separators with `T`.
+    Py_ssize_t len = PyUnicode_GetLength(dtstr);
+    *needs_decref = 0;
+    if (len <= 10 || !Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) {
+        return dtstr;
+    }
+
+    PyObject *str_out = PyUnicode_New(len, PyUnicode_MAX_CHAR_VALUE(dtstr));
+    if (str_out == NULL) {
+        return NULL;
+    }
+
+    if (PyUnicode_CopyCharacters(str_out, 0, dtstr, 0, len) == -1 ||
+            PyUnicode_WriteChar(str_out, 10, (Py_UCS4)'T')) {
+        Py_DECREF(str_out);
+        return NULL;
+    }
+
+    *needs_decref = 1;
+    return str_out;
+}
+
+static PyObject *
 datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
     assert(dtstr != NULL);
 
@@ -4848,9 +4888,20 @@
         return NULL;
     }
 
+    int needs_decref = 0;
+    dtstr = _sanitize_isoformat_str(dtstr, &needs_decref);
+    if (dtstr == NULL) {
+        goto error;
+    }
+
     Py_ssize_t len;
     const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len);
-    const char * p = dt_ptr;
+
+    if (dt_ptr == NULL) {
+        goto invalid_string_error;
+    }
+
+    const char *p = dt_ptr;
 
     int year = 0, month = 0, day = 0;
     int hour = 0, minute = 0, second = 0, microsecond = 0;
@@ -4883,20 +4934,32 @@
                                   &tzoffset, &tzusec);
     }
     if (rv < 0) {
-        PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s", dt_ptr);
-        return NULL;
+        goto invalid_string_error;
     }
 
     PyObject* tzinfo = tzinfo_from_isoformat_results(rv, tzoffset, tzusec);
     if (tzinfo == NULL) {
-        return NULL;
+        goto error;
     }
 
     PyObject *dt = new_datetime_subclass_ex(year, month, day, hour, minute,
                                             second, microsecond, tzinfo, cls);
 
     Py_DECREF(tzinfo);
+    if (needs_decref) {
+        Py_DECREF(dtstr);
+    }
     return dt;
+
+invalid_string_error:
+    PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", dtstr);
+
+error:
+    if (needs_decref) {
+        Py_DECREF(dtstr);
+    }
+
+    return NULL;
 }