Issue #26331: Implement the parsing part of PEP 515. Thanks to Georg Brandl for the patch.

commit: a721abac299bb6529021000a71847486d531b41a [log] [tgz]
author: Brett Cannon <brett@python.org> Fri Sep 09 14:57:09 2016 -0700
committer: Brett Cannon <brett@python.org> Fri Sep 09 14:57:09 2016 -0700
tree: 8355a69b891cfcdaad8a5fd62870231b7f940696
parent: ee73a657455a908102379d3c9bc254676418e10c [diff] [blame]
diff --git a/Objects/longobject.c b/Objects/longobject.c
index 740b7f5..bbf7e71 100644
--- a/Objects/longobject.c
+++ b/Objects/longobject.c

@@ -2004,12 +2004,18 @@
  * non-digit (which may be *str!).  A normalized int is returned.
  * The point to this routine is that it takes time linear in the number of
  * string characters.
+ *
+ * Return values:
+ *   -1 on syntax error (exception needs to be set, *res is untouched)
+ *   0 else (exception may be set, in that case *res is set to NULL)
  */
-static PyLongObject *
-long_from_binary_base(const char **str, int base)
+static int
+long_from_binary_base(const char **str, int base, PyLongObject **res)
 {
     const char *p = *str;
     const char *start = p;
+    char prev = 0;
+    int digits = 0;
     int bits_per_char;
     Py_ssize_t n;
     PyLongObject *z;
@@ -2019,23 +2025,43 @@
 
     assert(base >= 2 && base <= 32 && (base & (base - 1)) == 0);
     n = base;
-    for (bits_per_char = -1; n; ++bits_per_char)
+    for (bits_per_char = -1; n; ++bits_per_char) {
         n >>= 1;
-    /* n <- total # of bits needed, while setting p to end-of-string */
-    while (_PyLong_DigitValue[Py_CHARMASK(*p)] < base)
+    }
+    /* count digits and set p to end-of-string */
+    while (_PyLong_DigitValue[Py_CHARMASK(*p)] < base || *p == '_') {
+        if (*p == '_') {
+            if (prev == '_') {
+                *str = p - 1;
+                return -1;
+            }
+        } else {
+            ++digits;
+        }
+        prev = *p;
         ++p;
+    }
+    if (prev == '_') {
+        /* Trailing underscore not allowed. */
+        *str = p - 1;
+        return -1;
+    }
+
     *str = p;
     /* n <- # of Python digits needed, = ceiling(n/PyLong_SHIFT). */
-    n = (p - start) * bits_per_char + PyLong_SHIFT - 1;
+    n = digits * bits_per_char + PyLong_SHIFT - 1;
     if (n / bits_per_char < p - start) {
         PyErr_SetString(PyExc_ValueError,
                         "int string too large to convert");
-        return NULL;
+        *res = NULL;
+        return 0;
     }
     n = n / PyLong_SHIFT;
     z = _PyLong_New(n);
-    if (z == NULL)
-        return NULL;
+    if (z == NULL) {
+        *res = NULL;
+        return 0;
+    }
     /* Read string from right, and fill in int from left; i.e.,
      * from least to most significant in both.
      */
@@ -2043,7 +2069,11 @@
     bits_in_accum = 0;
     pdigit = z->ob_digit;
     while (--p >= start) {
-        int k = (int)_PyLong_DigitValue[Py_CHARMASK(*p)];
+        int k;
+        if (*p == '_') {
+            continue;
+        }
+        k = (int)_PyLong_DigitValue[Py_CHARMASK(*p)];
         assert(k >= 0 && k < base);
         accum |= (twodigits)k << bits_in_accum;
         bits_in_accum += bits_per_char;
@@ -2062,7 +2092,8 @@
     }
     while (pdigit - z->ob_digit < n)
         *pdigit++ = 0;
-    return long_normalize(z);
+    *res = long_normalize(z);
+    return 0;
 }
 
 /* Parses an int from a bytestring. Leading and trailing whitespace will be
@@ -2087,23 +2118,29 @@
                         "int() arg 2 must be >= 2 and <= 36");
         return NULL;
     }
-    while (*str != '\0' && Py_ISSPACE(Py_CHARMASK(*str)))
+    while (*str != '\0' && Py_ISSPACE(Py_CHARMASK(*str))) {
         str++;
-    if (*str == '+')
+    }
+    if (*str == '+') {
         ++str;
+    }
     else if (*str == '-') {
         ++str;
         sign = -1;
     }
     if (base == 0) {
-        if (str[0] != '0')
+        if (str[0] != '0') {
             base = 10;
-        else if (str[1] == 'x' || str[1] == 'X')
+        }
+        else if (str[1] == 'x' || str[1] == 'X') {
             base = 16;
-        else if (str[1] == 'o' || str[1] == 'O')
+        }
+        else if (str[1] == 'o' || str[1] == 'O') {
             base = 8;
-        else if (str[1] == 'b' || str[1] == 'B')
+        }
+        else if (str[1] == 'b' || str[1] == 'B') {
             base = 2;
+        }
         else {
             /* "old" (C-style) octal literal, now invalid.
                it might still be zero though */
@@ -2114,12 +2151,26 @@
     if (str[0] == '0' &&
         ((base == 16 && (str[1] == 'x' || str[1] == 'X')) ||
          (base == 8  && (str[1] == 'o' || str[1] == 'O')) ||
-         (base == 2  && (str[1] == 'b' || str[1] == 'B'))))
+         (base == 2  && (str[1] == 'b' || str[1] == 'B')))) {
         str += 2;
+        /* One underscore allowed here. */
+        if (*str == '_') {
+            ++str;
+        }
+    }
+    if (str[0] == '_') {
+	    /* May not start with underscores. */
+	    goto onError;
+    }
 
     start = str;
-    if ((base & (base - 1)) == 0)
-        z = long_from_binary_base(&str, base);
+    if ((base & (base - 1)) == 0) {
+        int res = long_from_binary_base(&str, base, &z);
+        if (res < 0) {
+            /* Syntax error. */
+            goto onError;
+        }
+    }
     else {
 /***
 Binary bases can be converted in time linear in the number of digits, because
@@ -2208,11 +2259,13 @@
 ***/
         twodigits c;           /* current input character */
         Py_ssize_t size_z;
+        int digits = 0;
         int i;
         int convwidth;
         twodigits convmultmax, convmult;
         digit *pz, *pzstop;
-        const char* scan;
+        const char *scan, *lastdigit;
+        char prev = 0;
 
         static double log_base_BASE[37] = {0.0e0,};
         static int convwidth_base[37] = {0,};
@@ -2226,8 +2279,9 @@
                                    log((double)PyLong_BASE));
             for (;;) {
                 twodigits next = convmax * base;
-                if (next > PyLong_BASE)
+                if (next > PyLong_BASE) {
                     break;
+                }
                 convmax = next;
                 ++i;
             }
@@ -2238,21 +2292,43 @@
 
         /* Find length of the string of numeric characters. */
         scan = str;
-        while (_PyLong_DigitValue[Py_CHARMASK(*scan)] < base)
+        lastdigit = str;
+
+        while (_PyLong_DigitValue[Py_CHARMASK(*scan)] < base || *scan == '_') {
+            if (*scan == '_') {
+                if (prev == '_') {
+                    /* Only one underscore allowed. */
+                    str = lastdigit + 1;
+                    goto onError;
+                }
+            }
+            else {
+                ++digits;
+                lastdigit = scan;
+            }
+            prev = *scan;
             ++scan;
+        }
+        if (prev == '_') {
+            /* Trailing underscore not allowed. */
+            /* Set error pointer to first underscore. */
+            str = lastdigit + 1;
+            goto onError;
+        }
 
         /* Create an int object that can contain the largest possible
          * integer with this base and length.  Note that there's no
          * need to initialize z->ob_digit -- no slot is read up before
          * being stored into.
          */
-        size_z = (Py_ssize_t)((scan - str) * log_base_BASE[base]) + 1;
+        size_z = (Py_ssize_t)(digits * log_base_BASE[base]) + 1;
         /* Uncomment next line to test exceedingly rare copy code */
         /* size_z = 1; */
         assert(size_z > 0);
         z = _PyLong_New(size_z);
-        if (z == NULL)
+        if (z == NULL) {
             return NULL;
+        }
         Py_SIZE(z) = 0;
 
         /* `convwidth` consecutive input digits are treated as a single
@@ -2263,9 +2339,17 @@
 
         /* Work ;-) */
         while (str < scan) {
+            if (*str == '_') {
+                str++;
+                continue;
+            }
             /* grab up to convwidth digits from the input string */
             c = (digit)_PyLong_DigitValue[Py_CHARMASK(*str++)];
-            for (i = 1; i < convwidth && str != scan; ++i, ++str) {
+            for (i = 1; i < convwidth && str != scan; ++str) {
+                if (*str == '_') {
+                    continue;
+                }
+                i++;
                 c = (twodigits)(c *  base +
                                 (int)_PyLong_DigitValue[Py_CHARMASK(*str)]);
                 assert(c < PyLong_BASE);
@@ -2277,8 +2361,9 @@
              */
             if (i != convwidth) {
                 convmult = base;
-                for ( ; i > 1; --i)
+                for ( ; i > 1; --i) {
                     convmult *= base;
+                }
             }
 
             /* Multiply z by convmult, and add c. */
@@ -2316,41 +2401,51 @@
             }
         }
     }
-    if (z == NULL)
+    if (z == NULL) {
         return NULL;
+    }
     if (error_if_nonzero) {
         /* reset the base to 0, else the exception message
            doesn't make too much sense */
         base = 0;
-        if (Py_SIZE(z) != 0)
+        if (Py_SIZE(z) != 0) {
             goto onError;
+        }
         /* there might still be other problems, therefore base
            remains zero here for the same reason */
     }
-    if (str == start)
+    if (str == start) {
         goto onError;
-    if (sign < 0)
+    }
+    if (sign < 0) {
         Py_SIZE(z) = -(Py_SIZE(z));
-    while (*str && Py_ISSPACE(Py_CHARMASK(*str)))
+    }
+    while (*str && Py_ISSPACE(Py_CHARMASK(*str))) {
         str++;
-    if (*str != '\0')
+    }
+    if (*str != '\0') {
         goto onError;
+    }
     long_normalize(z);
     z = maybe_small_long(z);
-    if (z == NULL)
+    if (z == NULL) {
         return NULL;
-    if (pend != NULL)
+    }
+    if (pend != NULL) {
         *pend = (char *)str;
+    }
     return (PyObject *) z;
 
   onError:
-    if (pend != NULL)
+    if (pend != NULL) {
         *pend = (char *)str;
+    }
     Py_XDECREF(z);
     slen = strlen(orig_str) < 200 ? strlen(orig_str) : 200;
     strobj = PyUnicode_FromStringAndSize(orig_str, slen);
-    if (strobj == NULL)
+    if (strobj == NULL) {
         return NULL;
+    }
     PyErr_Format(PyExc_ValueError,
                  "invalid literal for int() with base %d: %.200R",
                  base, strobj);
commit	a721abac299bb6529021000a71847486d531b41a	[log] [tgz]
author	Brett Cannon <brett@python.org>	Fri Sep 09 14:57:09 2016 -0700
committer	Brett Cannon <brett@python.org>	Fri Sep 09 14:57:09 2016 -0700
tree	8355a69b891cfcdaad8a5fd62870231b7f940696
parent	ee73a657455a908102379d3c9bc254676418e10c [diff] [blame]