blob: b919633ded8d9d321391a70c17db8383e2d17b12 [file] [log] [blame]
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -07001#include <stdbool.h>
2
Pablo Galindoc5fc1562020-04-22 23:29:27 +01003#include <Python.h>
4
Pablo Galindo1ed83ad2020-06-11 17:30:46 +01005#include "tokenizer.h"
Pablo Galindoc5fc1562020-04-22 23:29:27 +01006#include "pegen.h"
Pablo Galindo1ed83ad2020-06-11 17:30:46 +01007#include "string_parser.h"
Pablo Galindoc5fc1562020-04-22 23:29:27 +01008
9//// STRING HANDLING FUNCTIONS ////
10
Pablo Galindoc5fc1562020-04-22 23:29:27 +010011static int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030012warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010013{
14 PyObject *msg =
15 PyUnicode_FromFormat("invalid escape sequence \\%c", first_invalid_escape_char);
16 if (msg == NULL) {
17 return -1;
18 }
19 if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030020 t->lineno, NULL, NULL) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +010021 if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
22 /* Replace the DeprecationWarning exception with a SyntaxError
23 to get a more accurate error report */
24 PyErr_Clear();
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030025
26 /* This is needed, in order for the SyntaxError to point to the token t,
27 since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
28 error location, if p->known_err_token is not set. */
29 p->known_err_token = t;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010030 RAISE_SYNTAX_ERROR("invalid escape sequence \\%c", first_invalid_escape_char);
31 }
32 Py_DECREF(msg);
33 return -1;
34 }
35 Py_DECREF(msg);
36 return 0;
37}
38
39static PyObject *
40decode_utf8(const char **sPtr, const char *end)
41{
Pablo Galindofb61c422020-06-15 14:23:43 +010042 const char *s;
43 const char *t;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010044 t = s = *sPtr;
45 while (s < end && (*s & 0x80)) {
46 s++;
47 }
48 *sPtr = s;
49 return PyUnicode_DecodeUTF8(t, s - t, NULL);
50}
51
52static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030053decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010054{
Pablo Galindofb61c422020-06-15 14:23:43 +010055 PyObject *v;
56 PyObject *u;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010057 char *buf;
58 char *p;
59 const char *end;
60
61 /* check for integer overflow */
62 if (len > SIZE_MAX / 6) {
63 return NULL;
64 }
65 /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
66 "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
67 u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
68 if (u == NULL) {
69 return NULL;
70 }
71 p = buf = PyBytes_AsString(u);
Christian Heimes07f2ade2020-11-18 16:38:53 +010072 if (p == NULL) {
73 return NULL;
74 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +010075 end = s + len;
76 while (s < end) {
77 if (*s == '\\') {
78 *p++ = *s++;
79 if (s >= end || *s & 0x80) {
80 strcpy(p, "u005c");
81 p += 5;
82 if (s >= end) {
83 break;
84 }
85 }
86 }
87 if (*s & 0x80) {
88 PyObject *w;
89 int kind;
90 void *data;
Pablo Galindofb61c422020-06-15 14:23:43 +010091 Py_ssize_t w_len;
92 Py_ssize_t i;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010093 w = decode_utf8(&s, end);
94 if (w == NULL) {
95 Py_DECREF(u);
96 return NULL;
97 }
98 kind = PyUnicode_KIND(w);
99 data = PyUnicode_DATA(w);
Pablo Galindofb61c422020-06-15 14:23:43 +0100100 w_len = PyUnicode_GET_LENGTH(w);
101 for (i = 0; i < w_len; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100102 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
103 sprintf(p, "\\U%08x", chr);
104 p += 10;
105 }
106 /* Should be impossible to overflow */
107 assert(p - buf <= PyBytes_GET_SIZE(u));
108 Py_DECREF(w);
109 }
110 else {
111 *p++ = *s++;
112 }
113 }
114 len = p - buf;
115 s = buf;
116
117 const char *first_invalid_escape;
118 v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
119
120 if (v != NULL && first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300121 if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100122 /* We have not decref u before because first_invalid_escape points
123 inside u. */
124 Py_XDECREF(u);
125 Py_DECREF(v);
126 return NULL;
127 }
128 }
129 Py_XDECREF(u);
130 return v;
131}
132
133static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300134decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100135{
136 const char *first_invalid_escape;
137 PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
138 if (result == NULL) {
139 return NULL;
140 }
141
142 if (first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300143 if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100144 Py_DECREF(result);
145 return NULL;
146 }
147 }
148 return result;
149}
150
151/* s must include the bracketing quote characters, and r, b, u,
152 &/or f prefixes (if any), and embedded escape sequences (if any).
153 _PyPegen_parsestr parses it, and sets *result to decoded Python string object.
154 If the string is an f-string, set *fstr and *fstrlen to the unparsed
155 string object. Return 0 if no errors occurred. */
156int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300157_PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
158 const char **fstr, Py_ssize_t *fstrlen, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100159{
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300160 const char *s = PyBytes_AsString(t->bytes);
161 if (s == NULL) {
162 return -1;
163 }
164
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100165 size_t len;
166 int quote = Py_CHARMASK(*s);
167 int fmode = 0;
168 *bytesmode = 0;
169 *rawmode = 0;
170 *result = NULL;
171 *fstr = NULL;
172 if (Py_ISALPHA(quote)) {
173 while (!*bytesmode || !*rawmode) {
174 if (quote == 'b' || quote == 'B') {
Pablo Galindofb61c422020-06-15 14:23:43 +0100175 quote =(unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100176 *bytesmode = 1;
177 }
178 else if (quote == 'u' || quote == 'U') {
Pablo Galindofb61c422020-06-15 14:23:43 +0100179 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100180 }
181 else if (quote == 'r' || quote == 'R') {
Pablo Galindofb61c422020-06-15 14:23:43 +0100182 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100183 *rawmode = 1;
184 }
185 else if (quote == 'f' || quote == 'F') {
Pablo Galindofb61c422020-06-15 14:23:43 +0100186 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100187 fmode = 1;
188 }
189 else {
190 break;
191 }
192 }
193 }
194
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300195 /* fstrings are only allowed in Python 3.6 and greater */
196 if (fmode && p->feature_version < 6) {
197 p->error_indicator = 1;
198 RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
199 return -1;
200 }
201
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100202 if (fmode && *bytesmode) {
203 PyErr_BadInternalCall();
204 return -1;
205 }
206 if (quote != '\'' && quote != '\"') {
207 PyErr_BadInternalCall();
208 return -1;
209 }
210 /* Skip the leading quote char. */
211 s++;
212 len = strlen(s);
213 if (len > INT_MAX) {
214 PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
215 return -1;
216 }
217 if (s[--len] != quote) {
218 /* Last quote char must match the first. */
219 PyErr_BadInternalCall();
220 return -1;
221 }
222 if (len >= 4 && s[0] == quote && s[1] == quote) {
223 /* A triple quoted string. We've already skipped one quote at
224 the start and one at the end of the string. Now skip the
225 two at the start. */
226 s += 2;
227 len -= 2;
228 /* And check that the last two match. */
229 if (s[--len] != quote || s[--len] != quote) {
230 PyErr_BadInternalCall();
231 return -1;
232 }
233 }
234
235 if (fmode) {
236 /* Just return the bytes. The caller will parse the resulting
237 string. */
238 *fstr = s;
239 *fstrlen = len;
240 return 0;
241 }
242
243 /* Not an f-string. */
244 /* Avoid invoking escape decoding routines if possible. */
245 *rawmode = *rawmode || strchr(s, '\\') == NULL;
246 if (*bytesmode) {
247 /* Disallow non-ASCII characters. */
248 const char *ch;
249 for (ch = s; *ch; ch++) {
250 if (Py_CHARMASK(*ch) >= 0x80) {
251 RAISE_SYNTAX_ERROR(
252 "bytes can only contain ASCII "
numbermaniacbf9239b2021-01-24 09:56:57 +1100253 "literal characters");
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100254 return -1;
255 }
256 }
257 if (*rawmode) {
258 *result = PyBytes_FromStringAndSize(s, len);
259 }
260 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300261 *result = decode_bytes_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100262 }
263 }
264 else {
265 if (*rawmode) {
266 *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
267 }
268 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300269 *result = decode_unicode_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100270 }
271 }
272 return *result == NULL ? -1 : 0;
273}
274
275
276
277// FSTRING STUFF
278
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100279/* Fix locations for the given node and its children.
280
281 `parent` is the enclosing node.
282 `n` is the node which locations are going to be fixed relative to parent.
283 `expr_str` is the child node's string representation, including braces.
284*/
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700285static bool
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300286fstring_find_expr_location(Token *parent, char *expr_str, int *p_lines, int *p_cols)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100287{
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700288 *p_lines = 0;
289 *p_cols = 0;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100290 if (parent && parent->bytes) {
291 char *parent_str = PyBytes_AsString(parent->bytes);
292 if (!parent_str) {
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700293 return false;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100294 }
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700295 char *substr = strstr(parent_str, expr_str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100296 if (substr) {
297 // The following is needed, in order to correctly shift the column
298 // offset, in the case that (disregarding any whitespace) a newline
299 // immediately follows the opening curly brace of the fstring expression.
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700300 bool newline_after_brace = 1;
301 char *start = substr + 1;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100302 while (start && *start != '}' && *start != '\n') {
303 if (*start != ' ' && *start != '\t' && *start != '\f') {
304 newline_after_brace = 0;
305 break;
306 }
307 start++;
308 }
309
310 // Account for the characters from the last newline character to our
311 // left until the beginning of substr.
312 if (!newline_after_brace) {
313 start = substr;
314 while (start > parent_str && *start != '\n') {
315 start--;
316 }
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700317 *p_cols += (int)(substr - start);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100318 }
319 /* adjust the start based on the number of newlines encountered
320 before the f-string expression */
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100321 for (char* p = parent_str; p < substr; p++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100322 if (*p == '\n') {
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700323 (*p_lines)++;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100324 }
325 }
326 }
327 }
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700328 return true;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100329}
330
331
332/* Compile this expression in to an expr_ty. Add parens around the
333 expression, in order to allow leading spaces in the expression. */
334static expr_ty
335fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
336 Token *t)
337{
338 expr_ty expr = NULL;
339 char *str;
340 Py_ssize_t len;
341 const char *s;
342 expr_ty result = NULL;
343
344 assert(expr_end >= expr_start);
345 assert(*(expr_start-1) == '{');
346 assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' ||
347 *expr_end == '=');
348
349 /* If the substring is all whitespace, it's an error. We need to catch this
350 here, and not when we call PyParser_SimpleParseStringFlagsFilename,
351 because turning the expression '' in to '()' would go from being invalid
352 to valid. */
353 for (s = expr_start; s != expr_end; s++) {
354 char c = *s;
355 /* The Python parser ignores only the following whitespace
356 characters (\r already is converted to \n). */
357 if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) {
358 break;
359 }
360 }
361 if (s == expr_end) {
362 RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
363 return NULL;
364 }
365
366 len = expr_end - expr_start;
367 /* Allocate 3 extra bytes: open paren, close paren, null byte. */
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300368 str = PyMem_Malloc(len + 3);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100369 if (str == NULL) {
370 PyErr_NoMemory();
371 return NULL;
372 }
373
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300374 // The call to fstring_find_expr_location is responsible for finding the column offset
375 // the generated AST nodes need to be shifted to the right, which is equal to the number
376 // of the f-string characters before the expression starts. In order to correctly compute
377 // this offset, strstr gets called in fstring_find_expr_location which only succeeds
378 // if curly braces appear before and after the f-string expression (exactly like they do
379 // in the f-string itself), hence the following lines.
380 str[0] = '{';
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100381 memcpy(str+1, expr_start, len);
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300382 str[len+1] = '}';
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100383 str[len+2] = 0;
384
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300385 int lines, cols;
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700386 if (!fstring_find_expr_location(t, str, &lines, &cols)) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100387 PyMem_Free(str);
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700388 return NULL;
389 }
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300390
Eric V. Smith0275e042020-07-16 12:10:23 -0400391 // The parentheses are needed in order to allow for leading whitespace within
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300392 // the f-string expression. This consequently gets parsed as a group (see the
393 // group rule in python.gram).
394 str[0] = '(';
395 str[len+1] = ')';
396
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100397 struct tok_state* tok = PyTokenizer_FromString(str, 1);
398 if (tok == NULL) {
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300399 PyMem_Free(str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100400 return NULL;
401 }
Lysandros Nikolaouf7b1e462020-05-26 03:32:18 +0300402 Py_INCREF(p->tok->filename);
403 tok->filename = p->tok->filename;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100404
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300405 Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
406 NULL, p->arena);
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300407 p2->starting_lineno = t->lineno + lines - 1;
Pablo Galindobd2728b2021-01-03 01:11:41 +0000408 p2->starting_col_offset = t->col_offset + cols;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100409
410 expr = _PyPegen_run_parser(p2);
411
412 if (expr == NULL) {
413 goto exit;
414 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100415 result = expr;
416
417exit:
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300418 PyMem_Free(str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100419 _PyPegen_Parser_Free(p2);
420 PyTokenizer_Free(tok);
421 return result;
422}
423
424/* Return -1 on error.
425
426 Return 0 if we reached the end of the literal.
427
428 Return 1 if we haven't reached the end of the literal, but we want
429 the caller to process the literal up to this point. Used for
430 doubled braces.
431*/
432static int
433fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300434 PyObject **literal, int recurse_lvl, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100435{
436 /* Get any literal string. It ends when we hit an un-doubled left
437 brace (which isn't part of a unicode name escape such as
438 "\N{EULER CONSTANT}"), or the end of the string. */
439
440 const char *s = *str;
441 const char *literal_start = s;
442 int result = 0;
443
444 assert(*literal == NULL);
445 while (s < end) {
446 char ch = *s++;
447 if (!raw && ch == '\\' && s < end) {
448 ch = *s++;
449 if (ch == 'N') {
450 if (s < end && *s++ == '{') {
451 while (s < end && *s++ != '}') {
452 }
453 continue;
454 }
455 break;
456 }
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300457 if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100458 return -1;
459 }
460 }
461 if (ch == '{' || ch == '}') {
462 /* Check for doubled braces, but only at the top level. If
463 we checked at every level, then f'{0:{3}}' would fail
464 with the two closing braces. */
465 if (recurse_lvl == 0) {
466 if (s < end && *s == ch) {
467 /* We're going to tell the caller that the literal ends
468 here, but that they should continue scanning. But also
469 skip over the second brace when we resume scanning. */
470 *str = s + 1;
471 result = 1;
472 goto done;
473 }
474
475 /* Where a single '{' is the start of a new expression, a
476 single '}' is not allowed. */
477 if (ch == '}') {
478 *str = s - 1;
479 RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
480 return -1;
481 }
482 }
483 /* We're either at a '{', which means we're starting another
484 expression; or a '}', which means we're at the end of this
485 f-string (for a nested format_spec). */
486 s--;
487 break;
488 }
489 }
490 *str = s;
491 assert(s <= end);
492 assert(s == end || *s == '{' || *s == '}');
493done:
494 if (literal_start != s) {
Pablo Galindofb61c422020-06-15 14:23:43 +0100495 if (raw) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100496 *literal = PyUnicode_DecodeUTF8Stateful(literal_start,
497 s - literal_start,
498 NULL, NULL);
Pablo Galindofb61c422020-06-15 14:23:43 +0100499 } else {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100500 *literal = decode_unicode_with_escapes(p, literal_start,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300501 s - literal_start, t);
Pablo Galindofb61c422020-06-15 14:23:43 +0100502 }
503 if (!*literal) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100504 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +0100505 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100506 }
507 return result;
508}
509
510/* Forward declaration because parsing is recursive. */
511static expr_ty
512fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
513 Token *first_token, Token* t, Token *last_token);
514
515/* Parse the f-string at *str, ending at end. We know *str starts an
516 expression (so it must be a '{'). Returns the FormattedValue node, which
517 includes the expression, conversion character, format_spec expression, and
518 optionally the text of the expression (if = is used).
519
520 Note that I don't do a perfect job here: I don't make sure that a
521 closing brace doesn't match an opening paren, for example. It
522 doesn't need to error on all invalid expressions, just correctly
523 find the end of all valid ones. Any errors inside the expression
524 will be caught when we parse it later.
525
526 *expression is set to the expression. For an '=' "debug" expression,
527 *expr_text is set to the debug text (the original text of the expression,
528 including the '=' and any whitespace around it, as a string object). If
529 not a debug expression, *expr_text set to NULL. */
530static int
531fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
532 PyObject **expr_text, expr_ty *expression, Token *first_token,
533 Token *t, Token *last_token)
534{
535 /* Return -1 on error, else 0. */
536
537 const char *expr_start;
538 const char *expr_end;
539 expr_ty simple_expression;
540 expr_ty format_spec = NULL; /* Optional format specifier. */
541 int conversion = -1; /* The conversion char. Use default if not
542 specified, or !r if using = and no format
543 spec. */
544
545 /* 0 if we're not in a string, else the quote char we're trying to
546 match (single or double quote). */
547 char quote_char = 0;
548
549 /* If we're inside a string, 1=normal, 3=triple-quoted. */
550 int string_type = 0;
551
552 /* Keep track of nesting level for braces/parens/brackets in
553 expressions. */
554 Py_ssize_t nested_depth = 0;
555 char parenstack[MAXLEVEL];
556
557 *expr_text = NULL;
558
559 /* Can only nest one level deep. */
560 if (recurse_lvl >= 2) {
561 RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
562 goto error;
563 }
564
565 /* The first char must be a left brace, or we wouldn't have gotten
566 here. Skip over it. */
567 assert(**str == '{');
568 *str += 1;
569
570 expr_start = *str;
571 for (; *str < end; (*str)++) {
572 char ch;
573
574 /* Loop invariants. */
575 assert(nested_depth >= 0);
576 assert(*str >= expr_start && *str < end);
Pablo Galindofb61c422020-06-15 14:23:43 +0100577 if (quote_char) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100578 assert(string_type == 1 || string_type == 3);
Pablo Galindofb61c422020-06-15 14:23:43 +0100579 } else {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100580 assert(string_type == 0);
Pablo Galindofb61c422020-06-15 14:23:43 +0100581 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100582
583 ch = **str;
584 /* Nowhere inside an expression is a backslash allowed. */
585 if (ch == '\\') {
586 /* Error: can't include a backslash character, inside
587 parens or strings or not. */
588 RAISE_SYNTAX_ERROR(
589 "f-string expression part "
590 "cannot include a backslash");
591 goto error;
592 }
593 if (quote_char) {
594 /* We're inside a string. See if we're at the end. */
595 /* This code needs to implement the same non-error logic
596 as tok_get from tokenizer.c, at the letter_quote
597 label. To actually share that code would be a
598 nightmare. But, it's unlikely to change and is small,
599 so duplicate it here. Note we don't need to catch all
600 of the errors, since they'll be caught when parsing the
601 expression. We just need to match the non-error
602 cases. Thus we can ignore \n in single-quoted strings,
603 for example. Or non-terminated strings. */
604 if (ch == quote_char) {
605 /* Does this match the string_type (single or triple
606 quoted)? */
607 if (string_type == 3) {
608 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
609 /* We're at the end of a triple quoted string. */
610 *str += 2;
611 string_type = 0;
612 quote_char = 0;
613 continue;
614 }
615 } else {
616 /* We're at the end of a normal string. */
617 quote_char = 0;
618 string_type = 0;
619 continue;
620 }
621 }
622 } else if (ch == '\'' || ch == '"') {
623 /* Is this a triple quoted string? */
624 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
625 string_type = 3;
626 *str += 2;
627 } else {
628 /* Start of a normal string. */
629 string_type = 1;
630 }
631 /* Start looking for the end of the string. */
632 quote_char = ch;
633 } else if (ch == '[' || ch == '{' || ch == '(') {
634 if (nested_depth >= MAXLEVEL) {
635 RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
636 goto error;
637 }
638 parenstack[nested_depth] = ch;
639 nested_depth++;
640 } else if (ch == '#') {
641 /* Error: can't include a comment character, inside parens
642 or not. */
643 RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
644 goto error;
645 } else if (nested_depth == 0 &&
646 (ch == '!' || ch == ':' || ch == '}' ||
647 ch == '=' || ch == '>' || ch == '<')) {
648 /* See if there's a next character. */
649 if (*str+1 < end) {
650 char next = *(*str+1);
651
652 /* For "!=". since '=' is not an allowed conversion character,
653 nothing is lost in this test. */
654 if ((ch == '!' && next == '=') || /* != */
655 (ch == '=' && next == '=') || /* == */
656 (ch == '<' && next == '=') || /* <= */
657 (ch == '>' && next == '=') /* >= */
658 ) {
659 *str += 1;
660 continue;
661 }
662 /* Don't get out of the loop for these, if they're single
663 chars (not part of 2-char tokens). If by themselves, they
664 don't end an expression (unlike say '!'). */
665 if (ch == '>' || ch == '<') {
666 continue;
667 }
668 }
669
670 /* Normal way out of this loop. */
671 break;
672 } else if (ch == ']' || ch == '}' || ch == ')') {
673 if (!nested_depth) {
674 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
675 goto error;
676 }
677 nested_depth--;
Pablo Galindofb61c422020-06-15 14:23:43 +0100678 int opening = (unsigned char)parenstack[nested_depth];
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100679 if (!((opening == '(' && ch == ')') ||
680 (opening == '[' && ch == ']') ||
681 (opening == '{' && ch == '}')))
682 {
683 RAISE_SYNTAX_ERROR(
684 "f-string: closing parenthesis '%c' "
685 "does not match opening parenthesis '%c'",
686 ch, opening);
687 goto error;
688 }
689 } else {
690 /* Just consume this char and loop around. */
691 }
692 }
693 expr_end = *str;
694 /* If we leave this loop in a string or with mismatched parens, we
695 don't care. We'll get a syntax error when compiling the
696 expression. But, we can produce a better error message, so
697 let's just do that.*/
698 if (quote_char) {
699 RAISE_SYNTAX_ERROR("f-string: unterminated string");
700 goto error;
701 }
702 if (nested_depth) {
Pablo Galindofb61c422020-06-15 14:23:43 +0100703 int opening = (unsigned char)parenstack[nested_depth - 1];
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100704 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
705 goto error;
706 }
707
Pablo Galindofb61c422020-06-15 14:23:43 +0100708 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100709 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100710 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100711
712 /* Compile the expression as soon as possible, so we show errors
713 related to the expression before errors related to the
714 conversion or format_spec. */
715 simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
Pablo Galindofb61c422020-06-15 14:23:43 +0100716 if (!simple_expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100717 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +0100718 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100719
720 /* Check for =, which puts the text value of the expression in
721 expr_text. */
722 if (**str == '=') {
Shantanuc116c942020-05-27 13:30:38 -0700723 if (p->feature_version < 8) {
724 RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
725 "only supported in Python 3.8 and greater");
726 goto error;
727 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100728 *str += 1;
729
730 /* Skip over ASCII whitespace. No need to test for end of string
731 here, since we know there's at least a trailing quote somewhere
732 ahead. */
733 while (Py_ISSPACE(**str)) {
734 *str += 1;
735 }
736
737 /* Set *expr_text to the text of the expression. */
738 *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start);
739 if (!*expr_text) {
740 goto error;
741 }
742 }
743
744 /* Check for a conversion char, if present. */
745 if (**str == '!') {
746 *str += 1;
Pablo Galindofb61c422020-06-15 14:23:43 +0100747 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100748 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100749 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100750
Pablo Galindofb61c422020-06-15 14:23:43 +0100751 conversion = (unsigned char)**str;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100752 *str += 1;
753
754 /* Validate the conversion. */
755 if (!(conversion == 's' || conversion == 'r' || conversion == 'a')) {
756 RAISE_SYNTAX_ERROR(
757 "f-string: invalid conversion character: "
758 "expected 's', 'r', or 'a'");
759 goto error;
760 }
761
762 }
763
764 /* Check for the format spec, if present. */
Pablo Galindofb61c422020-06-15 14:23:43 +0100765 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100766 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100767 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100768 if (**str == ':') {
769 *str += 1;
Pablo Galindofb61c422020-06-15 14:23:43 +0100770 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100771 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100772 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100773
774 /* Parse the format spec. */
775 format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
776 first_token, t, last_token);
Pablo Galindofb61c422020-06-15 14:23:43 +0100777 if (!format_spec) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100778 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +0100779 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100780 }
781
Pablo Galindofb61c422020-06-15 14:23:43 +0100782 if (*str >= end || **str != '}') {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100783 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100784 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100785
786 /* We're at a right brace. Consume it. */
787 assert(*str < end);
788 assert(**str == '}');
789 *str += 1;
790
791 /* If we're in = mode (detected by non-NULL expr_text), and have no format
792 spec and no explicit conversion, set the conversion to 'r'. */
793 if (*expr_text && format_spec == NULL && conversion == -1) {
794 conversion = 'r';
795 }
796
797 /* And now create the FormattedValue node that represents this
798 entire expression with the conversion and format spec. */
799 //TODO: Fix this
Victor Stinnerd27f8d22021-04-07 21:34:22 +0200800 *expression = _PyAST_FormattedValue(simple_expression, conversion,
801 format_spec, first_token->lineno,
802 first_token->col_offset,
803 last_token->end_lineno,
804 last_token->end_col_offset, p->arena);
Pablo Galindofb61c422020-06-15 14:23:43 +0100805 if (!*expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100806 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +0100807 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100808
809 return 0;
810
811unexpected_end_of_string:
812 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
813 /* Falls through to error. */
814
815error:
816 Py_XDECREF(*expr_text);
817 return -1;
818
819}
820
821/* Return -1 on error.
822
823 Return 0 if we have a literal (possible zero length) and an
824 expression (zero length if at the end of the string.
825
826 Return 1 if we have a literal, but no expression, and we want the
827 caller to call us again. This is used to deal with doubled
828 braces.
829
830 When called multiple times on the string 'a{{b{0}c', this function
831 will return:
832
833 1. the literal 'a{' with no expression, and a return value
834 of 1. Despite the fact that there's no expression, the return
835 value of 1 means we're not finished yet.
836
837 2. the literal 'b' and the expression '0', with a return value of
838 0. The fact that there's an expression means we're not finished.
839
840 3. literal 'c' with no expression and a return value of 0. The
841 combination of the return value of 0 with no expression means
842 we're finished.
843*/
844static int
845fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw,
846 int recurse_lvl, PyObject **literal,
847 PyObject **expr_text, expr_ty *expression,
848 Token *first_token, Token *t, Token *last_token)
849{
850 int result;
851
852 assert(*literal == NULL && *expression == NULL);
853
854 /* Get any literal string. */
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300855 result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
Pablo Galindofb61c422020-06-15 14:23:43 +0100856 if (result < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100857 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +0100858 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100859
860 assert(result == 0 || result == 1);
861
Pablo Galindofb61c422020-06-15 14:23:43 +0100862 if (result == 1) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100863 /* We have a literal, but don't look at the expression. */
864 return 1;
Pablo Galindofb61c422020-06-15 14:23:43 +0100865 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100866
Pablo Galindofb61c422020-06-15 14:23:43 +0100867 if (*str >= end || **str == '}') {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100868 /* We're at the end of the string or the end of a nested
869 f-string: no expression. The top-level error case where we
870 expect to be at the end of the string but we're at a '}' is
871 handled later. */
872 return 0;
Pablo Galindofb61c422020-06-15 14:23:43 +0100873 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100874
875 /* We must now be the start of an expression, on a '{'. */
876 assert(**str == '{');
877
878 if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
Pablo Galindofb61c422020-06-15 14:23:43 +0100879 expression, first_token, t, last_token) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100880 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +0100881 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100882
883 return 0;
884
885error:
886 Py_CLEAR(*literal);
887 return -1;
888}
889
890#ifdef NDEBUG
891#define ExprList_check_invariants(l)
892#else
893static void
894ExprList_check_invariants(ExprList *l)
895{
896 /* Check our invariants. Make sure this object is "live", and
897 hasn't been deallocated. */
898 assert(l->size >= 0);
899 assert(l->p != NULL);
Pablo Galindofb61c422020-06-15 14:23:43 +0100900 if (l->size <= EXPRLIST_N_CACHED) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100901 assert(l->data == l->p);
Pablo Galindofb61c422020-06-15 14:23:43 +0100902 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100903}
904#endif
905
906static void
907ExprList_Init(ExprList *l)
908{
909 l->allocated = EXPRLIST_N_CACHED;
910 l->size = 0;
911
912 /* Until we start allocating dynamically, p points to data. */
913 l->p = l->data;
914
915 ExprList_check_invariants(l);
916}
917
918static int
919ExprList_Append(ExprList *l, expr_ty exp)
920{
921 ExprList_check_invariants(l);
922 if (l->size >= l->allocated) {
923 /* We need to alloc (or realloc) the memory. */
924 Py_ssize_t new_size = l->allocated * 2;
925
926 /* See if we've ever allocated anything dynamically. */
927 if (l->p == l->data) {
928 Py_ssize_t i;
929 /* We're still using the cached data. Switch to
930 alloc-ing. */
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300931 l->p = PyMem_Malloc(sizeof(expr_ty) * new_size);
Pablo Galindofb61c422020-06-15 14:23:43 +0100932 if (!l->p) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100933 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +0100934 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100935 /* Copy the cached data into the new buffer. */
Pablo Galindofb61c422020-06-15 14:23:43 +0100936 for (i = 0; i < l->size; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100937 l->p[i] = l->data[i];
Pablo Galindofb61c422020-06-15 14:23:43 +0100938 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100939 } else {
940 /* Just realloc. */
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300941 expr_ty *tmp = PyMem_Realloc(l->p, sizeof(expr_ty) * new_size);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100942 if (!tmp) {
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300943 PyMem_Free(l->p);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100944 l->p = NULL;
945 return -1;
946 }
947 l->p = tmp;
948 }
949
950 l->allocated = new_size;
951 assert(l->allocated == 2 * l->size);
952 }
953
954 l->p[l->size++] = exp;
955
956 ExprList_check_invariants(l);
957 return 0;
958}
959
960static void
961ExprList_Dealloc(ExprList *l)
962{
963 ExprList_check_invariants(l);
964
965 /* If there's been an error, or we've never dynamically allocated,
966 do nothing. */
967 if (!l->p || l->p == l->data) {
968 /* Do nothing. */
969 } else {
970 /* We have dynamically allocated. Free the memory. */
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300971 PyMem_Free(l->p);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100972 }
973 l->p = NULL;
974 l->size = -1;
975}
976
Pablo Galindoa5634c42020-09-16 19:42:00 +0100977static asdl_expr_seq *
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100978ExprList_Finish(ExprList *l, PyArena *arena)
979{
Pablo Galindoa5634c42020-09-16 19:42:00 +0100980 asdl_expr_seq *seq;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100981
982 ExprList_check_invariants(l);
983
984 /* Allocate the asdl_seq and copy the expressions in to it. */
Pablo Galindoa5634c42020-09-16 19:42:00 +0100985 seq = _Py_asdl_expr_seq_new(l->size, arena);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100986 if (seq) {
987 Py_ssize_t i;
Pablo Galindofb61c422020-06-15 14:23:43 +0100988 for (i = 0; i < l->size; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100989 asdl_seq_SET(seq, i, l->p[i]);
Pablo Galindofb61c422020-06-15 14:23:43 +0100990 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100991 }
992 ExprList_Dealloc(l);
993 return seq;
994}
995
996#ifdef NDEBUG
997#define FstringParser_check_invariants(state)
998#else
999static void
1000FstringParser_check_invariants(FstringParser *state)
1001{
Pablo Galindofb61c422020-06-15 14:23:43 +01001002 if (state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001003 assert(PyUnicode_CheckExact(state->last_str));
Pablo Galindofb61c422020-06-15 14:23:43 +01001004 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001005 ExprList_check_invariants(&state->expr_list);
1006}
1007#endif
1008
1009void
1010_PyPegen_FstringParser_Init(FstringParser *state)
1011{
1012 state->last_str = NULL;
1013 state->fmode = 0;
1014 ExprList_Init(&state->expr_list);
1015 FstringParser_check_invariants(state);
1016}
1017
1018void
1019_PyPegen_FstringParser_Dealloc(FstringParser *state)
1020{
1021 FstringParser_check_invariants(state);
1022
1023 Py_XDECREF(state->last_str);
1024 ExprList_Dealloc(&state->expr_list);
1025}
1026
1027/* Make a Constant node, but decref the PyUnicode object being added. */
1028static expr_ty
1029make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token)
1030{
1031 PyObject *s = *str;
1032 PyObject *kind = NULL;
1033 *str = NULL;
1034 assert(PyUnicode_CheckExact(s));
Victor Stinner8370e072021-03-24 02:23:01 +01001035 if (_PyArena_AddPyObject(p->arena, s) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001036 Py_DECREF(s);
1037 return NULL;
1038 }
1039 const char* the_str = PyBytes_AsString(first_token->bytes);
1040 if (the_str && the_str[0] == 'u') {
1041 kind = _PyPegen_new_identifier(p, "u");
1042 }
1043
1044 if (kind == NULL && PyErr_Occurred()) {
1045 return NULL;
1046 }
1047
Victor Stinnerd27f8d22021-04-07 21:34:22 +02001048 return _PyAST_Constant(s, kind, first_token->lineno, first_token->col_offset,
1049 last_token->end_lineno, last_token->end_col_offset,
1050 p->arena);
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001051
1052}
1053
1054
1055/* Add a non-f-string (that is, a regular literal string). str is
1056 decref'd. */
1057int
1058_PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str)
1059{
1060 FstringParser_check_invariants(state);
1061
1062 assert(PyUnicode_CheckExact(str));
1063
1064 if (PyUnicode_GET_LENGTH(str) == 0) {
1065 Py_DECREF(str);
1066 return 0;
1067 }
1068
1069 if (!state->last_str) {
1070 /* We didn't have a string before, so just remember this one. */
1071 state->last_str = str;
1072 } else {
1073 /* Concatenate this with the previous string. */
1074 PyUnicode_AppendAndDel(&state->last_str, str);
Pablo Galindofb61c422020-06-15 14:23:43 +01001075 if (!state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001076 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +01001077 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001078 }
1079 FstringParser_check_invariants(state);
1080 return 0;
1081}
1082
1083/* Parse an f-string. The f-string is in *str to end, with no
1084 'f' or quotes. */
1085int
1086_PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str,
1087 const char *end, int raw, int recurse_lvl,
1088 Token *first_token, Token* t, Token *last_token)
1089{
1090 FstringParser_check_invariants(state);
1091 state->fmode = 1;
1092
1093 /* Parse the f-string. */
1094 while (1) {
1095 PyObject *literal = NULL;
1096 PyObject *expr_text = NULL;
1097 expr_ty expression = NULL;
1098
1099 /* If there's a zero length literal in front of the
1100 expression, literal will be NULL. If we're at the end of
1101 the f-string, expression will be NULL (unless result == 1,
1102 see below). */
1103 int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
1104 &literal, &expr_text,
1105 &expression, first_token, t, last_token);
Pablo Galindofb61c422020-06-15 14:23:43 +01001106 if (result < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001107 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +01001108 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001109
1110 /* Add the literal, if any. */
1111 if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
1112 Py_XDECREF(expr_text);
1113 return -1;
1114 }
1115 /* Add the expr_text, if any. */
1116 if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
1117 return -1;
1118 }
1119
1120 /* We've dealt with the literal and expr_text, their ownership has
1121 been transferred to the state object. Don't look at them again. */
1122
1123 /* See if we should just loop around to get the next literal
1124 and expression, while ignoring the expression this
1125 time. This is used for un-doubling braces, as an
1126 optimization. */
Pablo Galindofb61c422020-06-15 14:23:43 +01001127 if (result == 1) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001128 continue;
Pablo Galindofb61c422020-06-15 14:23:43 +01001129 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001130
Pablo Galindofb61c422020-06-15 14:23:43 +01001131 if (!expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001132 /* We're done with this f-string. */
1133 break;
Pablo Galindofb61c422020-06-15 14:23:43 +01001134 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001135
1136 /* We know we have an expression. Convert any existing string
1137 to a Constant node. */
1138 if (!state->last_str) {
1139 /* Do nothing. No previous literal. */
1140 } else {
1141 /* Convert the existing last_str literal to a Constant node. */
Pablo Galindofb61c422020-06-15 14:23:43 +01001142 expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1143 if (!last_str || ExprList_Append(&state->expr_list, last_str) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001144 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +01001145 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001146 }
1147
Pablo Galindofb61c422020-06-15 14:23:43 +01001148 if (ExprList_Append(&state->expr_list, expression) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001149 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +01001150 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001151 }
1152
1153 /* If recurse_lvl is zero, then we must be at the end of the
1154 string. Otherwise, we must be at a right brace. */
1155
1156 if (recurse_lvl == 0 && *str < end-1) {
1157 RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
1158 return -1;
1159 }
1160 if (recurse_lvl != 0 && **str != '}') {
1161 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1162 return -1;
1163 }
1164
1165 FstringParser_check_invariants(state);
1166 return 0;
1167}
1168
1169/* Convert the partial state reflected in last_str and expr_list to an
1170 expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
1171expr_ty
1172_PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token,
1173 Token *last_token)
1174{
Pablo Galindoa5634c42020-09-16 19:42:00 +01001175 asdl_expr_seq *seq;
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001176
1177 FstringParser_check_invariants(state);
1178
1179 /* If we're just a constant string with no expressions, return
1180 that. */
1181 if (!state->fmode) {
1182 assert(!state->expr_list.size);
1183 if (!state->last_str) {
1184 /* Create a zero length string. */
1185 state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
Pablo Galindofb61c422020-06-15 14:23:43 +01001186 if (!state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001187 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +01001188 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001189 }
1190 return make_str_node_and_del(p, &state->last_str, first_token, last_token);
1191 }
1192
1193 /* Create a Constant node out of last_str, if needed. It will be the
1194 last node in our expression list. */
1195 if (state->last_str) {
1196 expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
Pablo Galindofb61c422020-06-15 14:23:43 +01001197 if (!str || ExprList_Append(&state->expr_list, str) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001198 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +01001199 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001200 }
1201 /* This has already been freed. */
1202 assert(state->last_str == NULL);
1203
1204 seq = ExprList_Finish(&state->expr_list, p->arena);
Pablo Galindofb61c422020-06-15 14:23:43 +01001205 if (!seq) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001206 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +01001207 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001208
Victor Stinnerd27f8d22021-04-07 21:34:22 +02001209 return _PyAST_JoinedStr(seq, first_token->lineno, first_token->col_offset,
1210 last_token->end_lineno, last_token->end_col_offset,
1211 p->arena);
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001212
1213error:
1214 _PyPegen_FstringParser_Dealloc(state);
1215 return NULL;
1216}
1217
1218/* Given an f-string (with no 'f' or quotes) that's in *str and ends
1219 at end, parse it into an expr_ty. Return NULL on error. Adjust
1220 str to point past the parsed portion. */
1221static expr_ty
1222fstring_parse(Parser *p, const char **str, const char *end, int raw,
1223 int recurse_lvl, Token *first_token, Token* t, Token *last_token)
1224{
1225 FstringParser state;
1226
1227 _PyPegen_FstringParser_Init(&state);
1228 if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
1229 first_token, t, last_token) < 0) {
1230 _PyPegen_FstringParser_Dealloc(&state);
1231 return NULL;
1232 }
1233
1234 return _PyPegen_FstringParser_Finish(p, &state, t, t);
1235}