blob: dcd298cb358ee7a85c1094034af06e2edde92cfd [file] [log] [blame]
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -07001#include <stdbool.h>
2
Pablo Galindoc5fc1562020-04-22 23:29:27 +01003#include <Python.h>
4
Pablo Galindo1ed83ad2020-06-11 17:30:46 +01005#include "tokenizer.h"
Pablo Galindoc5fc1562020-04-22 23:29:27 +01006#include "pegen.h"
Pablo Galindo1ed83ad2020-06-11 17:30:46 +01007#include "string_parser.h"
Pablo Galindoc5fc1562020-04-22 23:29:27 +01008
9//// STRING HANDLING FUNCTIONS ////
10
Pablo Galindoc5fc1562020-04-22 23:29:27 +010011static int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030012warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010013{
14 PyObject *msg =
Miss Islington (bot)d80f4262021-06-07 17:36:19 -070015 PyUnicode_FromFormat("invalid escape sequence '\\%c'", first_invalid_escape_char);
Pablo Galindoc5fc1562020-04-22 23:29:27 +010016 if (msg == NULL) {
17 return -1;
18 }
19 if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030020 t->lineno, NULL, NULL) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +010021 if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
22 /* Replace the DeprecationWarning exception with a SyntaxError
23 to get a more accurate error report */
24 PyErr_Clear();
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030025
26 /* This is needed, in order for the SyntaxError to point to the token t,
27 since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
28 error location, if p->known_err_token is not set. */
29 p->known_err_token = t;
Miss Islington (bot)d80f4262021-06-07 17:36:19 -070030 RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", first_invalid_escape_char);
Pablo Galindoc5fc1562020-04-22 23:29:27 +010031 }
32 Py_DECREF(msg);
33 return -1;
34 }
35 Py_DECREF(msg);
36 return 0;
37}
38
39static PyObject *
40decode_utf8(const char **sPtr, const char *end)
41{
Pablo Galindofb61c422020-06-15 14:23:43 +010042 const char *s;
43 const char *t;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010044 t = s = *sPtr;
45 while (s < end && (*s & 0x80)) {
46 s++;
47 }
48 *sPtr = s;
49 return PyUnicode_DecodeUTF8(t, s - t, NULL);
50}
51
52static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030053decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010054{
Pablo Galindofb61c422020-06-15 14:23:43 +010055 PyObject *v;
56 PyObject *u;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010057 char *buf;
58 char *p;
59 const char *end;
60
61 /* check for integer overflow */
62 if (len > SIZE_MAX / 6) {
63 return NULL;
64 }
65 /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
66 "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
67 u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
68 if (u == NULL) {
69 return NULL;
70 }
71 p = buf = PyBytes_AsString(u);
Christian Heimes07f2ade2020-11-18 16:38:53 +010072 if (p == NULL) {
73 return NULL;
74 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +010075 end = s + len;
76 while (s < end) {
77 if (*s == '\\') {
78 *p++ = *s++;
79 if (s >= end || *s & 0x80) {
80 strcpy(p, "u005c");
81 p += 5;
82 if (s >= end) {
83 break;
84 }
85 }
86 }
87 if (*s & 0x80) {
88 PyObject *w;
89 int kind;
Serhiy Storchakac43317d2021-06-12 20:44:32 +030090 const void *data;
Pablo Galindofb61c422020-06-15 14:23:43 +010091 Py_ssize_t w_len;
92 Py_ssize_t i;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010093 w = decode_utf8(&s, end);
94 if (w == NULL) {
95 Py_DECREF(u);
96 return NULL;
97 }
98 kind = PyUnicode_KIND(w);
99 data = PyUnicode_DATA(w);
Pablo Galindofb61c422020-06-15 14:23:43 +0100100 w_len = PyUnicode_GET_LENGTH(w);
101 for (i = 0; i < w_len; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100102 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
103 sprintf(p, "\\U%08x", chr);
104 p += 10;
105 }
106 /* Should be impossible to overflow */
107 assert(p - buf <= PyBytes_GET_SIZE(u));
108 Py_DECREF(w);
109 }
110 else {
111 *p++ = *s++;
112 }
113 }
114 len = p - buf;
115 s = buf;
116
117 const char *first_invalid_escape;
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -0700118 v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100119
120 if (v != NULL && first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300121 if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100122 /* We have not decref u before because first_invalid_escape points
123 inside u. */
124 Py_XDECREF(u);
125 Py_DECREF(v);
126 return NULL;
127 }
128 }
129 Py_XDECREF(u);
130 return v;
131}
132
133static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300134decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100135{
136 const char *first_invalid_escape;
137 PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
138 if (result == NULL) {
139 return NULL;
140 }
141
142 if (first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300143 if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100144 Py_DECREF(result);
145 return NULL;
146 }
147 }
148 return result;
149}
150
151/* s must include the bracketing quote characters, and r, b, u,
152 &/or f prefixes (if any), and embedded escape sequences (if any).
153 _PyPegen_parsestr parses it, and sets *result to decoded Python string object.
154 If the string is an f-string, set *fstr and *fstrlen to the unparsed
155 string object. Return 0 if no errors occurred. */
156int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300157_PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
158 const char **fstr, Py_ssize_t *fstrlen, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100159{
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300160 const char *s = PyBytes_AsString(t->bytes);
161 if (s == NULL) {
162 return -1;
163 }
164
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100165 size_t len;
166 int quote = Py_CHARMASK(*s);
167 int fmode = 0;
168 *bytesmode = 0;
169 *rawmode = 0;
170 *result = NULL;
171 *fstr = NULL;
172 if (Py_ISALPHA(quote)) {
173 while (!*bytesmode || !*rawmode) {
174 if (quote == 'b' || quote == 'B') {
Pablo Galindofb61c422020-06-15 14:23:43 +0100175 quote =(unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100176 *bytesmode = 1;
177 }
178 else if (quote == 'u' || quote == 'U') {
Pablo Galindofb61c422020-06-15 14:23:43 +0100179 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100180 }
181 else if (quote == 'r' || quote == 'R') {
Pablo Galindofb61c422020-06-15 14:23:43 +0100182 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100183 *rawmode = 1;
184 }
185 else if (quote == 'f' || quote == 'F') {
Pablo Galindofb61c422020-06-15 14:23:43 +0100186 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100187 fmode = 1;
188 }
189 else {
190 break;
191 }
192 }
193 }
194
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300195 /* fstrings are only allowed in Python 3.6 and greater */
196 if (fmode && p->feature_version < 6) {
197 p->error_indicator = 1;
198 RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
199 return -1;
200 }
201
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100202 if (fmode && *bytesmode) {
203 PyErr_BadInternalCall();
204 return -1;
205 }
206 if (quote != '\'' && quote != '\"') {
207 PyErr_BadInternalCall();
208 return -1;
209 }
210 /* Skip the leading quote char. */
211 s++;
212 len = strlen(s);
213 if (len > INT_MAX) {
214 PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
215 return -1;
216 }
217 if (s[--len] != quote) {
218 /* Last quote char must match the first. */
219 PyErr_BadInternalCall();
220 return -1;
221 }
222 if (len >= 4 && s[0] == quote && s[1] == quote) {
223 /* A triple quoted string. We've already skipped one quote at
224 the start and one at the end of the string. Now skip the
225 two at the start. */
226 s += 2;
227 len -= 2;
228 /* And check that the last two match. */
229 if (s[--len] != quote || s[--len] != quote) {
230 PyErr_BadInternalCall();
231 return -1;
232 }
233 }
234
235 if (fmode) {
236 /* Just return the bytes. The caller will parse the resulting
237 string. */
238 *fstr = s;
239 *fstrlen = len;
240 return 0;
241 }
242
243 /* Not an f-string. */
244 /* Avoid invoking escape decoding routines if possible. */
245 *rawmode = *rawmode || strchr(s, '\\') == NULL;
246 if (*bytesmode) {
247 /* Disallow non-ASCII characters. */
248 const char *ch;
249 for (ch = s; *ch; ch++) {
250 if (Py_CHARMASK(*ch) >= 0x80) {
251 RAISE_SYNTAX_ERROR(
252 "bytes can only contain ASCII "
numbermaniacbf9239b2021-01-24 09:56:57 +1100253 "literal characters");
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100254 return -1;
255 }
256 }
257 if (*rawmode) {
258 *result = PyBytes_FromStringAndSize(s, len);
259 }
260 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300261 *result = decode_bytes_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100262 }
263 }
264 else {
265 if (*rawmode) {
266 *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
267 }
268 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300269 *result = decode_unicode_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100270 }
271 }
272 return *result == NULL ? -1 : 0;
273}
274
275
276
277// FSTRING STUFF
278
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100279/* Fix locations for the given node and its children.
280
281 `parent` is the enclosing node.
Pablo Galindo Salgadoc28c2e12021-08-12 17:41:21 +0100282 `expr_start` is the starting position of the expression (pointing to the open brace).
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100283 `n` is the node which locations are going to be fixed relative to parent.
284 `expr_str` is the child node's string representation, including braces.
285*/
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700286static bool
Pablo Galindo Salgadoc28c2e12021-08-12 17:41:21 +0100287fstring_find_expr_location(Token *parent, const char* expr_start, char *expr_str, int *p_lines, int *p_cols)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100288{
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700289 *p_lines = 0;
290 *p_cols = 0;
Pablo Galindo Salgadoc28c2e12021-08-12 17:41:21 +0100291 assert(expr_start != NULL && *expr_start == '{');
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100292 if (parent && parent->bytes) {
Serhiy Storchakac43317d2021-06-12 20:44:32 +0300293 const char *parent_str = PyBytes_AsString(parent->bytes);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100294 if (!parent_str) {
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700295 return false;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100296 }
Pablo Galindo Salgadoc28c2e12021-08-12 17:41:21 +0100297 // The following is needed, in order to correctly shift the column
298 // offset, in the case that (disregarding any whitespace) a newline
299 // immediately follows the opening curly brace of the fstring expression.
300 bool newline_after_brace = 1;
301 const char *start = expr_start + 1;
302 while (start && *start != '}' && *start != '\n') {
303 if (*start != ' ' && *start != '\t' && *start != '\f') {
304 newline_after_brace = 0;
305 break;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100306 }
Pablo Galindo Salgadoc28c2e12021-08-12 17:41:21 +0100307 start++;
308 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100309
Pablo Galindo Salgadoc28c2e12021-08-12 17:41:21 +0100310 // Account for the characters from the last newline character to our
311 // left until the beginning of expr_start.
312 if (!newline_after_brace) {
313 start = expr_start;
314 while (start > parent_str && *start != '\n') {
315 start--;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100316 }
Pablo Galindo Salgadoc28c2e12021-08-12 17:41:21 +0100317 *p_cols += (int)(expr_start - start);
318 }
319 /* adjust the start based on the number of newlines encountered
320 before the f-string expression */
321 for (const char *p = parent_str; p < expr_start; p++) {
322 if (*p == '\n') {
323 (*p_lines)++;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100324 }
325 }
326 }
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700327 return true;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100328}
329
330
331/* Compile this expression in to an expr_ty. Add parens around the
332 expression, in order to allow leading spaces in the expression. */
333static expr_ty
334fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
335 Token *t)
336{
337 expr_ty expr = NULL;
338 char *str;
339 Py_ssize_t len;
340 const char *s;
341 expr_ty result = NULL;
342
343 assert(expr_end >= expr_start);
344 assert(*(expr_start-1) == '{');
345 assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' ||
346 *expr_end == '=');
347
348 /* If the substring is all whitespace, it's an error. We need to catch this
349 here, and not when we call PyParser_SimpleParseStringFlagsFilename,
350 because turning the expression '' in to '()' would go from being invalid
351 to valid. */
352 for (s = expr_start; s != expr_end; s++) {
353 char c = *s;
354 /* The Python parser ignores only the following whitespace
355 characters (\r already is converted to \n). */
356 if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) {
357 break;
358 }
359 }
360 if (s == expr_end) {
361 RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
362 return NULL;
363 }
364
365 len = expr_end - expr_start;
366 /* Allocate 3 extra bytes: open paren, close paren, null byte. */
Pablo Galindo Salgadoc28c2e12021-08-12 17:41:21 +0100367 str = PyMem_Calloc(len + 3, sizeof(char));
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100368 if (str == NULL) {
369 PyErr_NoMemory();
370 return NULL;
371 }
372
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300373 // The call to fstring_find_expr_location is responsible for finding the column offset
374 // the generated AST nodes need to be shifted to the right, which is equal to the number
Pablo Galindo Salgadoc28c2e12021-08-12 17:41:21 +0100375 // of the f-string characters before the expression starts.
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100376 memcpy(str+1, expr_start, len);
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300377 int lines, cols;
Pablo Galindo Salgadoc28c2e12021-08-12 17:41:21 +0100378 if (!fstring_find_expr_location(t, expr_start-1, str+1, &lines, &cols)) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100379 PyMem_Free(str);
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700380 return NULL;
381 }
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300382
Eric V. Smith0275e042020-07-16 12:10:23 -0400383 // The parentheses are needed in order to allow for leading whitespace within
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300384 // the f-string expression. This consequently gets parsed as a group (see the
385 // group rule in python.gram).
386 str[0] = '(';
387 str[len+1] = ')';
388
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100389 struct tok_state* tok = PyTokenizer_FromString(str, 1);
390 if (tok == NULL) {
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300391 PyMem_Free(str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100392 return NULL;
393 }
Lysandros Nikolaouf7b1e462020-05-26 03:32:18 +0300394 Py_INCREF(p->tok->filename);
395 tok->filename = p->tok->filename;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100396
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300397 Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
398 NULL, p->arena);
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300399 p2->starting_lineno = t->lineno + lines - 1;
Pablo Galindobd2728b2021-01-03 01:11:41 +0000400 p2->starting_col_offset = t->col_offset + cols;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100401
402 expr = _PyPegen_run_parser(p2);
403
404 if (expr == NULL) {
405 goto exit;
406 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100407 result = expr;
408
409exit:
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300410 PyMem_Free(str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100411 _PyPegen_Parser_Free(p2);
412 PyTokenizer_Free(tok);
413 return result;
414}
415
416/* Return -1 on error.
417
418 Return 0 if we reached the end of the literal.
419
420 Return 1 if we haven't reached the end of the literal, but we want
421 the caller to process the literal up to this point. Used for
422 doubled braces.
423*/
424static int
425fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300426 PyObject **literal, int recurse_lvl, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100427{
428 /* Get any literal string. It ends when we hit an un-doubled left
429 brace (which isn't part of a unicode name escape such as
430 "\N{EULER CONSTANT}"), or the end of the string. */
431
432 const char *s = *str;
433 const char *literal_start = s;
434 int result = 0;
435
436 assert(*literal == NULL);
437 while (s < end) {
438 char ch = *s++;
439 if (!raw && ch == '\\' && s < end) {
440 ch = *s++;
441 if (ch == 'N') {
442 if (s < end && *s++ == '{') {
443 while (s < end && *s++ != '}') {
444 }
445 continue;
446 }
447 break;
448 }
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300449 if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100450 return -1;
451 }
452 }
453 if (ch == '{' || ch == '}') {
454 /* Check for doubled braces, but only at the top level. If
455 we checked at every level, then f'{0:{3}}' would fail
456 with the two closing braces. */
457 if (recurse_lvl == 0) {
458 if (s < end && *s == ch) {
459 /* We're going to tell the caller that the literal ends
460 here, but that they should continue scanning. But also
461 skip over the second brace when we resume scanning. */
462 *str = s + 1;
463 result = 1;
464 goto done;
465 }
466
467 /* Where a single '{' is the start of a new expression, a
468 single '}' is not allowed. */
469 if (ch == '}') {
470 *str = s - 1;
471 RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
472 return -1;
473 }
474 }
475 /* We're either at a '{', which means we're starting another
476 expression; or a '}', which means we're at the end of this
477 f-string (for a nested format_spec). */
478 s--;
479 break;
480 }
481 }
482 *str = s;
483 assert(s <= end);
484 assert(s == end || *s == '{' || *s == '}');
485done:
486 if (literal_start != s) {
Pablo Galindofb61c422020-06-15 14:23:43 +0100487 if (raw) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100488 *literal = PyUnicode_DecodeUTF8Stateful(literal_start,
489 s - literal_start,
490 NULL, NULL);
Pablo Galindofb61c422020-06-15 14:23:43 +0100491 } else {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100492 *literal = decode_unicode_with_escapes(p, literal_start,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300493 s - literal_start, t);
Pablo Galindofb61c422020-06-15 14:23:43 +0100494 }
495 if (!*literal) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100496 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +0100497 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100498 }
499 return result;
500}
501
502/* Forward declaration because parsing is recursive. */
503static expr_ty
504fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
505 Token *first_token, Token* t, Token *last_token);
506
507/* Parse the f-string at *str, ending at end. We know *str starts an
508 expression (so it must be a '{'). Returns the FormattedValue node, which
509 includes the expression, conversion character, format_spec expression, and
510 optionally the text of the expression (if = is used).
511
512 Note that I don't do a perfect job here: I don't make sure that a
513 closing brace doesn't match an opening paren, for example. It
514 doesn't need to error on all invalid expressions, just correctly
515 find the end of all valid ones. Any errors inside the expression
516 will be caught when we parse it later.
517
518 *expression is set to the expression. For an '=' "debug" expression,
519 *expr_text is set to the debug text (the original text of the expression,
520 including the '=' and any whitespace around it, as a string object). If
521 not a debug expression, *expr_text set to NULL. */
522static int
523fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
524 PyObject **expr_text, expr_ty *expression, Token *first_token,
525 Token *t, Token *last_token)
526{
527 /* Return -1 on error, else 0. */
528
529 const char *expr_start;
530 const char *expr_end;
531 expr_ty simple_expression;
532 expr_ty format_spec = NULL; /* Optional format specifier. */
533 int conversion = -1; /* The conversion char. Use default if not
534 specified, or !r if using = and no format
535 spec. */
536
537 /* 0 if we're not in a string, else the quote char we're trying to
538 match (single or double quote). */
539 char quote_char = 0;
540
541 /* If we're inside a string, 1=normal, 3=triple-quoted. */
542 int string_type = 0;
543
544 /* Keep track of nesting level for braces/parens/brackets in
545 expressions. */
546 Py_ssize_t nested_depth = 0;
547 char parenstack[MAXLEVEL];
548
549 *expr_text = NULL;
550
551 /* Can only nest one level deep. */
552 if (recurse_lvl >= 2) {
553 RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
554 goto error;
555 }
556
557 /* The first char must be a left brace, or we wouldn't have gotten
558 here. Skip over it. */
559 assert(**str == '{');
560 *str += 1;
561
562 expr_start = *str;
563 for (; *str < end; (*str)++) {
564 char ch;
565
566 /* Loop invariants. */
567 assert(nested_depth >= 0);
568 assert(*str >= expr_start && *str < end);
Pablo Galindofb61c422020-06-15 14:23:43 +0100569 if (quote_char) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100570 assert(string_type == 1 || string_type == 3);
Pablo Galindofb61c422020-06-15 14:23:43 +0100571 } else {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100572 assert(string_type == 0);
Pablo Galindofb61c422020-06-15 14:23:43 +0100573 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100574
575 ch = **str;
576 /* Nowhere inside an expression is a backslash allowed. */
577 if (ch == '\\') {
578 /* Error: can't include a backslash character, inside
579 parens or strings or not. */
580 RAISE_SYNTAX_ERROR(
581 "f-string expression part "
582 "cannot include a backslash");
583 goto error;
584 }
585 if (quote_char) {
586 /* We're inside a string. See if we're at the end. */
587 /* This code needs to implement the same non-error logic
588 as tok_get from tokenizer.c, at the letter_quote
589 label. To actually share that code would be a
590 nightmare. But, it's unlikely to change and is small,
591 so duplicate it here. Note we don't need to catch all
592 of the errors, since they'll be caught when parsing the
593 expression. We just need to match the non-error
594 cases. Thus we can ignore \n in single-quoted strings,
595 for example. Or non-terminated strings. */
596 if (ch == quote_char) {
597 /* Does this match the string_type (single or triple
598 quoted)? */
599 if (string_type == 3) {
600 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
601 /* We're at the end of a triple quoted string. */
602 *str += 2;
603 string_type = 0;
604 quote_char = 0;
605 continue;
606 }
607 } else {
608 /* We're at the end of a normal string. */
609 quote_char = 0;
610 string_type = 0;
611 continue;
612 }
613 }
614 } else if (ch == '\'' || ch == '"') {
615 /* Is this a triple quoted string? */
616 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
617 string_type = 3;
618 *str += 2;
619 } else {
620 /* Start of a normal string. */
621 string_type = 1;
622 }
623 /* Start looking for the end of the string. */
624 quote_char = ch;
625 } else if (ch == '[' || ch == '{' || ch == '(') {
626 if (nested_depth >= MAXLEVEL) {
627 RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
628 goto error;
629 }
630 parenstack[nested_depth] = ch;
631 nested_depth++;
632 } else if (ch == '#') {
633 /* Error: can't include a comment character, inside parens
634 or not. */
635 RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
636 goto error;
637 } else if (nested_depth == 0 &&
638 (ch == '!' || ch == ':' || ch == '}' ||
639 ch == '=' || ch == '>' || ch == '<')) {
640 /* See if there's a next character. */
641 if (*str+1 < end) {
642 char next = *(*str+1);
643
644 /* For "!=". since '=' is not an allowed conversion character,
645 nothing is lost in this test. */
646 if ((ch == '!' && next == '=') || /* != */
647 (ch == '=' && next == '=') || /* == */
648 (ch == '<' && next == '=') || /* <= */
649 (ch == '>' && next == '=') /* >= */
650 ) {
651 *str += 1;
652 continue;
653 }
654 /* Don't get out of the loop for these, if they're single
655 chars (not part of 2-char tokens). If by themselves, they
656 don't end an expression (unlike say '!'). */
657 if (ch == '>' || ch == '<') {
658 continue;
659 }
660 }
661
662 /* Normal way out of this loop. */
663 break;
664 } else if (ch == ']' || ch == '}' || ch == ')') {
665 if (!nested_depth) {
666 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
667 goto error;
668 }
669 nested_depth--;
Pablo Galindofb61c422020-06-15 14:23:43 +0100670 int opening = (unsigned char)parenstack[nested_depth];
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100671 if (!((opening == '(' && ch == ')') ||
672 (opening == '[' && ch == ']') ||
673 (opening == '{' && ch == '}')))
674 {
675 RAISE_SYNTAX_ERROR(
676 "f-string: closing parenthesis '%c' "
677 "does not match opening parenthesis '%c'",
678 ch, opening);
679 goto error;
680 }
681 } else {
682 /* Just consume this char and loop around. */
683 }
684 }
685 expr_end = *str;
686 /* If we leave this loop in a string or with mismatched parens, we
687 don't care. We'll get a syntax error when compiling the
688 expression. But, we can produce a better error message, so
689 let's just do that.*/
690 if (quote_char) {
691 RAISE_SYNTAX_ERROR("f-string: unterminated string");
692 goto error;
693 }
694 if (nested_depth) {
Pablo Galindofb61c422020-06-15 14:23:43 +0100695 int opening = (unsigned char)parenstack[nested_depth - 1];
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100696 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
697 goto error;
698 }
699
Pablo Galindofb61c422020-06-15 14:23:43 +0100700 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100701 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100702 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100703
704 /* Compile the expression as soon as possible, so we show errors
705 related to the expression before errors related to the
706 conversion or format_spec. */
707 simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
Pablo Galindofb61c422020-06-15 14:23:43 +0100708 if (!simple_expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100709 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +0100710 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100711
712 /* Check for =, which puts the text value of the expression in
713 expr_text. */
714 if (**str == '=') {
Shantanuc116c942020-05-27 13:30:38 -0700715 if (p->feature_version < 8) {
716 RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
717 "only supported in Python 3.8 and greater");
718 goto error;
719 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100720 *str += 1;
721
722 /* Skip over ASCII whitespace. No need to test for end of string
723 here, since we know there's at least a trailing quote somewhere
724 ahead. */
725 while (Py_ISSPACE(**str)) {
726 *str += 1;
727 }
728
729 /* Set *expr_text to the text of the expression. */
730 *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start);
731 if (!*expr_text) {
732 goto error;
733 }
734 }
735
736 /* Check for a conversion char, if present. */
737 if (**str == '!') {
738 *str += 1;
Pablo Galindofb61c422020-06-15 14:23:43 +0100739 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100740 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100741 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100742
Pablo Galindofb61c422020-06-15 14:23:43 +0100743 conversion = (unsigned char)**str;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100744 *str += 1;
745
746 /* Validate the conversion. */
747 if (!(conversion == 's' || conversion == 'r' || conversion == 'a')) {
748 RAISE_SYNTAX_ERROR(
749 "f-string: invalid conversion character: "
750 "expected 's', 'r', or 'a'");
751 goto error;
752 }
753
754 }
755
756 /* Check for the format spec, if present. */
Pablo Galindofb61c422020-06-15 14:23:43 +0100757 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100758 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100759 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100760 if (**str == ':') {
761 *str += 1;
Pablo Galindofb61c422020-06-15 14:23:43 +0100762 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100763 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100764 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100765
766 /* Parse the format spec. */
767 format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
768 first_token, t, last_token);
Pablo Galindofb61c422020-06-15 14:23:43 +0100769 if (!format_spec) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100770 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +0100771 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100772 }
773
Pablo Galindofb61c422020-06-15 14:23:43 +0100774 if (*str >= end || **str != '}') {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100775 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100776 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100777
778 /* We're at a right brace. Consume it. */
779 assert(*str < end);
780 assert(**str == '}');
781 *str += 1;
782
783 /* If we're in = mode (detected by non-NULL expr_text), and have no format
784 spec and no explicit conversion, set the conversion to 'r'. */
785 if (*expr_text && format_spec == NULL && conversion == -1) {
786 conversion = 'r';
787 }
788
789 /* And now create the FormattedValue node that represents this
790 entire expression with the conversion and format spec. */
791 //TODO: Fix this
Victor Stinnerd27f8d22021-04-07 21:34:22 +0200792 *expression = _PyAST_FormattedValue(simple_expression, conversion,
793 format_spec, first_token->lineno,
794 first_token->col_offset,
795 last_token->end_lineno,
796 last_token->end_col_offset, p->arena);
Pablo Galindofb61c422020-06-15 14:23:43 +0100797 if (!*expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100798 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +0100799 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100800
801 return 0;
802
803unexpected_end_of_string:
804 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
805 /* Falls through to error. */
806
807error:
808 Py_XDECREF(*expr_text);
809 return -1;
810
811}
812
813/* Return -1 on error.
814
815 Return 0 if we have a literal (possible zero length) and an
816 expression (zero length if at the end of the string.
817
818 Return 1 if we have a literal, but no expression, and we want the
819 caller to call us again. This is used to deal with doubled
820 braces.
821
822 When called multiple times on the string 'a{{b{0}c', this function
823 will return:
824
825 1. the literal 'a{' with no expression, and a return value
826 of 1. Despite the fact that there's no expression, the return
827 value of 1 means we're not finished yet.
828
829 2. the literal 'b' and the expression '0', with a return value of
830 0. The fact that there's an expression means we're not finished.
831
832 3. literal 'c' with no expression and a return value of 0. The
833 combination of the return value of 0 with no expression means
834 we're finished.
835*/
836static int
837fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw,
838 int recurse_lvl, PyObject **literal,
839 PyObject **expr_text, expr_ty *expression,
840 Token *first_token, Token *t, Token *last_token)
841{
842 int result;
843
844 assert(*literal == NULL && *expression == NULL);
845
846 /* Get any literal string. */
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300847 result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
Pablo Galindofb61c422020-06-15 14:23:43 +0100848 if (result < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100849 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +0100850 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100851
852 assert(result == 0 || result == 1);
853
Pablo Galindofb61c422020-06-15 14:23:43 +0100854 if (result == 1) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100855 /* We have a literal, but don't look at the expression. */
856 return 1;
Pablo Galindofb61c422020-06-15 14:23:43 +0100857 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100858
Pablo Galindofb61c422020-06-15 14:23:43 +0100859 if (*str >= end || **str == '}') {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100860 /* We're at the end of the string or the end of a nested
861 f-string: no expression. The top-level error case where we
862 expect to be at the end of the string but we're at a '}' is
863 handled later. */
864 return 0;
Pablo Galindofb61c422020-06-15 14:23:43 +0100865 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100866
867 /* We must now be the start of an expression, on a '{'. */
868 assert(**str == '{');
869
870 if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
Pablo Galindofb61c422020-06-15 14:23:43 +0100871 expression, first_token, t, last_token) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100872 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +0100873 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100874
875 return 0;
876
877error:
878 Py_CLEAR(*literal);
879 return -1;
880}
881
882#ifdef NDEBUG
883#define ExprList_check_invariants(l)
884#else
885static void
886ExprList_check_invariants(ExprList *l)
887{
888 /* Check our invariants. Make sure this object is "live", and
889 hasn't been deallocated. */
890 assert(l->size >= 0);
891 assert(l->p != NULL);
Pablo Galindofb61c422020-06-15 14:23:43 +0100892 if (l->size <= EXPRLIST_N_CACHED) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100893 assert(l->data == l->p);
Pablo Galindofb61c422020-06-15 14:23:43 +0100894 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100895}
896#endif
897
898static void
899ExprList_Init(ExprList *l)
900{
901 l->allocated = EXPRLIST_N_CACHED;
902 l->size = 0;
903
904 /* Until we start allocating dynamically, p points to data. */
905 l->p = l->data;
906
907 ExprList_check_invariants(l);
908}
909
910static int
911ExprList_Append(ExprList *l, expr_ty exp)
912{
913 ExprList_check_invariants(l);
914 if (l->size >= l->allocated) {
915 /* We need to alloc (or realloc) the memory. */
916 Py_ssize_t new_size = l->allocated * 2;
917
918 /* See if we've ever allocated anything dynamically. */
919 if (l->p == l->data) {
920 Py_ssize_t i;
921 /* We're still using the cached data. Switch to
922 alloc-ing. */
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300923 l->p = PyMem_Malloc(sizeof(expr_ty) * new_size);
Pablo Galindofb61c422020-06-15 14:23:43 +0100924 if (!l->p) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100925 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +0100926 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100927 /* Copy the cached data into the new buffer. */
Pablo Galindofb61c422020-06-15 14:23:43 +0100928 for (i = 0; i < l->size; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100929 l->p[i] = l->data[i];
Pablo Galindofb61c422020-06-15 14:23:43 +0100930 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100931 } else {
932 /* Just realloc. */
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300933 expr_ty *tmp = PyMem_Realloc(l->p, sizeof(expr_ty) * new_size);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100934 if (!tmp) {
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300935 PyMem_Free(l->p);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100936 l->p = NULL;
937 return -1;
938 }
939 l->p = tmp;
940 }
941
942 l->allocated = new_size;
943 assert(l->allocated == 2 * l->size);
944 }
945
946 l->p[l->size++] = exp;
947
948 ExprList_check_invariants(l);
949 return 0;
950}
951
952static void
953ExprList_Dealloc(ExprList *l)
954{
955 ExprList_check_invariants(l);
956
957 /* If there's been an error, or we've never dynamically allocated,
958 do nothing. */
959 if (!l->p || l->p == l->data) {
960 /* Do nothing. */
961 } else {
962 /* We have dynamically allocated. Free the memory. */
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300963 PyMem_Free(l->p);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100964 }
965 l->p = NULL;
966 l->size = -1;
967}
968
Pablo Galindoa5634c42020-09-16 19:42:00 +0100969static asdl_expr_seq *
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100970ExprList_Finish(ExprList *l, PyArena *arena)
971{
Pablo Galindoa5634c42020-09-16 19:42:00 +0100972 asdl_expr_seq *seq;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100973
974 ExprList_check_invariants(l);
975
976 /* Allocate the asdl_seq and copy the expressions in to it. */
Pablo Galindoa5634c42020-09-16 19:42:00 +0100977 seq = _Py_asdl_expr_seq_new(l->size, arena);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100978 if (seq) {
979 Py_ssize_t i;
Pablo Galindofb61c422020-06-15 14:23:43 +0100980 for (i = 0; i < l->size; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100981 asdl_seq_SET(seq, i, l->p[i]);
Pablo Galindofb61c422020-06-15 14:23:43 +0100982 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100983 }
984 ExprList_Dealloc(l);
985 return seq;
986}
987
988#ifdef NDEBUG
989#define FstringParser_check_invariants(state)
990#else
991static void
992FstringParser_check_invariants(FstringParser *state)
993{
Pablo Galindofb61c422020-06-15 14:23:43 +0100994 if (state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100995 assert(PyUnicode_CheckExact(state->last_str));
Pablo Galindofb61c422020-06-15 14:23:43 +0100996 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100997 ExprList_check_invariants(&state->expr_list);
998}
999#endif
1000
1001void
1002_PyPegen_FstringParser_Init(FstringParser *state)
1003{
1004 state->last_str = NULL;
1005 state->fmode = 0;
1006 ExprList_Init(&state->expr_list);
1007 FstringParser_check_invariants(state);
1008}
1009
1010void
1011_PyPegen_FstringParser_Dealloc(FstringParser *state)
1012{
1013 FstringParser_check_invariants(state);
1014
1015 Py_XDECREF(state->last_str);
1016 ExprList_Dealloc(&state->expr_list);
1017}
1018
1019/* Make a Constant node, but decref the PyUnicode object being added. */
1020static expr_ty
1021make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token)
1022{
1023 PyObject *s = *str;
1024 PyObject *kind = NULL;
1025 *str = NULL;
1026 assert(PyUnicode_CheckExact(s));
Victor Stinner8370e072021-03-24 02:23:01 +01001027 if (_PyArena_AddPyObject(p->arena, s) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001028 Py_DECREF(s);
1029 return NULL;
1030 }
1031 const char* the_str = PyBytes_AsString(first_token->bytes);
1032 if (the_str && the_str[0] == 'u') {
1033 kind = _PyPegen_new_identifier(p, "u");
1034 }
1035
1036 if (kind == NULL && PyErr_Occurred()) {
1037 return NULL;
1038 }
1039
Victor Stinnerd27f8d22021-04-07 21:34:22 +02001040 return _PyAST_Constant(s, kind, first_token->lineno, first_token->col_offset,
1041 last_token->end_lineno, last_token->end_col_offset,
1042 p->arena);
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001043
1044}
1045
1046
1047/* Add a non-f-string (that is, a regular literal string). str is
1048 decref'd. */
1049int
1050_PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str)
1051{
1052 FstringParser_check_invariants(state);
1053
1054 assert(PyUnicode_CheckExact(str));
1055
1056 if (PyUnicode_GET_LENGTH(str) == 0) {
1057 Py_DECREF(str);
1058 return 0;
1059 }
1060
1061 if (!state->last_str) {
1062 /* We didn't have a string before, so just remember this one. */
1063 state->last_str = str;
1064 } else {
1065 /* Concatenate this with the previous string. */
1066 PyUnicode_AppendAndDel(&state->last_str, str);
Pablo Galindofb61c422020-06-15 14:23:43 +01001067 if (!state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001068 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +01001069 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001070 }
1071 FstringParser_check_invariants(state);
1072 return 0;
1073}
1074
1075/* Parse an f-string. The f-string is in *str to end, with no
1076 'f' or quotes. */
1077int
1078_PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str,
1079 const char *end, int raw, int recurse_lvl,
1080 Token *first_token, Token* t, Token *last_token)
1081{
1082 FstringParser_check_invariants(state);
1083 state->fmode = 1;
1084
1085 /* Parse the f-string. */
1086 while (1) {
1087 PyObject *literal = NULL;
1088 PyObject *expr_text = NULL;
1089 expr_ty expression = NULL;
1090
1091 /* If there's a zero length literal in front of the
1092 expression, literal will be NULL. If we're at the end of
1093 the f-string, expression will be NULL (unless result == 1,
1094 see below). */
1095 int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
1096 &literal, &expr_text,
1097 &expression, first_token, t, last_token);
Pablo Galindofb61c422020-06-15 14:23:43 +01001098 if (result < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001099 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +01001100 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001101
1102 /* Add the literal, if any. */
1103 if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
1104 Py_XDECREF(expr_text);
1105 return -1;
1106 }
1107 /* Add the expr_text, if any. */
1108 if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
1109 return -1;
1110 }
1111
1112 /* We've dealt with the literal and expr_text, their ownership has
1113 been transferred to the state object. Don't look at them again. */
1114
1115 /* See if we should just loop around to get the next literal
1116 and expression, while ignoring the expression this
1117 time. This is used for un-doubling braces, as an
1118 optimization. */
Pablo Galindofb61c422020-06-15 14:23:43 +01001119 if (result == 1) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001120 continue;
Pablo Galindofb61c422020-06-15 14:23:43 +01001121 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001122
Pablo Galindofb61c422020-06-15 14:23:43 +01001123 if (!expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001124 /* We're done with this f-string. */
1125 break;
Pablo Galindofb61c422020-06-15 14:23:43 +01001126 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001127
1128 /* We know we have an expression. Convert any existing string
1129 to a Constant node. */
1130 if (!state->last_str) {
1131 /* Do nothing. No previous literal. */
1132 } else {
1133 /* Convert the existing last_str literal to a Constant node. */
Pablo Galindofb61c422020-06-15 14:23:43 +01001134 expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1135 if (!last_str || ExprList_Append(&state->expr_list, last_str) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001136 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +01001137 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001138 }
1139
Pablo Galindofb61c422020-06-15 14:23:43 +01001140 if (ExprList_Append(&state->expr_list, expression) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001141 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +01001142 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001143 }
1144
1145 /* If recurse_lvl is zero, then we must be at the end of the
1146 string. Otherwise, we must be at a right brace. */
1147
1148 if (recurse_lvl == 0 && *str < end-1) {
1149 RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
1150 return -1;
1151 }
1152 if (recurse_lvl != 0 && **str != '}') {
1153 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1154 return -1;
1155 }
1156
1157 FstringParser_check_invariants(state);
1158 return 0;
1159}
1160
1161/* Convert the partial state reflected in last_str and expr_list to an
1162 expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
1163expr_ty
1164_PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token,
1165 Token *last_token)
1166{
Pablo Galindoa5634c42020-09-16 19:42:00 +01001167 asdl_expr_seq *seq;
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001168
1169 FstringParser_check_invariants(state);
1170
1171 /* If we're just a constant string with no expressions, return
1172 that. */
1173 if (!state->fmode) {
1174 assert(!state->expr_list.size);
1175 if (!state->last_str) {
1176 /* Create a zero length string. */
1177 state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
Pablo Galindofb61c422020-06-15 14:23:43 +01001178 if (!state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001179 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +01001180 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001181 }
1182 return make_str_node_and_del(p, &state->last_str, first_token, last_token);
1183 }
1184
1185 /* Create a Constant node out of last_str, if needed. It will be the
1186 last node in our expression list. */
1187 if (state->last_str) {
1188 expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
Pablo Galindofb61c422020-06-15 14:23:43 +01001189 if (!str || ExprList_Append(&state->expr_list, str) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001190 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +01001191 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001192 }
1193 /* This has already been freed. */
1194 assert(state->last_str == NULL);
1195
1196 seq = ExprList_Finish(&state->expr_list, p->arena);
Pablo Galindofb61c422020-06-15 14:23:43 +01001197 if (!seq) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001198 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +01001199 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001200
Victor Stinnerd27f8d22021-04-07 21:34:22 +02001201 return _PyAST_JoinedStr(seq, first_token->lineno, first_token->col_offset,
1202 last_token->end_lineno, last_token->end_col_offset,
1203 p->arena);
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001204
1205error:
1206 _PyPegen_FstringParser_Dealloc(state);
1207 return NULL;
1208}
1209
1210/* Given an f-string (with no 'f' or quotes) that's in *str and ends
1211 at end, parse it into an expr_ty. Return NULL on error. Adjust
1212 str to point past the parsed portion. */
1213static expr_ty
1214fstring_parse(Parser *p, const char **str, const char *end, int raw,
1215 int recurse_lvl, Token *first_token, Token* t, Token *last_token)
1216{
1217 FstringParser state;
1218
1219 _PyPegen_FstringParser_Init(&state);
1220 if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
1221 first_token, t, last_token) < 0) {
1222 _PyPegen_FstringParser_Dealloc(&state);
1223 return NULL;
1224 }
1225
1226 return _PyPegen_FstringParser_Finish(p, &state, t, t);
1227}