blob: 7b02bdde645e80f66fe562520854b16d422f554f [file] [log] [blame]
Miss Islington (bot)961703c2020-07-16 06:25:31 -07001#include <stdbool.h>
2
Pablo Galindoc5fc1562020-04-22 23:29:27 +01003#include <Python.h>
4
5#include "../tokenizer.h"
6#include "pegen.h"
7#include "parse_string.h"
8
9//// STRING HANDLING FUNCTIONS ////
10
11// These functions are ported directly from Python/ast.c with some modifications
12// to account for the use of "Parser *p", the fact that don't have parser nodes
13// to pass around and the usage of some specialized APIs present only in this
14// file (like "_PyPegen_raise_syntax_error").
15
16static int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030017warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010018{
19 PyObject *msg =
20 PyUnicode_FromFormat("invalid escape sequence \\%c", first_invalid_escape_char);
21 if (msg == NULL) {
22 return -1;
23 }
24 if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030025 t->lineno, NULL, NULL) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +010026 if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
27 /* Replace the DeprecationWarning exception with a SyntaxError
28 to get a more accurate error report */
29 PyErr_Clear();
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030030
31 /* This is needed, in order for the SyntaxError to point to the token t,
32 since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
33 error location, if p->known_err_token is not set. */
34 p->known_err_token = t;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010035 RAISE_SYNTAX_ERROR("invalid escape sequence \\%c", first_invalid_escape_char);
36 }
37 Py_DECREF(msg);
38 return -1;
39 }
40 Py_DECREF(msg);
41 return 0;
42}
43
44static PyObject *
45decode_utf8(const char **sPtr, const char *end)
46{
Pablo Galindo30b59fd2020-06-15 15:08:00 +010047 const char *s;
48 const char *t;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010049 t = s = *sPtr;
50 while (s < end && (*s & 0x80)) {
51 s++;
52 }
53 *sPtr = s;
54 return PyUnicode_DecodeUTF8(t, s - t, NULL);
55}
56
57static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030058decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010059{
Pablo Galindo30b59fd2020-06-15 15:08:00 +010060 PyObject *v;
61 PyObject *u;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010062 char *buf;
63 char *p;
64 const char *end;
65
66 /* check for integer overflow */
67 if (len > SIZE_MAX / 6) {
68 return NULL;
69 }
70 /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
71 "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
72 u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
73 if (u == NULL) {
74 return NULL;
75 }
76 p = buf = PyBytes_AsString(u);
77 end = s + len;
78 while (s < end) {
79 if (*s == '\\') {
80 *p++ = *s++;
81 if (s >= end || *s & 0x80) {
82 strcpy(p, "u005c");
83 p += 5;
84 if (s >= end) {
85 break;
86 }
87 }
88 }
89 if (*s & 0x80) {
90 PyObject *w;
91 int kind;
92 void *data;
Pablo Galindo30b59fd2020-06-15 15:08:00 +010093 Py_ssize_t w_len;
94 Py_ssize_t i;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010095 w = decode_utf8(&s, end);
96 if (w == NULL) {
97 Py_DECREF(u);
98 return NULL;
99 }
100 kind = PyUnicode_KIND(w);
101 data = PyUnicode_DATA(w);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100102 w_len = PyUnicode_GET_LENGTH(w);
103 for (i = 0; i < w_len; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100104 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
105 sprintf(p, "\\U%08x", chr);
106 p += 10;
107 }
108 /* Should be impossible to overflow */
109 assert(p - buf <= PyBytes_GET_SIZE(u));
110 Py_DECREF(w);
111 }
112 else {
113 *p++ = *s++;
114 }
115 }
116 len = p - buf;
117 s = buf;
118
119 const char *first_invalid_escape;
120 v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
121
122 if (v != NULL && first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300123 if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100124 /* We have not decref u before because first_invalid_escape points
125 inside u. */
126 Py_XDECREF(u);
127 Py_DECREF(v);
128 return NULL;
129 }
130 }
131 Py_XDECREF(u);
132 return v;
133}
134
135static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300136decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100137{
138 const char *first_invalid_escape;
139 PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
140 if (result == NULL) {
141 return NULL;
142 }
143
144 if (first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300145 if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100146 Py_DECREF(result);
147 return NULL;
148 }
149 }
150 return result;
151}
152
153/* s must include the bracketing quote characters, and r, b, u,
154 &/or f prefixes (if any), and embedded escape sequences (if any).
155 _PyPegen_parsestr parses it, and sets *result to decoded Python string object.
156 If the string is an f-string, set *fstr and *fstrlen to the unparsed
157 string object. Return 0 if no errors occurred. */
158int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300159_PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
160 const char **fstr, Py_ssize_t *fstrlen, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100161{
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300162 const char *s = PyBytes_AsString(t->bytes);
163 if (s == NULL) {
164 return -1;
165 }
166
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100167 size_t len;
168 int quote = Py_CHARMASK(*s);
169 int fmode = 0;
170 *bytesmode = 0;
171 *rawmode = 0;
172 *result = NULL;
173 *fstr = NULL;
174 if (Py_ISALPHA(quote)) {
175 while (!*bytesmode || !*rawmode) {
176 if (quote == 'b' || quote == 'B') {
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100177 quote =(unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100178 *bytesmode = 1;
179 }
180 else if (quote == 'u' || quote == 'U') {
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100181 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100182 }
183 else if (quote == 'r' || quote == 'R') {
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100184 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100185 *rawmode = 1;
186 }
187 else if (quote == 'f' || quote == 'F') {
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100188 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100189 fmode = 1;
190 }
191 else {
192 break;
193 }
194 }
195 }
196
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300197 /* fstrings are only allowed in Python 3.6 and greater */
198 if (fmode && p->feature_version < 6) {
199 p->error_indicator = 1;
200 RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
201 return -1;
202 }
203
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100204 if (fmode && *bytesmode) {
205 PyErr_BadInternalCall();
206 return -1;
207 }
208 if (quote != '\'' && quote != '\"') {
209 PyErr_BadInternalCall();
210 return -1;
211 }
212 /* Skip the leading quote char. */
213 s++;
214 len = strlen(s);
215 if (len > INT_MAX) {
216 PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
217 return -1;
218 }
219 if (s[--len] != quote) {
220 /* Last quote char must match the first. */
221 PyErr_BadInternalCall();
222 return -1;
223 }
224 if (len >= 4 && s[0] == quote && s[1] == quote) {
225 /* A triple quoted string. We've already skipped one quote at
226 the start and one at the end of the string. Now skip the
227 two at the start. */
228 s += 2;
229 len -= 2;
230 /* And check that the last two match. */
231 if (s[--len] != quote || s[--len] != quote) {
232 PyErr_BadInternalCall();
233 return -1;
234 }
235 }
236
237 if (fmode) {
238 /* Just return the bytes. The caller will parse the resulting
239 string. */
240 *fstr = s;
241 *fstrlen = len;
242 return 0;
243 }
244
245 /* Not an f-string. */
246 /* Avoid invoking escape decoding routines if possible. */
247 *rawmode = *rawmode || strchr(s, '\\') == NULL;
248 if (*bytesmode) {
249 /* Disallow non-ASCII characters. */
250 const char *ch;
251 for (ch = s; *ch; ch++) {
252 if (Py_CHARMASK(*ch) >= 0x80) {
253 RAISE_SYNTAX_ERROR(
254 "bytes can only contain ASCII "
255 "literal characters.");
256 return -1;
257 }
258 }
259 if (*rawmode) {
260 *result = PyBytes_FromStringAndSize(s, len);
261 }
262 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300263 *result = decode_bytes_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100264 }
265 }
266 else {
267 if (*rawmode) {
268 *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
269 }
270 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300271 *result = decode_unicode_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100272 }
273 }
274 return *result == NULL ? -1 : 0;
275}
276
277
278
279// FSTRING STUFF
280
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100281/* Fix locations for the given node and its children.
282
283 `parent` is the enclosing node.
284 `n` is the node which locations are going to be fixed relative to parent.
285 `expr_str` is the child node's string representation, including braces.
286*/
Miss Islington (bot)961703c2020-07-16 06:25:31 -0700287static bool
Pablo Galindodab533d2020-06-28 01:15:28 +0100288fstring_find_expr_location(Token *parent, char *expr_str, int *p_lines, int *p_cols)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100289{
Miss Islington (bot)961703c2020-07-16 06:25:31 -0700290 *p_lines = 0;
291 *p_cols = 0;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100292 if (parent && parent->bytes) {
293 char *parent_str = PyBytes_AsString(parent->bytes);
294 if (!parent_str) {
Miss Islington (bot)961703c2020-07-16 06:25:31 -0700295 return false;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100296 }
Miss Islington (bot)961703c2020-07-16 06:25:31 -0700297 char *substr = strstr(parent_str, expr_str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100298 if (substr) {
299 // The following is needed, in order to correctly shift the column
300 // offset, in the case that (disregarding any whitespace) a newline
301 // immediately follows the opening curly brace of the fstring expression.
Miss Islington (bot)961703c2020-07-16 06:25:31 -0700302 bool newline_after_brace = 1;
303 char *start = substr + 1;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100304 while (start && *start != '}' && *start != '\n') {
305 if (*start != ' ' && *start != '\t' && *start != '\f') {
306 newline_after_brace = 0;
307 break;
308 }
309 start++;
310 }
311
312 // Account for the characters from the last newline character to our
313 // left until the beginning of substr.
314 if (!newline_after_brace) {
315 start = substr;
316 while (start > parent_str && *start != '\n') {
317 start--;
318 }
Miss Islington (bot)961703c2020-07-16 06:25:31 -0700319 *p_cols += (int)(substr - start);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100320 }
321 /* adjust the start based on the number of newlines encountered
322 before the f-string expression */
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100323 for (char* p = parent_str; p < substr; p++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100324 if (*p == '\n') {
Miss Islington (bot)961703c2020-07-16 06:25:31 -0700325 (*p_lines)++;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100326 }
327 }
328 }
329 }
Miss Islington (bot)961703c2020-07-16 06:25:31 -0700330 return true;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100331}
332
333
334/* Compile this expression in to an expr_ty. Add parens around the
335 expression, in order to allow leading spaces in the expression. */
336static expr_ty
337fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
338 Token *t)
339{
340 expr_ty expr = NULL;
341 char *str;
342 Py_ssize_t len;
343 const char *s;
344 expr_ty result = NULL;
345
346 assert(expr_end >= expr_start);
347 assert(*(expr_start-1) == '{');
348 assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' ||
349 *expr_end == '=');
350
351 /* If the substring is all whitespace, it's an error. We need to catch this
352 here, and not when we call PyParser_SimpleParseStringFlagsFilename,
353 because turning the expression '' in to '()' would go from being invalid
354 to valid. */
355 for (s = expr_start; s != expr_end; s++) {
356 char c = *s;
357 /* The Python parser ignores only the following whitespace
358 characters (\r already is converted to \n). */
359 if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) {
360 break;
361 }
362 }
363 if (s == expr_end) {
364 RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
365 return NULL;
366 }
367
368 len = expr_end - expr_start;
369 /* Allocate 3 extra bytes: open paren, close paren, null byte. */
Lysandros Nikolaou5193d0a2020-06-27 21:35:18 +0300370 str = PyMem_Malloc(len + 3);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100371 if (str == NULL) {
372 PyErr_NoMemory();
373 return NULL;
374 }
375
Pablo Galindodab533d2020-06-28 01:15:28 +0100376 // The call to fstring_find_expr_location is responsible for finding the column offset
377 // the generated AST nodes need to be shifted to the right, which is equal to the number
378 // of the f-string characters before the expression starts. In order to correctly compute
379 // this offset, strstr gets called in fstring_find_expr_location which only succeeds
380 // if curly braces appear before and after the f-string expression (exactly like they do
381 // in the f-string itself), hence the following lines.
382 str[0] = '{';
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100383 memcpy(str+1, expr_start, len);
Pablo Galindodab533d2020-06-28 01:15:28 +0100384 str[len+1] = '}';
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100385 str[len+2] = 0;
386
Pablo Galindodab533d2020-06-28 01:15:28 +0100387 int lines, cols;
Miss Islington (bot)961703c2020-07-16 06:25:31 -0700388 if (!fstring_find_expr_location(t, str, &lines, &cols)) {
389 PyMem_FREE(str);
390 return NULL;
391 }
Pablo Galindodab533d2020-06-28 01:15:28 +0100392
Miss Islington (bot)9d8b8c32020-07-16 09:30:19 -0700393 // The parentheses are needed in order to allow for leading whitespace within
Pablo Galindodab533d2020-06-28 01:15:28 +0100394 // the f-string expression. This consequently gets parsed as a group (see the
395 // group rule in python.gram).
396 str[0] = '(';
397 str[len+1] = ')';
398
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100399 struct tok_state* tok = PyTokenizer_FromString(str, 1);
400 if (tok == NULL) {
Lysandros Nikolaou5193d0a2020-06-27 21:35:18 +0300401 PyMem_Free(str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100402 return NULL;
403 }
Lysandros Nikolaou791a46e2020-05-26 04:24:31 +0300404 Py_INCREF(p->tok->filename);
405 tok->filename = p->tok->filename;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100406
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300407 Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
408 NULL, p->arena);
Pablo Galindodab533d2020-06-28 01:15:28 +0100409 p2->starting_lineno = t->lineno + lines - 1;
410 p2->starting_col_offset = p->tok->first_lineno == p->tok->lineno ? t->col_offset + cols : cols;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100411
412 expr = _PyPegen_run_parser(p2);
413
414 if (expr == NULL) {
415 goto exit;
416 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100417 result = expr;
418
419exit:
Lysandros Nikolaou5193d0a2020-06-27 21:35:18 +0300420 PyMem_Free(str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100421 _PyPegen_Parser_Free(p2);
422 PyTokenizer_Free(tok);
423 return result;
424}
425
426/* Return -1 on error.
427
428 Return 0 if we reached the end of the literal.
429
430 Return 1 if we haven't reached the end of the literal, but we want
431 the caller to process the literal up to this point. Used for
432 doubled braces.
433*/
434static int
435fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300436 PyObject **literal, int recurse_lvl, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100437{
438 /* Get any literal string. It ends when we hit an un-doubled left
439 brace (which isn't part of a unicode name escape such as
440 "\N{EULER CONSTANT}"), or the end of the string. */
441
442 const char *s = *str;
443 const char *literal_start = s;
444 int result = 0;
445
446 assert(*literal == NULL);
447 while (s < end) {
448 char ch = *s++;
449 if (!raw && ch == '\\' && s < end) {
450 ch = *s++;
451 if (ch == 'N') {
452 if (s < end && *s++ == '{') {
453 while (s < end && *s++ != '}') {
454 }
455 continue;
456 }
457 break;
458 }
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300459 if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100460 return -1;
461 }
462 }
463 if (ch == '{' || ch == '}') {
464 /* Check for doubled braces, but only at the top level. If
465 we checked at every level, then f'{0:{3}}' would fail
466 with the two closing braces. */
467 if (recurse_lvl == 0) {
468 if (s < end && *s == ch) {
469 /* We're going to tell the caller that the literal ends
470 here, but that they should continue scanning. But also
471 skip over the second brace when we resume scanning. */
472 *str = s + 1;
473 result = 1;
474 goto done;
475 }
476
477 /* Where a single '{' is the start of a new expression, a
478 single '}' is not allowed. */
479 if (ch == '}') {
480 *str = s - 1;
481 RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
482 return -1;
483 }
484 }
485 /* We're either at a '{', which means we're starting another
486 expression; or a '}', which means we're at the end of this
487 f-string (for a nested format_spec). */
488 s--;
489 break;
490 }
491 }
492 *str = s;
493 assert(s <= end);
494 assert(s == end || *s == '{' || *s == '}');
495done:
496 if (literal_start != s) {
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100497 if (raw) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100498 *literal = PyUnicode_DecodeUTF8Stateful(literal_start,
499 s - literal_start,
500 NULL, NULL);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100501 } else {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100502 *literal = decode_unicode_with_escapes(p, literal_start,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300503 s - literal_start, t);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100504 }
505 if (!*literal) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100506 return -1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100507 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100508 }
509 return result;
510}
511
512/* Forward declaration because parsing is recursive. */
513static expr_ty
514fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
515 Token *first_token, Token* t, Token *last_token);
516
517/* Parse the f-string at *str, ending at end. We know *str starts an
518 expression (so it must be a '{'). Returns the FormattedValue node, which
519 includes the expression, conversion character, format_spec expression, and
520 optionally the text of the expression (if = is used).
521
522 Note that I don't do a perfect job here: I don't make sure that a
523 closing brace doesn't match an opening paren, for example. It
524 doesn't need to error on all invalid expressions, just correctly
525 find the end of all valid ones. Any errors inside the expression
526 will be caught when we parse it later.
527
528 *expression is set to the expression. For an '=' "debug" expression,
529 *expr_text is set to the debug text (the original text of the expression,
530 including the '=' and any whitespace around it, as a string object). If
531 not a debug expression, *expr_text set to NULL. */
532static int
533fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
534 PyObject **expr_text, expr_ty *expression, Token *first_token,
535 Token *t, Token *last_token)
536{
537 /* Return -1 on error, else 0. */
538
539 const char *expr_start;
540 const char *expr_end;
541 expr_ty simple_expression;
542 expr_ty format_spec = NULL; /* Optional format specifier. */
543 int conversion = -1; /* The conversion char. Use default if not
544 specified, or !r if using = and no format
545 spec. */
546
547 /* 0 if we're not in a string, else the quote char we're trying to
548 match (single or double quote). */
549 char quote_char = 0;
550
551 /* If we're inside a string, 1=normal, 3=triple-quoted. */
552 int string_type = 0;
553
554 /* Keep track of nesting level for braces/parens/brackets in
555 expressions. */
556 Py_ssize_t nested_depth = 0;
557 char parenstack[MAXLEVEL];
558
559 *expr_text = NULL;
560
561 /* Can only nest one level deep. */
562 if (recurse_lvl >= 2) {
563 RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
564 goto error;
565 }
566
567 /* The first char must be a left brace, or we wouldn't have gotten
568 here. Skip over it. */
569 assert(**str == '{');
570 *str += 1;
571
572 expr_start = *str;
573 for (; *str < end; (*str)++) {
574 char ch;
575
576 /* Loop invariants. */
577 assert(nested_depth >= 0);
578 assert(*str >= expr_start && *str < end);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100579 if (quote_char) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100580 assert(string_type == 1 || string_type == 3);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100581 } else {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100582 assert(string_type == 0);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100583 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100584
585 ch = **str;
586 /* Nowhere inside an expression is a backslash allowed. */
587 if (ch == '\\') {
588 /* Error: can't include a backslash character, inside
589 parens or strings or not. */
590 RAISE_SYNTAX_ERROR(
591 "f-string expression part "
592 "cannot include a backslash");
593 goto error;
594 }
595 if (quote_char) {
596 /* We're inside a string. See if we're at the end. */
597 /* This code needs to implement the same non-error logic
598 as tok_get from tokenizer.c, at the letter_quote
599 label. To actually share that code would be a
600 nightmare. But, it's unlikely to change and is small,
601 so duplicate it here. Note we don't need to catch all
602 of the errors, since they'll be caught when parsing the
603 expression. We just need to match the non-error
604 cases. Thus we can ignore \n in single-quoted strings,
605 for example. Or non-terminated strings. */
606 if (ch == quote_char) {
607 /* Does this match the string_type (single or triple
608 quoted)? */
609 if (string_type == 3) {
610 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
611 /* We're at the end of a triple quoted string. */
612 *str += 2;
613 string_type = 0;
614 quote_char = 0;
615 continue;
616 }
617 } else {
618 /* We're at the end of a normal string. */
619 quote_char = 0;
620 string_type = 0;
621 continue;
622 }
623 }
624 } else if (ch == '\'' || ch == '"') {
625 /* Is this a triple quoted string? */
626 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
627 string_type = 3;
628 *str += 2;
629 } else {
630 /* Start of a normal string. */
631 string_type = 1;
632 }
633 /* Start looking for the end of the string. */
634 quote_char = ch;
635 } else if (ch == '[' || ch == '{' || ch == '(') {
636 if (nested_depth >= MAXLEVEL) {
637 RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
638 goto error;
639 }
640 parenstack[nested_depth] = ch;
641 nested_depth++;
642 } else if (ch == '#') {
643 /* Error: can't include a comment character, inside parens
644 or not. */
645 RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
646 goto error;
647 } else if (nested_depth == 0 &&
648 (ch == '!' || ch == ':' || ch == '}' ||
649 ch == '=' || ch == '>' || ch == '<')) {
650 /* See if there's a next character. */
651 if (*str+1 < end) {
652 char next = *(*str+1);
653
654 /* For "!=". since '=' is not an allowed conversion character,
655 nothing is lost in this test. */
656 if ((ch == '!' && next == '=') || /* != */
657 (ch == '=' && next == '=') || /* == */
658 (ch == '<' && next == '=') || /* <= */
659 (ch == '>' && next == '=') /* >= */
660 ) {
661 *str += 1;
662 continue;
663 }
664 /* Don't get out of the loop for these, if they're single
665 chars (not part of 2-char tokens). If by themselves, they
666 don't end an expression (unlike say '!'). */
667 if (ch == '>' || ch == '<') {
668 continue;
669 }
670 }
671
672 /* Normal way out of this loop. */
673 break;
674 } else if (ch == ']' || ch == '}' || ch == ')') {
675 if (!nested_depth) {
676 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
677 goto error;
678 }
679 nested_depth--;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100680 int opening = (unsigned char)parenstack[nested_depth];
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100681 if (!((opening == '(' && ch == ')') ||
682 (opening == '[' && ch == ']') ||
683 (opening == '{' && ch == '}')))
684 {
685 RAISE_SYNTAX_ERROR(
686 "f-string: closing parenthesis '%c' "
687 "does not match opening parenthesis '%c'",
688 ch, opening);
689 goto error;
690 }
691 } else {
692 /* Just consume this char and loop around. */
693 }
694 }
695 expr_end = *str;
696 /* If we leave this loop in a string or with mismatched parens, we
697 don't care. We'll get a syntax error when compiling the
698 expression. But, we can produce a better error message, so
699 let's just do that.*/
700 if (quote_char) {
701 RAISE_SYNTAX_ERROR("f-string: unterminated string");
702 goto error;
703 }
704 if (nested_depth) {
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100705 int opening = (unsigned char)parenstack[nested_depth - 1];
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100706 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
707 goto error;
708 }
709
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100710 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100711 goto unexpected_end_of_string;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100712 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100713
714 /* Compile the expression as soon as possible, so we show errors
715 related to the expression before errors related to the
716 conversion or format_spec. */
717 simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100718 if (!simple_expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100719 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100720 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100721
722 /* Check for =, which puts the text value of the expression in
723 expr_text. */
724 if (**str == '=') {
Pablo Galindo9b838292020-05-27 22:01:11 +0100725 if (p->feature_version < 8) {
726 RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
727 "only supported in Python 3.8 and greater");
728 goto error;
729 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100730 *str += 1;
731
732 /* Skip over ASCII whitespace. No need to test for end of string
733 here, since we know there's at least a trailing quote somewhere
734 ahead. */
735 while (Py_ISSPACE(**str)) {
736 *str += 1;
737 }
738
739 /* Set *expr_text to the text of the expression. */
740 *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start);
741 if (!*expr_text) {
742 goto error;
743 }
744 }
745
746 /* Check for a conversion char, if present. */
747 if (**str == '!') {
748 *str += 1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100749 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100750 goto unexpected_end_of_string;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100751 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100752
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100753 conversion = (unsigned char)**str;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100754 *str += 1;
755
756 /* Validate the conversion. */
757 if (!(conversion == 's' || conversion == 'r' || conversion == 'a')) {
758 RAISE_SYNTAX_ERROR(
759 "f-string: invalid conversion character: "
760 "expected 's', 'r', or 'a'");
761 goto error;
762 }
763
764 }
765
766 /* Check for the format spec, if present. */
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100767 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100768 goto unexpected_end_of_string;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100769 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100770 if (**str == ':') {
771 *str += 1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100772 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100773 goto unexpected_end_of_string;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100774 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100775
776 /* Parse the format spec. */
777 format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
778 first_token, t, last_token);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100779 if (!format_spec) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100780 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100781 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100782 }
783
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100784 if (*str >= end || **str != '}') {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100785 goto unexpected_end_of_string;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100786 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100787
788 /* We're at a right brace. Consume it. */
789 assert(*str < end);
790 assert(**str == '}');
791 *str += 1;
792
793 /* If we're in = mode (detected by non-NULL expr_text), and have no format
794 spec and no explicit conversion, set the conversion to 'r'. */
795 if (*expr_text && format_spec == NULL && conversion == -1) {
796 conversion = 'r';
797 }
798
799 /* And now create the FormattedValue node that represents this
800 entire expression with the conversion and format spec. */
801 //TODO: Fix this
802 *expression = FormattedValue(simple_expression, conversion,
803 format_spec, first_token->lineno,
804 first_token->col_offset, last_token->end_lineno,
805 last_token->end_col_offset, p->arena);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100806 if (!*expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100807 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100808 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100809
810 return 0;
811
812unexpected_end_of_string:
813 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
814 /* Falls through to error. */
815
816error:
817 Py_XDECREF(*expr_text);
818 return -1;
819
820}
821
822/* Return -1 on error.
823
824 Return 0 if we have a literal (possible zero length) and an
825 expression (zero length if at the end of the string.
826
827 Return 1 if we have a literal, but no expression, and we want the
828 caller to call us again. This is used to deal with doubled
829 braces.
830
831 When called multiple times on the string 'a{{b{0}c', this function
832 will return:
833
834 1. the literal 'a{' with no expression, and a return value
835 of 1. Despite the fact that there's no expression, the return
836 value of 1 means we're not finished yet.
837
838 2. the literal 'b' and the expression '0', with a return value of
839 0. The fact that there's an expression means we're not finished.
840
841 3. literal 'c' with no expression and a return value of 0. The
842 combination of the return value of 0 with no expression means
843 we're finished.
844*/
845static int
846fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw,
847 int recurse_lvl, PyObject **literal,
848 PyObject **expr_text, expr_ty *expression,
849 Token *first_token, Token *t, Token *last_token)
850{
851 int result;
852
853 assert(*literal == NULL && *expression == NULL);
854
855 /* Get any literal string. */
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300856 result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100857 if (result < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100858 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100859 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100860
861 assert(result == 0 || result == 1);
862
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100863 if (result == 1) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100864 /* We have a literal, but don't look at the expression. */
865 return 1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100866 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100867
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100868 if (*str >= end || **str == '}') {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100869 /* We're at the end of the string or the end of a nested
870 f-string: no expression. The top-level error case where we
871 expect to be at the end of the string but we're at a '}' is
872 handled later. */
873 return 0;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100874 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100875
876 /* We must now be the start of an expression, on a '{'. */
877 assert(**str == '{');
878
879 if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100880 expression, first_token, t, last_token) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100881 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100882 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100883
884 return 0;
885
886error:
887 Py_CLEAR(*literal);
888 return -1;
889}
890
891#ifdef NDEBUG
892#define ExprList_check_invariants(l)
893#else
894static void
895ExprList_check_invariants(ExprList *l)
896{
897 /* Check our invariants. Make sure this object is "live", and
898 hasn't been deallocated. */
899 assert(l->size >= 0);
900 assert(l->p != NULL);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100901 if (l->size <= EXPRLIST_N_CACHED) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100902 assert(l->data == l->p);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100903 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100904}
905#endif
906
907static void
908ExprList_Init(ExprList *l)
909{
910 l->allocated = EXPRLIST_N_CACHED;
911 l->size = 0;
912
913 /* Until we start allocating dynamically, p points to data. */
914 l->p = l->data;
915
916 ExprList_check_invariants(l);
917}
918
919static int
920ExprList_Append(ExprList *l, expr_ty exp)
921{
922 ExprList_check_invariants(l);
923 if (l->size >= l->allocated) {
924 /* We need to alloc (or realloc) the memory. */
925 Py_ssize_t new_size = l->allocated * 2;
926
927 /* See if we've ever allocated anything dynamically. */
928 if (l->p == l->data) {
929 Py_ssize_t i;
930 /* We're still using the cached data. Switch to
931 alloc-ing. */
Lysandros Nikolaou5193d0a2020-06-27 21:35:18 +0300932 l->p = PyMem_Malloc(sizeof(expr_ty) * new_size);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100933 if (!l->p) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100934 return -1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100935 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100936 /* Copy the cached data into the new buffer. */
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100937 for (i = 0; i < l->size; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100938 l->p[i] = l->data[i];
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100939 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100940 } else {
941 /* Just realloc. */
Lysandros Nikolaou5193d0a2020-06-27 21:35:18 +0300942 expr_ty *tmp = PyMem_Realloc(l->p, sizeof(expr_ty) * new_size);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100943 if (!tmp) {
Lysandros Nikolaou5193d0a2020-06-27 21:35:18 +0300944 PyMem_Free(l->p);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100945 l->p = NULL;
946 return -1;
947 }
948 l->p = tmp;
949 }
950
951 l->allocated = new_size;
952 assert(l->allocated == 2 * l->size);
953 }
954
955 l->p[l->size++] = exp;
956
957 ExprList_check_invariants(l);
958 return 0;
959}
960
961static void
962ExprList_Dealloc(ExprList *l)
963{
964 ExprList_check_invariants(l);
965
966 /* If there's been an error, or we've never dynamically allocated,
967 do nothing. */
968 if (!l->p || l->p == l->data) {
969 /* Do nothing. */
970 } else {
971 /* We have dynamically allocated. Free the memory. */
Lysandros Nikolaou5193d0a2020-06-27 21:35:18 +0300972 PyMem_Free(l->p);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100973 }
974 l->p = NULL;
975 l->size = -1;
976}
977
978static asdl_seq *
979ExprList_Finish(ExprList *l, PyArena *arena)
980{
981 asdl_seq *seq;
982
983 ExprList_check_invariants(l);
984
985 /* Allocate the asdl_seq and copy the expressions in to it. */
986 seq = _Py_asdl_seq_new(l->size, arena);
987 if (seq) {
988 Py_ssize_t i;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100989 for (i = 0; i < l->size; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100990 asdl_seq_SET(seq, i, l->p[i]);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100991 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100992 }
993 ExprList_Dealloc(l);
994 return seq;
995}
996
997#ifdef NDEBUG
998#define FstringParser_check_invariants(state)
999#else
1000static void
1001FstringParser_check_invariants(FstringParser *state)
1002{
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001003 if (state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001004 assert(PyUnicode_CheckExact(state->last_str));
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001005 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001006 ExprList_check_invariants(&state->expr_list);
1007}
1008#endif
1009
1010void
1011_PyPegen_FstringParser_Init(FstringParser *state)
1012{
1013 state->last_str = NULL;
1014 state->fmode = 0;
1015 ExprList_Init(&state->expr_list);
1016 FstringParser_check_invariants(state);
1017}
1018
1019void
1020_PyPegen_FstringParser_Dealloc(FstringParser *state)
1021{
1022 FstringParser_check_invariants(state);
1023
1024 Py_XDECREF(state->last_str);
1025 ExprList_Dealloc(&state->expr_list);
1026}
1027
1028/* Make a Constant node, but decref the PyUnicode object being added. */
1029static expr_ty
1030make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token)
1031{
1032 PyObject *s = *str;
1033 PyObject *kind = NULL;
1034 *str = NULL;
1035 assert(PyUnicode_CheckExact(s));
1036 if (PyArena_AddPyObject(p->arena, s) < 0) {
1037 Py_DECREF(s);
1038 return NULL;
1039 }
1040 const char* the_str = PyBytes_AsString(first_token->bytes);
1041 if (the_str && the_str[0] == 'u') {
1042 kind = _PyPegen_new_identifier(p, "u");
1043 }
1044
1045 if (kind == NULL && PyErr_Occurred()) {
1046 return NULL;
1047 }
1048
1049 return Constant(s, kind, first_token->lineno, first_token->col_offset,
1050 last_token->end_lineno, last_token->end_col_offset, p->arena);
1051
1052}
1053
1054
1055/* Add a non-f-string (that is, a regular literal string). str is
1056 decref'd. */
1057int
1058_PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str)
1059{
1060 FstringParser_check_invariants(state);
1061
1062 assert(PyUnicode_CheckExact(str));
1063
1064 if (PyUnicode_GET_LENGTH(str) == 0) {
1065 Py_DECREF(str);
1066 return 0;
1067 }
1068
1069 if (!state->last_str) {
1070 /* We didn't have a string before, so just remember this one. */
1071 state->last_str = str;
1072 } else {
1073 /* Concatenate this with the previous string. */
1074 PyUnicode_AppendAndDel(&state->last_str, str);
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001075 if (!state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001076 return -1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001077 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001078 }
1079 FstringParser_check_invariants(state);
1080 return 0;
1081}
1082
1083/* Parse an f-string. The f-string is in *str to end, with no
1084 'f' or quotes. */
1085int
1086_PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str,
1087 const char *end, int raw, int recurse_lvl,
1088 Token *first_token, Token* t, Token *last_token)
1089{
1090 FstringParser_check_invariants(state);
1091 state->fmode = 1;
1092
1093 /* Parse the f-string. */
1094 while (1) {
1095 PyObject *literal = NULL;
1096 PyObject *expr_text = NULL;
1097 expr_ty expression = NULL;
1098
1099 /* If there's a zero length literal in front of the
1100 expression, literal will be NULL. If we're at the end of
1101 the f-string, expression will be NULL (unless result == 1,
1102 see below). */
1103 int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
1104 &literal, &expr_text,
1105 &expression, first_token, t, last_token);
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001106 if (result < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001107 return -1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001108 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001109
1110 /* Add the literal, if any. */
1111 if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
1112 Py_XDECREF(expr_text);
1113 return -1;
1114 }
1115 /* Add the expr_text, if any. */
1116 if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
1117 return -1;
1118 }
1119
1120 /* We've dealt with the literal and expr_text, their ownership has
1121 been transferred to the state object. Don't look at them again. */
1122
1123 /* See if we should just loop around to get the next literal
1124 and expression, while ignoring the expression this
1125 time. This is used for un-doubling braces, as an
1126 optimization. */
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001127 if (result == 1) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001128 continue;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001129 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001130
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001131 if (!expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001132 /* We're done with this f-string. */
1133 break;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001134 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001135
1136 /* We know we have an expression. Convert any existing string
1137 to a Constant node. */
1138 if (!state->last_str) {
1139 /* Do nothing. No previous literal. */
1140 } else {
1141 /* Convert the existing last_str literal to a Constant node. */
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001142 expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1143 if (!last_str || ExprList_Append(&state->expr_list, last_str) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001144 return -1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001145 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001146 }
1147
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001148 if (ExprList_Append(&state->expr_list, expression) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001149 return -1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001150 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001151 }
1152
1153 /* If recurse_lvl is zero, then we must be at the end of the
1154 string. Otherwise, we must be at a right brace. */
1155
1156 if (recurse_lvl == 0 && *str < end-1) {
1157 RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
1158 return -1;
1159 }
1160 if (recurse_lvl != 0 && **str != '}') {
1161 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1162 return -1;
1163 }
1164
1165 FstringParser_check_invariants(state);
1166 return 0;
1167}
1168
1169/* Convert the partial state reflected in last_str and expr_list to an
1170 expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
1171expr_ty
1172_PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token,
1173 Token *last_token)
1174{
1175 asdl_seq *seq;
1176
1177 FstringParser_check_invariants(state);
1178
1179 /* If we're just a constant string with no expressions, return
1180 that. */
1181 if (!state->fmode) {
1182 assert(!state->expr_list.size);
1183 if (!state->last_str) {
1184 /* Create a zero length string. */
1185 state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001186 if (!state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001187 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001188 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001189 }
1190 return make_str_node_and_del(p, &state->last_str, first_token, last_token);
1191 }
1192
1193 /* Create a Constant node out of last_str, if needed. It will be the
1194 last node in our expression list. */
1195 if (state->last_str) {
1196 expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001197 if (!str || ExprList_Append(&state->expr_list, str) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001198 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001199 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001200 }
1201 /* This has already been freed. */
1202 assert(state->last_str == NULL);
1203
1204 seq = ExprList_Finish(&state->expr_list, p->arena);
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001205 if (!seq) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001206 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001207 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001208
1209 return _Py_JoinedStr(seq, first_token->lineno, first_token->col_offset,
1210 last_token->end_lineno, last_token->end_col_offset, p->arena);
1211
1212error:
1213 _PyPegen_FstringParser_Dealloc(state);
1214 return NULL;
1215}
1216
1217/* Given an f-string (with no 'f' or quotes) that's in *str and ends
1218 at end, parse it into an expr_ty. Return NULL on error. Adjust
1219 str to point past the parsed portion. */
1220static expr_ty
1221fstring_parse(Parser *p, const char **str, const char *end, int raw,
1222 int recurse_lvl, Token *first_token, Token* t, Token *last_token)
1223{
1224 FstringParser state;
1225
1226 _PyPegen_FstringParser_Init(&state);
1227 if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
1228 first_token, t, last_token) < 0) {
1229 _PyPegen_FstringParser_Dealloc(&state);
1230 return NULL;
1231 }
1232
1233 return _PyPegen_FstringParser_Finish(p, &state, t, t);
1234}