blob: fb0c4aff9d3d000720d3692be2238ec4612fb852 [file] [log] [blame]
Miss Islington (bot)961703c2020-07-16 06:25:31 -07001#include <stdbool.h>
2
Pablo Galindoc5fc1562020-04-22 23:29:27 +01003#include <Python.h>
4
5#include "../tokenizer.h"
6#include "pegen.h"
7#include "parse_string.h"
8
9//// STRING HANDLING FUNCTIONS ////
10
11// These functions are ported directly from Python/ast.c with some modifications
12// to account for the use of "Parser *p", the fact that don't have parser nodes
13// to pass around and the usage of some specialized APIs present only in this
14// file (like "_PyPegen_raise_syntax_error").
15
16static int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030017warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010018{
19 PyObject *msg =
20 PyUnicode_FromFormat("invalid escape sequence \\%c", first_invalid_escape_char);
21 if (msg == NULL) {
22 return -1;
23 }
24 if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030025 t->lineno, NULL, NULL) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +010026 if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
27 /* Replace the DeprecationWarning exception with a SyntaxError
28 to get a more accurate error report */
29 PyErr_Clear();
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030030
31 /* This is needed, in order for the SyntaxError to point to the token t,
32 since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
33 error location, if p->known_err_token is not set. */
34 p->known_err_token = t;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010035 RAISE_SYNTAX_ERROR("invalid escape sequence \\%c", first_invalid_escape_char);
36 }
37 Py_DECREF(msg);
38 return -1;
39 }
40 Py_DECREF(msg);
41 return 0;
42}
43
44static PyObject *
45decode_utf8(const char **sPtr, const char *end)
46{
Pablo Galindo30b59fd2020-06-15 15:08:00 +010047 const char *s;
48 const char *t;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010049 t = s = *sPtr;
50 while (s < end && (*s & 0x80)) {
51 s++;
52 }
53 *sPtr = s;
54 return PyUnicode_DecodeUTF8(t, s - t, NULL);
55}
56
57static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030058decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010059{
Pablo Galindo30b59fd2020-06-15 15:08:00 +010060 PyObject *v;
61 PyObject *u;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010062 char *buf;
63 char *p;
64 const char *end;
65
66 /* check for integer overflow */
67 if (len > SIZE_MAX / 6) {
68 return NULL;
69 }
70 /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
71 "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
72 u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
73 if (u == NULL) {
74 return NULL;
75 }
76 p = buf = PyBytes_AsString(u);
Miss Islington (bot)994c68f2020-11-18 08:01:48 -080077 if (p == NULL) {
78 return NULL;
79 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +010080 end = s + len;
81 while (s < end) {
82 if (*s == '\\') {
83 *p++ = *s++;
84 if (s >= end || *s & 0x80) {
85 strcpy(p, "u005c");
86 p += 5;
87 if (s >= end) {
88 break;
89 }
90 }
91 }
92 if (*s & 0x80) {
93 PyObject *w;
94 int kind;
95 void *data;
Pablo Galindo30b59fd2020-06-15 15:08:00 +010096 Py_ssize_t w_len;
97 Py_ssize_t i;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010098 w = decode_utf8(&s, end);
99 if (w == NULL) {
100 Py_DECREF(u);
101 return NULL;
102 }
103 kind = PyUnicode_KIND(w);
104 data = PyUnicode_DATA(w);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100105 w_len = PyUnicode_GET_LENGTH(w);
106 for (i = 0; i < w_len; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100107 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
108 sprintf(p, "\\U%08x", chr);
109 p += 10;
110 }
111 /* Should be impossible to overflow */
112 assert(p - buf <= PyBytes_GET_SIZE(u));
113 Py_DECREF(w);
114 }
115 else {
116 *p++ = *s++;
117 }
118 }
119 len = p - buf;
120 s = buf;
121
122 const char *first_invalid_escape;
123 v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
124
125 if (v != NULL && first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300126 if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100127 /* We have not decref u before because first_invalid_escape points
128 inside u. */
129 Py_XDECREF(u);
130 Py_DECREF(v);
131 return NULL;
132 }
133 }
134 Py_XDECREF(u);
135 return v;
136}
137
138static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300139decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100140{
141 const char *first_invalid_escape;
142 PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
143 if (result == NULL) {
144 return NULL;
145 }
146
147 if (first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300148 if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100149 Py_DECREF(result);
150 return NULL;
151 }
152 }
153 return result;
154}
155
156/* s must include the bracketing quote characters, and r, b, u,
157 &/or f prefixes (if any), and embedded escape sequences (if any).
158 _PyPegen_parsestr parses it, and sets *result to decoded Python string object.
159 If the string is an f-string, set *fstr and *fstrlen to the unparsed
160 string object. Return 0 if no errors occurred. */
161int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300162_PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
163 const char **fstr, Py_ssize_t *fstrlen, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100164{
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300165 const char *s = PyBytes_AsString(t->bytes);
166 if (s == NULL) {
167 return -1;
168 }
169
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100170 size_t len;
171 int quote = Py_CHARMASK(*s);
172 int fmode = 0;
173 *bytesmode = 0;
174 *rawmode = 0;
175 *result = NULL;
176 *fstr = NULL;
177 if (Py_ISALPHA(quote)) {
178 while (!*bytesmode || !*rawmode) {
179 if (quote == 'b' || quote == 'B') {
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100180 quote =(unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100181 *bytesmode = 1;
182 }
183 else if (quote == 'u' || quote == 'U') {
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100184 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100185 }
186 else if (quote == 'r' || quote == 'R') {
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100187 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100188 *rawmode = 1;
189 }
190 else if (quote == 'f' || quote == 'F') {
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100191 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100192 fmode = 1;
193 }
194 else {
195 break;
196 }
197 }
198 }
199
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300200 /* fstrings are only allowed in Python 3.6 and greater */
201 if (fmode && p->feature_version < 6) {
202 p->error_indicator = 1;
203 RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
204 return -1;
205 }
206
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100207 if (fmode && *bytesmode) {
208 PyErr_BadInternalCall();
209 return -1;
210 }
211 if (quote != '\'' && quote != '\"') {
212 PyErr_BadInternalCall();
213 return -1;
214 }
215 /* Skip the leading quote char. */
216 s++;
217 len = strlen(s);
218 if (len > INT_MAX) {
219 PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
220 return -1;
221 }
222 if (s[--len] != quote) {
223 /* Last quote char must match the first. */
224 PyErr_BadInternalCall();
225 return -1;
226 }
227 if (len >= 4 && s[0] == quote && s[1] == quote) {
228 /* A triple quoted string. We've already skipped one quote at
229 the start and one at the end of the string. Now skip the
230 two at the start. */
231 s += 2;
232 len -= 2;
233 /* And check that the last two match. */
234 if (s[--len] != quote || s[--len] != quote) {
235 PyErr_BadInternalCall();
236 return -1;
237 }
238 }
239
240 if (fmode) {
241 /* Just return the bytes. The caller will parse the resulting
242 string. */
243 *fstr = s;
244 *fstrlen = len;
245 return 0;
246 }
247
248 /* Not an f-string. */
249 /* Avoid invoking escape decoding routines if possible. */
250 *rawmode = *rawmode || strchr(s, '\\') == NULL;
251 if (*bytesmode) {
252 /* Disallow non-ASCII characters. */
253 const char *ch;
254 for (ch = s; *ch; ch++) {
255 if (Py_CHARMASK(*ch) >= 0x80) {
256 RAISE_SYNTAX_ERROR(
257 "bytes can only contain ASCII "
258 "literal characters.");
259 return -1;
260 }
261 }
262 if (*rawmode) {
263 *result = PyBytes_FromStringAndSize(s, len);
264 }
265 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300266 *result = decode_bytes_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100267 }
268 }
269 else {
270 if (*rawmode) {
271 *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
272 }
273 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300274 *result = decode_unicode_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100275 }
276 }
277 return *result == NULL ? -1 : 0;
278}
279
280
281
282// FSTRING STUFF
283
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100284/* Fix locations for the given node and its children.
285
286 `parent` is the enclosing node.
287 `n` is the node which locations are going to be fixed relative to parent.
288 `expr_str` is the child node's string representation, including braces.
289*/
Miss Islington (bot)961703c2020-07-16 06:25:31 -0700290static bool
Pablo Galindodab533d2020-06-28 01:15:28 +0100291fstring_find_expr_location(Token *parent, char *expr_str, int *p_lines, int *p_cols)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100292{
Miss Islington (bot)961703c2020-07-16 06:25:31 -0700293 *p_lines = 0;
294 *p_cols = 0;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100295 if (parent && parent->bytes) {
296 char *parent_str = PyBytes_AsString(parent->bytes);
297 if (!parent_str) {
Miss Islington (bot)961703c2020-07-16 06:25:31 -0700298 return false;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100299 }
Miss Islington (bot)961703c2020-07-16 06:25:31 -0700300 char *substr = strstr(parent_str, expr_str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100301 if (substr) {
302 // The following is needed, in order to correctly shift the column
303 // offset, in the case that (disregarding any whitespace) a newline
304 // immediately follows the opening curly brace of the fstring expression.
Miss Islington (bot)961703c2020-07-16 06:25:31 -0700305 bool newline_after_brace = 1;
306 char *start = substr + 1;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100307 while (start && *start != '}' && *start != '\n') {
308 if (*start != ' ' && *start != '\t' && *start != '\f') {
309 newline_after_brace = 0;
310 break;
311 }
312 start++;
313 }
314
315 // Account for the characters from the last newline character to our
316 // left until the beginning of substr.
317 if (!newline_after_brace) {
318 start = substr;
319 while (start > parent_str && *start != '\n') {
320 start--;
321 }
Miss Islington (bot)961703c2020-07-16 06:25:31 -0700322 *p_cols += (int)(substr - start);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100323 }
324 /* adjust the start based on the number of newlines encountered
325 before the f-string expression */
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100326 for (char* p = parent_str; p < substr; p++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100327 if (*p == '\n') {
Miss Islington (bot)961703c2020-07-16 06:25:31 -0700328 (*p_lines)++;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100329 }
330 }
331 }
332 }
Miss Islington (bot)961703c2020-07-16 06:25:31 -0700333 return true;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100334}
335
336
337/* Compile this expression in to an expr_ty. Add parens around the
338 expression, in order to allow leading spaces in the expression. */
339static expr_ty
340fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
341 Token *t)
342{
343 expr_ty expr = NULL;
344 char *str;
345 Py_ssize_t len;
346 const char *s;
347 expr_ty result = NULL;
348
349 assert(expr_end >= expr_start);
350 assert(*(expr_start-1) == '{');
351 assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' ||
352 *expr_end == '=');
353
354 /* If the substring is all whitespace, it's an error. We need to catch this
355 here, and not when we call PyParser_SimpleParseStringFlagsFilename,
356 because turning the expression '' in to '()' would go from being invalid
357 to valid. */
358 for (s = expr_start; s != expr_end; s++) {
359 char c = *s;
360 /* The Python parser ignores only the following whitespace
361 characters (\r already is converted to \n). */
362 if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) {
363 break;
364 }
365 }
366 if (s == expr_end) {
367 RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
368 return NULL;
369 }
370
371 len = expr_end - expr_start;
372 /* Allocate 3 extra bytes: open paren, close paren, null byte. */
Lysandros Nikolaou5193d0a2020-06-27 21:35:18 +0300373 str = PyMem_Malloc(len + 3);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100374 if (str == NULL) {
375 PyErr_NoMemory();
376 return NULL;
377 }
378
Pablo Galindodab533d2020-06-28 01:15:28 +0100379 // The call to fstring_find_expr_location is responsible for finding the column offset
380 // the generated AST nodes need to be shifted to the right, which is equal to the number
381 // of the f-string characters before the expression starts. In order to correctly compute
382 // this offset, strstr gets called in fstring_find_expr_location which only succeeds
383 // if curly braces appear before and after the f-string expression (exactly like they do
384 // in the f-string itself), hence the following lines.
385 str[0] = '{';
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100386 memcpy(str+1, expr_start, len);
Pablo Galindodab533d2020-06-28 01:15:28 +0100387 str[len+1] = '}';
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100388 str[len+2] = 0;
389
Pablo Galindodab533d2020-06-28 01:15:28 +0100390 int lines, cols;
Miss Islington (bot)961703c2020-07-16 06:25:31 -0700391 if (!fstring_find_expr_location(t, str, &lines, &cols)) {
392 PyMem_FREE(str);
393 return NULL;
394 }
Pablo Galindodab533d2020-06-28 01:15:28 +0100395
Miss Islington (bot)9d8b8c32020-07-16 09:30:19 -0700396 // The parentheses are needed in order to allow for leading whitespace within
Pablo Galindodab533d2020-06-28 01:15:28 +0100397 // the f-string expression. This consequently gets parsed as a group (see the
398 // group rule in python.gram).
399 str[0] = '(';
400 str[len+1] = ')';
401
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100402 struct tok_state* tok = PyTokenizer_FromString(str, 1);
403 if (tok == NULL) {
Lysandros Nikolaou5193d0a2020-06-27 21:35:18 +0300404 PyMem_Free(str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100405 return NULL;
406 }
Lysandros Nikolaou791a46e2020-05-26 04:24:31 +0300407 Py_INCREF(p->tok->filename);
408 tok->filename = p->tok->filename;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100409
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300410 Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
411 NULL, p->arena);
Pablo Galindodab533d2020-06-28 01:15:28 +0100412 p2->starting_lineno = t->lineno + lines - 1;
413 p2->starting_col_offset = p->tok->first_lineno == p->tok->lineno ? t->col_offset + cols : cols;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100414
415 expr = _PyPegen_run_parser(p2);
416
417 if (expr == NULL) {
418 goto exit;
419 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100420 result = expr;
421
422exit:
Lysandros Nikolaou5193d0a2020-06-27 21:35:18 +0300423 PyMem_Free(str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100424 _PyPegen_Parser_Free(p2);
425 PyTokenizer_Free(tok);
426 return result;
427}
428
429/* Return -1 on error.
430
431 Return 0 if we reached the end of the literal.
432
433 Return 1 if we haven't reached the end of the literal, but we want
434 the caller to process the literal up to this point. Used for
435 doubled braces.
436*/
437static int
438fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300439 PyObject **literal, int recurse_lvl, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100440{
441 /* Get any literal string. It ends when we hit an un-doubled left
442 brace (which isn't part of a unicode name escape such as
443 "\N{EULER CONSTANT}"), or the end of the string. */
444
445 const char *s = *str;
446 const char *literal_start = s;
447 int result = 0;
448
449 assert(*literal == NULL);
450 while (s < end) {
451 char ch = *s++;
452 if (!raw && ch == '\\' && s < end) {
453 ch = *s++;
454 if (ch == 'N') {
455 if (s < end && *s++ == '{') {
456 while (s < end && *s++ != '}') {
457 }
458 continue;
459 }
460 break;
461 }
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300462 if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100463 return -1;
464 }
465 }
466 if (ch == '{' || ch == '}') {
467 /* Check for doubled braces, but only at the top level. If
468 we checked at every level, then f'{0:{3}}' would fail
469 with the two closing braces. */
470 if (recurse_lvl == 0) {
471 if (s < end && *s == ch) {
472 /* We're going to tell the caller that the literal ends
473 here, but that they should continue scanning. But also
474 skip over the second brace when we resume scanning. */
475 *str = s + 1;
476 result = 1;
477 goto done;
478 }
479
480 /* Where a single '{' is the start of a new expression, a
481 single '}' is not allowed. */
482 if (ch == '}') {
483 *str = s - 1;
484 RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
485 return -1;
486 }
487 }
488 /* We're either at a '{', which means we're starting another
489 expression; or a '}', which means we're at the end of this
490 f-string (for a nested format_spec). */
491 s--;
492 break;
493 }
494 }
495 *str = s;
496 assert(s <= end);
497 assert(s == end || *s == '{' || *s == '}');
498done:
499 if (literal_start != s) {
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100500 if (raw) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100501 *literal = PyUnicode_DecodeUTF8Stateful(literal_start,
502 s - literal_start,
503 NULL, NULL);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100504 } else {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100505 *literal = decode_unicode_with_escapes(p, literal_start,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300506 s - literal_start, t);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100507 }
508 if (!*literal) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100509 return -1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100510 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100511 }
512 return result;
513}
514
515/* Forward declaration because parsing is recursive. */
516static expr_ty
517fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
518 Token *first_token, Token* t, Token *last_token);
519
520/* Parse the f-string at *str, ending at end. We know *str starts an
521 expression (so it must be a '{'). Returns the FormattedValue node, which
522 includes the expression, conversion character, format_spec expression, and
523 optionally the text of the expression (if = is used).
524
525 Note that I don't do a perfect job here: I don't make sure that a
526 closing brace doesn't match an opening paren, for example. It
527 doesn't need to error on all invalid expressions, just correctly
528 find the end of all valid ones. Any errors inside the expression
529 will be caught when we parse it later.
530
531 *expression is set to the expression. For an '=' "debug" expression,
532 *expr_text is set to the debug text (the original text of the expression,
533 including the '=' and any whitespace around it, as a string object). If
534 not a debug expression, *expr_text set to NULL. */
535static int
536fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
537 PyObject **expr_text, expr_ty *expression, Token *first_token,
538 Token *t, Token *last_token)
539{
540 /* Return -1 on error, else 0. */
541
542 const char *expr_start;
543 const char *expr_end;
544 expr_ty simple_expression;
545 expr_ty format_spec = NULL; /* Optional format specifier. */
546 int conversion = -1; /* The conversion char. Use default if not
547 specified, or !r if using = and no format
548 spec. */
549
550 /* 0 if we're not in a string, else the quote char we're trying to
551 match (single or double quote). */
552 char quote_char = 0;
553
554 /* If we're inside a string, 1=normal, 3=triple-quoted. */
555 int string_type = 0;
556
557 /* Keep track of nesting level for braces/parens/brackets in
558 expressions. */
559 Py_ssize_t nested_depth = 0;
560 char parenstack[MAXLEVEL];
561
562 *expr_text = NULL;
563
564 /* Can only nest one level deep. */
565 if (recurse_lvl >= 2) {
566 RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
567 goto error;
568 }
569
570 /* The first char must be a left brace, or we wouldn't have gotten
571 here. Skip over it. */
572 assert(**str == '{');
573 *str += 1;
574
575 expr_start = *str;
576 for (; *str < end; (*str)++) {
577 char ch;
578
579 /* Loop invariants. */
580 assert(nested_depth >= 0);
581 assert(*str >= expr_start && *str < end);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100582 if (quote_char) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100583 assert(string_type == 1 || string_type == 3);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100584 } else {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100585 assert(string_type == 0);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100586 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100587
588 ch = **str;
589 /* Nowhere inside an expression is a backslash allowed. */
590 if (ch == '\\') {
591 /* Error: can't include a backslash character, inside
592 parens or strings or not. */
593 RAISE_SYNTAX_ERROR(
594 "f-string expression part "
595 "cannot include a backslash");
596 goto error;
597 }
598 if (quote_char) {
599 /* We're inside a string. See if we're at the end. */
600 /* This code needs to implement the same non-error logic
601 as tok_get from tokenizer.c, at the letter_quote
602 label. To actually share that code would be a
603 nightmare. But, it's unlikely to change and is small,
604 so duplicate it here. Note we don't need to catch all
605 of the errors, since they'll be caught when parsing the
606 expression. We just need to match the non-error
607 cases. Thus we can ignore \n in single-quoted strings,
608 for example. Or non-terminated strings. */
609 if (ch == quote_char) {
610 /* Does this match the string_type (single or triple
611 quoted)? */
612 if (string_type == 3) {
613 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
614 /* We're at the end of a triple quoted string. */
615 *str += 2;
616 string_type = 0;
617 quote_char = 0;
618 continue;
619 }
620 } else {
621 /* We're at the end of a normal string. */
622 quote_char = 0;
623 string_type = 0;
624 continue;
625 }
626 }
627 } else if (ch == '\'' || ch == '"') {
628 /* Is this a triple quoted string? */
629 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
630 string_type = 3;
631 *str += 2;
632 } else {
633 /* Start of a normal string. */
634 string_type = 1;
635 }
636 /* Start looking for the end of the string. */
637 quote_char = ch;
638 } else if (ch == '[' || ch == '{' || ch == '(') {
639 if (nested_depth >= MAXLEVEL) {
640 RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
641 goto error;
642 }
643 parenstack[nested_depth] = ch;
644 nested_depth++;
645 } else if (ch == '#') {
646 /* Error: can't include a comment character, inside parens
647 or not. */
648 RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
649 goto error;
650 } else if (nested_depth == 0 &&
651 (ch == '!' || ch == ':' || ch == '}' ||
652 ch == '=' || ch == '>' || ch == '<')) {
653 /* See if there's a next character. */
654 if (*str+1 < end) {
655 char next = *(*str+1);
656
657 /* For "!=". since '=' is not an allowed conversion character,
658 nothing is lost in this test. */
659 if ((ch == '!' && next == '=') || /* != */
660 (ch == '=' && next == '=') || /* == */
661 (ch == '<' && next == '=') || /* <= */
662 (ch == '>' && next == '=') /* >= */
663 ) {
664 *str += 1;
665 continue;
666 }
667 /* Don't get out of the loop for these, if they're single
668 chars (not part of 2-char tokens). If by themselves, they
669 don't end an expression (unlike say '!'). */
670 if (ch == '>' || ch == '<') {
671 continue;
672 }
673 }
674
675 /* Normal way out of this loop. */
676 break;
677 } else if (ch == ']' || ch == '}' || ch == ')') {
678 if (!nested_depth) {
679 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
680 goto error;
681 }
682 nested_depth--;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100683 int opening = (unsigned char)parenstack[nested_depth];
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100684 if (!((opening == '(' && ch == ')') ||
685 (opening == '[' && ch == ']') ||
686 (opening == '{' && ch == '}')))
687 {
688 RAISE_SYNTAX_ERROR(
689 "f-string: closing parenthesis '%c' "
690 "does not match opening parenthesis '%c'",
691 ch, opening);
692 goto error;
693 }
694 } else {
695 /* Just consume this char and loop around. */
696 }
697 }
698 expr_end = *str;
699 /* If we leave this loop in a string or with mismatched parens, we
700 don't care. We'll get a syntax error when compiling the
701 expression. But, we can produce a better error message, so
702 let's just do that.*/
703 if (quote_char) {
704 RAISE_SYNTAX_ERROR("f-string: unterminated string");
705 goto error;
706 }
707 if (nested_depth) {
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100708 int opening = (unsigned char)parenstack[nested_depth - 1];
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100709 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
710 goto error;
711 }
712
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100713 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100714 goto unexpected_end_of_string;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100715 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100716
717 /* Compile the expression as soon as possible, so we show errors
718 related to the expression before errors related to the
719 conversion or format_spec. */
720 simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100721 if (!simple_expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100722 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100723 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100724
725 /* Check for =, which puts the text value of the expression in
726 expr_text. */
727 if (**str == '=') {
Pablo Galindo9b838292020-05-27 22:01:11 +0100728 if (p->feature_version < 8) {
729 RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
730 "only supported in Python 3.8 and greater");
731 goto error;
732 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100733 *str += 1;
734
735 /* Skip over ASCII whitespace. No need to test for end of string
736 here, since we know there's at least a trailing quote somewhere
737 ahead. */
738 while (Py_ISSPACE(**str)) {
739 *str += 1;
740 }
741
742 /* Set *expr_text to the text of the expression. */
743 *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start);
744 if (!*expr_text) {
745 goto error;
746 }
747 }
748
749 /* Check for a conversion char, if present. */
750 if (**str == '!') {
751 *str += 1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100752 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100753 goto unexpected_end_of_string;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100754 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100755
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100756 conversion = (unsigned char)**str;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100757 *str += 1;
758
759 /* Validate the conversion. */
760 if (!(conversion == 's' || conversion == 'r' || conversion == 'a')) {
761 RAISE_SYNTAX_ERROR(
762 "f-string: invalid conversion character: "
763 "expected 's', 'r', or 'a'");
764 goto error;
765 }
766
767 }
768
769 /* Check for the format spec, if present. */
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100770 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100771 goto unexpected_end_of_string;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100772 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100773 if (**str == ':') {
774 *str += 1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100775 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100776 goto unexpected_end_of_string;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100777 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100778
779 /* Parse the format spec. */
780 format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
781 first_token, t, last_token);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100782 if (!format_spec) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100783 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100784 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100785 }
786
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100787 if (*str >= end || **str != '}') {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100788 goto unexpected_end_of_string;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100789 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100790
791 /* We're at a right brace. Consume it. */
792 assert(*str < end);
793 assert(**str == '}');
794 *str += 1;
795
796 /* If we're in = mode (detected by non-NULL expr_text), and have no format
797 spec and no explicit conversion, set the conversion to 'r'. */
798 if (*expr_text && format_spec == NULL && conversion == -1) {
799 conversion = 'r';
800 }
801
802 /* And now create the FormattedValue node that represents this
803 entire expression with the conversion and format spec. */
804 //TODO: Fix this
805 *expression = FormattedValue(simple_expression, conversion,
806 format_spec, first_token->lineno,
807 first_token->col_offset, last_token->end_lineno,
808 last_token->end_col_offset, p->arena);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100809 if (!*expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100810 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100811 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100812
813 return 0;
814
815unexpected_end_of_string:
816 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
817 /* Falls through to error. */
818
819error:
820 Py_XDECREF(*expr_text);
821 return -1;
822
823}
824
825/* Return -1 on error.
826
827 Return 0 if we have a literal (possible zero length) and an
828 expression (zero length if at the end of the string.
829
830 Return 1 if we have a literal, but no expression, and we want the
831 caller to call us again. This is used to deal with doubled
832 braces.
833
834 When called multiple times on the string 'a{{b{0}c', this function
835 will return:
836
837 1. the literal 'a{' with no expression, and a return value
838 of 1. Despite the fact that there's no expression, the return
839 value of 1 means we're not finished yet.
840
841 2. the literal 'b' and the expression '0', with a return value of
842 0. The fact that there's an expression means we're not finished.
843
844 3. literal 'c' with no expression and a return value of 0. The
845 combination of the return value of 0 with no expression means
846 we're finished.
847*/
848static int
849fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw,
850 int recurse_lvl, PyObject **literal,
851 PyObject **expr_text, expr_ty *expression,
852 Token *first_token, Token *t, Token *last_token)
853{
854 int result;
855
856 assert(*literal == NULL && *expression == NULL);
857
858 /* Get any literal string. */
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300859 result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100860 if (result < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100861 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100862 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100863
864 assert(result == 0 || result == 1);
865
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100866 if (result == 1) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100867 /* We have a literal, but don't look at the expression. */
868 return 1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100869 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100870
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100871 if (*str >= end || **str == '}') {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100872 /* We're at the end of the string or the end of a nested
873 f-string: no expression. The top-level error case where we
874 expect to be at the end of the string but we're at a '}' is
875 handled later. */
876 return 0;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100877 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100878
879 /* We must now be the start of an expression, on a '{'. */
880 assert(**str == '{');
881
882 if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100883 expression, first_token, t, last_token) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100884 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100885 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100886
887 return 0;
888
889error:
890 Py_CLEAR(*literal);
891 return -1;
892}
893
894#ifdef NDEBUG
895#define ExprList_check_invariants(l)
896#else
897static void
898ExprList_check_invariants(ExprList *l)
899{
900 /* Check our invariants. Make sure this object is "live", and
901 hasn't been deallocated. */
902 assert(l->size >= 0);
903 assert(l->p != NULL);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100904 if (l->size <= EXPRLIST_N_CACHED) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100905 assert(l->data == l->p);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100906 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100907}
908#endif
909
910static void
911ExprList_Init(ExprList *l)
912{
913 l->allocated = EXPRLIST_N_CACHED;
914 l->size = 0;
915
916 /* Until we start allocating dynamically, p points to data. */
917 l->p = l->data;
918
919 ExprList_check_invariants(l);
920}
921
922static int
923ExprList_Append(ExprList *l, expr_ty exp)
924{
925 ExprList_check_invariants(l);
926 if (l->size >= l->allocated) {
927 /* We need to alloc (or realloc) the memory. */
928 Py_ssize_t new_size = l->allocated * 2;
929
930 /* See if we've ever allocated anything dynamically. */
931 if (l->p == l->data) {
932 Py_ssize_t i;
933 /* We're still using the cached data. Switch to
934 alloc-ing. */
Lysandros Nikolaou5193d0a2020-06-27 21:35:18 +0300935 l->p = PyMem_Malloc(sizeof(expr_ty) * new_size);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100936 if (!l->p) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100937 return -1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100938 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100939 /* Copy the cached data into the new buffer. */
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100940 for (i = 0; i < l->size; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100941 l->p[i] = l->data[i];
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100942 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100943 } else {
944 /* Just realloc. */
Lysandros Nikolaou5193d0a2020-06-27 21:35:18 +0300945 expr_ty *tmp = PyMem_Realloc(l->p, sizeof(expr_ty) * new_size);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100946 if (!tmp) {
Lysandros Nikolaou5193d0a2020-06-27 21:35:18 +0300947 PyMem_Free(l->p);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100948 l->p = NULL;
949 return -1;
950 }
951 l->p = tmp;
952 }
953
954 l->allocated = new_size;
955 assert(l->allocated == 2 * l->size);
956 }
957
958 l->p[l->size++] = exp;
959
960 ExprList_check_invariants(l);
961 return 0;
962}
963
964static void
965ExprList_Dealloc(ExprList *l)
966{
967 ExprList_check_invariants(l);
968
969 /* If there's been an error, or we've never dynamically allocated,
970 do nothing. */
971 if (!l->p || l->p == l->data) {
972 /* Do nothing. */
973 } else {
974 /* We have dynamically allocated. Free the memory. */
Lysandros Nikolaou5193d0a2020-06-27 21:35:18 +0300975 PyMem_Free(l->p);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100976 }
977 l->p = NULL;
978 l->size = -1;
979}
980
981static asdl_seq *
982ExprList_Finish(ExprList *l, PyArena *arena)
983{
984 asdl_seq *seq;
985
986 ExprList_check_invariants(l);
987
988 /* Allocate the asdl_seq and copy the expressions in to it. */
989 seq = _Py_asdl_seq_new(l->size, arena);
990 if (seq) {
991 Py_ssize_t i;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100992 for (i = 0; i < l->size; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100993 asdl_seq_SET(seq, i, l->p[i]);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100994 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100995 }
996 ExprList_Dealloc(l);
997 return seq;
998}
999
1000#ifdef NDEBUG
1001#define FstringParser_check_invariants(state)
1002#else
1003static void
1004FstringParser_check_invariants(FstringParser *state)
1005{
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001006 if (state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001007 assert(PyUnicode_CheckExact(state->last_str));
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001008 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001009 ExprList_check_invariants(&state->expr_list);
1010}
1011#endif
1012
1013void
1014_PyPegen_FstringParser_Init(FstringParser *state)
1015{
1016 state->last_str = NULL;
1017 state->fmode = 0;
1018 ExprList_Init(&state->expr_list);
1019 FstringParser_check_invariants(state);
1020}
1021
1022void
1023_PyPegen_FstringParser_Dealloc(FstringParser *state)
1024{
1025 FstringParser_check_invariants(state);
1026
1027 Py_XDECREF(state->last_str);
1028 ExprList_Dealloc(&state->expr_list);
1029}
1030
1031/* Make a Constant node, but decref the PyUnicode object being added. */
1032static expr_ty
1033make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token)
1034{
1035 PyObject *s = *str;
1036 PyObject *kind = NULL;
1037 *str = NULL;
1038 assert(PyUnicode_CheckExact(s));
1039 if (PyArena_AddPyObject(p->arena, s) < 0) {
1040 Py_DECREF(s);
1041 return NULL;
1042 }
1043 const char* the_str = PyBytes_AsString(first_token->bytes);
1044 if (the_str && the_str[0] == 'u') {
1045 kind = _PyPegen_new_identifier(p, "u");
1046 }
1047
1048 if (kind == NULL && PyErr_Occurred()) {
1049 return NULL;
1050 }
1051
1052 return Constant(s, kind, first_token->lineno, first_token->col_offset,
1053 last_token->end_lineno, last_token->end_col_offset, p->arena);
1054
1055}
1056
1057
1058/* Add a non-f-string (that is, a regular literal string). str is
1059 decref'd. */
1060int
1061_PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str)
1062{
1063 FstringParser_check_invariants(state);
1064
1065 assert(PyUnicode_CheckExact(str));
1066
1067 if (PyUnicode_GET_LENGTH(str) == 0) {
1068 Py_DECREF(str);
1069 return 0;
1070 }
1071
1072 if (!state->last_str) {
1073 /* We didn't have a string before, so just remember this one. */
1074 state->last_str = str;
1075 } else {
1076 /* Concatenate this with the previous string. */
1077 PyUnicode_AppendAndDel(&state->last_str, str);
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001078 if (!state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001079 return -1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001080 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001081 }
1082 FstringParser_check_invariants(state);
1083 return 0;
1084}
1085
1086/* Parse an f-string. The f-string is in *str to end, with no
1087 'f' or quotes. */
1088int
1089_PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str,
1090 const char *end, int raw, int recurse_lvl,
1091 Token *first_token, Token* t, Token *last_token)
1092{
1093 FstringParser_check_invariants(state);
1094 state->fmode = 1;
1095
1096 /* Parse the f-string. */
1097 while (1) {
1098 PyObject *literal = NULL;
1099 PyObject *expr_text = NULL;
1100 expr_ty expression = NULL;
1101
1102 /* If there's a zero length literal in front of the
1103 expression, literal will be NULL. If we're at the end of
1104 the f-string, expression will be NULL (unless result == 1,
1105 see below). */
1106 int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
1107 &literal, &expr_text,
1108 &expression, first_token, t, last_token);
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001109 if (result < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001110 return -1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001111 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001112
1113 /* Add the literal, if any. */
1114 if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
1115 Py_XDECREF(expr_text);
1116 return -1;
1117 }
1118 /* Add the expr_text, if any. */
1119 if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
1120 return -1;
1121 }
1122
1123 /* We've dealt with the literal and expr_text, their ownership has
1124 been transferred to the state object. Don't look at them again. */
1125
1126 /* See if we should just loop around to get the next literal
1127 and expression, while ignoring the expression this
1128 time. This is used for un-doubling braces, as an
1129 optimization. */
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001130 if (result == 1) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001131 continue;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001132 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001133
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001134 if (!expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001135 /* We're done with this f-string. */
1136 break;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001137 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001138
1139 /* We know we have an expression. Convert any existing string
1140 to a Constant node. */
1141 if (!state->last_str) {
1142 /* Do nothing. No previous literal. */
1143 } else {
1144 /* Convert the existing last_str literal to a Constant node. */
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001145 expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1146 if (!last_str || ExprList_Append(&state->expr_list, last_str) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001147 return -1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001148 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001149 }
1150
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001151 if (ExprList_Append(&state->expr_list, expression) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001152 return -1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001153 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001154 }
1155
1156 /* If recurse_lvl is zero, then we must be at the end of the
1157 string. Otherwise, we must be at a right brace. */
1158
1159 if (recurse_lvl == 0 && *str < end-1) {
1160 RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
1161 return -1;
1162 }
1163 if (recurse_lvl != 0 && **str != '}') {
1164 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1165 return -1;
1166 }
1167
1168 FstringParser_check_invariants(state);
1169 return 0;
1170}
1171
1172/* Convert the partial state reflected in last_str and expr_list to an
1173 expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
1174expr_ty
1175_PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token,
1176 Token *last_token)
1177{
1178 asdl_seq *seq;
1179
1180 FstringParser_check_invariants(state);
1181
1182 /* If we're just a constant string with no expressions, return
1183 that. */
1184 if (!state->fmode) {
1185 assert(!state->expr_list.size);
1186 if (!state->last_str) {
1187 /* Create a zero length string. */
1188 state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001189 if (!state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001190 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001191 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001192 }
1193 return make_str_node_and_del(p, &state->last_str, first_token, last_token);
1194 }
1195
1196 /* Create a Constant node out of last_str, if needed. It will be the
1197 last node in our expression list. */
1198 if (state->last_str) {
1199 expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001200 if (!str || ExprList_Append(&state->expr_list, str) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001201 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001202 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001203 }
1204 /* This has already been freed. */
1205 assert(state->last_str == NULL);
1206
1207 seq = ExprList_Finish(&state->expr_list, p->arena);
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001208 if (!seq) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001209 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001210 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001211
1212 return _Py_JoinedStr(seq, first_token->lineno, first_token->col_offset,
1213 last_token->end_lineno, last_token->end_col_offset, p->arena);
1214
1215error:
1216 _PyPegen_FstringParser_Dealloc(state);
1217 return NULL;
1218}
1219
1220/* Given an f-string (with no 'f' or quotes) that's in *str and ends
1221 at end, parse it into an expr_ty. Return NULL on error. Adjust
1222 str to point past the parsed portion. */
1223static expr_ty
1224fstring_parse(Parser *p, const char **str, const char *end, int raw,
1225 int recurse_lvl, Token *first_token, Token* t, Token *last_token)
1226{
1227 FstringParser state;
1228
1229 _PyPegen_FstringParser_Init(&state);
1230 if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
1231 first_token, t, last_token) < 0) {
1232 _PyPegen_FstringParser_Dealloc(&state);
1233 return NULL;
1234 }
1235
1236 return _PyPegen_FstringParser_Finish(p, &state, t, t);
1237}