blob: 88b10c3f494ccee363df00c4783ade4fcf950a45 [file] [log] [blame]
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001#include <Python.h>
2
3#include "../tokenizer.h"
4#include "pegen.h"
5#include "parse_string.h"
6
7//// STRING HANDLING FUNCTIONS ////
8
9// These functions are ported directly from Python/ast.c with some modifications
10// to account for the use of "Parser *p", the fact that don't have parser nodes
11// to pass around and the usage of some specialized APIs present only in this
12// file (like "_PyPegen_raise_syntax_error").
13
14static int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030015warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010016{
17 PyObject *msg =
18 PyUnicode_FromFormat("invalid escape sequence \\%c", first_invalid_escape_char);
19 if (msg == NULL) {
20 return -1;
21 }
22 if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030023 t->lineno, NULL, NULL) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +010024 if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
25 /* Replace the DeprecationWarning exception with a SyntaxError
26 to get a more accurate error report */
27 PyErr_Clear();
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030028
29 /* This is needed, in order for the SyntaxError to point to the token t,
30 since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
31 error location, if p->known_err_token is not set. */
32 p->known_err_token = t;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010033 RAISE_SYNTAX_ERROR("invalid escape sequence \\%c", first_invalid_escape_char);
34 }
35 Py_DECREF(msg);
36 return -1;
37 }
38 Py_DECREF(msg);
39 return 0;
40}
41
42static PyObject *
43decode_utf8(const char **sPtr, const char *end)
44{
Pablo Galindo30b59fd2020-06-15 15:08:00 +010045 const char *s;
46 const char *t;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010047 t = s = *sPtr;
48 while (s < end && (*s & 0x80)) {
49 s++;
50 }
51 *sPtr = s;
52 return PyUnicode_DecodeUTF8(t, s - t, NULL);
53}
54
55static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030056decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010057{
Pablo Galindo30b59fd2020-06-15 15:08:00 +010058 PyObject *v;
59 PyObject *u;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010060 char *buf;
61 char *p;
62 const char *end;
63
64 /* check for integer overflow */
65 if (len > SIZE_MAX / 6) {
66 return NULL;
67 }
68 /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
69 "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
70 u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
71 if (u == NULL) {
72 return NULL;
73 }
74 p = buf = PyBytes_AsString(u);
75 end = s + len;
76 while (s < end) {
77 if (*s == '\\') {
78 *p++ = *s++;
79 if (s >= end || *s & 0x80) {
80 strcpy(p, "u005c");
81 p += 5;
82 if (s >= end) {
83 break;
84 }
85 }
86 }
87 if (*s & 0x80) {
88 PyObject *w;
89 int kind;
90 void *data;
Pablo Galindo30b59fd2020-06-15 15:08:00 +010091 Py_ssize_t w_len;
92 Py_ssize_t i;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010093 w = decode_utf8(&s, end);
94 if (w == NULL) {
95 Py_DECREF(u);
96 return NULL;
97 }
98 kind = PyUnicode_KIND(w);
99 data = PyUnicode_DATA(w);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100100 w_len = PyUnicode_GET_LENGTH(w);
101 for (i = 0; i < w_len; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100102 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
103 sprintf(p, "\\U%08x", chr);
104 p += 10;
105 }
106 /* Should be impossible to overflow */
107 assert(p - buf <= PyBytes_GET_SIZE(u));
108 Py_DECREF(w);
109 }
110 else {
111 *p++ = *s++;
112 }
113 }
114 len = p - buf;
115 s = buf;
116
117 const char *first_invalid_escape;
118 v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
119
120 if (v != NULL && first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300121 if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100122 /* We have not decref u before because first_invalid_escape points
123 inside u. */
124 Py_XDECREF(u);
125 Py_DECREF(v);
126 return NULL;
127 }
128 }
129 Py_XDECREF(u);
130 return v;
131}
132
133static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300134decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100135{
136 const char *first_invalid_escape;
137 PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
138 if (result == NULL) {
139 return NULL;
140 }
141
142 if (first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300143 if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100144 Py_DECREF(result);
145 return NULL;
146 }
147 }
148 return result;
149}
150
151/* s must include the bracketing quote characters, and r, b, u,
152 &/or f prefixes (if any), and embedded escape sequences (if any).
153 _PyPegen_parsestr parses it, and sets *result to decoded Python string object.
154 If the string is an f-string, set *fstr and *fstrlen to the unparsed
155 string object. Return 0 if no errors occurred. */
156int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300157_PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
158 const char **fstr, Py_ssize_t *fstrlen, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100159{
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300160 const char *s = PyBytes_AsString(t->bytes);
161 if (s == NULL) {
162 return -1;
163 }
164
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100165 size_t len;
166 int quote = Py_CHARMASK(*s);
167 int fmode = 0;
168 *bytesmode = 0;
169 *rawmode = 0;
170 *result = NULL;
171 *fstr = NULL;
172 if (Py_ISALPHA(quote)) {
173 while (!*bytesmode || !*rawmode) {
174 if (quote == 'b' || quote == 'B') {
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100175 quote =(unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100176 *bytesmode = 1;
177 }
178 else if (quote == 'u' || quote == 'U') {
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100179 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100180 }
181 else if (quote == 'r' || quote == 'R') {
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100182 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100183 *rawmode = 1;
184 }
185 else if (quote == 'f' || quote == 'F') {
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100186 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100187 fmode = 1;
188 }
189 else {
190 break;
191 }
192 }
193 }
194
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300195 /* fstrings are only allowed in Python 3.6 and greater */
196 if (fmode && p->feature_version < 6) {
197 p->error_indicator = 1;
198 RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
199 return -1;
200 }
201
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100202 if (fmode && *bytesmode) {
203 PyErr_BadInternalCall();
204 return -1;
205 }
206 if (quote != '\'' && quote != '\"') {
207 PyErr_BadInternalCall();
208 return -1;
209 }
210 /* Skip the leading quote char. */
211 s++;
212 len = strlen(s);
213 if (len > INT_MAX) {
214 PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
215 return -1;
216 }
217 if (s[--len] != quote) {
218 /* Last quote char must match the first. */
219 PyErr_BadInternalCall();
220 return -1;
221 }
222 if (len >= 4 && s[0] == quote && s[1] == quote) {
223 /* A triple quoted string. We've already skipped one quote at
224 the start and one at the end of the string. Now skip the
225 two at the start. */
226 s += 2;
227 len -= 2;
228 /* And check that the last two match. */
229 if (s[--len] != quote || s[--len] != quote) {
230 PyErr_BadInternalCall();
231 return -1;
232 }
233 }
234
235 if (fmode) {
236 /* Just return the bytes. The caller will parse the resulting
237 string. */
238 *fstr = s;
239 *fstrlen = len;
240 return 0;
241 }
242
243 /* Not an f-string. */
244 /* Avoid invoking escape decoding routines if possible. */
245 *rawmode = *rawmode || strchr(s, '\\') == NULL;
246 if (*bytesmode) {
247 /* Disallow non-ASCII characters. */
248 const char *ch;
249 for (ch = s; *ch; ch++) {
250 if (Py_CHARMASK(*ch) >= 0x80) {
251 RAISE_SYNTAX_ERROR(
252 "bytes can only contain ASCII "
253 "literal characters.");
254 return -1;
255 }
256 }
257 if (*rawmode) {
258 *result = PyBytes_FromStringAndSize(s, len);
259 }
260 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300261 *result = decode_bytes_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100262 }
263 }
264 else {
265 if (*rawmode) {
266 *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
267 }
268 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300269 *result = decode_unicode_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100270 }
271 }
272 return *result == NULL ? -1 : 0;
273}
274
275
276
277// FSTRING STUFF
278
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100279/* Fix locations for the given node and its children.
280
281 `parent` is the enclosing node.
282 `n` is the node which locations are going to be fixed relative to parent.
283 `expr_str` is the child node's string representation, including braces.
284*/
285static void
Pablo Galindodab533d2020-06-28 01:15:28 +0100286fstring_find_expr_location(Token *parent, char *expr_str, int *p_lines, int *p_cols)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100287{
288 char *substr = NULL;
289 char *start;
290 int lines = 0;
291 int cols = 0;
292
293 if (parent && parent->bytes) {
294 char *parent_str = PyBytes_AsString(parent->bytes);
295 if (!parent_str) {
296 return;
297 }
298 substr = strstr(parent_str, expr_str);
299 if (substr) {
300 // The following is needed, in order to correctly shift the column
301 // offset, in the case that (disregarding any whitespace) a newline
302 // immediately follows the opening curly brace of the fstring expression.
303 int newline_after_brace = 1;
304 start = substr + 1;
305 while (start && *start != '}' && *start != '\n') {
306 if (*start != ' ' && *start != '\t' && *start != '\f') {
307 newline_after_brace = 0;
308 break;
309 }
310 start++;
311 }
312
313 // Account for the characters from the last newline character to our
314 // left until the beginning of substr.
315 if (!newline_after_brace) {
316 start = substr;
317 while (start > parent_str && *start != '\n') {
318 start--;
319 }
320 cols += (int)(substr - start);
321 }
322 /* adjust the start based on the number of newlines encountered
323 before the f-string expression */
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100324 for (char* p = parent_str; p < substr; p++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100325 if (*p == '\n') {
326 lines++;
327 }
328 }
329 }
330 }
Pablo Galindodab533d2020-06-28 01:15:28 +0100331 *p_lines = lines;
332 *p_cols = cols;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100333}
334
335
336/* Compile this expression in to an expr_ty. Add parens around the
337 expression, in order to allow leading spaces in the expression. */
338static expr_ty
339fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
340 Token *t)
341{
342 expr_ty expr = NULL;
343 char *str;
344 Py_ssize_t len;
345 const char *s;
346 expr_ty result = NULL;
347
348 assert(expr_end >= expr_start);
349 assert(*(expr_start-1) == '{');
350 assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' ||
351 *expr_end == '=');
352
353 /* If the substring is all whitespace, it's an error. We need to catch this
354 here, and not when we call PyParser_SimpleParseStringFlagsFilename,
355 because turning the expression '' in to '()' would go from being invalid
356 to valid. */
357 for (s = expr_start; s != expr_end; s++) {
358 char c = *s;
359 /* The Python parser ignores only the following whitespace
360 characters (\r already is converted to \n). */
361 if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) {
362 break;
363 }
364 }
365 if (s == expr_end) {
366 RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
367 return NULL;
368 }
369
370 len = expr_end - expr_start;
371 /* Allocate 3 extra bytes: open paren, close paren, null byte. */
Lysandros Nikolaou5193d0a2020-06-27 21:35:18 +0300372 str = PyMem_Malloc(len + 3);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100373 if (str == NULL) {
374 PyErr_NoMemory();
375 return NULL;
376 }
377
Pablo Galindodab533d2020-06-28 01:15:28 +0100378 // The call to fstring_find_expr_location is responsible for finding the column offset
379 // the generated AST nodes need to be shifted to the right, which is equal to the number
380 // of the f-string characters before the expression starts. In order to correctly compute
381 // this offset, strstr gets called in fstring_find_expr_location which only succeeds
382 // if curly braces appear before and after the f-string expression (exactly like they do
383 // in the f-string itself), hence the following lines.
384 str[0] = '{';
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100385 memcpy(str+1, expr_start, len);
Pablo Galindodab533d2020-06-28 01:15:28 +0100386 str[len+1] = '}';
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100387 str[len+2] = 0;
388
Pablo Galindodab533d2020-06-28 01:15:28 +0100389 int lines, cols;
390 fstring_find_expr_location(t, str, &lines, &cols);
391
392 // The parentheses are needed in order to allow for leading whitespace withing
393 // the f-string expression. This consequently gets parsed as a group (see the
394 // group rule in python.gram).
395 str[0] = '(';
396 str[len+1] = ')';
397
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100398 struct tok_state* tok = PyTokenizer_FromString(str, 1);
399 if (tok == NULL) {
Lysandros Nikolaou5193d0a2020-06-27 21:35:18 +0300400 PyMem_Free(str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100401 return NULL;
402 }
Lysandros Nikolaou791a46e2020-05-26 04:24:31 +0300403 Py_INCREF(p->tok->filename);
404 tok->filename = p->tok->filename;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100405
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300406 Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
407 NULL, p->arena);
Pablo Galindodab533d2020-06-28 01:15:28 +0100408 p2->starting_lineno = t->lineno + lines - 1;
409 p2->starting_col_offset = p->tok->first_lineno == p->tok->lineno ? t->col_offset + cols : cols;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100410
411 expr = _PyPegen_run_parser(p2);
412
413 if (expr == NULL) {
414 goto exit;
415 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100416 result = expr;
417
418exit:
Lysandros Nikolaou5193d0a2020-06-27 21:35:18 +0300419 PyMem_Free(str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100420 _PyPegen_Parser_Free(p2);
421 PyTokenizer_Free(tok);
422 return result;
423}
424
425/* Return -1 on error.
426
427 Return 0 if we reached the end of the literal.
428
429 Return 1 if we haven't reached the end of the literal, but we want
430 the caller to process the literal up to this point. Used for
431 doubled braces.
432*/
433static int
434fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300435 PyObject **literal, int recurse_lvl, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100436{
437 /* Get any literal string. It ends when we hit an un-doubled left
438 brace (which isn't part of a unicode name escape such as
439 "\N{EULER CONSTANT}"), or the end of the string. */
440
441 const char *s = *str;
442 const char *literal_start = s;
443 int result = 0;
444
445 assert(*literal == NULL);
446 while (s < end) {
447 char ch = *s++;
448 if (!raw && ch == '\\' && s < end) {
449 ch = *s++;
450 if (ch == 'N') {
451 if (s < end && *s++ == '{') {
452 while (s < end && *s++ != '}') {
453 }
454 continue;
455 }
456 break;
457 }
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300458 if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100459 return -1;
460 }
461 }
462 if (ch == '{' || ch == '}') {
463 /* Check for doubled braces, but only at the top level. If
464 we checked at every level, then f'{0:{3}}' would fail
465 with the two closing braces. */
466 if (recurse_lvl == 0) {
467 if (s < end && *s == ch) {
468 /* We're going to tell the caller that the literal ends
469 here, but that they should continue scanning. But also
470 skip over the second brace when we resume scanning. */
471 *str = s + 1;
472 result = 1;
473 goto done;
474 }
475
476 /* Where a single '{' is the start of a new expression, a
477 single '}' is not allowed. */
478 if (ch == '}') {
479 *str = s - 1;
480 RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
481 return -1;
482 }
483 }
484 /* We're either at a '{', which means we're starting another
485 expression; or a '}', which means we're at the end of this
486 f-string (for a nested format_spec). */
487 s--;
488 break;
489 }
490 }
491 *str = s;
492 assert(s <= end);
493 assert(s == end || *s == '{' || *s == '}');
494done:
495 if (literal_start != s) {
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100496 if (raw) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100497 *literal = PyUnicode_DecodeUTF8Stateful(literal_start,
498 s - literal_start,
499 NULL, NULL);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100500 } else {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100501 *literal = decode_unicode_with_escapes(p, literal_start,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300502 s - literal_start, t);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100503 }
504 if (!*literal) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100505 return -1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100506 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100507 }
508 return result;
509}
510
511/* Forward declaration because parsing is recursive. */
512static expr_ty
513fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
514 Token *first_token, Token* t, Token *last_token);
515
516/* Parse the f-string at *str, ending at end. We know *str starts an
517 expression (so it must be a '{'). Returns the FormattedValue node, which
518 includes the expression, conversion character, format_spec expression, and
519 optionally the text of the expression (if = is used).
520
521 Note that I don't do a perfect job here: I don't make sure that a
522 closing brace doesn't match an opening paren, for example. It
523 doesn't need to error on all invalid expressions, just correctly
524 find the end of all valid ones. Any errors inside the expression
525 will be caught when we parse it later.
526
527 *expression is set to the expression. For an '=' "debug" expression,
528 *expr_text is set to the debug text (the original text of the expression,
529 including the '=' and any whitespace around it, as a string object). If
530 not a debug expression, *expr_text set to NULL. */
531static int
532fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
533 PyObject **expr_text, expr_ty *expression, Token *first_token,
534 Token *t, Token *last_token)
535{
536 /* Return -1 on error, else 0. */
537
538 const char *expr_start;
539 const char *expr_end;
540 expr_ty simple_expression;
541 expr_ty format_spec = NULL; /* Optional format specifier. */
542 int conversion = -1; /* The conversion char. Use default if not
543 specified, or !r if using = and no format
544 spec. */
545
546 /* 0 if we're not in a string, else the quote char we're trying to
547 match (single or double quote). */
548 char quote_char = 0;
549
550 /* If we're inside a string, 1=normal, 3=triple-quoted. */
551 int string_type = 0;
552
553 /* Keep track of nesting level for braces/parens/brackets in
554 expressions. */
555 Py_ssize_t nested_depth = 0;
556 char parenstack[MAXLEVEL];
557
558 *expr_text = NULL;
559
560 /* Can only nest one level deep. */
561 if (recurse_lvl >= 2) {
562 RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
563 goto error;
564 }
565
566 /* The first char must be a left brace, or we wouldn't have gotten
567 here. Skip over it. */
568 assert(**str == '{');
569 *str += 1;
570
571 expr_start = *str;
572 for (; *str < end; (*str)++) {
573 char ch;
574
575 /* Loop invariants. */
576 assert(nested_depth >= 0);
577 assert(*str >= expr_start && *str < end);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100578 if (quote_char) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100579 assert(string_type == 1 || string_type == 3);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100580 } else {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100581 assert(string_type == 0);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100582 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100583
584 ch = **str;
585 /* Nowhere inside an expression is a backslash allowed. */
586 if (ch == '\\') {
587 /* Error: can't include a backslash character, inside
588 parens or strings or not. */
589 RAISE_SYNTAX_ERROR(
590 "f-string expression part "
591 "cannot include a backslash");
592 goto error;
593 }
594 if (quote_char) {
595 /* We're inside a string. See if we're at the end. */
596 /* This code needs to implement the same non-error logic
597 as tok_get from tokenizer.c, at the letter_quote
598 label. To actually share that code would be a
599 nightmare. But, it's unlikely to change and is small,
600 so duplicate it here. Note we don't need to catch all
601 of the errors, since they'll be caught when parsing the
602 expression. We just need to match the non-error
603 cases. Thus we can ignore \n in single-quoted strings,
604 for example. Or non-terminated strings. */
605 if (ch == quote_char) {
606 /* Does this match the string_type (single or triple
607 quoted)? */
608 if (string_type == 3) {
609 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
610 /* We're at the end of a triple quoted string. */
611 *str += 2;
612 string_type = 0;
613 quote_char = 0;
614 continue;
615 }
616 } else {
617 /* We're at the end of a normal string. */
618 quote_char = 0;
619 string_type = 0;
620 continue;
621 }
622 }
623 } else if (ch == '\'' || ch == '"') {
624 /* Is this a triple quoted string? */
625 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
626 string_type = 3;
627 *str += 2;
628 } else {
629 /* Start of a normal string. */
630 string_type = 1;
631 }
632 /* Start looking for the end of the string. */
633 quote_char = ch;
634 } else if (ch == '[' || ch == '{' || ch == '(') {
635 if (nested_depth >= MAXLEVEL) {
636 RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
637 goto error;
638 }
639 parenstack[nested_depth] = ch;
640 nested_depth++;
641 } else if (ch == '#') {
642 /* Error: can't include a comment character, inside parens
643 or not. */
644 RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
645 goto error;
646 } else if (nested_depth == 0 &&
647 (ch == '!' || ch == ':' || ch == '}' ||
648 ch == '=' || ch == '>' || ch == '<')) {
649 /* See if there's a next character. */
650 if (*str+1 < end) {
651 char next = *(*str+1);
652
653 /* For "!=". since '=' is not an allowed conversion character,
654 nothing is lost in this test. */
655 if ((ch == '!' && next == '=') || /* != */
656 (ch == '=' && next == '=') || /* == */
657 (ch == '<' && next == '=') || /* <= */
658 (ch == '>' && next == '=') /* >= */
659 ) {
660 *str += 1;
661 continue;
662 }
663 /* Don't get out of the loop for these, if they're single
664 chars (not part of 2-char tokens). If by themselves, they
665 don't end an expression (unlike say '!'). */
666 if (ch == '>' || ch == '<') {
667 continue;
668 }
669 }
670
671 /* Normal way out of this loop. */
672 break;
673 } else if (ch == ']' || ch == '}' || ch == ')') {
674 if (!nested_depth) {
675 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
676 goto error;
677 }
678 nested_depth--;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100679 int opening = (unsigned char)parenstack[nested_depth];
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100680 if (!((opening == '(' && ch == ')') ||
681 (opening == '[' && ch == ']') ||
682 (opening == '{' && ch == '}')))
683 {
684 RAISE_SYNTAX_ERROR(
685 "f-string: closing parenthesis '%c' "
686 "does not match opening parenthesis '%c'",
687 ch, opening);
688 goto error;
689 }
690 } else {
691 /* Just consume this char and loop around. */
692 }
693 }
694 expr_end = *str;
695 /* If we leave this loop in a string or with mismatched parens, we
696 don't care. We'll get a syntax error when compiling the
697 expression. But, we can produce a better error message, so
698 let's just do that.*/
699 if (quote_char) {
700 RAISE_SYNTAX_ERROR("f-string: unterminated string");
701 goto error;
702 }
703 if (nested_depth) {
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100704 int opening = (unsigned char)parenstack[nested_depth - 1];
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100705 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
706 goto error;
707 }
708
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100709 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100710 goto unexpected_end_of_string;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100711 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100712
713 /* Compile the expression as soon as possible, so we show errors
714 related to the expression before errors related to the
715 conversion or format_spec. */
716 simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100717 if (!simple_expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100718 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100719 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100720
721 /* Check for =, which puts the text value of the expression in
722 expr_text. */
723 if (**str == '=') {
Pablo Galindo9b838292020-05-27 22:01:11 +0100724 if (p->feature_version < 8) {
725 RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
726 "only supported in Python 3.8 and greater");
727 goto error;
728 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100729 *str += 1;
730
731 /* Skip over ASCII whitespace. No need to test for end of string
732 here, since we know there's at least a trailing quote somewhere
733 ahead. */
734 while (Py_ISSPACE(**str)) {
735 *str += 1;
736 }
737
738 /* Set *expr_text to the text of the expression. */
739 *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start);
740 if (!*expr_text) {
741 goto error;
742 }
743 }
744
745 /* Check for a conversion char, if present. */
746 if (**str == '!') {
747 *str += 1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100748 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100749 goto unexpected_end_of_string;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100750 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100751
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100752 conversion = (unsigned char)**str;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100753 *str += 1;
754
755 /* Validate the conversion. */
756 if (!(conversion == 's' || conversion == 'r' || conversion == 'a')) {
757 RAISE_SYNTAX_ERROR(
758 "f-string: invalid conversion character: "
759 "expected 's', 'r', or 'a'");
760 goto error;
761 }
762
763 }
764
765 /* Check for the format spec, if present. */
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100766 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100767 goto unexpected_end_of_string;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100768 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100769 if (**str == ':') {
770 *str += 1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100771 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100772 goto unexpected_end_of_string;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100773 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100774
775 /* Parse the format spec. */
776 format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
777 first_token, t, last_token);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100778 if (!format_spec) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100779 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100780 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100781 }
782
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100783 if (*str >= end || **str != '}') {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100784 goto unexpected_end_of_string;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100785 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100786
787 /* We're at a right brace. Consume it. */
788 assert(*str < end);
789 assert(**str == '}');
790 *str += 1;
791
792 /* If we're in = mode (detected by non-NULL expr_text), and have no format
793 spec and no explicit conversion, set the conversion to 'r'. */
794 if (*expr_text && format_spec == NULL && conversion == -1) {
795 conversion = 'r';
796 }
797
798 /* And now create the FormattedValue node that represents this
799 entire expression with the conversion and format spec. */
800 //TODO: Fix this
801 *expression = FormattedValue(simple_expression, conversion,
802 format_spec, first_token->lineno,
803 first_token->col_offset, last_token->end_lineno,
804 last_token->end_col_offset, p->arena);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100805 if (!*expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100806 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100807 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100808
809 return 0;
810
811unexpected_end_of_string:
812 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
813 /* Falls through to error. */
814
815error:
816 Py_XDECREF(*expr_text);
817 return -1;
818
819}
820
821/* Return -1 on error.
822
823 Return 0 if we have a literal (possible zero length) and an
824 expression (zero length if at the end of the string.
825
826 Return 1 if we have a literal, but no expression, and we want the
827 caller to call us again. This is used to deal with doubled
828 braces.
829
830 When called multiple times on the string 'a{{b{0}c', this function
831 will return:
832
833 1. the literal 'a{' with no expression, and a return value
834 of 1. Despite the fact that there's no expression, the return
835 value of 1 means we're not finished yet.
836
837 2. the literal 'b' and the expression '0', with a return value of
838 0. The fact that there's an expression means we're not finished.
839
840 3. literal 'c' with no expression and a return value of 0. The
841 combination of the return value of 0 with no expression means
842 we're finished.
843*/
844static int
845fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw,
846 int recurse_lvl, PyObject **literal,
847 PyObject **expr_text, expr_ty *expression,
848 Token *first_token, Token *t, Token *last_token)
849{
850 int result;
851
852 assert(*literal == NULL && *expression == NULL);
853
854 /* Get any literal string. */
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300855 result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100856 if (result < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100857 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100858 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100859
860 assert(result == 0 || result == 1);
861
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100862 if (result == 1) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100863 /* We have a literal, but don't look at the expression. */
864 return 1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100865 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100866
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100867 if (*str >= end || **str == '}') {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100868 /* We're at the end of the string or the end of a nested
869 f-string: no expression. The top-level error case where we
870 expect to be at the end of the string but we're at a '}' is
871 handled later. */
872 return 0;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100873 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100874
875 /* We must now be the start of an expression, on a '{'. */
876 assert(**str == '{');
877
878 if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100879 expression, first_token, t, last_token) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100880 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100881 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100882
883 return 0;
884
885error:
886 Py_CLEAR(*literal);
887 return -1;
888}
889
890#ifdef NDEBUG
891#define ExprList_check_invariants(l)
892#else
893static void
894ExprList_check_invariants(ExprList *l)
895{
896 /* Check our invariants. Make sure this object is "live", and
897 hasn't been deallocated. */
898 assert(l->size >= 0);
899 assert(l->p != NULL);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100900 if (l->size <= EXPRLIST_N_CACHED) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100901 assert(l->data == l->p);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100902 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100903}
904#endif
905
906static void
907ExprList_Init(ExprList *l)
908{
909 l->allocated = EXPRLIST_N_CACHED;
910 l->size = 0;
911
912 /* Until we start allocating dynamically, p points to data. */
913 l->p = l->data;
914
915 ExprList_check_invariants(l);
916}
917
918static int
919ExprList_Append(ExprList *l, expr_ty exp)
920{
921 ExprList_check_invariants(l);
922 if (l->size >= l->allocated) {
923 /* We need to alloc (or realloc) the memory. */
924 Py_ssize_t new_size = l->allocated * 2;
925
926 /* See if we've ever allocated anything dynamically. */
927 if (l->p == l->data) {
928 Py_ssize_t i;
929 /* We're still using the cached data. Switch to
930 alloc-ing. */
Lysandros Nikolaou5193d0a2020-06-27 21:35:18 +0300931 l->p = PyMem_Malloc(sizeof(expr_ty) * new_size);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100932 if (!l->p) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100933 return -1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100934 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100935 /* Copy the cached data into the new buffer. */
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100936 for (i = 0; i < l->size; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100937 l->p[i] = l->data[i];
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100938 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100939 } else {
940 /* Just realloc. */
Lysandros Nikolaou5193d0a2020-06-27 21:35:18 +0300941 expr_ty *tmp = PyMem_Realloc(l->p, sizeof(expr_ty) * new_size);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100942 if (!tmp) {
Lysandros Nikolaou5193d0a2020-06-27 21:35:18 +0300943 PyMem_Free(l->p);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100944 l->p = NULL;
945 return -1;
946 }
947 l->p = tmp;
948 }
949
950 l->allocated = new_size;
951 assert(l->allocated == 2 * l->size);
952 }
953
954 l->p[l->size++] = exp;
955
956 ExprList_check_invariants(l);
957 return 0;
958}
959
960static void
961ExprList_Dealloc(ExprList *l)
962{
963 ExprList_check_invariants(l);
964
965 /* If there's been an error, or we've never dynamically allocated,
966 do nothing. */
967 if (!l->p || l->p == l->data) {
968 /* Do nothing. */
969 } else {
970 /* We have dynamically allocated. Free the memory. */
Lysandros Nikolaou5193d0a2020-06-27 21:35:18 +0300971 PyMem_Free(l->p);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100972 }
973 l->p = NULL;
974 l->size = -1;
975}
976
977static asdl_seq *
978ExprList_Finish(ExprList *l, PyArena *arena)
979{
980 asdl_seq *seq;
981
982 ExprList_check_invariants(l);
983
984 /* Allocate the asdl_seq and copy the expressions in to it. */
985 seq = _Py_asdl_seq_new(l->size, arena);
986 if (seq) {
987 Py_ssize_t i;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100988 for (i = 0; i < l->size; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100989 asdl_seq_SET(seq, i, l->p[i]);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100990 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100991 }
992 ExprList_Dealloc(l);
993 return seq;
994}
995
996#ifdef NDEBUG
997#define FstringParser_check_invariants(state)
998#else
999static void
1000FstringParser_check_invariants(FstringParser *state)
1001{
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001002 if (state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001003 assert(PyUnicode_CheckExact(state->last_str));
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001004 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001005 ExprList_check_invariants(&state->expr_list);
1006}
1007#endif
1008
1009void
1010_PyPegen_FstringParser_Init(FstringParser *state)
1011{
1012 state->last_str = NULL;
1013 state->fmode = 0;
1014 ExprList_Init(&state->expr_list);
1015 FstringParser_check_invariants(state);
1016}
1017
1018void
1019_PyPegen_FstringParser_Dealloc(FstringParser *state)
1020{
1021 FstringParser_check_invariants(state);
1022
1023 Py_XDECREF(state->last_str);
1024 ExprList_Dealloc(&state->expr_list);
1025}
1026
1027/* Make a Constant node, but decref the PyUnicode object being added. */
1028static expr_ty
1029make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token)
1030{
1031 PyObject *s = *str;
1032 PyObject *kind = NULL;
1033 *str = NULL;
1034 assert(PyUnicode_CheckExact(s));
1035 if (PyArena_AddPyObject(p->arena, s) < 0) {
1036 Py_DECREF(s);
1037 return NULL;
1038 }
1039 const char* the_str = PyBytes_AsString(first_token->bytes);
1040 if (the_str && the_str[0] == 'u') {
1041 kind = _PyPegen_new_identifier(p, "u");
1042 }
1043
1044 if (kind == NULL && PyErr_Occurred()) {
1045 return NULL;
1046 }
1047
1048 return Constant(s, kind, first_token->lineno, first_token->col_offset,
1049 last_token->end_lineno, last_token->end_col_offset, p->arena);
1050
1051}
1052
1053
1054/* Add a non-f-string (that is, a regular literal string). str is
1055 decref'd. */
1056int
1057_PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str)
1058{
1059 FstringParser_check_invariants(state);
1060
1061 assert(PyUnicode_CheckExact(str));
1062
1063 if (PyUnicode_GET_LENGTH(str) == 0) {
1064 Py_DECREF(str);
1065 return 0;
1066 }
1067
1068 if (!state->last_str) {
1069 /* We didn't have a string before, so just remember this one. */
1070 state->last_str = str;
1071 } else {
1072 /* Concatenate this with the previous string. */
1073 PyUnicode_AppendAndDel(&state->last_str, str);
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001074 if (!state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001075 return -1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001076 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001077 }
1078 FstringParser_check_invariants(state);
1079 return 0;
1080}
1081
1082/* Parse an f-string. The f-string is in *str to end, with no
1083 'f' or quotes. */
1084int
1085_PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str,
1086 const char *end, int raw, int recurse_lvl,
1087 Token *first_token, Token* t, Token *last_token)
1088{
1089 FstringParser_check_invariants(state);
1090 state->fmode = 1;
1091
1092 /* Parse the f-string. */
1093 while (1) {
1094 PyObject *literal = NULL;
1095 PyObject *expr_text = NULL;
1096 expr_ty expression = NULL;
1097
1098 /* If there's a zero length literal in front of the
1099 expression, literal will be NULL. If we're at the end of
1100 the f-string, expression will be NULL (unless result == 1,
1101 see below). */
1102 int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
1103 &literal, &expr_text,
1104 &expression, first_token, t, last_token);
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001105 if (result < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001106 return -1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001107 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001108
1109 /* Add the literal, if any. */
1110 if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
1111 Py_XDECREF(expr_text);
1112 return -1;
1113 }
1114 /* Add the expr_text, if any. */
1115 if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
1116 return -1;
1117 }
1118
1119 /* We've dealt with the literal and expr_text, their ownership has
1120 been transferred to the state object. Don't look at them again. */
1121
1122 /* See if we should just loop around to get the next literal
1123 and expression, while ignoring the expression this
1124 time. This is used for un-doubling braces, as an
1125 optimization. */
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001126 if (result == 1) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001127 continue;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001128 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001129
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001130 if (!expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001131 /* We're done with this f-string. */
1132 break;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001133 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001134
1135 /* We know we have an expression. Convert any existing string
1136 to a Constant node. */
1137 if (!state->last_str) {
1138 /* Do nothing. No previous literal. */
1139 } else {
1140 /* Convert the existing last_str literal to a Constant node. */
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001141 expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1142 if (!last_str || ExprList_Append(&state->expr_list, last_str) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001143 return -1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001144 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001145 }
1146
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001147 if (ExprList_Append(&state->expr_list, expression) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001148 return -1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001149 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001150 }
1151
1152 /* If recurse_lvl is zero, then we must be at the end of the
1153 string. Otherwise, we must be at a right brace. */
1154
1155 if (recurse_lvl == 0 && *str < end-1) {
1156 RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
1157 return -1;
1158 }
1159 if (recurse_lvl != 0 && **str != '}') {
1160 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1161 return -1;
1162 }
1163
1164 FstringParser_check_invariants(state);
1165 return 0;
1166}
1167
1168/* Convert the partial state reflected in last_str and expr_list to an
1169 expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
1170expr_ty
1171_PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token,
1172 Token *last_token)
1173{
1174 asdl_seq *seq;
1175
1176 FstringParser_check_invariants(state);
1177
1178 /* If we're just a constant string with no expressions, return
1179 that. */
1180 if (!state->fmode) {
1181 assert(!state->expr_list.size);
1182 if (!state->last_str) {
1183 /* Create a zero length string. */
1184 state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001185 if (!state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001186 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001187 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001188 }
1189 return make_str_node_and_del(p, &state->last_str, first_token, last_token);
1190 }
1191
1192 /* Create a Constant node out of last_str, if needed. It will be the
1193 last node in our expression list. */
1194 if (state->last_str) {
1195 expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001196 if (!str || ExprList_Append(&state->expr_list, str) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001197 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001198 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001199 }
1200 /* This has already been freed. */
1201 assert(state->last_str == NULL);
1202
1203 seq = ExprList_Finish(&state->expr_list, p->arena);
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001204 if (!seq) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001205 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001206 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001207
1208 return _Py_JoinedStr(seq, first_token->lineno, first_token->col_offset,
1209 last_token->end_lineno, last_token->end_col_offset, p->arena);
1210
1211error:
1212 _PyPegen_FstringParser_Dealloc(state);
1213 return NULL;
1214}
1215
1216/* Given an f-string (with no 'f' or quotes) that's in *str and ends
1217 at end, parse it into an expr_ty. Return NULL on error. Adjust
1218 str to point past the parsed portion. */
1219static expr_ty
1220fstring_parse(Parser *p, const char **str, const char *end, int raw,
1221 int recurse_lvl, Token *first_token, Token* t, Token *last_token)
1222{
1223 FstringParser state;
1224
1225 _PyPegen_FstringParser_Init(&state);
1226 if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
1227 first_token, t, last_token) < 0) {
1228 _PyPegen_FstringParser_Dealloc(&state);
1229 return NULL;
1230 }
1231
1232 return _PyPegen_FstringParser_Finish(p, &state, t, t);
1233}