blob: 2c35da590defbbc47df152be00b769999f32b00f [file] [log] [blame]
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -07001#include <stdbool.h>
2
Pablo Galindoc5fc1562020-04-22 23:29:27 +01003#include <Python.h>
4
Pablo Galindo1ed83ad2020-06-11 17:30:46 +01005#include "tokenizer.h"
Pablo Galindoc5fc1562020-04-22 23:29:27 +01006#include "pegen.h"
Pablo Galindo1ed83ad2020-06-11 17:30:46 +01007#include "string_parser.h"
Pablo Galindoc5fc1562020-04-22 23:29:27 +01008
9//// STRING HANDLING FUNCTIONS ////
10
Pablo Galindoc5fc1562020-04-22 23:29:27 +010011static int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030012warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010013{
14 PyObject *msg =
15 PyUnicode_FromFormat("invalid escape sequence \\%c", first_invalid_escape_char);
16 if (msg == NULL) {
17 return -1;
18 }
19 if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030020 t->lineno, NULL, NULL) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +010021 if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
22 /* Replace the DeprecationWarning exception with a SyntaxError
23 to get a more accurate error report */
24 PyErr_Clear();
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030025
26 /* This is needed, in order for the SyntaxError to point to the token t,
27 since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
28 error location, if p->known_err_token is not set. */
29 p->known_err_token = t;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010030 RAISE_SYNTAX_ERROR("invalid escape sequence \\%c", first_invalid_escape_char);
31 }
32 Py_DECREF(msg);
33 return -1;
34 }
35 Py_DECREF(msg);
36 return 0;
37}
38
39static PyObject *
40decode_utf8(const char **sPtr, const char *end)
41{
Pablo Galindofb61c422020-06-15 14:23:43 +010042 const char *s;
43 const char *t;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010044 t = s = *sPtr;
45 while (s < end && (*s & 0x80)) {
46 s++;
47 }
48 *sPtr = s;
49 return PyUnicode_DecodeUTF8(t, s - t, NULL);
50}
51
52static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030053decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010054{
Pablo Galindofb61c422020-06-15 14:23:43 +010055 PyObject *v;
56 PyObject *u;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010057 char *buf;
58 char *p;
59 const char *end;
60
61 /* check for integer overflow */
62 if (len > SIZE_MAX / 6) {
63 return NULL;
64 }
65 /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
66 "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
67 u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
68 if (u == NULL) {
69 return NULL;
70 }
71 p = buf = PyBytes_AsString(u);
72 end = s + len;
73 while (s < end) {
74 if (*s == '\\') {
75 *p++ = *s++;
76 if (s >= end || *s & 0x80) {
77 strcpy(p, "u005c");
78 p += 5;
79 if (s >= end) {
80 break;
81 }
82 }
83 }
84 if (*s & 0x80) {
85 PyObject *w;
86 int kind;
87 void *data;
Pablo Galindofb61c422020-06-15 14:23:43 +010088 Py_ssize_t w_len;
89 Py_ssize_t i;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010090 w = decode_utf8(&s, end);
91 if (w == NULL) {
92 Py_DECREF(u);
93 return NULL;
94 }
95 kind = PyUnicode_KIND(w);
96 data = PyUnicode_DATA(w);
Pablo Galindofb61c422020-06-15 14:23:43 +010097 w_len = PyUnicode_GET_LENGTH(w);
98 for (i = 0; i < w_len; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +010099 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
100 sprintf(p, "\\U%08x", chr);
101 p += 10;
102 }
103 /* Should be impossible to overflow */
104 assert(p - buf <= PyBytes_GET_SIZE(u));
105 Py_DECREF(w);
106 }
107 else {
108 *p++ = *s++;
109 }
110 }
111 len = p - buf;
112 s = buf;
113
114 const char *first_invalid_escape;
115 v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
116
117 if (v != NULL && first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300118 if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100119 /* We have not decref u before because first_invalid_escape points
120 inside u. */
121 Py_XDECREF(u);
122 Py_DECREF(v);
123 return NULL;
124 }
125 }
126 Py_XDECREF(u);
127 return v;
128}
129
130static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300131decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100132{
133 const char *first_invalid_escape;
134 PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
135 if (result == NULL) {
136 return NULL;
137 }
138
139 if (first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300140 if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100141 Py_DECREF(result);
142 return NULL;
143 }
144 }
145 return result;
146}
147
148/* s must include the bracketing quote characters, and r, b, u,
149 &/or f prefixes (if any), and embedded escape sequences (if any).
150 _PyPegen_parsestr parses it, and sets *result to decoded Python string object.
151 If the string is an f-string, set *fstr and *fstrlen to the unparsed
152 string object. Return 0 if no errors occurred. */
153int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300154_PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
155 const char **fstr, Py_ssize_t *fstrlen, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100156{
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300157 const char *s = PyBytes_AsString(t->bytes);
158 if (s == NULL) {
159 return -1;
160 }
161
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100162 size_t len;
163 int quote = Py_CHARMASK(*s);
164 int fmode = 0;
165 *bytesmode = 0;
166 *rawmode = 0;
167 *result = NULL;
168 *fstr = NULL;
169 if (Py_ISALPHA(quote)) {
170 while (!*bytesmode || !*rawmode) {
171 if (quote == 'b' || quote == 'B') {
Pablo Galindofb61c422020-06-15 14:23:43 +0100172 quote =(unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100173 *bytesmode = 1;
174 }
175 else if (quote == 'u' || quote == 'U') {
Pablo Galindofb61c422020-06-15 14:23:43 +0100176 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100177 }
178 else if (quote == 'r' || quote == 'R') {
Pablo Galindofb61c422020-06-15 14:23:43 +0100179 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100180 *rawmode = 1;
181 }
182 else if (quote == 'f' || quote == 'F') {
Pablo Galindofb61c422020-06-15 14:23:43 +0100183 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100184 fmode = 1;
185 }
186 else {
187 break;
188 }
189 }
190 }
191
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300192 /* fstrings are only allowed in Python 3.6 and greater */
193 if (fmode && p->feature_version < 6) {
194 p->error_indicator = 1;
195 RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
196 return -1;
197 }
198
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100199 if (fmode && *bytesmode) {
200 PyErr_BadInternalCall();
201 return -1;
202 }
203 if (quote != '\'' && quote != '\"') {
204 PyErr_BadInternalCall();
205 return -1;
206 }
207 /* Skip the leading quote char. */
208 s++;
209 len = strlen(s);
210 if (len > INT_MAX) {
211 PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
212 return -1;
213 }
214 if (s[--len] != quote) {
215 /* Last quote char must match the first. */
216 PyErr_BadInternalCall();
217 return -1;
218 }
219 if (len >= 4 && s[0] == quote && s[1] == quote) {
220 /* A triple quoted string. We've already skipped one quote at
221 the start and one at the end of the string. Now skip the
222 two at the start. */
223 s += 2;
224 len -= 2;
225 /* And check that the last two match. */
226 if (s[--len] != quote || s[--len] != quote) {
227 PyErr_BadInternalCall();
228 return -1;
229 }
230 }
231
232 if (fmode) {
233 /* Just return the bytes. The caller will parse the resulting
234 string. */
235 *fstr = s;
236 *fstrlen = len;
237 return 0;
238 }
239
240 /* Not an f-string. */
241 /* Avoid invoking escape decoding routines if possible. */
242 *rawmode = *rawmode || strchr(s, '\\') == NULL;
243 if (*bytesmode) {
244 /* Disallow non-ASCII characters. */
245 const char *ch;
246 for (ch = s; *ch; ch++) {
247 if (Py_CHARMASK(*ch) >= 0x80) {
248 RAISE_SYNTAX_ERROR(
249 "bytes can only contain ASCII "
250 "literal characters.");
251 return -1;
252 }
253 }
254 if (*rawmode) {
255 *result = PyBytes_FromStringAndSize(s, len);
256 }
257 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300258 *result = decode_bytes_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100259 }
260 }
261 else {
262 if (*rawmode) {
263 *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
264 }
265 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300266 *result = decode_unicode_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100267 }
268 }
269 return *result == NULL ? -1 : 0;
270}
271
272
273
274// FSTRING STUFF
275
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100276/* Fix locations for the given node and its children.
277
278 `parent` is the enclosing node.
279 `n` is the node which locations are going to be fixed relative to parent.
280 `expr_str` is the child node's string representation, including braces.
281*/
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700282static bool
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300283fstring_find_expr_location(Token *parent, char *expr_str, int *p_lines, int *p_cols)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100284{
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700285 *p_lines = 0;
286 *p_cols = 0;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100287 if (parent && parent->bytes) {
288 char *parent_str = PyBytes_AsString(parent->bytes);
289 if (!parent_str) {
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700290 return false;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100291 }
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700292 char *substr = strstr(parent_str, expr_str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100293 if (substr) {
294 // The following is needed, in order to correctly shift the column
295 // offset, in the case that (disregarding any whitespace) a newline
296 // immediately follows the opening curly brace of the fstring expression.
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700297 bool newline_after_brace = 1;
298 char *start = substr + 1;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100299 while (start && *start != '}' && *start != '\n') {
300 if (*start != ' ' && *start != '\t' && *start != '\f') {
301 newline_after_brace = 0;
302 break;
303 }
304 start++;
305 }
306
307 // Account for the characters from the last newline character to our
308 // left until the beginning of substr.
309 if (!newline_after_brace) {
310 start = substr;
311 while (start > parent_str && *start != '\n') {
312 start--;
313 }
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700314 *p_cols += (int)(substr - start);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100315 }
316 /* adjust the start based on the number of newlines encountered
317 before the f-string expression */
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100318 for (char* p = parent_str; p < substr; p++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100319 if (*p == '\n') {
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700320 (*p_lines)++;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100321 }
322 }
323 }
324 }
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700325 return true;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100326}
327
328
329/* Compile this expression in to an expr_ty. Add parens around the
330 expression, in order to allow leading spaces in the expression. */
331static expr_ty
332fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
333 Token *t)
334{
335 expr_ty expr = NULL;
336 char *str;
337 Py_ssize_t len;
338 const char *s;
339 expr_ty result = NULL;
340
341 assert(expr_end >= expr_start);
342 assert(*(expr_start-1) == '{');
343 assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' ||
344 *expr_end == '=');
345
346 /* If the substring is all whitespace, it's an error. We need to catch this
347 here, and not when we call PyParser_SimpleParseStringFlagsFilename,
348 because turning the expression '' in to '()' would go from being invalid
349 to valid. */
350 for (s = expr_start; s != expr_end; s++) {
351 char c = *s;
352 /* The Python parser ignores only the following whitespace
353 characters (\r already is converted to \n). */
354 if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) {
355 break;
356 }
357 }
358 if (s == expr_end) {
359 RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
360 return NULL;
361 }
362
363 len = expr_end - expr_start;
364 /* Allocate 3 extra bytes: open paren, close paren, null byte. */
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300365 str = PyMem_Malloc(len + 3);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100366 if (str == NULL) {
367 PyErr_NoMemory();
368 return NULL;
369 }
370
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300371 // The call to fstring_find_expr_location is responsible for finding the column offset
372 // the generated AST nodes need to be shifted to the right, which is equal to the number
373 // of the f-string characters before the expression starts. In order to correctly compute
374 // this offset, strstr gets called in fstring_find_expr_location which only succeeds
375 // if curly braces appear before and after the f-string expression (exactly like they do
376 // in the f-string itself), hence the following lines.
377 str[0] = '{';
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100378 memcpy(str+1, expr_start, len);
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300379 str[len+1] = '}';
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100380 str[len+2] = 0;
381
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300382 int lines, cols;
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700383 if (!fstring_find_expr_location(t, str, &lines, &cols)) {
384 PyMem_FREE(str);
385 return NULL;
386 }
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300387
Eric V. Smith0275e042020-07-16 12:10:23 -0400388 // The parentheses are needed in order to allow for leading whitespace within
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300389 // the f-string expression. This consequently gets parsed as a group (see the
390 // group rule in python.gram).
391 str[0] = '(';
392 str[len+1] = ')';
393
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100394 struct tok_state* tok = PyTokenizer_FromString(str, 1);
395 if (tok == NULL) {
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300396 PyMem_Free(str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100397 return NULL;
398 }
Lysandros Nikolaouf7b1e462020-05-26 03:32:18 +0300399 Py_INCREF(p->tok->filename);
400 tok->filename = p->tok->filename;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100401
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300402 Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
403 NULL, p->arena);
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300404 p2->starting_lineno = t->lineno + lines - 1;
405 p2->starting_col_offset = p->tok->first_lineno == p->tok->lineno ? t->col_offset + cols : cols;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100406
407 expr = _PyPegen_run_parser(p2);
408
409 if (expr == NULL) {
410 goto exit;
411 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100412 result = expr;
413
414exit:
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300415 PyMem_Free(str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100416 _PyPegen_Parser_Free(p2);
417 PyTokenizer_Free(tok);
418 return result;
419}
420
421/* Return -1 on error.
422
423 Return 0 if we reached the end of the literal.
424
425 Return 1 if we haven't reached the end of the literal, but we want
426 the caller to process the literal up to this point. Used for
427 doubled braces.
428*/
429static int
430fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300431 PyObject **literal, int recurse_lvl, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100432{
433 /* Get any literal string. It ends when we hit an un-doubled left
434 brace (which isn't part of a unicode name escape such as
435 "\N{EULER CONSTANT}"), or the end of the string. */
436
437 const char *s = *str;
438 const char *literal_start = s;
439 int result = 0;
440
441 assert(*literal == NULL);
442 while (s < end) {
443 char ch = *s++;
444 if (!raw && ch == '\\' && s < end) {
445 ch = *s++;
446 if (ch == 'N') {
447 if (s < end && *s++ == '{') {
448 while (s < end && *s++ != '}') {
449 }
450 continue;
451 }
452 break;
453 }
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300454 if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100455 return -1;
456 }
457 }
458 if (ch == '{' || ch == '}') {
459 /* Check for doubled braces, but only at the top level. If
460 we checked at every level, then f'{0:{3}}' would fail
461 with the two closing braces. */
462 if (recurse_lvl == 0) {
463 if (s < end && *s == ch) {
464 /* We're going to tell the caller that the literal ends
465 here, but that they should continue scanning. But also
466 skip over the second brace when we resume scanning. */
467 *str = s + 1;
468 result = 1;
469 goto done;
470 }
471
472 /* Where a single '{' is the start of a new expression, a
473 single '}' is not allowed. */
474 if (ch == '}') {
475 *str = s - 1;
476 RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
477 return -1;
478 }
479 }
480 /* We're either at a '{', which means we're starting another
481 expression; or a '}', which means we're at the end of this
482 f-string (for a nested format_spec). */
483 s--;
484 break;
485 }
486 }
487 *str = s;
488 assert(s <= end);
489 assert(s == end || *s == '{' || *s == '}');
490done:
491 if (literal_start != s) {
Pablo Galindofb61c422020-06-15 14:23:43 +0100492 if (raw) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100493 *literal = PyUnicode_DecodeUTF8Stateful(literal_start,
494 s - literal_start,
495 NULL, NULL);
Pablo Galindofb61c422020-06-15 14:23:43 +0100496 } else {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100497 *literal = decode_unicode_with_escapes(p, literal_start,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300498 s - literal_start, t);
Pablo Galindofb61c422020-06-15 14:23:43 +0100499 }
500 if (!*literal) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100501 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +0100502 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100503 }
504 return result;
505}
506
507/* Forward declaration because parsing is recursive. */
508static expr_ty
509fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
510 Token *first_token, Token* t, Token *last_token);
511
512/* Parse the f-string at *str, ending at end. We know *str starts an
513 expression (so it must be a '{'). Returns the FormattedValue node, which
514 includes the expression, conversion character, format_spec expression, and
515 optionally the text of the expression (if = is used).
516
517 Note that I don't do a perfect job here: I don't make sure that a
518 closing brace doesn't match an opening paren, for example. It
519 doesn't need to error on all invalid expressions, just correctly
520 find the end of all valid ones. Any errors inside the expression
521 will be caught when we parse it later.
522
523 *expression is set to the expression. For an '=' "debug" expression,
524 *expr_text is set to the debug text (the original text of the expression,
525 including the '=' and any whitespace around it, as a string object). If
526 not a debug expression, *expr_text set to NULL. */
527static int
528fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
529 PyObject **expr_text, expr_ty *expression, Token *first_token,
530 Token *t, Token *last_token)
531{
532 /* Return -1 on error, else 0. */
533
534 const char *expr_start;
535 const char *expr_end;
536 expr_ty simple_expression;
537 expr_ty format_spec = NULL; /* Optional format specifier. */
538 int conversion = -1; /* The conversion char. Use default if not
539 specified, or !r if using = and no format
540 spec. */
541
542 /* 0 if we're not in a string, else the quote char we're trying to
543 match (single or double quote). */
544 char quote_char = 0;
545
546 /* If we're inside a string, 1=normal, 3=triple-quoted. */
547 int string_type = 0;
548
549 /* Keep track of nesting level for braces/parens/brackets in
550 expressions. */
551 Py_ssize_t nested_depth = 0;
552 char parenstack[MAXLEVEL];
553
554 *expr_text = NULL;
555
556 /* Can only nest one level deep. */
557 if (recurse_lvl >= 2) {
558 RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
559 goto error;
560 }
561
562 /* The first char must be a left brace, or we wouldn't have gotten
563 here. Skip over it. */
564 assert(**str == '{');
565 *str += 1;
566
567 expr_start = *str;
568 for (; *str < end; (*str)++) {
569 char ch;
570
571 /* Loop invariants. */
572 assert(nested_depth >= 0);
573 assert(*str >= expr_start && *str < end);
Pablo Galindofb61c422020-06-15 14:23:43 +0100574 if (quote_char) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100575 assert(string_type == 1 || string_type == 3);
Pablo Galindofb61c422020-06-15 14:23:43 +0100576 } else {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100577 assert(string_type == 0);
Pablo Galindofb61c422020-06-15 14:23:43 +0100578 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100579
580 ch = **str;
581 /* Nowhere inside an expression is a backslash allowed. */
582 if (ch == '\\') {
583 /* Error: can't include a backslash character, inside
584 parens or strings or not. */
585 RAISE_SYNTAX_ERROR(
586 "f-string expression part "
587 "cannot include a backslash");
588 goto error;
589 }
590 if (quote_char) {
591 /* We're inside a string. See if we're at the end. */
592 /* This code needs to implement the same non-error logic
593 as tok_get from tokenizer.c, at the letter_quote
594 label. To actually share that code would be a
595 nightmare. But, it's unlikely to change and is small,
596 so duplicate it here. Note we don't need to catch all
597 of the errors, since they'll be caught when parsing the
598 expression. We just need to match the non-error
599 cases. Thus we can ignore \n in single-quoted strings,
600 for example. Or non-terminated strings. */
601 if (ch == quote_char) {
602 /* Does this match the string_type (single or triple
603 quoted)? */
604 if (string_type == 3) {
605 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
606 /* We're at the end of a triple quoted string. */
607 *str += 2;
608 string_type = 0;
609 quote_char = 0;
610 continue;
611 }
612 } else {
613 /* We're at the end of a normal string. */
614 quote_char = 0;
615 string_type = 0;
616 continue;
617 }
618 }
619 } else if (ch == '\'' || ch == '"') {
620 /* Is this a triple quoted string? */
621 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
622 string_type = 3;
623 *str += 2;
624 } else {
625 /* Start of a normal string. */
626 string_type = 1;
627 }
628 /* Start looking for the end of the string. */
629 quote_char = ch;
630 } else if (ch == '[' || ch == '{' || ch == '(') {
631 if (nested_depth >= MAXLEVEL) {
632 RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
633 goto error;
634 }
635 parenstack[nested_depth] = ch;
636 nested_depth++;
637 } else if (ch == '#') {
638 /* Error: can't include a comment character, inside parens
639 or not. */
640 RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
641 goto error;
642 } else if (nested_depth == 0 &&
643 (ch == '!' || ch == ':' || ch == '}' ||
644 ch == '=' || ch == '>' || ch == '<')) {
645 /* See if there's a next character. */
646 if (*str+1 < end) {
647 char next = *(*str+1);
648
649 /* For "!=". since '=' is not an allowed conversion character,
650 nothing is lost in this test. */
651 if ((ch == '!' && next == '=') || /* != */
652 (ch == '=' && next == '=') || /* == */
653 (ch == '<' && next == '=') || /* <= */
654 (ch == '>' && next == '=') /* >= */
655 ) {
656 *str += 1;
657 continue;
658 }
659 /* Don't get out of the loop for these, if they're single
660 chars (not part of 2-char tokens). If by themselves, they
661 don't end an expression (unlike say '!'). */
662 if (ch == '>' || ch == '<') {
663 continue;
664 }
665 }
666
667 /* Normal way out of this loop. */
668 break;
669 } else if (ch == ']' || ch == '}' || ch == ')') {
670 if (!nested_depth) {
671 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
672 goto error;
673 }
674 nested_depth--;
Pablo Galindofb61c422020-06-15 14:23:43 +0100675 int opening = (unsigned char)parenstack[nested_depth];
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100676 if (!((opening == '(' && ch == ')') ||
677 (opening == '[' && ch == ']') ||
678 (opening == '{' && ch == '}')))
679 {
680 RAISE_SYNTAX_ERROR(
681 "f-string: closing parenthesis '%c' "
682 "does not match opening parenthesis '%c'",
683 ch, opening);
684 goto error;
685 }
686 } else {
687 /* Just consume this char and loop around. */
688 }
689 }
690 expr_end = *str;
691 /* If we leave this loop in a string or with mismatched parens, we
692 don't care. We'll get a syntax error when compiling the
693 expression. But, we can produce a better error message, so
694 let's just do that.*/
695 if (quote_char) {
696 RAISE_SYNTAX_ERROR("f-string: unterminated string");
697 goto error;
698 }
699 if (nested_depth) {
Pablo Galindofb61c422020-06-15 14:23:43 +0100700 int opening = (unsigned char)parenstack[nested_depth - 1];
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100701 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
702 goto error;
703 }
704
Pablo Galindofb61c422020-06-15 14:23:43 +0100705 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100706 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100707 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100708
709 /* Compile the expression as soon as possible, so we show errors
710 related to the expression before errors related to the
711 conversion or format_spec. */
712 simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
Pablo Galindofb61c422020-06-15 14:23:43 +0100713 if (!simple_expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100714 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +0100715 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100716
717 /* Check for =, which puts the text value of the expression in
718 expr_text. */
719 if (**str == '=') {
Shantanuc116c942020-05-27 13:30:38 -0700720 if (p->feature_version < 8) {
721 RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
722 "only supported in Python 3.8 and greater");
723 goto error;
724 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100725 *str += 1;
726
727 /* Skip over ASCII whitespace. No need to test for end of string
728 here, since we know there's at least a trailing quote somewhere
729 ahead. */
730 while (Py_ISSPACE(**str)) {
731 *str += 1;
732 }
733
734 /* Set *expr_text to the text of the expression. */
735 *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start);
736 if (!*expr_text) {
737 goto error;
738 }
739 }
740
741 /* Check for a conversion char, if present. */
742 if (**str == '!') {
743 *str += 1;
Pablo Galindofb61c422020-06-15 14:23:43 +0100744 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100745 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100746 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100747
Pablo Galindofb61c422020-06-15 14:23:43 +0100748 conversion = (unsigned char)**str;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100749 *str += 1;
750
751 /* Validate the conversion. */
752 if (!(conversion == 's' || conversion == 'r' || conversion == 'a')) {
753 RAISE_SYNTAX_ERROR(
754 "f-string: invalid conversion character: "
755 "expected 's', 'r', or 'a'");
756 goto error;
757 }
758
759 }
760
761 /* Check for the format spec, if present. */
Pablo Galindofb61c422020-06-15 14:23:43 +0100762 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100763 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100764 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100765 if (**str == ':') {
766 *str += 1;
Pablo Galindofb61c422020-06-15 14:23:43 +0100767 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100768 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100769 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100770
771 /* Parse the format spec. */
772 format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
773 first_token, t, last_token);
Pablo Galindofb61c422020-06-15 14:23:43 +0100774 if (!format_spec) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100775 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +0100776 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100777 }
778
Pablo Galindofb61c422020-06-15 14:23:43 +0100779 if (*str >= end || **str != '}') {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100780 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100781 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100782
783 /* We're at a right brace. Consume it. */
784 assert(*str < end);
785 assert(**str == '}');
786 *str += 1;
787
788 /* If we're in = mode (detected by non-NULL expr_text), and have no format
789 spec and no explicit conversion, set the conversion to 'r'. */
790 if (*expr_text && format_spec == NULL && conversion == -1) {
791 conversion = 'r';
792 }
793
794 /* And now create the FormattedValue node that represents this
795 entire expression with the conversion and format spec. */
796 //TODO: Fix this
797 *expression = FormattedValue(simple_expression, conversion,
798 format_spec, first_token->lineno,
799 first_token->col_offset, last_token->end_lineno,
800 last_token->end_col_offset, p->arena);
Pablo Galindofb61c422020-06-15 14:23:43 +0100801 if (!*expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100802 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +0100803 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100804
805 return 0;
806
807unexpected_end_of_string:
808 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
809 /* Falls through to error. */
810
811error:
812 Py_XDECREF(*expr_text);
813 return -1;
814
815}
816
817/* Return -1 on error.
818
819 Return 0 if we have a literal (possible zero length) and an
820 expression (zero length if at the end of the string.
821
822 Return 1 if we have a literal, but no expression, and we want the
823 caller to call us again. This is used to deal with doubled
824 braces.
825
826 When called multiple times on the string 'a{{b{0}c', this function
827 will return:
828
829 1. the literal 'a{' with no expression, and a return value
830 of 1. Despite the fact that there's no expression, the return
831 value of 1 means we're not finished yet.
832
833 2. the literal 'b' and the expression '0', with a return value of
834 0. The fact that there's an expression means we're not finished.
835
836 3. literal 'c' with no expression and a return value of 0. The
837 combination of the return value of 0 with no expression means
838 we're finished.
839*/
840static int
841fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw,
842 int recurse_lvl, PyObject **literal,
843 PyObject **expr_text, expr_ty *expression,
844 Token *first_token, Token *t, Token *last_token)
845{
846 int result;
847
848 assert(*literal == NULL && *expression == NULL);
849
850 /* Get any literal string. */
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300851 result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
Pablo Galindofb61c422020-06-15 14:23:43 +0100852 if (result < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100853 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +0100854 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100855
856 assert(result == 0 || result == 1);
857
Pablo Galindofb61c422020-06-15 14:23:43 +0100858 if (result == 1) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100859 /* We have a literal, but don't look at the expression. */
860 return 1;
Pablo Galindofb61c422020-06-15 14:23:43 +0100861 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100862
Pablo Galindofb61c422020-06-15 14:23:43 +0100863 if (*str >= end || **str == '}') {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100864 /* We're at the end of the string or the end of a nested
865 f-string: no expression. The top-level error case where we
866 expect to be at the end of the string but we're at a '}' is
867 handled later. */
868 return 0;
Pablo Galindofb61c422020-06-15 14:23:43 +0100869 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100870
871 /* We must now be the start of an expression, on a '{'. */
872 assert(**str == '{');
873
874 if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
Pablo Galindofb61c422020-06-15 14:23:43 +0100875 expression, first_token, t, last_token) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100876 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +0100877 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100878
879 return 0;
880
881error:
882 Py_CLEAR(*literal);
883 return -1;
884}
885
886#ifdef NDEBUG
887#define ExprList_check_invariants(l)
888#else
889static void
890ExprList_check_invariants(ExprList *l)
891{
892 /* Check our invariants. Make sure this object is "live", and
893 hasn't been deallocated. */
894 assert(l->size >= 0);
895 assert(l->p != NULL);
Pablo Galindofb61c422020-06-15 14:23:43 +0100896 if (l->size <= EXPRLIST_N_CACHED) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100897 assert(l->data == l->p);
Pablo Galindofb61c422020-06-15 14:23:43 +0100898 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100899}
900#endif
901
902static void
903ExprList_Init(ExprList *l)
904{
905 l->allocated = EXPRLIST_N_CACHED;
906 l->size = 0;
907
908 /* Until we start allocating dynamically, p points to data. */
909 l->p = l->data;
910
911 ExprList_check_invariants(l);
912}
913
914static int
915ExprList_Append(ExprList *l, expr_ty exp)
916{
917 ExprList_check_invariants(l);
918 if (l->size >= l->allocated) {
919 /* We need to alloc (or realloc) the memory. */
920 Py_ssize_t new_size = l->allocated * 2;
921
922 /* See if we've ever allocated anything dynamically. */
923 if (l->p == l->data) {
924 Py_ssize_t i;
925 /* We're still using the cached data. Switch to
926 alloc-ing. */
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300927 l->p = PyMem_Malloc(sizeof(expr_ty) * new_size);
Pablo Galindofb61c422020-06-15 14:23:43 +0100928 if (!l->p) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100929 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +0100930 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100931 /* Copy the cached data into the new buffer. */
Pablo Galindofb61c422020-06-15 14:23:43 +0100932 for (i = 0; i < l->size; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100933 l->p[i] = l->data[i];
Pablo Galindofb61c422020-06-15 14:23:43 +0100934 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100935 } else {
936 /* Just realloc. */
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300937 expr_ty *tmp = PyMem_Realloc(l->p, sizeof(expr_ty) * new_size);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100938 if (!tmp) {
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300939 PyMem_Free(l->p);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100940 l->p = NULL;
941 return -1;
942 }
943 l->p = tmp;
944 }
945
946 l->allocated = new_size;
947 assert(l->allocated == 2 * l->size);
948 }
949
950 l->p[l->size++] = exp;
951
952 ExprList_check_invariants(l);
953 return 0;
954}
955
956static void
957ExprList_Dealloc(ExprList *l)
958{
959 ExprList_check_invariants(l);
960
961 /* If there's been an error, or we've never dynamically allocated,
962 do nothing. */
963 if (!l->p || l->p == l->data) {
964 /* Do nothing. */
965 } else {
966 /* We have dynamically allocated. Free the memory. */
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300967 PyMem_Free(l->p);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100968 }
969 l->p = NULL;
970 l->size = -1;
971}
972
973static asdl_seq *
974ExprList_Finish(ExprList *l, PyArena *arena)
975{
976 asdl_seq *seq;
977
978 ExprList_check_invariants(l);
979
980 /* Allocate the asdl_seq and copy the expressions in to it. */
981 seq = _Py_asdl_seq_new(l->size, arena);
982 if (seq) {
983 Py_ssize_t i;
Pablo Galindofb61c422020-06-15 14:23:43 +0100984 for (i = 0; i < l->size; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100985 asdl_seq_SET(seq, i, l->p[i]);
Pablo Galindofb61c422020-06-15 14:23:43 +0100986 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100987 }
988 ExprList_Dealloc(l);
989 return seq;
990}
991
992#ifdef NDEBUG
993#define FstringParser_check_invariants(state)
994#else
995static void
996FstringParser_check_invariants(FstringParser *state)
997{
Pablo Galindofb61c422020-06-15 14:23:43 +0100998 if (state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100999 assert(PyUnicode_CheckExact(state->last_str));
Pablo Galindofb61c422020-06-15 14:23:43 +01001000 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001001 ExprList_check_invariants(&state->expr_list);
1002}
1003#endif
1004
1005void
1006_PyPegen_FstringParser_Init(FstringParser *state)
1007{
1008 state->last_str = NULL;
1009 state->fmode = 0;
1010 ExprList_Init(&state->expr_list);
1011 FstringParser_check_invariants(state);
1012}
1013
1014void
1015_PyPegen_FstringParser_Dealloc(FstringParser *state)
1016{
1017 FstringParser_check_invariants(state);
1018
1019 Py_XDECREF(state->last_str);
1020 ExprList_Dealloc(&state->expr_list);
1021}
1022
1023/* Make a Constant node, but decref the PyUnicode object being added. */
1024static expr_ty
1025make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token)
1026{
1027 PyObject *s = *str;
1028 PyObject *kind = NULL;
1029 *str = NULL;
1030 assert(PyUnicode_CheckExact(s));
1031 if (PyArena_AddPyObject(p->arena, s) < 0) {
1032 Py_DECREF(s);
1033 return NULL;
1034 }
1035 const char* the_str = PyBytes_AsString(first_token->bytes);
1036 if (the_str && the_str[0] == 'u') {
1037 kind = _PyPegen_new_identifier(p, "u");
1038 }
1039
1040 if (kind == NULL && PyErr_Occurred()) {
1041 return NULL;
1042 }
1043
1044 return Constant(s, kind, first_token->lineno, first_token->col_offset,
1045 last_token->end_lineno, last_token->end_col_offset, p->arena);
1046
1047}
1048
1049
1050/* Add a non-f-string (that is, a regular literal string). str is
1051 decref'd. */
1052int
1053_PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str)
1054{
1055 FstringParser_check_invariants(state);
1056
1057 assert(PyUnicode_CheckExact(str));
1058
1059 if (PyUnicode_GET_LENGTH(str) == 0) {
1060 Py_DECREF(str);
1061 return 0;
1062 }
1063
1064 if (!state->last_str) {
1065 /* We didn't have a string before, so just remember this one. */
1066 state->last_str = str;
1067 } else {
1068 /* Concatenate this with the previous string. */
1069 PyUnicode_AppendAndDel(&state->last_str, str);
Pablo Galindofb61c422020-06-15 14:23:43 +01001070 if (!state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001071 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +01001072 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001073 }
1074 FstringParser_check_invariants(state);
1075 return 0;
1076}
1077
1078/* Parse an f-string. The f-string is in *str to end, with no
1079 'f' or quotes. */
1080int
1081_PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str,
1082 const char *end, int raw, int recurse_lvl,
1083 Token *first_token, Token* t, Token *last_token)
1084{
1085 FstringParser_check_invariants(state);
1086 state->fmode = 1;
1087
1088 /* Parse the f-string. */
1089 while (1) {
1090 PyObject *literal = NULL;
1091 PyObject *expr_text = NULL;
1092 expr_ty expression = NULL;
1093
1094 /* If there's a zero length literal in front of the
1095 expression, literal will be NULL. If we're at the end of
1096 the f-string, expression will be NULL (unless result == 1,
1097 see below). */
1098 int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
1099 &literal, &expr_text,
1100 &expression, first_token, t, last_token);
Pablo Galindofb61c422020-06-15 14:23:43 +01001101 if (result < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001102 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +01001103 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001104
1105 /* Add the literal, if any. */
1106 if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
1107 Py_XDECREF(expr_text);
1108 return -1;
1109 }
1110 /* Add the expr_text, if any. */
1111 if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
1112 return -1;
1113 }
1114
1115 /* We've dealt with the literal and expr_text, their ownership has
1116 been transferred to the state object. Don't look at them again. */
1117
1118 /* See if we should just loop around to get the next literal
1119 and expression, while ignoring the expression this
1120 time. This is used for un-doubling braces, as an
1121 optimization. */
Pablo Galindofb61c422020-06-15 14:23:43 +01001122 if (result == 1) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001123 continue;
Pablo Galindofb61c422020-06-15 14:23:43 +01001124 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001125
Pablo Galindofb61c422020-06-15 14:23:43 +01001126 if (!expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001127 /* We're done with this f-string. */
1128 break;
Pablo Galindofb61c422020-06-15 14:23:43 +01001129 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001130
1131 /* We know we have an expression. Convert any existing string
1132 to a Constant node. */
1133 if (!state->last_str) {
1134 /* Do nothing. No previous literal. */
1135 } else {
1136 /* Convert the existing last_str literal to a Constant node. */
Pablo Galindofb61c422020-06-15 14:23:43 +01001137 expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1138 if (!last_str || ExprList_Append(&state->expr_list, last_str) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001139 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +01001140 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001141 }
1142
Pablo Galindofb61c422020-06-15 14:23:43 +01001143 if (ExprList_Append(&state->expr_list, expression) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001144 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +01001145 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001146 }
1147
1148 /* If recurse_lvl is zero, then we must be at the end of the
1149 string. Otherwise, we must be at a right brace. */
1150
1151 if (recurse_lvl == 0 && *str < end-1) {
1152 RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
1153 return -1;
1154 }
1155 if (recurse_lvl != 0 && **str != '}') {
1156 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1157 return -1;
1158 }
1159
1160 FstringParser_check_invariants(state);
1161 return 0;
1162}
1163
1164/* Convert the partial state reflected in last_str and expr_list to an
1165 expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
1166expr_ty
1167_PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token,
1168 Token *last_token)
1169{
1170 asdl_seq *seq;
1171
1172 FstringParser_check_invariants(state);
1173
1174 /* If we're just a constant string with no expressions, return
1175 that. */
1176 if (!state->fmode) {
1177 assert(!state->expr_list.size);
1178 if (!state->last_str) {
1179 /* Create a zero length string. */
1180 state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
Pablo Galindofb61c422020-06-15 14:23:43 +01001181 if (!state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001182 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +01001183 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001184 }
1185 return make_str_node_and_del(p, &state->last_str, first_token, last_token);
1186 }
1187
1188 /* Create a Constant node out of last_str, if needed. It will be the
1189 last node in our expression list. */
1190 if (state->last_str) {
1191 expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
Pablo Galindofb61c422020-06-15 14:23:43 +01001192 if (!str || ExprList_Append(&state->expr_list, str) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001193 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +01001194 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001195 }
1196 /* This has already been freed. */
1197 assert(state->last_str == NULL);
1198
1199 seq = ExprList_Finish(&state->expr_list, p->arena);
Pablo Galindofb61c422020-06-15 14:23:43 +01001200 if (!seq) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001201 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +01001202 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001203
1204 return _Py_JoinedStr(seq, first_token->lineno, first_token->col_offset,
1205 last_token->end_lineno, last_token->end_col_offset, p->arena);
1206
1207error:
1208 _PyPegen_FstringParser_Dealloc(state);
1209 return NULL;
1210}
1211
1212/* Given an f-string (with no 'f' or quotes) that's in *str and ends
1213 at end, parse it into an expr_ty. Return NULL on error. Adjust
1214 str to point past the parsed portion. */
1215static expr_ty
1216fstring_parse(Parser *p, const char **str, const char *end, int raw,
1217 int recurse_lvl, Token *first_token, Token* t, Token *last_token)
1218{
1219 FstringParser state;
1220
1221 _PyPegen_FstringParser_Init(&state);
1222 if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
1223 first_token, t, last_token) < 0) {
1224 _PyPegen_FstringParser_Dealloc(&state);
1225 return NULL;
1226 }
1227
1228 return _PyPegen_FstringParser_Finish(p, &state, t, t);
1229}