blob: 8f6433dbcec1313745b6b0a52ee25d9dc5f499d3 [file] [log] [blame]
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -07001#include <stdbool.h>
2
Pablo Galindoc5fc1562020-04-22 23:29:27 +01003#include <Python.h>
4
Pablo Galindo1ed83ad2020-06-11 17:30:46 +01005#include "tokenizer.h"
Pablo Galindoc5fc1562020-04-22 23:29:27 +01006#include "pegen.h"
Pablo Galindo1ed83ad2020-06-11 17:30:46 +01007#include "string_parser.h"
Pablo Galindoc5fc1562020-04-22 23:29:27 +01008
9//// STRING HANDLING FUNCTIONS ////
10
Pablo Galindoc5fc1562020-04-22 23:29:27 +010011static int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030012warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010013{
14 PyObject *msg =
15 PyUnicode_FromFormat("invalid escape sequence \\%c", first_invalid_escape_char);
16 if (msg == NULL) {
17 return -1;
18 }
19 if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030020 t->lineno, NULL, NULL) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +010021 if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
22 /* Replace the DeprecationWarning exception with a SyntaxError
23 to get a more accurate error report */
24 PyErr_Clear();
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030025
26 /* This is needed, in order for the SyntaxError to point to the token t,
27 since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
28 error location, if p->known_err_token is not set. */
29 p->known_err_token = t;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010030 RAISE_SYNTAX_ERROR("invalid escape sequence \\%c", first_invalid_escape_char);
31 }
32 Py_DECREF(msg);
33 return -1;
34 }
35 Py_DECREF(msg);
36 return 0;
37}
38
39static PyObject *
40decode_utf8(const char **sPtr, const char *end)
41{
Pablo Galindofb61c422020-06-15 14:23:43 +010042 const char *s;
43 const char *t;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010044 t = s = *sPtr;
45 while (s < end && (*s & 0x80)) {
46 s++;
47 }
48 *sPtr = s;
49 return PyUnicode_DecodeUTF8(t, s - t, NULL);
50}
51
52static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030053decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010054{
Pablo Galindofb61c422020-06-15 14:23:43 +010055 PyObject *v;
56 PyObject *u;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010057 char *buf;
58 char *p;
59 const char *end;
60
61 /* check for integer overflow */
62 if (len > SIZE_MAX / 6) {
63 return NULL;
64 }
65 /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
66 "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
67 u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
68 if (u == NULL) {
69 return NULL;
70 }
71 p = buf = PyBytes_AsString(u);
Christian Heimes07f2ade2020-11-18 16:38:53 +010072 if (p == NULL) {
73 return NULL;
74 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +010075 end = s + len;
76 while (s < end) {
77 if (*s == '\\') {
78 *p++ = *s++;
79 if (s >= end || *s & 0x80) {
80 strcpy(p, "u005c");
81 p += 5;
82 if (s >= end) {
83 break;
84 }
85 }
86 }
87 if (*s & 0x80) {
88 PyObject *w;
89 int kind;
90 void *data;
Pablo Galindofb61c422020-06-15 14:23:43 +010091 Py_ssize_t w_len;
92 Py_ssize_t i;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010093 w = decode_utf8(&s, end);
94 if (w == NULL) {
95 Py_DECREF(u);
96 return NULL;
97 }
98 kind = PyUnicode_KIND(w);
99 data = PyUnicode_DATA(w);
Pablo Galindofb61c422020-06-15 14:23:43 +0100100 w_len = PyUnicode_GET_LENGTH(w);
101 for (i = 0; i < w_len; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100102 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
103 sprintf(p, "\\U%08x", chr);
104 p += 10;
105 }
106 /* Should be impossible to overflow */
107 assert(p - buf <= PyBytes_GET_SIZE(u));
108 Py_DECREF(w);
109 }
110 else {
111 *p++ = *s++;
112 }
113 }
114 len = p - buf;
115 s = buf;
116
117 const char *first_invalid_escape;
118 v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
119
120 if (v != NULL && first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300121 if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100122 /* We have not decref u before because first_invalid_escape points
123 inside u. */
124 Py_XDECREF(u);
125 Py_DECREF(v);
126 return NULL;
127 }
128 }
129 Py_XDECREF(u);
130 return v;
131}
132
133static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300134decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100135{
136 const char *first_invalid_escape;
137 PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
138 if (result == NULL) {
139 return NULL;
140 }
141
142 if (first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300143 if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100144 Py_DECREF(result);
145 return NULL;
146 }
147 }
148 return result;
149}
150
151/* s must include the bracketing quote characters, and r, b, u,
152 &/or f prefixes (if any), and embedded escape sequences (if any).
153 _PyPegen_parsestr parses it, and sets *result to decoded Python string object.
154 If the string is an f-string, set *fstr and *fstrlen to the unparsed
155 string object. Return 0 if no errors occurred. */
156int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300157_PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
158 const char **fstr, Py_ssize_t *fstrlen, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100159{
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300160 const char *s = PyBytes_AsString(t->bytes);
161 if (s == NULL) {
162 return -1;
163 }
164
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100165 size_t len;
166 int quote = Py_CHARMASK(*s);
167 int fmode = 0;
168 *bytesmode = 0;
169 *rawmode = 0;
170 *result = NULL;
171 *fstr = NULL;
172 if (Py_ISALPHA(quote)) {
173 while (!*bytesmode || !*rawmode) {
174 if (quote == 'b' || quote == 'B') {
Pablo Galindofb61c422020-06-15 14:23:43 +0100175 quote =(unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100176 *bytesmode = 1;
177 }
178 else if (quote == 'u' || quote == 'U') {
Pablo Galindofb61c422020-06-15 14:23:43 +0100179 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100180 }
181 else if (quote == 'r' || quote == 'R') {
Pablo Galindofb61c422020-06-15 14:23:43 +0100182 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100183 *rawmode = 1;
184 }
185 else if (quote == 'f' || quote == 'F') {
Pablo Galindofb61c422020-06-15 14:23:43 +0100186 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100187 fmode = 1;
188 }
189 else {
190 break;
191 }
192 }
193 }
194
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300195 /* fstrings are only allowed in Python 3.6 and greater */
196 if (fmode && p->feature_version < 6) {
197 p->error_indicator = 1;
198 RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
199 return -1;
200 }
201
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100202 if (fmode && *bytesmode) {
203 PyErr_BadInternalCall();
204 return -1;
205 }
206 if (quote != '\'' && quote != '\"') {
207 PyErr_BadInternalCall();
208 return -1;
209 }
210 /* Skip the leading quote char. */
211 s++;
212 len = strlen(s);
213 if (len > INT_MAX) {
214 PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
215 return -1;
216 }
217 if (s[--len] != quote) {
218 /* Last quote char must match the first. */
219 PyErr_BadInternalCall();
220 return -1;
221 }
222 if (len >= 4 && s[0] == quote && s[1] == quote) {
223 /* A triple quoted string. We've already skipped one quote at
224 the start and one at the end of the string. Now skip the
225 two at the start. */
226 s += 2;
227 len -= 2;
228 /* And check that the last two match. */
229 if (s[--len] != quote || s[--len] != quote) {
230 PyErr_BadInternalCall();
231 return -1;
232 }
233 }
234
235 if (fmode) {
236 /* Just return the bytes. The caller will parse the resulting
237 string. */
238 *fstr = s;
239 *fstrlen = len;
240 return 0;
241 }
242
243 /* Not an f-string. */
244 /* Avoid invoking escape decoding routines if possible. */
245 *rawmode = *rawmode || strchr(s, '\\') == NULL;
246 if (*bytesmode) {
247 /* Disallow non-ASCII characters. */
248 const char *ch;
249 for (ch = s; *ch; ch++) {
250 if (Py_CHARMASK(*ch) >= 0x80) {
251 RAISE_SYNTAX_ERROR(
252 "bytes can only contain ASCII "
253 "literal characters.");
254 return -1;
255 }
256 }
257 if (*rawmode) {
258 *result = PyBytes_FromStringAndSize(s, len);
259 }
260 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300261 *result = decode_bytes_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100262 }
263 }
264 else {
265 if (*rawmode) {
266 *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
267 }
268 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300269 *result = decode_unicode_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100270 }
271 }
272 return *result == NULL ? -1 : 0;
273}
274
275
276
277// FSTRING STUFF
278
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100279/* Fix locations for the given node and its children.
280
281 `parent` is the enclosing node.
282 `n` is the node which locations are going to be fixed relative to parent.
283 `expr_str` is the child node's string representation, including braces.
284*/
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700285static bool
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300286fstring_find_expr_location(Token *parent, char *expr_str, int *p_lines, int *p_cols)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100287{
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700288 *p_lines = 0;
289 *p_cols = 0;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100290 if (parent && parent->bytes) {
291 char *parent_str = PyBytes_AsString(parent->bytes);
292 if (!parent_str) {
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700293 return false;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100294 }
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700295 char *substr = strstr(parent_str, expr_str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100296 if (substr) {
297 // The following is needed, in order to correctly shift the column
298 // offset, in the case that (disregarding any whitespace) a newline
299 // immediately follows the opening curly brace of the fstring expression.
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700300 bool newline_after_brace = 1;
301 char *start = substr + 1;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100302 while (start && *start != '}' && *start != '\n') {
303 if (*start != ' ' && *start != '\t' && *start != '\f') {
304 newline_after_brace = 0;
305 break;
306 }
307 start++;
308 }
309
310 // Account for the characters from the last newline character to our
311 // left until the beginning of substr.
312 if (!newline_after_brace) {
313 start = substr;
314 while (start > parent_str && *start != '\n') {
315 start--;
316 }
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700317 *p_cols += (int)(substr - start);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100318 }
319 /* adjust the start based on the number of newlines encountered
320 before the f-string expression */
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100321 for (char* p = parent_str; p < substr; p++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100322 if (*p == '\n') {
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700323 (*p_lines)++;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100324 }
325 }
326 }
327 }
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700328 return true;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100329}
330
331
332/* Compile this expression in to an expr_ty. Add parens around the
333 expression, in order to allow leading spaces in the expression. */
334static expr_ty
335fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
336 Token *t)
337{
338 expr_ty expr = NULL;
339 char *str;
340 Py_ssize_t len;
341 const char *s;
342 expr_ty result = NULL;
343
344 assert(expr_end >= expr_start);
345 assert(*(expr_start-1) == '{');
346 assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' ||
347 *expr_end == '=');
348
349 /* If the substring is all whitespace, it's an error. We need to catch this
350 here, and not when we call PyParser_SimpleParseStringFlagsFilename,
351 because turning the expression '' in to '()' would go from being invalid
352 to valid. */
353 for (s = expr_start; s != expr_end; s++) {
354 char c = *s;
355 /* The Python parser ignores only the following whitespace
356 characters (\r already is converted to \n). */
357 if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) {
358 break;
359 }
360 }
361 if (s == expr_end) {
362 RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
363 return NULL;
364 }
365
366 len = expr_end - expr_start;
367 /* Allocate 3 extra bytes: open paren, close paren, null byte. */
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300368 str = PyMem_Malloc(len + 3);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100369 if (str == NULL) {
370 PyErr_NoMemory();
371 return NULL;
372 }
373
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300374 // The call to fstring_find_expr_location is responsible for finding the column offset
375 // the generated AST nodes need to be shifted to the right, which is equal to the number
376 // of the f-string characters before the expression starts. In order to correctly compute
377 // this offset, strstr gets called in fstring_find_expr_location which only succeeds
378 // if curly braces appear before and after the f-string expression (exactly like they do
379 // in the f-string itself), hence the following lines.
380 str[0] = '{';
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100381 memcpy(str+1, expr_start, len);
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300382 str[len+1] = '}';
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100383 str[len+2] = 0;
384
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300385 int lines, cols;
Benjamin Peterson2ad7e9c2020-07-16 06:07:29 -0700386 if (!fstring_find_expr_location(t, str, &lines, &cols)) {
387 PyMem_FREE(str);
388 return NULL;
389 }
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300390
Eric V. Smith0275e042020-07-16 12:10:23 -0400391 // The parentheses are needed in order to allow for leading whitespace within
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300392 // the f-string expression. This consequently gets parsed as a group (see the
393 // group rule in python.gram).
394 str[0] = '(';
395 str[len+1] = ')';
396
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100397 struct tok_state* tok = PyTokenizer_FromString(str, 1);
398 if (tok == NULL) {
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300399 PyMem_Free(str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100400 return NULL;
401 }
Lysandros Nikolaouf7b1e462020-05-26 03:32:18 +0300402 Py_INCREF(p->tok->filename);
403 tok->filename = p->tok->filename;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100404
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300405 Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
406 NULL, p->arena);
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300407 p2->starting_lineno = t->lineno + lines - 1;
408 p2->starting_col_offset = p->tok->first_lineno == p->tok->lineno ? t->col_offset + cols : cols;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100409
410 expr = _PyPegen_run_parser(p2);
411
412 if (expr == NULL) {
413 goto exit;
414 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100415 result = expr;
416
417exit:
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300418 PyMem_Free(str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100419 _PyPegen_Parser_Free(p2);
420 PyTokenizer_Free(tok);
421 return result;
422}
423
424/* Return -1 on error.
425
426 Return 0 if we reached the end of the literal.
427
428 Return 1 if we haven't reached the end of the literal, but we want
429 the caller to process the literal up to this point. Used for
430 doubled braces.
431*/
432static int
433fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300434 PyObject **literal, int recurse_lvl, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100435{
436 /* Get any literal string. It ends when we hit an un-doubled left
437 brace (which isn't part of a unicode name escape such as
438 "\N{EULER CONSTANT}"), or the end of the string. */
439
440 const char *s = *str;
441 const char *literal_start = s;
442 int result = 0;
443
444 assert(*literal == NULL);
445 while (s < end) {
446 char ch = *s++;
447 if (!raw && ch == '\\' && s < end) {
448 ch = *s++;
449 if (ch == 'N') {
450 if (s < end && *s++ == '{') {
451 while (s < end && *s++ != '}') {
452 }
453 continue;
454 }
455 break;
456 }
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300457 if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100458 return -1;
459 }
460 }
461 if (ch == '{' || ch == '}') {
462 /* Check for doubled braces, but only at the top level. If
463 we checked at every level, then f'{0:{3}}' would fail
464 with the two closing braces. */
465 if (recurse_lvl == 0) {
466 if (s < end && *s == ch) {
467 /* We're going to tell the caller that the literal ends
468 here, but that they should continue scanning. But also
469 skip over the second brace when we resume scanning. */
470 *str = s + 1;
471 result = 1;
472 goto done;
473 }
474
475 /* Where a single '{' is the start of a new expression, a
476 single '}' is not allowed. */
477 if (ch == '}') {
478 *str = s - 1;
479 RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
480 return -1;
481 }
482 }
483 /* We're either at a '{', which means we're starting another
484 expression; or a '}', which means we're at the end of this
485 f-string (for a nested format_spec). */
486 s--;
487 break;
488 }
489 }
490 *str = s;
491 assert(s <= end);
492 assert(s == end || *s == '{' || *s == '}');
493done:
494 if (literal_start != s) {
Pablo Galindofb61c422020-06-15 14:23:43 +0100495 if (raw) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100496 *literal = PyUnicode_DecodeUTF8Stateful(literal_start,
497 s - literal_start,
498 NULL, NULL);
Pablo Galindofb61c422020-06-15 14:23:43 +0100499 } else {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100500 *literal = decode_unicode_with_escapes(p, literal_start,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300501 s - literal_start, t);
Pablo Galindofb61c422020-06-15 14:23:43 +0100502 }
503 if (!*literal) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100504 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +0100505 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100506 }
507 return result;
508}
509
510/* Forward declaration because parsing is recursive. */
511static expr_ty
512fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
513 Token *first_token, Token* t, Token *last_token);
514
515/* Parse the f-string at *str, ending at end. We know *str starts an
516 expression (so it must be a '{'). Returns the FormattedValue node, which
517 includes the expression, conversion character, format_spec expression, and
518 optionally the text of the expression (if = is used).
519
520 Note that I don't do a perfect job here: I don't make sure that a
521 closing brace doesn't match an opening paren, for example. It
522 doesn't need to error on all invalid expressions, just correctly
523 find the end of all valid ones. Any errors inside the expression
524 will be caught when we parse it later.
525
526 *expression is set to the expression. For an '=' "debug" expression,
527 *expr_text is set to the debug text (the original text of the expression,
528 including the '=' and any whitespace around it, as a string object). If
529 not a debug expression, *expr_text set to NULL. */
530static int
531fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
532 PyObject **expr_text, expr_ty *expression, Token *first_token,
533 Token *t, Token *last_token)
534{
535 /* Return -1 on error, else 0. */
536
537 const char *expr_start;
538 const char *expr_end;
539 expr_ty simple_expression;
540 expr_ty format_spec = NULL; /* Optional format specifier. */
541 int conversion = -1; /* The conversion char. Use default if not
542 specified, or !r if using = and no format
543 spec. */
544
545 /* 0 if we're not in a string, else the quote char we're trying to
546 match (single or double quote). */
547 char quote_char = 0;
548
549 /* If we're inside a string, 1=normal, 3=triple-quoted. */
550 int string_type = 0;
551
552 /* Keep track of nesting level for braces/parens/brackets in
553 expressions. */
554 Py_ssize_t nested_depth = 0;
555 char parenstack[MAXLEVEL];
556
557 *expr_text = NULL;
558
559 /* Can only nest one level deep. */
560 if (recurse_lvl >= 2) {
561 RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
562 goto error;
563 }
564
565 /* The first char must be a left brace, or we wouldn't have gotten
566 here. Skip over it. */
567 assert(**str == '{');
568 *str += 1;
569
570 expr_start = *str;
571 for (; *str < end; (*str)++) {
572 char ch;
573
574 /* Loop invariants. */
575 assert(nested_depth >= 0);
576 assert(*str >= expr_start && *str < end);
Pablo Galindofb61c422020-06-15 14:23:43 +0100577 if (quote_char) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100578 assert(string_type == 1 || string_type == 3);
Pablo Galindofb61c422020-06-15 14:23:43 +0100579 } else {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100580 assert(string_type == 0);
Pablo Galindofb61c422020-06-15 14:23:43 +0100581 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100582
583 ch = **str;
584 /* Nowhere inside an expression is a backslash allowed. */
585 if (ch == '\\') {
586 /* Error: can't include a backslash character, inside
587 parens or strings or not. */
588 RAISE_SYNTAX_ERROR(
589 "f-string expression part "
590 "cannot include a backslash");
591 goto error;
592 }
593 if (quote_char) {
594 /* We're inside a string. See if we're at the end. */
595 /* This code needs to implement the same non-error logic
596 as tok_get from tokenizer.c, at the letter_quote
597 label. To actually share that code would be a
598 nightmare. But, it's unlikely to change and is small,
599 so duplicate it here. Note we don't need to catch all
600 of the errors, since they'll be caught when parsing the
601 expression. We just need to match the non-error
602 cases. Thus we can ignore \n in single-quoted strings,
603 for example. Or non-terminated strings. */
604 if (ch == quote_char) {
605 /* Does this match the string_type (single or triple
606 quoted)? */
607 if (string_type == 3) {
608 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
609 /* We're at the end of a triple quoted string. */
610 *str += 2;
611 string_type = 0;
612 quote_char = 0;
613 continue;
614 }
615 } else {
616 /* We're at the end of a normal string. */
617 quote_char = 0;
618 string_type = 0;
619 continue;
620 }
621 }
622 } else if (ch == '\'' || ch == '"') {
623 /* Is this a triple quoted string? */
624 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
625 string_type = 3;
626 *str += 2;
627 } else {
628 /* Start of a normal string. */
629 string_type = 1;
630 }
631 /* Start looking for the end of the string. */
632 quote_char = ch;
633 } else if (ch == '[' || ch == '{' || ch == '(') {
634 if (nested_depth >= MAXLEVEL) {
635 RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
636 goto error;
637 }
638 parenstack[nested_depth] = ch;
639 nested_depth++;
640 } else if (ch == '#') {
641 /* Error: can't include a comment character, inside parens
642 or not. */
643 RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
644 goto error;
645 } else if (nested_depth == 0 &&
646 (ch == '!' || ch == ':' || ch == '}' ||
647 ch == '=' || ch == '>' || ch == '<')) {
648 /* See if there's a next character. */
649 if (*str+1 < end) {
650 char next = *(*str+1);
651
652 /* For "!=". since '=' is not an allowed conversion character,
653 nothing is lost in this test. */
654 if ((ch == '!' && next == '=') || /* != */
655 (ch == '=' && next == '=') || /* == */
656 (ch == '<' && next == '=') || /* <= */
657 (ch == '>' && next == '=') /* >= */
658 ) {
659 *str += 1;
660 continue;
661 }
662 /* Don't get out of the loop for these, if they're single
663 chars (not part of 2-char tokens). If by themselves, they
664 don't end an expression (unlike say '!'). */
665 if (ch == '>' || ch == '<') {
666 continue;
667 }
668 }
669
670 /* Normal way out of this loop. */
671 break;
672 } else if (ch == ']' || ch == '}' || ch == ')') {
673 if (!nested_depth) {
674 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
675 goto error;
676 }
677 nested_depth--;
Pablo Galindofb61c422020-06-15 14:23:43 +0100678 int opening = (unsigned char)parenstack[nested_depth];
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100679 if (!((opening == '(' && ch == ')') ||
680 (opening == '[' && ch == ']') ||
681 (opening == '{' && ch == '}')))
682 {
683 RAISE_SYNTAX_ERROR(
684 "f-string: closing parenthesis '%c' "
685 "does not match opening parenthesis '%c'",
686 ch, opening);
687 goto error;
688 }
689 } else {
690 /* Just consume this char and loop around. */
691 }
692 }
693 expr_end = *str;
694 /* If we leave this loop in a string or with mismatched parens, we
695 don't care. We'll get a syntax error when compiling the
696 expression. But, we can produce a better error message, so
697 let's just do that.*/
698 if (quote_char) {
699 RAISE_SYNTAX_ERROR("f-string: unterminated string");
700 goto error;
701 }
702 if (nested_depth) {
Pablo Galindofb61c422020-06-15 14:23:43 +0100703 int opening = (unsigned char)parenstack[nested_depth - 1];
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100704 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
705 goto error;
706 }
707
Pablo Galindofb61c422020-06-15 14:23:43 +0100708 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100709 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100710 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100711
712 /* Compile the expression as soon as possible, so we show errors
713 related to the expression before errors related to the
714 conversion or format_spec. */
715 simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
Pablo Galindofb61c422020-06-15 14:23:43 +0100716 if (!simple_expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100717 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +0100718 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100719
720 /* Check for =, which puts the text value of the expression in
721 expr_text. */
722 if (**str == '=') {
Shantanuc116c942020-05-27 13:30:38 -0700723 if (p->feature_version < 8) {
724 RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
725 "only supported in Python 3.8 and greater");
726 goto error;
727 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100728 *str += 1;
729
730 /* Skip over ASCII whitespace. No need to test for end of string
731 here, since we know there's at least a trailing quote somewhere
732 ahead. */
733 while (Py_ISSPACE(**str)) {
734 *str += 1;
735 }
736
737 /* Set *expr_text to the text of the expression. */
738 *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start);
739 if (!*expr_text) {
740 goto error;
741 }
742 }
743
744 /* Check for a conversion char, if present. */
745 if (**str == '!') {
746 *str += 1;
Pablo Galindofb61c422020-06-15 14:23:43 +0100747 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100748 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100749 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100750
Pablo Galindofb61c422020-06-15 14:23:43 +0100751 conversion = (unsigned char)**str;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100752 *str += 1;
753
754 /* Validate the conversion. */
755 if (!(conversion == 's' || conversion == 'r' || conversion == 'a')) {
756 RAISE_SYNTAX_ERROR(
757 "f-string: invalid conversion character: "
758 "expected 's', 'r', or 'a'");
759 goto error;
760 }
761
762 }
763
764 /* Check for the format spec, if present. */
Pablo Galindofb61c422020-06-15 14:23:43 +0100765 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100766 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100767 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100768 if (**str == ':') {
769 *str += 1;
Pablo Galindofb61c422020-06-15 14:23:43 +0100770 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100771 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100772 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100773
774 /* Parse the format spec. */
775 format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
776 first_token, t, last_token);
Pablo Galindofb61c422020-06-15 14:23:43 +0100777 if (!format_spec) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100778 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +0100779 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100780 }
781
Pablo Galindofb61c422020-06-15 14:23:43 +0100782 if (*str >= end || **str != '}') {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100783 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100784 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100785
786 /* We're at a right brace. Consume it. */
787 assert(*str < end);
788 assert(**str == '}');
789 *str += 1;
790
791 /* If we're in = mode (detected by non-NULL expr_text), and have no format
792 spec and no explicit conversion, set the conversion to 'r'. */
793 if (*expr_text && format_spec == NULL && conversion == -1) {
794 conversion = 'r';
795 }
796
797 /* And now create the FormattedValue node that represents this
798 entire expression with the conversion and format spec. */
799 //TODO: Fix this
800 *expression = FormattedValue(simple_expression, conversion,
801 format_spec, first_token->lineno,
802 first_token->col_offset, last_token->end_lineno,
803 last_token->end_col_offset, p->arena);
Pablo Galindofb61c422020-06-15 14:23:43 +0100804 if (!*expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100805 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +0100806 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100807
808 return 0;
809
810unexpected_end_of_string:
811 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
812 /* Falls through to error. */
813
814error:
815 Py_XDECREF(*expr_text);
816 return -1;
817
818}
819
820/* Return -1 on error.
821
822 Return 0 if we have a literal (possible zero length) and an
823 expression (zero length if at the end of the string.
824
825 Return 1 if we have a literal, but no expression, and we want the
826 caller to call us again. This is used to deal with doubled
827 braces.
828
829 When called multiple times on the string 'a{{b{0}c', this function
830 will return:
831
832 1. the literal 'a{' with no expression, and a return value
833 of 1. Despite the fact that there's no expression, the return
834 value of 1 means we're not finished yet.
835
836 2. the literal 'b' and the expression '0', with a return value of
837 0. The fact that there's an expression means we're not finished.
838
839 3. literal 'c' with no expression and a return value of 0. The
840 combination of the return value of 0 with no expression means
841 we're finished.
842*/
843static int
844fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw,
845 int recurse_lvl, PyObject **literal,
846 PyObject **expr_text, expr_ty *expression,
847 Token *first_token, Token *t, Token *last_token)
848{
849 int result;
850
851 assert(*literal == NULL && *expression == NULL);
852
853 /* Get any literal string. */
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300854 result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
Pablo Galindofb61c422020-06-15 14:23:43 +0100855 if (result < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100856 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +0100857 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100858
859 assert(result == 0 || result == 1);
860
Pablo Galindofb61c422020-06-15 14:23:43 +0100861 if (result == 1) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100862 /* We have a literal, but don't look at the expression. */
863 return 1;
Pablo Galindofb61c422020-06-15 14:23:43 +0100864 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100865
Pablo Galindofb61c422020-06-15 14:23:43 +0100866 if (*str >= end || **str == '}') {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100867 /* We're at the end of the string or the end of a nested
868 f-string: no expression. The top-level error case where we
869 expect to be at the end of the string but we're at a '}' is
870 handled later. */
871 return 0;
Pablo Galindofb61c422020-06-15 14:23:43 +0100872 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100873
874 /* We must now be the start of an expression, on a '{'. */
875 assert(**str == '{');
876
877 if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
Pablo Galindofb61c422020-06-15 14:23:43 +0100878 expression, first_token, t, last_token) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100879 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +0100880 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100881
882 return 0;
883
884error:
885 Py_CLEAR(*literal);
886 return -1;
887}
888
889#ifdef NDEBUG
890#define ExprList_check_invariants(l)
891#else
892static void
893ExprList_check_invariants(ExprList *l)
894{
895 /* Check our invariants. Make sure this object is "live", and
896 hasn't been deallocated. */
897 assert(l->size >= 0);
898 assert(l->p != NULL);
Pablo Galindofb61c422020-06-15 14:23:43 +0100899 if (l->size <= EXPRLIST_N_CACHED) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100900 assert(l->data == l->p);
Pablo Galindofb61c422020-06-15 14:23:43 +0100901 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100902}
903#endif
904
905static void
906ExprList_Init(ExprList *l)
907{
908 l->allocated = EXPRLIST_N_CACHED;
909 l->size = 0;
910
911 /* Until we start allocating dynamically, p points to data. */
912 l->p = l->data;
913
914 ExprList_check_invariants(l);
915}
916
917static int
918ExprList_Append(ExprList *l, expr_ty exp)
919{
920 ExprList_check_invariants(l);
921 if (l->size >= l->allocated) {
922 /* We need to alloc (or realloc) the memory. */
923 Py_ssize_t new_size = l->allocated * 2;
924
925 /* See if we've ever allocated anything dynamically. */
926 if (l->p == l->data) {
927 Py_ssize_t i;
928 /* We're still using the cached data. Switch to
929 alloc-ing. */
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300930 l->p = PyMem_Malloc(sizeof(expr_ty) * new_size);
Pablo Galindofb61c422020-06-15 14:23:43 +0100931 if (!l->p) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100932 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +0100933 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100934 /* Copy the cached data into the new buffer. */
Pablo Galindofb61c422020-06-15 14:23:43 +0100935 for (i = 0; i < l->size; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100936 l->p[i] = l->data[i];
Pablo Galindofb61c422020-06-15 14:23:43 +0100937 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100938 } else {
939 /* Just realloc. */
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300940 expr_ty *tmp = PyMem_Realloc(l->p, sizeof(expr_ty) * new_size);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100941 if (!tmp) {
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300942 PyMem_Free(l->p);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100943 l->p = NULL;
944 return -1;
945 }
946 l->p = tmp;
947 }
948
949 l->allocated = new_size;
950 assert(l->allocated == 2 * l->size);
951 }
952
953 l->p[l->size++] = exp;
954
955 ExprList_check_invariants(l);
956 return 0;
957}
958
959static void
960ExprList_Dealloc(ExprList *l)
961{
962 ExprList_check_invariants(l);
963
964 /* If there's been an error, or we've never dynamically allocated,
965 do nothing. */
966 if (!l->p || l->p == l->data) {
967 /* Do nothing. */
968 } else {
969 /* We have dynamically allocated. Free the memory. */
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300970 PyMem_Free(l->p);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100971 }
972 l->p = NULL;
973 l->size = -1;
974}
975
Pablo Galindoa5634c42020-09-16 19:42:00 +0100976static asdl_expr_seq *
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100977ExprList_Finish(ExprList *l, PyArena *arena)
978{
Pablo Galindoa5634c42020-09-16 19:42:00 +0100979 asdl_expr_seq *seq;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100980
981 ExprList_check_invariants(l);
982
983 /* Allocate the asdl_seq and copy the expressions in to it. */
Pablo Galindoa5634c42020-09-16 19:42:00 +0100984 seq = _Py_asdl_expr_seq_new(l->size, arena);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100985 if (seq) {
986 Py_ssize_t i;
Pablo Galindofb61c422020-06-15 14:23:43 +0100987 for (i = 0; i < l->size; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100988 asdl_seq_SET(seq, i, l->p[i]);
Pablo Galindofb61c422020-06-15 14:23:43 +0100989 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100990 }
991 ExprList_Dealloc(l);
992 return seq;
993}
994
995#ifdef NDEBUG
996#define FstringParser_check_invariants(state)
997#else
998static void
999FstringParser_check_invariants(FstringParser *state)
1000{
Pablo Galindofb61c422020-06-15 14:23:43 +01001001 if (state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001002 assert(PyUnicode_CheckExact(state->last_str));
Pablo Galindofb61c422020-06-15 14:23:43 +01001003 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001004 ExprList_check_invariants(&state->expr_list);
1005}
1006#endif
1007
1008void
1009_PyPegen_FstringParser_Init(FstringParser *state)
1010{
1011 state->last_str = NULL;
1012 state->fmode = 0;
1013 ExprList_Init(&state->expr_list);
1014 FstringParser_check_invariants(state);
1015}
1016
1017void
1018_PyPegen_FstringParser_Dealloc(FstringParser *state)
1019{
1020 FstringParser_check_invariants(state);
1021
1022 Py_XDECREF(state->last_str);
1023 ExprList_Dealloc(&state->expr_list);
1024}
1025
1026/* Make a Constant node, but decref the PyUnicode object being added. */
1027static expr_ty
1028make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token)
1029{
1030 PyObject *s = *str;
1031 PyObject *kind = NULL;
1032 *str = NULL;
1033 assert(PyUnicode_CheckExact(s));
1034 if (PyArena_AddPyObject(p->arena, s) < 0) {
1035 Py_DECREF(s);
1036 return NULL;
1037 }
1038 const char* the_str = PyBytes_AsString(first_token->bytes);
1039 if (the_str && the_str[0] == 'u') {
1040 kind = _PyPegen_new_identifier(p, "u");
1041 }
1042
1043 if (kind == NULL && PyErr_Occurred()) {
1044 return NULL;
1045 }
1046
1047 return Constant(s, kind, first_token->lineno, first_token->col_offset,
1048 last_token->end_lineno, last_token->end_col_offset, p->arena);
1049
1050}
1051
1052
1053/* Add a non-f-string (that is, a regular literal string). str is
1054 decref'd. */
1055int
1056_PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str)
1057{
1058 FstringParser_check_invariants(state);
1059
1060 assert(PyUnicode_CheckExact(str));
1061
1062 if (PyUnicode_GET_LENGTH(str) == 0) {
1063 Py_DECREF(str);
1064 return 0;
1065 }
1066
1067 if (!state->last_str) {
1068 /* We didn't have a string before, so just remember this one. */
1069 state->last_str = str;
1070 } else {
1071 /* Concatenate this with the previous string. */
1072 PyUnicode_AppendAndDel(&state->last_str, str);
Pablo Galindofb61c422020-06-15 14:23:43 +01001073 if (!state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001074 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +01001075 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001076 }
1077 FstringParser_check_invariants(state);
1078 return 0;
1079}
1080
1081/* Parse an f-string. The f-string is in *str to end, with no
1082 'f' or quotes. */
1083int
1084_PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str,
1085 const char *end, int raw, int recurse_lvl,
1086 Token *first_token, Token* t, Token *last_token)
1087{
1088 FstringParser_check_invariants(state);
1089 state->fmode = 1;
1090
1091 /* Parse the f-string. */
1092 while (1) {
1093 PyObject *literal = NULL;
1094 PyObject *expr_text = NULL;
1095 expr_ty expression = NULL;
1096
1097 /* If there's a zero length literal in front of the
1098 expression, literal will be NULL. If we're at the end of
1099 the f-string, expression will be NULL (unless result == 1,
1100 see below). */
1101 int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
1102 &literal, &expr_text,
1103 &expression, first_token, t, last_token);
Pablo Galindofb61c422020-06-15 14:23:43 +01001104 if (result < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001105 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +01001106 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001107
1108 /* Add the literal, if any. */
1109 if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
1110 Py_XDECREF(expr_text);
1111 return -1;
1112 }
1113 /* Add the expr_text, if any. */
1114 if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
1115 return -1;
1116 }
1117
1118 /* We've dealt with the literal and expr_text, their ownership has
1119 been transferred to the state object. Don't look at them again. */
1120
1121 /* See if we should just loop around to get the next literal
1122 and expression, while ignoring the expression this
1123 time. This is used for un-doubling braces, as an
1124 optimization. */
Pablo Galindofb61c422020-06-15 14:23:43 +01001125 if (result == 1) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001126 continue;
Pablo Galindofb61c422020-06-15 14:23:43 +01001127 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001128
Pablo Galindofb61c422020-06-15 14:23:43 +01001129 if (!expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001130 /* We're done with this f-string. */
1131 break;
Pablo Galindofb61c422020-06-15 14:23:43 +01001132 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001133
1134 /* We know we have an expression. Convert any existing string
1135 to a Constant node. */
1136 if (!state->last_str) {
1137 /* Do nothing. No previous literal. */
1138 } else {
1139 /* Convert the existing last_str literal to a Constant node. */
Pablo Galindofb61c422020-06-15 14:23:43 +01001140 expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1141 if (!last_str || ExprList_Append(&state->expr_list, last_str) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001142 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +01001143 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001144 }
1145
Pablo Galindofb61c422020-06-15 14:23:43 +01001146 if (ExprList_Append(&state->expr_list, expression) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001147 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +01001148 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001149 }
1150
1151 /* If recurse_lvl is zero, then we must be at the end of the
1152 string. Otherwise, we must be at a right brace. */
1153
1154 if (recurse_lvl == 0 && *str < end-1) {
1155 RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
1156 return -1;
1157 }
1158 if (recurse_lvl != 0 && **str != '}') {
1159 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1160 return -1;
1161 }
1162
1163 FstringParser_check_invariants(state);
1164 return 0;
1165}
1166
1167/* Convert the partial state reflected in last_str and expr_list to an
1168 expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
1169expr_ty
1170_PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token,
1171 Token *last_token)
1172{
Pablo Galindoa5634c42020-09-16 19:42:00 +01001173 asdl_expr_seq *seq;
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001174
1175 FstringParser_check_invariants(state);
1176
1177 /* If we're just a constant string with no expressions, return
1178 that. */
1179 if (!state->fmode) {
1180 assert(!state->expr_list.size);
1181 if (!state->last_str) {
1182 /* Create a zero length string. */
1183 state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
Pablo Galindofb61c422020-06-15 14:23:43 +01001184 if (!state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001185 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +01001186 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001187 }
1188 return make_str_node_and_del(p, &state->last_str, first_token, last_token);
1189 }
1190
1191 /* Create a Constant node out of last_str, if needed. It will be the
1192 last node in our expression list. */
1193 if (state->last_str) {
1194 expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
Pablo Galindofb61c422020-06-15 14:23:43 +01001195 if (!str || ExprList_Append(&state->expr_list, str) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001196 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +01001197 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001198 }
1199 /* This has already been freed. */
1200 assert(state->last_str == NULL);
1201
1202 seq = ExprList_Finish(&state->expr_list, p->arena);
Pablo Galindofb61c422020-06-15 14:23:43 +01001203 if (!seq) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001204 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +01001205 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001206
1207 return _Py_JoinedStr(seq, first_token->lineno, first_token->col_offset,
1208 last_token->end_lineno, last_token->end_col_offset, p->arena);
1209
1210error:
1211 _PyPegen_FstringParser_Dealloc(state);
1212 return NULL;
1213}
1214
1215/* Given an f-string (with no 'f' or quotes) that's in *str and ends
1216 at end, parse it into an expr_ty. Return NULL on error. Adjust
1217 str to point past the parsed portion. */
1218static expr_ty
1219fstring_parse(Parser *p, const char **str, const char *end, int raw,
1220 int recurse_lvl, Token *first_token, Token* t, Token *last_token)
1221{
1222 FstringParser state;
1223
1224 _PyPegen_FstringParser_Init(&state);
1225 if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
1226 first_token, t, last_token) < 0) {
1227 _PyPegen_FstringParser_Dealloc(&state);
1228 return NULL;
1229 }
1230
1231 return _PyPegen_FstringParser_Finish(p, &state, t, t);
1232}