blob: efe82df47658bdb4ad1b8d1c5d476b6cf3fd65f8 [file] [log] [blame]
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001#include <Python.h>
2
3#include "../tokenizer.h"
4#include "pegen.h"
5#include "parse_string.h"
6
7//// STRING HANDLING FUNCTIONS ////
8
9// These functions are ported directly from Python/ast.c with some modifications
10// to account for the use of "Parser *p", the fact that don't have parser nodes
11// to pass around and the usage of some specialized APIs present only in this
12// file (like "_PyPegen_raise_syntax_error").
13
14static int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030015warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010016{
17 PyObject *msg =
18 PyUnicode_FromFormat("invalid escape sequence \\%c", first_invalid_escape_char);
19 if (msg == NULL) {
20 return -1;
21 }
22 if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030023 t->lineno, NULL, NULL) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +010024 if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
25 /* Replace the DeprecationWarning exception with a SyntaxError
26 to get a more accurate error report */
27 PyErr_Clear();
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030028
29 /* This is needed, in order for the SyntaxError to point to the token t,
30 since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
31 error location, if p->known_err_token is not set. */
32 p->known_err_token = t;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010033 RAISE_SYNTAX_ERROR("invalid escape sequence \\%c", first_invalid_escape_char);
34 }
35 Py_DECREF(msg);
36 return -1;
37 }
38 Py_DECREF(msg);
39 return 0;
40}
41
42static PyObject *
43decode_utf8(const char **sPtr, const char *end)
44{
45 const char *s, *t;
46 t = s = *sPtr;
47 while (s < end && (*s & 0x80)) {
48 s++;
49 }
50 *sPtr = s;
51 return PyUnicode_DecodeUTF8(t, s - t, NULL);
52}
53
54static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030055decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010056{
57 PyObject *v, *u;
58 char *buf;
59 char *p;
60 const char *end;
61
62 /* check for integer overflow */
63 if (len > SIZE_MAX / 6) {
64 return NULL;
65 }
66 /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
67 "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
68 u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
69 if (u == NULL) {
70 return NULL;
71 }
72 p = buf = PyBytes_AsString(u);
73 end = s + len;
74 while (s < end) {
75 if (*s == '\\') {
76 *p++ = *s++;
77 if (s >= end || *s & 0x80) {
78 strcpy(p, "u005c");
79 p += 5;
80 if (s >= end) {
81 break;
82 }
83 }
84 }
85 if (*s & 0x80) {
86 PyObject *w;
87 int kind;
88 void *data;
89 Py_ssize_t len, i;
90 w = decode_utf8(&s, end);
91 if (w == NULL) {
92 Py_DECREF(u);
93 return NULL;
94 }
95 kind = PyUnicode_KIND(w);
96 data = PyUnicode_DATA(w);
97 len = PyUnicode_GET_LENGTH(w);
98 for (i = 0; i < len; i++) {
99 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
100 sprintf(p, "\\U%08x", chr);
101 p += 10;
102 }
103 /* Should be impossible to overflow */
104 assert(p - buf <= PyBytes_GET_SIZE(u));
105 Py_DECREF(w);
106 }
107 else {
108 *p++ = *s++;
109 }
110 }
111 len = p - buf;
112 s = buf;
113
114 const char *first_invalid_escape;
115 v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
116
117 if (v != NULL && first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300118 if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100119 /* We have not decref u before because first_invalid_escape points
120 inside u. */
121 Py_XDECREF(u);
122 Py_DECREF(v);
123 return NULL;
124 }
125 }
126 Py_XDECREF(u);
127 return v;
128}
129
130static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300131decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100132{
133 const char *first_invalid_escape;
134 PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
135 if (result == NULL) {
136 return NULL;
137 }
138
139 if (first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300140 if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100141 Py_DECREF(result);
142 return NULL;
143 }
144 }
145 return result;
146}
147
148/* s must include the bracketing quote characters, and r, b, u,
149 &/or f prefixes (if any), and embedded escape sequences (if any).
150 _PyPegen_parsestr parses it, and sets *result to decoded Python string object.
151 If the string is an f-string, set *fstr and *fstrlen to the unparsed
152 string object. Return 0 if no errors occurred. */
153int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300154_PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
155 const char **fstr, Py_ssize_t *fstrlen, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100156{
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300157 const char *s = PyBytes_AsString(t->bytes);
158 if (s == NULL) {
159 return -1;
160 }
161
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100162 size_t len;
163 int quote = Py_CHARMASK(*s);
164 int fmode = 0;
165 *bytesmode = 0;
166 *rawmode = 0;
167 *result = NULL;
168 *fstr = NULL;
169 if (Py_ISALPHA(quote)) {
170 while (!*bytesmode || !*rawmode) {
171 if (quote == 'b' || quote == 'B') {
172 quote = *++s;
173 *bytesmode = 1;
174 }
175 else if (quote == 'u' || quote == 'U') {
176 quote = *++s;
177 }
178 else if (quote == 'r' || quote == 'R') {
179 quote = *++s;
180 *rawmode = 1;
181 }
182 else if (quote == 'f' || quote == 'F') {
183 quote = *++s;
184 fmode = 1;
185 }
186 else {
187 break;
188 }
189 }
190 }
191
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300192 /* fstrings are only allowed in Python 3.6 and greater */
193 if (fmode && p->feature_version < 6) {
194 p->error_indicator = 1;
195 RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
196 return -1;
197 }
198
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100199 if (fmode && *bytesmode) {
200 PyErr_BadInternalCall();
201 return -1;
202 }
203 if (quote != '\'' && quote != '\"') {
204 PyErr_BadInternalCall();
205 return -1;
206 }
207 /* Skip the leading quote char. */
208 s++;
209 len = strlen(s);
210 if (len > INT_MAX) {
211 PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
212 return -1;
213 }
214 if (s[--len] != quote) {
215 /* Last quote char must match the first. */
216 PyErr_BadInternalCall();
217 return -1;
218 }
219 if (len >= 4 && s[0] == quote && s[1] == quote) {
220 /* A triple quoted string. We've already skipped one quote at
221 the start and one at the end of the string. Now skip the
222 two at the start. */
223 s += 2;
224 len -= 2;
225 /* And check that the last two match. */
226 if (s[--len] != quote || s[--len] != quote) {
227 PyErr_BadInternalCall();
228 return -1;
229 }
230 }
231
232 if (fmode) {
233 /* Just return the bytes. The caller will parse the resulting
234 string. */
235 *fstr = s;
236 *fstrlen = len;
237 return 0;
238 }
239
240 /* Not an f-string. */
241 /* Avoid invoking escape decoding routines if possible. */
242 *rawmode = *rawmode || strchr(s, '\\') == NULL;
243 if (*bytesmode) {
244 /* Disallow non-ASCII characters. */
245 const char *ch;
246 for (ch = s; *ch; ch++) {
247 if (Py_CHARMASK(*ch) >= 0x80) {
248 RAISE_SYNTAX_ERROR(
249 "bytes can only contain ASCII "
250 "literal characters.");
251 return -1;
252 }
253 }
254 if (*rawmode) {
255 *result = PyBytes_FromStringAndSize(s, len);
256 }
257 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300258 *result = decode_bytes_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100259 }
260 }
261 else {
262 if (*rawmode) {
263 *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
264 }
265 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300266 *result = decode_unicode_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100267 }
268 }
269 return *result == NULL ? -1 : 0;
270}
271
272
273
274// FSTRING STUFF
275
276static void fstring_shift_expr_locations(expr_ty n, int lineno, int col_offset);
277static void fstring_shift_argument(expr_ty parent, arg_ty args, int lineno, int col_offset);
278
279
280static inline void shift_expr(expr_ty parent, expr_ty n, int line, int col) {
281 if (parent->lineno < n->lineno) {
282 col = 0;
283 }
284 fstring_shift_expr_locations(n, line, col);
285}
286
287static inline void shift_arg(expr_ty parent, arg_ty n, int line, int col) {
288 if (parent->lineno < n->lineno) {
289 col = 0;
290 }
291 fstring_shift_argument(parent, n, line, col);
292}
293
294static void fstring_shift_seq_locations(expr_ty parent, asdl_seq *seq, int lineno, int col_offset) {
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100295 for (Py_ssize_t i = 0, l = asdl_seq_LEN(seq); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100296 expr_ty expr = asdl_seq_GET(seq, i);
297 if (expr == NULL){
298 continue;
299 }
300 shift_expr(parent, expr, lineno, col_offset);
301 }
302}
303
304static void fstring_shift_slice_locations(expr_ty parent, expr_ty slice, int lineno, int col_offset) {
305 switch (slice->kind) {
306 case Slice_kind:
307 if (slice->v.Slice.lower) {
308 shift_expr(parent, slice->v.Slice.lower, lineno, col_offset);
309 }
310 if (slice->v.Slice.upper) {
311 shift_expr(parent, slice->v.Slice.upper, lineno, col_offset);
312 }
313 if (slice->v.Slice.step) {
314 shift_expr(parent, slice->v.Slice.step, lineno, col_offset);
315 }
316 break;
317 case Tuple_kind:
318 fstring_shift_seq_locations(parent, slice->v.Tuple.elts, lineno, col_offset);
319 break;
320 default:
321 break;
322 }
323}
324
325static void fstring_shift_comprehension(expr_ty parent, comprehension_ty comp, int lineno, int col_offset) {
326 shift_expr(parent, comp->target, lineno, col_offset);
327 shift_expr(parent, comp->iter, lineno, col_offset);
328 fstring_shift_seq_locations(parent, comp->ifs, lineno, col_offset);
329}
330
331static void fstring_shift_argument(expr_ty parent, arg_ty arg, int lineno, int col_offset) {
332 if (arg->annotation != NULL){
333 shift_expr(parent, arg->annotation, lineno, col_offset);
334 }
335 arg->col_offset = arg->col_offset + col_offset;
336 arg->end_col_offset = arg->end_col_offset + col_offset;
337 arg->lineno = arg->lineno + lineno;
338 arg->end_lineno = arg->end_lineno + lineno;
339}
340
341static void fstring_shift_arguments(expr_ty parent, arguments_ty args, int lineno, int col_offset) {
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100342 for (Py_ssize_t i = 0, l = asdl_seq_LEN(args->posonlyargs); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100343 arg_ty arg = asdl_seq_GET(args->posonlyargs, i);
344 shift_arg(parent, arg, lineno, col_offset);
345 }
346
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100347 for (Py_ssize_t i = 0, l = asdl_seq_LEN(args->args); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100348 arg_ty arg = asdl_seq_GET(args->args, i);
349 shift_arg(parent, arg, lineno, col_offset);
350 }
351
352 if (args->vararg != NULL) {
353 shift_arg(parent, args->vararg, lineno, col_offset);
354 }
355
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100356 for (Py_ssize_t i = 0, l = asdl_seq_LEN(args->kwonlyargs); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100357 arg_ty arg = asdl_seq_GET(args->kwonlyargs, i);
358 shift_arg(parent, arg, lineno, col_offset);
359 }
360
361 fstring_shift_seq_locations(parent, args->kw_defaults, lineno, col_offset);
362
363 if (args->kwarg != NULL) {
364 shift_arg(parent, args->kwarg, lineno, col_offset);
365 }
366
367 fstring_shift_seq_locations(parent, args->defaults, lineno, col_offset);
368}
369
370static void fstring_shift_children_locations(expr_ty n, int lineno, int col_offset) {
371 switch (n->kind) {
372 case BoolOp_kind:
373 fstring_shift_seq_locations(n, n->v.BoolOp.values, lineno, col_offset);
374 break;
375 case NamedExpr_kind:
376 shift_expr(n, n->v.NamedExpr.target, lineno, col_offset);
377 shift_expr(n, n->v.NamedExpr.value, lineno, col_offset);
378 break;
379 case BinOp_kind:
380 shift_expr(n, n->v.BinOp.left, lineno, col_offset);
381 shift_expr(n, n->v.BinOp.right, lineno, col_offset);
382 break;
383 case UnaryOp_kind:
384 shift_expr(n, n->v.UnaryOp.operand, lineno, col_offset);
385 break;
386 case Lambda_kind:
387 fstring_shift_arguments(n, n->v.Lambda.args, lineno, col_offset);
388 shift_expr(n, n->v.Lambda.body, lineno, col_offset);
389 break;
390 case IfExp_kind:
391 shift_expr(n, n->v.IfExp.test, lineno, col_offset);
392 shift_expr(n, n->v.IfExp.body, lineno, col_offset);
393 shift_expr(n, n->v.IfExp.orelse, lineno, col_offset);
394 break;
395 case Dict_kind:
396 fstring_shift_seq_locations(n, n->v.Dict.keys, lineno, col_offset);
397 fstring_shift_seq_locations(n, n->v.Dict.values, lineno, col_offset);
398 break;
399 case Set_kind:
400 fstring_shift_seq_locations(n, n->v.Set.elts, lineno, col_offset);
401 break;
402 case ListComp_kind:
403 shift_expr(n, n->v.ListComp.elt, lineno, col_offset);
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100404 for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.ListComp.generators); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100405 comprehension_ty comp = asdl_seq_GET(n->v.ListComp.generators, i);
406 fstring_shift_comprehension(n, comp, lineno, col_offset);
407 }
408 break;
409 case SetComp_kind:
410 shift_expr(n, n->v.SetComp.elt, lineno, col_offset);
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100411 for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.SetComp.generators); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100412 comprehension_ty comp = asdl_seq_GET(n->v.SetComp.generators, i);
413 fstring_shift_comprehension(n, comp, lineno, col_offset);
414 }
415 break;
416 case DictComp_kind:
417 shift_expr(n, n->v.DictComp.key, lineno, col_offset);
418 shift_expr(n, n->v.DictComp.value, lineno, col_offset);
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100419 for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.DictComp.generators); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100420 comprehension_ty comp = asdl_seq_GET(n->v.DictComp.generators, i);
421 fstring_shift_comprehension(n, comp, lineno, col_offset);
422 }
423 break;
424 case GeneratorExp_kind:
425 shift_expr(n, n->v.GeneratorExp.elt, lineno, col_offset);
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100426 for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.GeneratorExp.generators); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100427 comprehension_ty comp = asdl_seq_GET(n->v.GeneratorExp.generators, i);
428 fstring_shift_comprehension(n, comp, lineno, col_offset);
429 }
430 break;
431 case Await_kind:
432 shift_expr(n, n->v.Await.value, lineno, col_offset);
433 break;
434 case Yield_kind:
435 shift_expr(n, n->v.Yield.value, lineno, col_offset);
436 break;
437 case YieldFrom_kind:
438 shift_expr(n, n->v.YieldFrom.value, lineno, col_offset);
439 break;
440 case Compare_kind:
441 shift_expr(n, n->v.Compare.left, lineno, col_offset);
442 fstring_shift_seq_locations(n, n->v.Compare.comparators, lineno, col_offset);
443 break;
444 case Call_kind:
445 shift_expr(n, n->v.Call.func, lineno, col_offset);
446 fstring_shift_seq_locations(n, n->v.Call.args, lineno, col_offset);
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100447 for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.Call.keywords); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100448 keyword_ty keyword = asdl_seq_GET(n->v.Call.keywords, i);
449 shift_expr(n, keyword->value, lineno, col_offset);
450 }
451 break;
452 case Attribute_kind:
453 shift_expr(n, n->v.Attribute.value, lineno, col_offset);
454 break;
455 case Subscript_kind:
456 shift_expr(n, n->v.Subscript.value, lineno, col_offset);
457 fstring_shift_slice_locations(n, n->v.Subscript.slice, lineno, col_offset);
458 shift_expr(n, n->v.Subscript.slice, lineno, col_offset);
459 break;
460 case Starred_kind:
461 shift_expr(n, n->v.Starred.value, lineno, col_offset);
462 break;
463 case List_kind:
464 fstring_shift_seq_locations(n, n->v.List.elts, lineno, col_offset);
465 break;
466 case Tuple_kind:
467 fstring_shift_seq_locations(n, n->v.Tuple.elts, lineno, col_offset);
468 break;
Lysandros Nikolaou37af21b2020-04-29 03:43:50 +0300469 case JoinedStr_kind:
470 fstring_shift_seq_locations(n, n->v.JoinedStr.values, lineno, col_offset);
471 break;
472 case FormattedValue_kind:
473 shift_expr(n, n->v.FormattedValue.value, lineno, col_offset);
474 if (n->v.FormattedValue.format_spec) {
475 shift_expr(n, n->v.FormattedValue.format_spec, lineno, col_offset);
476 }
477 break;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100478 default:
479 return;
480 }
481}
482
483/* Shift locations for the given node and all its children by adding `lineno`
484 and `col_offset` to existing locations. Note that n is the already parsed
485 expression. */
486static void fstring_shift_expr_locations(expr_ty n, int lineno, int col_offset)
487{
488 n->col_offset = n->col_offset + col_offset;
489
490 // The following is needed, in order for nodes spanning across multiple lines
491 // to be shifted correctly. An example of such a node is a Call node, the closing
492 // parenthesis of which is not on the same line as its name.
493 if (n->lineno == n->end_lineno) {
494 n->end_col_offset = n->end_col_offset + col_offset;
495 }
496
497 fstring_shift_children_locations(n, lineno, col_offset);
498 n->lineno = n->lineno + lineno;
499 n->end_lineno = n->end_lineno + lineno;
500}
501
502/* Fix locations for the given node and its children.
503
504 `parent` is the enclosing node.
505 `n` is the node which locations are going to be fixed relative to parent.
506 `expr_str` is the child node's string representation, including braces.
507*/
508static void
509fstring_fix_expr_location(Token *parent, expr_ty n, char *expr_str)
510{
511 char *substr = NULL;
512 char *start;
513 int lines = 0;
514 int cols = 0;
515
516 if (parent && parent->bytes) {
517 char *parent_str = PyBytes_AsString(parent->bytes);
518 if (!parent_str) {
519 return;
520 }
521 substr = strstr(parent_str, expr_str);
522 if (substr) {
523 // The following is needed, in order to correctly shift the column
524 // offset, in the case that (disregarding any whitespace) a newline
525 // immediately follows the opening curly brace of the fstring expression.
526 int newline_after_brace = 1;
527 start = substr + 1;
528 while (start && *start != '}' && *start != '\n') {
529 if (*start != ' ' && *start != '\t' && *start != '\f') {
530 newline_after_brace = 0;
531 break;
532 }
533 start++;
534 }
535
536 // Account for the characters from the last newline character to our
537 // left until the beginning of substr.
538 if (!newline_after_brace) {
539 start = substr;
540 while (start > parent_str && *start != '\n') {
541 start--;
542 }
543 cols += (int)(substr - start);
544 }
545 /* adjust the start based on the number of newlines encountered
546 before the f-string expression */
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100547 for (char* p = parent_str; p < substr; p++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100548 if (*p == '\n') {
549 lines++;
550 }
551 }
552 }
553 }
554 fstring_shift_expr_locations(n, lines, cols);
555}
556
557
558/* Compile this expression in to an expr_ty. Add parens around the
559 expression, in order to allow leading spaces in the expression. */
560static expr_ty
561fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
562 Token *t)
563{
564 expr_ty expr = NULL;
565 char *str;
566 Py_ssize_t len;
567 const char *s;
568 expr_ty result = NULL;
569
570 assert(expr_end >= expr_start);
571 assert(*(expr_start-1) == '{');
572 assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' ||
573 *expr_end == '=');
574
575 /* If the substring is all whitespace, it's an error. We need to catch this
576 here, and not when we call PyParser_SimpleParseStringFlagsFilename,
577 because turning the expression '' in to '()' would go from being invalid
578 to valid. */
579 for (s = expr_start; s != expr_end; s++) {
580 char c = *s;
581 /* The Python parser ignores only the following whitespace
582 characters (\r already is converted to \n). */
583 if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) {
584 break;
585 }
586 }
587 if (s == expr_end) {
588 RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
589 return NULL;
590 }
591
592 len = expr_end - expr_start;
593 /* Allocate 3 extra bytes: open paren, close paren, null byte. */
594 str = PyMem_RawMalloc(len + 3);
595 if (str == NULL) {
596 PyErr_NoMemory();
597 return NULL;
598 }
599
600 str[0] = '(';
601 memcpy(str+1, expr_start, len);
602 str[len+1] = ')';
603 str[len+2] = 0;
604
605 struct tok_state* tok = PyTokenizer_FromString(str, 1);
606 if (tok == NULL) {
Miss Islington (bot)79e6c152020-06-05 17:10:57 -0700607 PyMem_RawFree(str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100608 return NULL;
609 }
Lysandros Nikolaou791a46e2020-05-26 04:24:31 +0300610 Py_INCREF(p->tok->filename);
611 tok->filename = p->tok->filename;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100612
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300613 Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
614 NULL, p->arena);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100615 p2->starting_lineno = p->starting_lineno + p->tok->first_lineno - 1;
616 p2->starting_col_offset = p->tok->first_lineno == p->tok->lineno
617 ? p->starting_col_offset + t->col_offset : 0;
618
619 expr = _PyPegen_run_parser(p2);
620
621 if (expr == NULL) {
622 goto exit;
623 }
624
625 /* Reuse str to find the correct column offset. */
626 str[0] = '{';
627 str[len+1] = '}';
628 fstring_fix_expr_location(t, expr, str);
629
630 result = expr;
631
632exit:
Miss Islington (bot)79e6c152020-06-05 17:10:57 -0700633 PyMem_RawFree(str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100634 _PyPegen_Parser_Free(p2);
635 PyTokenizer_Free(tok);
636 return result;
637}
638
639/* Return -1 on error.
640
641 Return 0 if we reached the end of the literal.
642
643 Return 1 if we haven't reached the end of the literal, but we want
644 the caller to process the literal up to this point. Used for
645 doubled braces.
646*/
647static int
648fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300649 PyObject **literal, int recurse_lvl, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100650{
651 /* Get any literal string. It ends when we hit an un-doubled left
652 brace (which isn't part of a unicode name escape such as
653 "\N{EULER CONSTANT}"), or the end of the string. */
654
655 const char *s = *str;
656 const char *literal_start = s;
657 int result = 0;
658
659 assert(*literal == NULL);
660 while (s < end) {
661 char ch = *s++;
662 if (!raw && ch == '\\' && s < end) {
663 ch = *s++;
664 if (ch == 'N') {
665 if (s < end && *s++ == '{') {
666 while (s < end && *s++ != '}') {
667 }
668 continue;
669 }
670 break;
671 }
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300672 if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100673 return -1;
674 }
675 }
676 if (ch == '{' || ch == '}') {
677 /* Check for doubled braces, but only at the top level. If
678 we checked at every level, then f'{0:{3}}' would fail
679 with the two closing braces. */
680 if (recurse_lvl == 0) {
681 if (s < end && *s == ch) {
682 /* We're going to tell the caller that the literal ends
683 here, but that they should continue scanning. But also
684 skip over the second brace when we resume scanning. */
685 *str = s + 1;
686 result = 1;
687 goto done;
688 }
689
690 /* Where a single '{' is the start of a new expression, a
691 single '}' is not allowed. */
692 if (ch == '}') {
693 *str = s - 1;
694 RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
695 return -1;
696 }
697 }
698 /* We're either at a '{', which means we're starting another
699 expression; or a '}', which means we're at the end of this
700 f-string (for a nested format_spec). */
701 s--;
702 break;
703 }
704 }
705 *str = s;
706 assert(s <= end);
707 assert(s == end || *s == '{' || *s == '}');
708done:
709 if (literal_start != s) {
710 if (raw)
711 *literal = PyUnicode_DecodeUTF8Stateful(literal_start,
712 s - literal_start,
713 NULL, NULL);
714 else
715 *literal = decode_unicode_with_escapes(p, literal_start,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300716 s - literal_start, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100717 if (!*literal)
718 return -1;
719 }
720 return result;
721}
722
723/* Forward declaration because parsing is recursive. */
724static expr_ty
725fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
726 Token *first_token, Token* t, Token *last_token);
727
728/* Parse the f-string at *str, ending at end. We know *str starts an
729 expression (so it must be a '{'). Returns the FormattedValue node, which
730 includes the expression, conversion character, format_spec expression, and
731 optionally the text of the expression (if = is used).
732
733 Note that I don't do a perfect job here: I don't make sure that a
734 closing brace doesn't match an opening paren, for example. It
735 doesn't need to error on all invalid expressions, just correctly
736 find the end of all valid ones. Any errors inside the expression
737 will be caught when we parse it later.
738
739 *expression is set to the expression. For an '=' "debug" expression,
740 *expr_text is set to the debug text (the original text of the expression,
741 including the '=' and any whitespace around it, as a string object). If
742 not a debug expression, *expr_text set to NULL. */
743static int
744fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
745 PyObject **expr_text, expr_ty *expression, Token *first_token,
746 Token *t, Token *last_token)
747{
748 /* Return -1 on error, else 0. */
749
750 const char *expr_start;
751 const char *expr_end;
752 expr_ty simple_expression;
753 expr_ty format_spec = NULL; /* Optional format specifier. */
754 int conversion = -1; /* The conversion char. Use default if not
755 specified, or !r if using = and no format
756 spec. */
757
758 /* 0 if we're not in a string, else the quote char we're trying to
759 match (single or double quote). */
760 char quote_char = 0;
761
762 /* If we're inside a string, 1=normal, 3=triple-quoted. */
763 int string_type = 0;
764
765 /* Keep track of nesting level for braces/parens/brackets in
766 expressions. */
767 Py_ssize_t nested_depth = 0;
768 char parenstack[MAXLEVEL];
769
770 *expr_text = NULL;
771
772 /* Can only nest one level deep. */
773 if (recurse_lvl >= 2) {
774 RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
775 goto error;
776 }
777
778 /* The first char must be a left brace, or we wouldn't have gotten
779 here. Skip over it. */
780 assert(**str == '{');
781 *str += 1;
782
783 expr_start = *str;
784 for (; *str < end; (*str)++) {
785 char ch;
786
787 /* Loop invariants. */
788 assert(nested_depth >= 0);
789 assert(*str >= expr_start && *str < end);
790 if (quote_char)
791 assert(string_type == 1 || string_type == 3);
792 else
793 assert(string_type == 0);
794
795 ch = **str;
796 /* Nowhere inside an expression is a backslash allowed. */
797 if (ch == '\\') {
798 /* Error: can't include a backslash character, inside
799 parens or strings or not. */
800 RAISE_SYNTAX_ERROR(
801 "f-string expression part "
802 "cannot include a backslash");
803 goto error;
804 }
805 if (quote_char) {
806 /* We're inside a string. See if we're at the end. */
807 /* This code needs to implement the same non-error logic
808 as tok_get from tokenizer.c, at the letter_quote
809 label. To actually share that code would be a
810 nightmare. But, it's unlikely to change and is small,
811 so duplicate it here. Note we don't need to catch all
812 of the errors, since they'll be caught when parsing the
813 expression. We just need to match the non-error
814 cases. Thus we can ignore \n in single-quoted strings,
815 for example. Or non-terminated strings. */
816 if (ch == quote_char) {
817 /* Does this match the string_type (single or triple
818 quoted)? */
819 if (string_type == 3) {
820 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
821 /* We're at the end of a triple quoted string. */
822 *str += 2;
823 string_type = 0;
824 quote_char = 0;
825 continue;
826 }
827 } else {
828 /* We're at the end of a normal string. */
829 quote_char = 0;
830 string_type = 0;
831 continue;
832 }
833 }
834 } else if (ch == '\'' || ch == '"') {
835 /* Is this a triple quoted string? */
836 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
837 string_type = 3;
838 *str += 2;
839 } else {
840 /* Start of a normal string. */
841 string_type = 1;
842 }
843 /* Start looking for the end of the string. */
844 quote_char = ch;
845 } else if (ch == '[' || ch == '{' || ch == '(') {
846 if (nested_depth >= MAXLEVEL) {
847 RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
848 goto error;
849 }
850 parenstack[nested_depth] = ch;
851 nested_depth++;
852 } else if (ch == '#') {
853 /* Error: can't include a comment character, inside parens
854 or not. */
855 RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
856 goto error;
857 } else if (nested_depth == 0 &&
858 (ch == '!' || ch == ':' || ch == '}' ||
859 ch == '=' || ch == '>' || ch == '<')) {
860 /* See if there's a next character. */
861 if (*str+1 < end) {
862 char next = *(*str+1);
863
864 /* For "!=". since '=' is not an allowed conversion character,
865 nothing is lost in this test. */
866 if ((ch == '!' && next == '=') || /* != */
867 (ch == '=' && next == '=') || /* == */
868 (ch == '<' && next == '=') || /* <= */
869 (ch == '>' && next == '=') /* >= */
870 ) {
871 *str += 1;
872 continue;
873 }
874 /* Don't get out of the loop for these, if they're single
875 chars (not part of 2-char tokens). If by themselves, they
876 don't end an expression (unlike say '!'). */
877 if (ch == '>' || ch == '<') {
878 continue;
879 }
880 }
881
882 /* Normal way out of this loop. */
883 break;
884 } else if (ch == ']' || ch == '}' || ch == ')') {
885 if (!nested_depth) {
886 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
887 goto error;
888 }
889 nested_depth--;
890 int opening = parenstack[nested_depth];
891 if (!((opening == '(' && ch == ')') ||
892 (opening == '[' && ch == ']') ||
893 (opening == '{' && ch == '}')))
894 {
895 RAISE_SYNTAX_ERROR(
896 "f-string: closing parenthesis '%c' "
897 "does not match opening parenthesis '%c'",
898 ch, opening);
899 goto error;
900 }
901 } else {
902 /* Just consume this char and loop around. */
903 }
904 }
905 expr_end = *str;
906 /* If we leave this loop in a string or with mismatched parens, we
907 don't care. We'll get a syntax error when compiling the
908 expression. But, we can produce a better error message, so
909 let's just do that.*/
910 if (quote_char) {
911 RAISE_SYNTAX_ERROR("f-string: unterminated string");
912 goto error;
913 }
914 if (nested_depth) {
915 int opening = parenstack[nested_depth - 1];
916 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
917 goto error;
918 }
919
920 if (*str >= end)
921 goto unexpected_end_of_string;
922
923 /* Compile the expression as soon as possible, so we show errors
924 related to the expression before errors related to the
925 conversion or format_spec. */
926 simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
927 if (!simple_expression)
928 goto error;
929
930 /* Check for =, which puts the text value of the expression in
931 expr_text. */
932 if (**str == '=') {
Pablo Galindo9b838292020-05-27 22:01:11 +0100933 if (p->feature_version < 8) {
934 RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
935 "only supported in Python 3.8 and greater");
936 goto error;
937 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100938 *str += 1;
939
940 /* Skip over ASCII whitespace. No need to test for end of string
941 here, since we know there's at least a trailing quote somewhere
942 ahead. */
943 while (Py_ISSPACE(**str)) {
944 *str += 1;
945 }
946
947 /* Set *expr_text to the text of the expression. */
948 *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start);
949 if (!*expr_text) {
950 goto error;
951 }
952 }
953
954 /* Check for a conversion char, if present. */
955 if (**str == '!') {
956 *str += 1;
957 if (*str >= end)
958 goto unexpected_end_of_string;
959
960 conversion = **str;
961 *str += 1;
962
963 /* Validate the conversion. */
964 if (!(conversion == 's' || conversion == 'r' || conversion == 'a')) {
965 RAISE_SYNTAX_ERROR(
966 "f-string: invalid conversion character: "
967 "expected 's', 'r', or 'a'");
968 goto error;
969 }
970
971 }
972
973 /* Check for the format spec, if present. */
974 if (*str >= end)
975 goto unexpected_end_of_string;
976 if (**str == ':') {
977 *str += 1;
978 if (*str >= end)
979 goto unexpected_end_of_string;
980
981 /* Parse the format spec. */
982 format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
983 first_token, t, last_token);
984 if (!format_spec)
985 goto error;
986 }
987
988 if (*str >= end || **str != '}')
989 goto unexpected_end_of_string;
990
991 /* We're at a right brace. Consume it. */
992 assert(*str < end);
993 assert(**str == '}');
994 *str += 1;
995
996 /* If we're in = mode (detected by non-NULL expr_text), and have no format
997 spec and no explicit conversion, set the conversion to 'r'. */
998 if (*expr_text && format_spec == NULL && conversion == -1) {
999 conversion = 'r';
1000 }
1001
1002 /* And now create the FormattedValue node that represents this
1003 entire expression with the conversion and format spec. */
1004 //TODO: Fix this
1005 *expression = FormattedValue(simple_expression, conversion,
1006 format_spec, first_token->lineno,
1007 first_token->col_offset, last_token->end_lineno,
1008 last_token->end_col_offset, p->arena);
1009 if (!*expression)
1010 goto error;
1011
1012 return 0;
1013
1014unexpected_end_of_string:
1015 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1016 /* Falls through to error. */
1017
1018error:
1019 Py_XDECREF(*expr_text);
1020 return -1;
1021
1022}
1023
1024/* Return -1 on error.
1025
1026 Return 0 if we have a literal (possible zero length) and an
1027 expression (zero length if at the end of the string.
1028
1029 Return 1 if we have a literal, but no expression, and we want the
1030 caller to call us again. This is used to deal with doubled
1031 braces.
1032
1033 When called multiple times on the string 'a{{b{0}c', this function
1034 will return:
1035
1036 1. the literal 'a{' with no expression, and a return value
1037 of 1. Despite the fact that there's no expression, the return
1038 value of 1 means we're not finished yet.
1039
1040 2. the literal 'b' and the expression '0', with a return value of
1041 0. The fact that there's an expression means we're not finished.
1042
1043 3. literal 'c' with no expression and a return value of 0. The
1044 combination of the return value of 0 with no expression means
1045 we're finished.
1046*/
1047static int
1048fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw,
1049 int recurse_lvl, PyObject **literal,
1050 PyObject **expr_text, expr_ty *expression,
1051 Token *first_token, Token *t, Token *last_token)
1052{
1053 int result;
1054
1055 assert(*literal == NULL && *expression == NULL);
1056
1057 /* Get any literal string. */
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +03001058 result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001059 if (result < 0)
1060 goto error;
1061
1062 assert(result == 0 || result == 1);
1063
1064 if (result == 1)
1065 /* We have a literal, but don't look at the expression. */
1066 return 1;
1067
1068 if (*str >= end || **str == '}')
1069 /* We're at the end of the string or the end of a nested
1070 f-string: no expression. The top-level error case where we
1071 expect to be at the end of the string but we're at a '}' is
1072 handled later. */
1073 return 0;
1074
1075 /* We must now be the start of an expression, on a '{'. */
1076 assert(**str == '{');
1077
1078 if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
1079 expression, first_token, t, last_token) < 0)
1080 goto error;
1081
1082 return 0;
1083
1084error:
1085 Py_CLEAR(*literal);
1086 return -1;
1087}
1088
1089#ifdef NDEBUG
1090#define ExprList_check_invariants(l)
1091#else
1092static void
1093ExprList_check_invariants(ExprList *l)
1094{
1095 /* Check our invariants. Make sure this object is "live", and
1096 hasn't been deallocated. */
1097 assert(l->size >= 0);
1098 assert(l->p != NULL);
1099 if (l->size <= EXPRLIST_N_CACHED)
1100 assert(l->data == l->p);
1101}
1102#endif
1103
1104static void
1105ExprList_Init(ExprList *l)
1106{
1107 l->allocated = EXPRLIST_N_CACHED;
1108 l->size = 0;
1109
1110 /* Until we start allocating dynamically, p points to data. */
1111 l->p = l->data;
1112
1113 ExprList_check_invariants(l);
1114}
1115
1116static int
1117ExprList_Append(ExprList *l, expr_ty exp)
1118{
1119 ExprList_check_invariants(l);
1120 if (l->size >= l->allocated) {
1121 /* We need to alloc (or realloc) the memory. */
1122 Py_ssize_t new_size = l->allocated * 2;
1123
1124 /* See if we've ever allocated anything dynamically. */
1125 if (l->p == l->data) {
1126 Py_ssize_t i;
1127 /* We're still using the cached data. Switch to
1128 alloc-ing. */
1129 l->p = PyMem_RawMalloc(sizeof(expr_ty) * new_size);
1130 if (!l->p)
1131 return -1;
1132 /* Copy the cached data into the new buffer. */
1133 for (i = 0; i < l->size; i++)
1134 l->p[i] = l->data[i];
1135 } else {
1136 /* Just realloc. */
1137 expr_ty *tmp = PyMem_RawRealloc(l->p, sizeof(expr_ty) * new_size);
1138 if (!tmp) {
1139 PyMem_RawFree(l->p);
1140 l->p = NULL;
1141 return -1;
1142 }
1143 l->p = tmp;
1144 }
1145
1146 l->allocated = new_size;
1147 assert(l->allocated == 2 * l->size);
1148 }
1149
1150 l->p[l->size++] = exp;
1151
1152 ExprList_check_invariants(l);
1153 return 0;
1154}
1155
1156static void
1157ExprList_Dealloc(ExprList *l)
1158{
1159 ExprList_check_invariants(l);
1160
1161 /* If there's been an error, or we've never dynamically allocated,
1162 do nothing. */
1163 if (!l->p || l->p == l->data) {
1164 /* Do nothing. */
1165 } else {
1166 /* We have dynamically allocated. Free the memory. */
1167 PyMem_RawFree(l->p);
1168 }
1169 l->p = NULL;
1170 l->size = -1;
1171}
1172
1173static asdl_seq *
1174ExprList_Finish(ExprList *l, PyArena *arena)
1175{
1176 asdl_seq *seq;
1177
1178 ExprList_check_invariants(l);
1179
1180 /* Allocate the asdl_seq and copy the expressions in to it. */
1181 seq = _Py_asdl_seq_new(l->size, arena);
1182 if (seq) {
1183 Py_ssize_t i;
1184 for (i = 0; i < l->size; i++)
1185 asdl_seq_SET(seq, i, l->p[i]);
1186 }
1187 ExprList_Dealloc(l);
1188 return seq;
1189}
1190
1191#ifdef NDEBUG
1192#define FstringParser_check_invariants(state)
1193#else
1194static void
1195FstringParser_check_invariants(FstringParser *state)
1196{
1197 if (state->last_str)
1198 assert(PyUnicode_CheckExact(state->last_str));
1199 ExprList_check_invariants(&state->expr_list);
1200}
1201#endif
1202
1203void
1204_PyPegen_FstringParser_Init(FstringParser *state)
1205{
1206 state->last_str = NULL;
1207 state->fmode = 0;
1208 ExprList_Init(&state->expr_list);
1209 FstringParser_check_invariants(state);
1210}
1211
1212void
1213_PyPegen_FstringParser_Dealloc(FstringParser *state)
1214{
1215 FstringParser_check_invariants(state);
1216
1217 Py_XDECREF(state->last_str);
1218 ExprList_Dealloc(&state->expr_list);
1219}
1220
1221/* Make a Constant node, but decref the PyUnicode object being added. */
1222static expr_ty
1223make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token)
1224{
1225 PyObject *s = *str;
1226 PyObject *kind = NULL;
1227 *str = NULL;
1228 assert(PyUnicode_CheckExact(s));
1229 if (PyArena_AddPyObject(p->arena, s) < 0) {
1230 Py_DECREF(s);
1231 return NULL;
1232 }
1233 const char* the_str = PyBytes_AsString(first_token->bytes);
1234 if (the_str && the_str[0] == 'u') {
1235 kind = _PyPegen_new_identifier(p, "u");
1236 }
1237
1238 if (kind == NULL && PyErr_Occurred()) {
1239 return NULL;
1240 }
1241
1242 return Constant(s, kind, first_token->lineno, first_token->col_offset,
1243 last_token->end_lineno, last_token->end_col_offset, p->arena);
1244
1245}
1246
1247
1248/* Add a non-f-string (that is, a regular literal string). str is
1249 decref'd. */
1250int
1251_PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str)
1252{
1253 FstringParser_check_invariants(state);
1254
1255 assert(PyUnicode_CheckExact(str));
1256
1257 if (PyUnicode_GET_LENGTH(str) == 0) {
1258 Py_DECREF(str);
1259 return 0;
1260 }
1261
1262 if (!state->last_str) {
1263 /* We didn't have a string before, so just remember this one. */
1264 state->last_str = str;
1265 } else {
1266 /* Concatenate this with the previous string. */
1267 PyUnicode_AppendAndDel(&state->last_str, str);
1268 if (!state->last_str)
1269 return -1;
1270 }
1271 FstringParser_check_invariants(state);
1272 return 0;
1273}
1274
1275/* Parse an f-string. The f-string is in *str to end, with no
1276 'f' or quotes. */
1277int
1278_PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str,
1279 const char *end, int raw, int recurse_lvl,
1280 Token *first_token, Token* t, Token *last_token)
1281{
1282 FstringParser_check_invariants(state);
1283 state->fmode = 1;
1284
1285 /* Parse the f-string. */
1286 while (1) {
1287 PyObject *literal = NULL;
1288 PyObject *expr_text = NULL;
1289 expr_ty expression = NULL;
1290
1291 /* If there's a zero length literal in front of the
1292 expression, literal will be NULL. If we're at the end of
1293 the f-string, expression will be NULL (unless result == 1,
1294 see below). */
1295 int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
1296 &literal, &expr_text,
1297 &expression, first_token, t, last_token);
1298 if (result < 0)
1299 return -1;
1300
1301 /* Add the literal, if any. */
1302 if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
1303 Py_XDECREF(expr_text);
1304 return -1;
1305 }
1306 /* Add the expr_text, if any. */
1307 if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
1308 return -1;
1309 }
1310
1311 /* We've dealt with the literal and expr_text, their ownership has
1312 been transferred to the state object. Don't look at them again. */
1313
1314 /* See if we should just loop around to get the next literal
1315 and expression, while ignoring the expression this
1316 time. This is used for un-doubling braces, as an
1317 optimization. */
1318 if (result == 1)
1319 continue;
1320
1321 if (!expression)
1322 /* We're done with this f-string. */
1323 break;
1324
1325 /* We know we have an expression. Convert any existing string
1326 to a Constant node. */
1327 if (!state->last_str) {
1328 /* Do nothing. No previous literal. */
1329 } else {
1330 /* Convert the existing last_str literal to a Constant node. */
1331 expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1332 if (!str || ExprList_Append(&state->expr_list, str) < 0)
1333 return -1;
1334 }
1335
1336 if (ExprList_Append(&state->expr_list, expression) < 0)
1337 return -1;
1338 }
1339
1340 /* If recurse_lvl is zero, then we must be at the end of the
1341 string. Otherwise, we must be at a right brace. */
1342
1343 if (recurse_lvl == 0 && *str < end-1) {
1344 RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
1345 return -1;
1346 }
1347 if (recurse_lvl != 0 && **str != '}') {
1348 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1349 return -1;
1350 }
1351
1352 FstringParser_check_invariants(state);
1353 return 0;
1354}
1355
1356/* Convert the partial state reflected in last_str and expr_list to an
1357 expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
1358expr_ty
1359_PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token,
1360 Token *last_token)
1361{
1362 asdl_seq *seq;
1363
1364 FstringParser_check_invariants(state);
1365
1366 /* If we're just a constant string with no expressions, return
1367 that. */
1368 if (!state->fmode) {
1369 assert(!state->expr_list.size);
1370 if (!state->last_str) {
1371 /* Create a zero length string. */
1372 state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
1373 if (!state->last_str)
1374 goto error;
1375 }
1376 return make_str_node_and_del(p, &state->last_str, first_token, last_token);
1377 }
1378
1379 /* Create a Constant node out of last_str, if needed. It will be the
1380 last node in our expression list. */
1381 if (state->last_str) {
1382 expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1383 if (!str || ExprList_Append(&state->expr_list, str) < 0)
1384 goto error;
1385 }
1386 /* This has already been freed. */
1387 assert(state->last_str == NULL);
1388
1389 seq = ExprList_Finish(&state->expr_list, p->arena);
1390 if (!seq)
1391 goto error;
1392
1393 return _Py_JoinedStr(seq, first_token->lineno, first_token->col_offset,
1394 last_token->end_lineno, last_token->end_col_offset, p->arena);
1395
1396error:
1397 _PyPegen_FstringParser_Dealloc(state);
1398 return NULL;
1399}
1400
1401/* Given an f-string (with no 'f' or quotes) that's in *str and ends
1402 at end, parse it into an expr_ty. Return NULL on error. Adjust
1403 str to point past the parsed portion. */
1404static expr_ty
1405fstring_parse(Parser *p, const char **str, const char *end, int raw,
1406 int recurse_lvl, Token *first_token, Token* t, Token *last_token)
1407{
1408 FstringParser state;
1409
1410 _PyPegen_FstringParser_Init(&state);
1411 if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
1412 first_token, t, last_token) < 0) {
1413 _PyPegen_FstringParser_Dealloc(&state);
1414 return NULL;
1415 }
1416
1417 return _PyPegen_FstringParser_Finish(p, &state, t, t);
1418}