blob: e24ecc58d3aa1dc3acf7f05423ad3dd1720604ff [file] [log] [blame]
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001#include <Python.h>
2
3#include "../tokenizer.h"
4#include "pegen.h"
5#include "parse_string.h"
6
7//// STRING HANDLING FUNCTIONS ////
8
9// These functions are ported directly from Python/ast.c with some modifications
10// to account for the use of "Parser *p", the fact that don't have parser nodes
11// to pass around and the usage of some specialized APIs present only in this
12// file (like "_PyPegen_raise_syntax_error").
13
14static int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030015warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010016{
17 PyObject *msg =
18 PyUnicode_FromFormat("invalid escape sequence \\%c", first_invalid_escape_char);
19 if (msg == NULL) {
20 return -1;
21 }
22 if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030023 t->lineno, NULL, NULL) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +010024 if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
25 /* Replace the DeprecationWarning exception with a SyntaxError
26 to get a more accurate error report */
27 PyErr_Clear();
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030028
29 /* This is needed, in order for the SyntaxError to point to the token t,
30 since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
31 error location, if p->known_err_token is not set. */
32 p->known_err_token = t;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010033 RAISE_SYNTAX_ERROR("invalid escape sequence \\%c", first_invalid_escape_char);
34 }
35 Py_DECREF(msg);
36 return -1;
37 }
38 Py_DECREF(msg);
39 return 0;
40}
41
42static PyObject *
43decode_utf8(const char **sPtr, const char *end)
44{
45 const char *s, *t;
46 t = s = *sPtr;
47 while (s < end && (*s & 0x80)) {
48 s++;
49 }
50 *sPtr = s;
51 return PyUnicode_DecodeUTF8(t, s - t, NULL);
52}
53
54static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030055decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010056{
57 PyObject *v, *u;
58 char *buf;
59 char *p;
60 const char *end;
61
62 /* check for integer overflow */
63 if (len > SIZE_MAX / 6) {
64 return NULL;
65 }
66 /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
67 "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
68 u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
69 if (u == NULL) {
70 return NULL;
71 }
72 p = buf = PyBytes_AsString(u);
73 end = s + len;
74 while (s < end) {
75 if (*s == '\\') {
76 *p++ = *s++;
77 if (s >= end || *s & 0x80) {
78 strcpy(p, "u005c");
79 p += 5;
80 if (s >= end) {
81 break;
82 }
83 }
84 }
85 if (*s & 0x80) {
86 PyObject *w;
87 int kind;
88 void *data;
89 Py_ssize_t len, i;
90 w = decode_utf8(&s, end);
91 if (w == NULL) {
92 Py_DECREF(u);
93 return NULL;
94 }
95 kind = PyUnicode_KIND(w);
96 data = PyUnicode_DATA(w);
97 len = PyUnicode_GET_LENGTH(w);
98 for (i = 0; i < len; i++) {
99 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
100 sprintf(p, "\\U%08x", chr);
101 p += 10;
102 }
103 /* Should be impossible to overflow */
104 assert(p - buf <= PyBytes_GET_SIZE(u));
105 Py_DECREF(w);
106 }
107 else {
108 *p++ = *s++;
109 }
110 }
111 len = p - buf;
112 s = buf;
113
114 const char *first_invalid_escape;
115 v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
116
117 if (v != NULL && first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300118 if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100119 /* We have not decref u before because first_invalid_escape points
120 inside u. */
121 Py_XDECREF(u);
122 Py_DECREF(v);
123 return NULL;
124 }
125 }
126 Py_XDECREF(u);
127 return v;
128}
129
130static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300131decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100132{
133 const char *first_invalid_escape;
134 PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
135 if (result == NULL) {
136 return NULL;
137 }
138
139 if (first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300140 if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100141 Py_DECREF(result);
142 return NULL;
143 }
144 }
145 return result;
146}
147
148/* s must include the bracketing quote characters, and r, b, u,
149 &/or f prefixes (if any), and embedded escape sequences (if any).
150 _PyPegen_parsestr parses it, and sets *result to decoded Python string object.
151 If the string is an f-string, set *fstr and *fstrlen to the unparsed
152 string object. Return 0 if no errors occurred. */
153int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300154_PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
155 const char **fstr, Py_ssize_t *fstrlen, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100156{
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300157 const char *s = PyBytes_AsString(t->bytes);
158 if (s == NULL) {
159 return -1;
160 }
161
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100162 size_t len;
163 int quote = Py_CHARMASK(*s);
164 int fmode = 0;
165 *bytesmode = 0;
166 *rawmode = 0;
167 *result = NULL;
168 *fstr = NULL;
169 if (Py_ISALPHA(quote)) {
170 while (!*bytesmode || !*rawmode) {
171 if (quote == 'b' || quote == 'B') {
172 quote = *++s;
173 *bytesmode = 1;
174 }
175 else if (quote == 'u' || quote == 'U') {
176 quote = *++s;
177 }
178 else if (quote == 'r' || quote == 'R') {
179 quote = *++s;
180 *rawmode = 1;
181 }
182 else if (quote == 'f' || quote == 'F') {
183 quote = *++s;
184 fmode = 1;
185 }
186 else {
187 break;
188 }
189 }
190 }
191
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300192 /* fstrings are only allowed in Python 3.6 and greater */
193 if (fmode && p->feature_version < 6) {
194 p->error_indicator = 1;
195 RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
196 return -1;
197 }
198
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100199 if (fmode && *bytesmode) {
200 PyErr_BadInternalCall();
201 return -1;
202 }
203 if (quote != '\'' && quote != '\"') {
204 PyErr_BadInternalCall();
205 return -1;
206 }
207 /* Skip the leading quote char. */
208 s++;
209 len = strlen(s);
210 if (len > INT_MAX) {
211 PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
212 return -1;
213 }
214 if (s[--len] != quote) {
215 /* Last quote char must match the first. */
216 PyErr_BadInternalCall();
217 return -1;
218 }
219 if (len >= 4 && s[0] == quote && s[1] == quote) {
220 /* A triple quoted string. We've already skipped one quote at
221 the start and one at the end of the string. Now skip the
222 two at the start. */
223 s += 2;
224 len -= 2;
225 /* And check that the last two match. */
226 if (s[--len] != quote || s[--len] != quote) {
227 PyErr_BadInternalCall();
228 return -1;
229 }
230 }
231
232 if (fmode) {
233 /* Just return the bytes. The caller will parse the resulting
234 string. */
235 *fstr = s;
236 *fstrlen = len;
237 return 0;
238 }
239
240 /* Not an f-string. */
241 /* Avoid invoking escape decoding routines if possible. */
242 *rawmode = *rawmode || strchr(s, '\\') == NULL;
243 if (*bytesmode) {
244 /* Disallow non-ASCII characters. */
245 const char *ch;
246 for (ch = s; *ch; ch++) {
247 if (Py_CHARMASK(*ch) >= 0x80) {
248 RAISE_SYNTAX_ERROR(
249 "bytes can only contain ASCII "
250 "literal characters.");
251 return -1;
252 }
253 }
254 if (*rawmode) {
255 *result = PyBytes_FromStringAndSize(s, len);
256 }
257 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300258 *result = decode_bytes_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100259 }
260 }
261 else {
262 if (*rawmode) {
263 *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
264 }
265 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300266 *result = decode_unicode_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100267 }
268 }
269 return *result == NULL ? -1 : 0;
270}
271
272
273
274// FSTRING STUFF
275
276static void fstring_shift_expr_locations(expr_ty n, int lineno, int col_offset);
277static void fstring_shift_argument(expr_ty parent, arg_ty args, int lineno, int col_offset);
278
279
280static inline void shift_expr(expr_ty parent, expr_ty n, int line, int col) {
281 if (parent->lineno < n->lineno) {
282 col = 0;
283 }
284 fstring_shift_expr_locations(n, line, col);
285}
286
287static inline void shift_arg(expr_ty parent, arg_ty n, int line, int col) {
288 if (parent->lineno < n->lineno) {
289 col = 0;
290 }
291 fstring_shift_argument(parent, n, line, col);
292}
293
294static void fstring_shift_seq_locations(expr_ty parent, asdl_seq *seq, int lineno, int col_offset) {
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100295 for (Py_ssize_t i = 0, l = asdl_seq_LEN(seq); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100296 expr_ty expr = asdl_seq_GET(seq, i);
297 if (expr == NULL){
298 continue;
299 }
300 shift_expr(parent, expr, lineno, col_offset);
301 }
302}
303
304static void fstring_shift_slice_locations(expr_ty parent, expr_ty slice, int lineno, int col_offset) {
305 switch (slice->kind) {
306 case Slice_kind:
307 if (slice->v.Slice.lower) {
308 shift_expr(parent, slice->v.Slice.lower, lineno, col_offset);
309 }
310 if (slice->v.Slice.upper) {
311 shift_expr(parent, slice->v.Slice.upper, lineno, col_offset);
312 }
313 if (slice->v.Slice.step) {
314 shift_expr(parent, slice->v.Slice.step, lineno, col_offset);
315 }
316 break;
317 case Tuple_kind:
318 fstring_shift_seq_locations(parent, slice->v.Tuple.elts, lineno, col_offset);
319 break;
320 default:
321 break;
322 }
323}
324
325static void fstring_shift_comprehension(expr_ty parent, comprehension_ty comp, int lineno, int col_offset) {
326 shift_expr(parent, comp->target, lineno, col_offset);
327 shift_expr(parent, comp->iter, lineno, col_offset);
328 fstring_shift_seq_locations(parent, comp->ifs, lineno, col_offset);
329}
330
331static void fstring_shift_argument(expr_ty parent, arg_ty arg, int lineno, int col_offset) {
332 if (arg->annotation != NULL){
333 shift_expr(parent, arg->annotation, lineno, col_offset);
334 }
335 arg->col_offset = arg->col_offset + col_offset;
336 arg->end_col_offset = arg->end_col_offset + col_offset;
337 arg->lineno = arg->lineno + lineno;
338 arg->end_lineno = arg->end_lineno + lineno;
339}
340
341static void fstring_shift_arguments(expr_ty parent, arguments_ty args, int lineno, int col_offset) {
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100342 for (Py_ssize_t i = 0, l = asdl_seq_LEN(args->posonlyargs); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100343 arg_ty arg = asdl_seq_GET(args->posonlyargs, i);
344 shift_arg(parent, arg, lineno, col_offset);
345 }
346
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100347 for (Py_ssize_t i = 0, l = asdl_seq_LEN(args->args); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100348 arg_ty arg = asdl_seq_GET(args->args, i);
349 shift_arg(parent, arg, lineno, col_offset);
350 }
351
352 if (args->vararg != NULL) {
353 shift_arg(parent, args->vararg, lineno, col_offset);
354 }
355
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100356 for (Py_ssize_t i = 0, l = asdl_seq_LEN(args->kwonlyargs); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100357 arg_ty arg = asdl_seq_GET(args->kwonlyargs, i);
358 shift_arg(parent, arg, lineno, col_offset);
359 }
360
361 fstring_shift_seq_locations(parent, args->kw_defaults, lineno, col_offset);
362
363 if (args->kwarg != NULL) {
364 shift_arg(parent, args->kwarg, lineno, col_offset);
365 }
366
367 fstring_shift_seq_locations(parent, args->defaults, lineno, col_offset);
368}
369
370static void fstring_shift_children_locations(expr_ty n, int lineno, int col_offset) {
371 switch (n->kind) {
372 case BoolOp_kind:
373 fstring_shift_seq_locations(n, n->v.BoolOp.values, lineno, col_offset);
374 break;
375 case NamedExpr_kind:
376 shift_expr(n, n->v.NamedExpr.target, lineno, col_offset);
377 shift_expr(n, n->v.NamedExpr.value, lineno, col_offset);
378 break;
379 case BinOp_kind:
380 shift_expr(n, n->v.BinOp.left, lineno, col_offset);
381 shift_expr(n, n->v.BinOp.right, lineno, col_offset);
382 break;
383 case UnaryOp_kind:
384 shift_expr(n, n->v.UnaryOp.operand, lineno, col_offset);
385 break;
386 case Lambda_kind:
387 fstring_shift_arguments(n, n->v.Lambda.args, lineno, col_offset);
388 shift_expr(n, n->v.Lambda.body, lineno, col_offset);
389 break;
390 case IfExp_kind:
391 shift_expr(n, n->v.IfExp.test, lineno, col_offset);
392 shift_expr(n, n->v.IfExp.body, lineno, col_offset);
393 shift_expr(n, n->v.IfExp.orelse, lineno, col_offset);
394 break;
395 case Dict_kind:
396 fstring_shift_seq_locations(n, n->v.Dict.keys, lineno, col_offset);
397 fstring_shift_seq_locations(n, n->v.Dict.values, lineno, col_offset);
398 break;
399 case Set_kind:
400 fstring_shift_seq_locations(n, n->v.Set.elts, lineno, col_offset);
401 break;
402 case ListComp_kind:
403 shift_expr(n, n->v.ListComp.elt, lineno, col_offset);
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100404 for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.ListComp.generators); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100405 comprehension_ty comp = asdl_seq_GET(n->v.ListComp.generators, i);
406 fstring_shift_comprehension(n, comp, lineno, col_offset);
407 }
408 break;
409 case SetComp_kind:
410 shift_expr(n, n->v.SetComp.elt, lineno, col_offset);
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100411 for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.SetComp.generators); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100412 comprehension_ty comp = asdl_seq_GET(n->v.SetComp.generators, i);
413 fstring_shift_comprehension(n, comp, lineno, col_offset);
414 }
415 break;
416 case DictComp_kind:
417 shift_expr(n, n->v.DictComp.key, lineno, col_offset);
418 shift_expr(n, n->v.DictComp.value, lineno, col_offset);
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100419 for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.DictComp.generators); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100420 comprehension_ty comp = asdl_seq_GET(n->v.DictComp.generators, i);
421 fstring_shift_comprehension(n, comp, lineno, col_offset);
422 }
423 break;
424 case GeneratorExp_kind:
425 shift_expr(n, n->v.GeneratorExp.elt, lineno, col_offset);
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100426 for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.GeneratorExp.generators); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100427 comprehension_ty comp = asdl_seq_GET(n->v.GeneratorExp.generators, i);
428 fstring_shift_comprehension(n, comp, lineno, col_offset);
429 }
430 break;
431 case Await_kind:
432 shift_expr(n, n->v.Await.value, lineno, col_offset);
433 break;
434 case Yield_kind:
435 shift_expr(n, n->v.Yield.value, lineno, col_offset);
436 break;
437 case YieldFrom_kind:
438 shift_expr(n, n->v.YieldFrom.value, lineno, col_offset);
439 break;
440 case Compare_kind:
441 shift_expr(n, n->v.Compare.left, lineno, col_offset);
442 fstring_shift_seq_locations(n, n->v.Compare.comparators, lineno, col_offset);
443 break;
444 case Call_kind:
445 shift_expr(n, n->v.Call.func, lineno, col_offset);
446 fstring_shift_seq_locations(n, n->v.Call.args, lineno, col_offset);
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100447 for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.Call.keywords); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100448 keyword_ty keyword = asdl_seq_GET(n->v.Call.keywords, i);
449 shift_expr(n, keyword->value, lineno, col_offset);
450 }
451 break;
452 case Attribute_kind:
453 shift_expr(n, n->v.Attribute.value, lineno, col_offset);
454 break;
455 case Subscript_kind:
456 shift_expr(n, n->v.Subscript.value, lineno, col_offset);
457 fstring_shift_slice_locations(n, n->v.Subscript.slice, lineno, col_offset);
458 shift_expr(n, n->v.Subscript.slice, lineno, col_offset);
459 break;
460 case Starred_kind:
461 shift_expr(n, n->v.Starred.value, lineno, col_offset);
462 break;
463 case List_kind:
464 fstring_shift_seq_locations(n, n->v.List.elts, lineno, col_offset);
465 break;
466 case Tuple_kind:
467 fstring_shift_seq_locations(n, n->v.Tuple.elts, lineno, col_offset);
468 break;
Lysandros Nikolaou37af21b2020-04-29 03:43:50 +0300469 case JoinedStr_kind:
470 fstring_shift_seq_locations(n, n->v.JoinedStr.values, lineno, col_offset);
471 break;
472 case FormattedValue_kind:
473 shift_expr(n, n->v.FormattedValue.value, lineno, col_offset);
474 if (n->v.FormattedValue.format_spec) {
475 shift_expr(n, n->v.FormattedValue.format_spec, lineno, col_offset);
476 }
477 break;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100478 default:
479 return;
480 }
481}
482
483/* Shift locations for the given node and all its children by adding `lineno`
484 and `col_offset` to existing locations. Note that n is the already parsed
485 expression. */
486static void fstring_shift_expr_locations(expr_ty n, int lineno, int col_offset)
487{
488 n->col_offset = n->col_offset + col_offset;
489
490 // The following is needed, in order for nodes spanning across multiple lines
491 // to be shifted correctly. An example of such a node is a Call node, the closing
492 // parenthesis of which is not on the same line as its name.
493 if (n->lineno == n->end_lineno) {
494 n->end_col_offset = n->end_col_offset + col_offset;
495 }
496
497 fstring_shift_children_locations(n, lineno, col_offset);
498 n->lineno = n->lineno + lineno;
499 n->end_lineno = n->end_lineno + lineno;
500}
501
502/* Fix locations for the given node and its children.
503
504 `parent` is the enclosing node.
505 `n` is the node which locations are going to be fixed relative to parent.
506 `expr_str` is the child node's string representation, including braces.
507*/
508static void
509fstring_fix_expr_location(Token *parent, expr_ty n, char *expr_str)
510{
511 char *substr = NULL;
512 char *start;
513 int lines = 0;
514 int cols = 0;
515
516 if (parent && parent->bytes) {
517 char *parent_str = PyBytes_AsString(parent->bytes);
518 if (!parent_str) {
519 return;
520 }
521 substr = strstr(parent_str, expr_str);
522 if (substr) {
523 // The following is needed, in order to correctly shift the column
524 // offset, in the case that (disregarding any whitespace) a newline
525 // immediately follows the opening curly brace of the fstring expression.
526 int newline_after_brace = 1;
527 start = substr + 1;
528 while (start && *start != '}' && *start != '\n') {
529 if (*start != ' ' && *start != '\t' && *start != '\f') {
530 newline_after_brace = 0;
531 break;
532 }
533 start++;
534 }
535
536 // Account for the characters from the last newline character to our
537 // left until the beginning of substr.
538 if (!newline_after_brace) {
539 start = substr;
540 while (start > parent_str && *start != '\n') {
541 start--;
542 }
543 cols += (int)(substr - start);
544 }
545 /* adjust the start based on the number of newlines encountered
546 before the f-string expression */
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100547 for (char* p = parent_str; p < substr; p++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100548 if (*p == '\n') {
549 lines++;
550 }
551 }
552 }
553 }
554 fstring_shift_expr_locations(n, lines, cols);
555}
556
557
558/* Compile this expression in to an expr_ty. Add parens around the
559 expression, in order to allow leading spaces in the expression. */
560static expr_ty
561fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
562 Token *t)
563{
564 expr_ty expr = NULL;
565 char *str;
566 Py_ssize_t len;
567 const char *s;
568 expr_ty result = NULL;
569
570 assert(expr_end >= expr_start);
571 assert(*(expr_start-1) == '{');
572 assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' ||
573 *expr_end == '=');
574
575 /* If the substring is all whitespace, it's an error. We need to catch this
576 here, and not when we call PyParser_SimpleParseStringFlagsFilename,
577 because turning the expression '' in to '()' would go from being invalid
578 to valid. */
579 for (s = expr_start; s != expr_end; s++) {
580 char c = *s;
581 /* The Python parser ignores only the following whitespace
582 characters (\r already is converted to \n). */
583 if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) {
584 break;
585 }
586 }
587 if (s == expr_end) {
588 RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
589 return NULL;
590 }
591
592 len = expr_end - expr_start;
593 /* Allocate 3 extra bytes: open paren, close paren, null byte. */
594 str = PyMem_RawMalloc(len + 3);
595 if (str == NULL) {
596 PyErr_NoMemory();
597 return NULL;
598 }
599
600 str[0] = '(';
601 memcpy(str+1, expr_start, len);
602 str[len+1] = ')';
603 str[len+2] = 0;
604
605 struct tok_state* tok = PyTokenizer_FromString(str, 1);
606 if (tok == NULL) {
607 return NULL;
608 }
Lysandros Nikolaou791a46e2020-05-26 04:24:31 +0300609 Py_INCREF(p->tok->filename);
610 tok->filename = p->tok->filename;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100611
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300612 Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
613 NULL, p->arena);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100614 p2->starting_lineno = p->starting_lineno + p->tok->first_lineno - 1;
615 p2->starting_col_offset = p->tok->first_lineno == p->tok->lineno
616 ? p->starting_col_offset + t->col_offset : 0;
617
618 expr = _PyPegen_run_parser(p2);
619
620 if (expr == NULL) {
621 goto exit;
622 }
623
624 /* Reuse str to find the correct column offset. */
625 str[0] = '{';
626 str[len+1] = '}';
627 fstring_fix_expr_location(t, expr, str);
628
629 result = expr;
630
631exit:
632 _PyPegen_Parser_Free(p2);
633 PyTokenizer_Free(tok);
634 return result;
635}
636
637/* Return -1 on error.
638
639 Return 0 if we reached the end of the literal.
640
641 Return 1 if we haven't reached the end of the literal, but we want
642 the caller to process the literal up to this point. Used for
643 doubled braces.
644*/
645static int
646fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300647 PyObject **literal, int recurse_lvl, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100648{
649 /* Get any literal string. It ends when we hit an un-doubled left
650 brace (which isn't part of a unicode name escape such as
651 "\N{EULER CONSTANT}"), or the end of the string. */
652
653 const char *s = *str;
654 const char *literal_start = s;
655 int result = 0;
656
657 assert(*literal == NULL);
658 while (s < end) {
659 char ch = *s++;
660 if (!raw && ch == '\\' && s < end) {
661 ch = *s++;
662 if (ch == 'N') {
663 if (s < end && *s++ == '{') {
664 while (s < end && *s++ != '}') {
665 }
666 continue;
667 }
668 break;
669 }
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300670 if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100671 return -1;
672 }
673 }
674 if (ch == '{' || ch == '}') {
675 /* Check for doubled braces, but only at the top level. If
676 we checked at every level, then f'{0:{3}}' would fail
677 with the two closing braces. */
678 if (recurse_lvl == 0) {
679 if (s < end && *s == ch) {
680 /* We're going to tell the caller that the literal ends
681 here, but that they should continue scanning. But also
682 skip over the second brace when we resume scanning. */
683 *str = s + 1;
684 result = 1;
685 goto done;
686 }
687
688 /* Where a single '{' is the start of a new expression, a
689 single '}' is not allowed. */
690 if (ch == '}') {
691 *str = s - 1;
692 RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
693 return -1;
694 }
695 }
696 /* We're either at a '{', which means we're starting another
697 expression; or a '}', which means we're at the end of this
698 f-string (for a nested format_spec). */
699 s--;
700 break;
701 }
702 }
703 *str = s;
704 assert(s <= end);
705 assert(s == end || *s == '{' || *s == '}');
706done:
707 if (literal_start != s) {
708 if (raw)
709 *literal = PyUnicode_DecodeUTF8Stateful(literal_start,
710 s - literal_start,
711 NULL, NULL);
712 else
713 *literal = decode_unicode_with_escapes(p, literal_start,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300714 s - literal_start, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100715 if (!*literal)
716 return -1;
717 }
718 return result;
719}
720
721/* Forward declaration because parsing is recursive. */
722static expr_ty
723fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
724 Token *first_token, Token* t, Token *last_token);
725
726/* Parse the f-string at *str, ending at end. We know *str starts an
727 expression (so it must be a '{'). Returns the FormattedValue node, which
728 includes the expression, conversion character, format_spec expression, and
729 optionally the text of the expression (if = is used).
730
731 Note that I don't do a perfect job here: I don't make sure that a
732 closing brace doesn't match an opening paren, for example. It
733 doesn't need to error on all invalid expressions, just correctly
734 find the end of all valid ones. Any errors inside the expression
735 will be caught when we parse it later.
736
737 *expression is set to the expression. For an '=' "debug" expression,
738 *expr_text is set to the debug text (the original text of the expression,
739 including the '=' and any whitespace around it, as a string object). If
740 not a debug expression, *expr_text set to NULL. */
741static int
742fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
743 PyObject **expr_text, expr_ty *expression, Token *first_token,
744 Token *t, Token *last_token)
745{
746 /* Return -1 on error, else 0. */
747
748 const char *expr_start;
749 const char *expr_end;
750 expr_ty simple_expression;
751 expr_ty format_spec = NULL; /* Optional format specifier. */
752 int conversion = -1; /* The conversion char. Use default if not
753 specified, or !r if using = and no format
754 spec. */
755
756 /* 0 if we're not in a string, else the quote char we're trying to
757 match (single or double quote). */
758 char quote_char = 0;
759
760 /* If we're inside a string, 1=normal, 3=triple-quoted. */
761 int string_type = 0;
762
763 /* Keep track of nesting level for braces/parens/brackets in
764 expressions. */
765 Py_ssize_t nested_depth = 0;
766 char parenstack[MAXLEVEL];
767
768 *expr_text = NULL;
769
770 /* Can only nest one level deep. */
771 if (recurse_lvl >= 2) {
772 RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
773 goto error;
774 }
775
776 /* The first char must be a left brace, or we wouldn't have gotten
777 here. Skip over it. */
778 assert(**str == '{');
779 *str += 1;
780
781 expr_start = *str;
782 for (; *str < end; (*str)++) {
783 char ch;
784
785 /* Loop invariants. */
786 assert(nested_depth >= 0);
787 assert(*str >= expr_start && *str < end);
788 if (quote_char)
789 assert(string_type == 1 || string_type == 3);
790 else
791 assert(string_type == 0);
792
793 ch = **str;
794 /* Nowhere inside an expression is a backslash allowed. */
795 if (ch == '\\') {
796 /* Error: can't include a backslash character, inside
797 parens or strings or not. */
798 RAISE_SYNTAX_ERROR(
799 "f-string expression part "
800 "cannot include a backslash");
801 goto error;
802 }
803 if (quote_char) {
804 /* We're inside a string. See if we're at the end. */
805 /* This code needs to implement the same non-error logic
806 as tok_get from tokenizer.c, at the letter_quote
807 label. To actually share that code would be a
808 nightmare. But, it's unlikely to change and is small,
809 so duplicate it here. Note we don't need to catch all
810 of the errors, since they'll be caught when parsing the
811 expression. We just need to match the non-error
812 cases. Thus we can ignore \n in single-quoted strings,
813 for example. Or non-terminated strings. */
814 if (ch == quote_char) {
815 /* Does this match the string_type (single or triple
816 quoted)? */
817 if (string_type == 3) {
818 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
819 /* We're at the end of a triple quoted string. */
820 *str += 2;
821 string_type = 0;
822 quote_char = 0;
823 continue;
824 }
825 } else {
826 /* We're at the end of a normal string. */
827 quote_char = 0;
828 string_type = 0;
829 continue;
830 }
831 }
832 } else if (ch == '\'' || ch == '"') {
833 /* Is this a triple quoted string? */
834 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
835 string_type = 3;
836 *str += 2;
837 } else {
838 /* Start of a normal string. */
839 string_type = 1;
840 }
841 /* Start looking for the end of the string. */
842 quote_char = ch;
843 } else if (ch == '[' || ch == '{' || ch == '(') {
844 if (nested_depth >= MAXLEVEL) {
845 RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
846 goto error;
847 }
848 parenstack[nested_depth] = ch;
849 nested_depth++;
850 } else if (ch == '#') {
851 /* Error: can't include a comment character, inside parens
852 or not. */
853 RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
854 goto error;
855 } else if (nested_depth == 0 &&
856 (ch == '!' || ch == ':' || ch == '}' ||
857 ch == '=' || ch == '>' || ch == '<')) {
858 /* See if there's a next character. */
859 if (*str+1 < end) {
860 char next = *(*str+1);
861
862 /* For "!=". since '=' is not an allowed conversion character,
863 nothing is lost in this test. */
864 if ((ch == '!' && next == '=') || /* != */
865 (ch == '=' && next == '=') || /* == */
866 (ch == '<' && next == '=') || /* <= */
867 (ch == '>' && next == '=') /* >= */
868 ) {
869 *str += 1;
870 continue;
871 }
872 /* Don't get out of the loop for these, if they're single
873 chars (not part of 2-char tokens). If by themselves, they
874 don't end an expression (unlike say '!'). */
875 if (ch == '>' || ch == '<') {
876 continue;
877 }
878 }
879
880 /* Normal way out of this loop. */
881 break;
882 } else if (ch == ']' || ch == '}' || ch == ')') {
883 if (!nested_depth) {
884 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
885 goto error;
886 }
887 nested_depth--;
888 int opening = parenstack[nested_depth];
889 if (!((opening == '(' && ch == ')') ||
890 (opening == '[' && ch == ']') ||
891 (opening == '{' && ch == '}')))
892 {
893 RAISE_SYNTAX_ERROR(
894 "f-string: closing parenthesis '%c' "
895 "does not match opening parenthesis '%c'",
896 ch, opening);
897 goto error;
898 }
899 } else {
900 /* Just consume this char and loop around. */
901 }
902 }
903 expr_end = *str;
904 /* If we leave this loop in a string or with mismatched parens, we
905 don't care. We'll get a syntax error when compiling the
906 expression. But, we can produce a better error message, so
907 let's just do that.*/
908 if (quote_char) {
909 RAISE_SYNTAX_ERROR("f-string: unterminated string");
910 goto error;
911 }
912 if (nested_depth) {
913 int opening = parenstack[nested_depth - 1];
914 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
915 goto error;
916 }
917
918 if (*str >= end)
919 goto unexpected_end_of_string;
920
921 /* Compile the expression as soon as possible, so we show errors
922 related to the expression before errors related to the
923 conversion or format_spec. */
924 simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
925 if (!simple_expression)
926 goto error;
927
928 /* Check for =, which puts the text value of the expression in
929 expr_text. */
930 if (**str == '=') {
Pablo Galindo9b838292020-05-27 22:01:11 +0100931 if (p->feature_version < 8) {
932 RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
933 "only supported in Python 3.8 and greater");
934 goto error;
935 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100936 *str += 1;
937
938 /* Skip over ASCII whitespace. No need to test for end of string
939 here, since we know there's at least a trailing quote somewhere
940 ahead. */
941 while (Py_ISSPACE(**str)) {
942 *str += 1;
943 }
944
945 /* Set *expr_text to the text of the expression. */
946 *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start);
947 if (!*expr_text) {
948 goto error;
949 }
950 }
951
952 /* Check for a conversion char, if present. */
953 if (**str == '!') {
954 *str += 1;
955 if (*str >= end)
956 goto unexpected_end_of_string;
957
958 conversion = **str;
959 *str += 1;
960
961 /* Validate the conversion. */
962 if (!(conversion == 's' || conversion == 'r' || conversion == 'a')) {
963 RAISE_SYNTAX_ERROR(
964 "f-string: invalid conversion character: "
965 "expected 's', 'r', or 'a'");
966 goto error;
967 }
968
969 }
970
971 /* Check for the format spec, if present. */
972 if (*str >= end)
973 goto unexpected_end_of_string;
974 if (**str == ':') {
975 *str += 1;
976 if (*str >= end)
977 goto unexpected_end_of_string;
978
979 /* Parse the format spec. */
980 format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
981 first_token, t, last_token);
982 if (!format_spec)
983 goto error;
984 }
985
986 if (*str >= end || **str != '}')
987 goto unexpected_end_of_string;
988
989 /* We're at a right brace. Consume it. */
990 assert(*str < end);
991 assert(**str == '}');
992 *str += 1;
993
994 /* If we're in = mode (detected by non-NULL expr_text), and have no format
995 spec and no explicit conversion, set the conversion to 'r'. */
996 if (*expr_text && format_spec == NULL && conversion == -1) {
997 conversion = 'r';
998 }
999
1000 /* And now create the FormattedValue node that represents this
1001 entire expression with the conversion and format spec. */
1002 //TODO: Fix this
1003 *expression = FormattedValue(simple_expression, conversion,
1004 format_spec, first_token->lineno,
1005 first_token->col_offset, last_token->end_lineno,
1006 last_token->end_col_offset, p->arena);
1007 if (!*expression)
1008 goto error;
1009
1010 return 0;
1011
1012unexpected_end_of_string:
1013 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1014 /* Falls through to error. */
1015
1016error:
1017 Py_XDECREF(*expr_text);
1018 return -1;
1019
1020}
1021
1022/* Return -1 on error.
1023
1024 Return 0 if we have a literal (possible zero length) and an
1025 expression (zero length if at the end of the string.
1026
1027 Return 1 if we have a literal, but no expression, and we want the
1028 caller to call us again. This is used to deal with doubled
1029 braces.
1030
1031 When called multiple times on the string 'a{{b{0}c', this function
1032 will return:
1033
1034 1. the literal 'a{' with no expression, and a return value
1035 of 1. Despite the fact that there's no expression, the return
1036 value of 1 means we're not finished yet.
1037
1038 2. the literal 'b' and the expression '0', with a return value of
1039 0. The fact that there's an expression means we're not finished.
1040
1041 3. literal 'c' with no expression and a return value of 0. The
1042 combination of the return value of 0 with no expression means
1043 we're finished.
1044*/
1045static int
1046fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw,
1047 int recurse_lvl, PyObject **literal,
1048 PyObject **expr_text, expr_ty *expression,
1049 Token *first_token, Token *t, Token *last_token)
1050{
1051 int result;
1052
1053 assert(*literal == NULL && *expression == NULL);
1054
1055 /* Get any literal string. */
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +03001056 result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001057 if (result < 0)
1058 goto error;
1059
1060 assert(result == 0 || result == 1);
1061
1062 if (result == 1)
1063 /* We have a literal, but don't look at the expression. */
1064 return 1;
1065
1066 if (*str >= end || **str == '}')
1067 /* We're at the end of the string or the end of a nested
1068 f-string: no expression. The top-level error case where we
1069 expect to be at the end of the string but we're at a '}' is
1070 handled later. */
1071 return 0;
1072
1073 /* We must now be the start of an expression, on a '{'. */
1074 assert(**str == '{');
1075
1076 if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
1077 expression, first_token, t, last_token) < 0)
1078 goto error;
1079
1080 return 0;
1081
1082error:
1083 Py_CLEAR(*literal);
1084 return -1;
1085}
1086
1087#ifdef NDEBUG
1088#define ExprList_check_invariants(l)
1089#else
1090static void
1091ExprList_check_invariants(ExprList *l)
1092{
1093 /* Check our invariants. Make sure this object is "live", and
1094 hasn't been deallocated. */
1095 assert(l->size >= 0);
1096 assert(l->p != NULL);
1097 if (l->size <= EXPRLIST_N_CACHED)
1098 assert(l->data == l->p);
1099}
1100#endif
1101
1102static void
1103ExprList_Init(ExprList *l)
1104{
1105 l->allocated = EXPRLIST_N_CACHED;
1106 l->size = 0;
1107
1108 /* Until we start allocating dynamically, p points to data. */
1109 l->p = l->data;
1110
1111 ExprList_check_invariants(l);
1112}
1113
1114static int
1115ExprList_Append(ExprList *l, expr_ty exp)
1116{
1117 ExprList_check_invariants(l);
1118 if (l->size >= l->allocated) {
1119 /* We need to alloc (or realloc) the memory. */
1120 Py_ssize_t new_size = l->allocated * 2;
1121
1122 /* See if we've ever allocated anything dynamically. */
1123 if (l->p == l->data) {
1124 Py_ssize_t i;
1125 /* We're still using the cached data. Switch to
1126 alloc-ing. */
1127 l->p = PyMem_RawMalloc(sizeof(expr_ty) * new_size);
1128 if (!l->p)
1129 return -1;
1130 /* Copy the cached data into the new buffer. */
1131 for (i = 0; i < l->size; i++)
1132 l->p[i] = l->data[i];
1133 } else {
1134 /* Just realloc. */
1135 expr_ty *tmp = PyMem_RawRealloc(l->p, sizeof(expr_ty) * new_size);
1136 if (!tmp) {
1137 PyMem_RawFree(l->p);
1138 l->p = NULL;
1139 return -1;
1140 }
1141 l->p = tmp;
1142 }
1143
1144 l->allocated = new_size;
1145 assert(l->allocated == 2 * l->size);
1146 }
1147
1148 l->p[l->size++] = exp;
1149
1150 ExprList_check_invariants(l);
1151 return 0;
1152}
1153
1154static void
1155ExprList_Dealloc(ExprList *l)
1156{
1157 ExprList_check_invariants(l);
1158
1159 /* If there's been an error, or we've never dynamically allocated,
1160 do nothing. */
1161 if (!l->p || l->p == l->data) {
1162 /* Do nothing. */
1163 } else {
1164 /* We have dynamically allocated. Free the memory. */
1165 PyMem_RawFree(l->p);
1166 }
1167 l->p = NULL;
1168 l->size = -1;
1169}
1170
1171static asdl_seq *
1172ExprList_Finish(ExprList *l, PyArena *arena)
1173{
1174 asdl_seq *seq;
1175
1176 ExprList_check_invariants(l);
1177
1178 /* Allocate the asdl_seq and copy the expressions in to it. */
1179 seq = _Py_asdl_seq_new(l->size, arena);
1180 if (seq) {
1181 Py_ssize_t i;
1182 for (i = 0; i < l->size; i++)
1183 asdl_seq_SET(seq, i, l->p[i]);
1184 }
1185 ExprList_Dealloc(l);
1186 return seq;
1187}
1188
1189#ifdef NDEBUG
1190#define FstringParser_check_invariants(state)
1191#else
1192static void
1193FstringParser_check_invariants(FstringParser *state)
1194{
1195 if (state->last_str)
1196 assert(PyUnicode_CheckExact(state->last_str));
1197 ExprList_check_invariants(&state->expr_list);
1198}
1199#endif
1200
1201void
1202_PyPegen_FstringParser_Init(FstringParser *state)
1203{
1204 state->last_str = NULL;
1205 state->fmode = 0;
1206 ExprList_Init(&state->expr_list);
1207 FstringParser_check_invariants(state);
1208}
1209
1210void
1211_PyPegen_FstringParser_Dealloc(FstringParser *state)
1212{
1213 FstringParser_check_invariants(state);
1214
1215 Py_XDECREF(state->last_str);
1216 ExprList_Dealloc(&state->expr_list);
1217}
1218
1219/* Make a Constant node, but decref the PyUnicode object being added. */
1220static expr_ty
1221make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token)
1222{
1223 PyObject *s = *str;
1224 PyObject *kind = NULL;
1225 *str = NULL;
1226 assert(PyUnicode_CheckExact(s));
1227 if (PyArena_AddPyObject(p->arena, s) < 0) {
1228 Py_DECREF(s);
1229 return NULL;
1230 }
1231 const char* the_str = PyBytes_AsString(first_token->bytes);
1232 if (the_str && the_str[0] == 'u') {
1233 kind = _PyPegen_new_identifier(p, "u");
1234 }
1235
1236 if (kind == NULL && PyErr_Occurred()) {
1237 return NULL;
1238 }
1239
1240 return Constant(s, kind, first_token->lineno, first_token->col_offset,
1241 last_token->end_lineno, last_token->end_col_offset, p->arena);
1242
1243}
1244
1245
1246/* Add a non-f-string (that is, a regular literal string). str is
1247 decref'd. */
1248int
1249_PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str)
1250{
1251 FstringParser_check_invariants(state);
1252
1253 assert(PyUnicode_CheckExact(str));
1254
1255 if (PyUnicode_GET_LENGTH(str) == 0) {
1256 Py_DECREF(str);
1257 return 0;
1258 }
1259
1260 if (!state->last_str) {
1261 /* We didn't have a string before, so just remember this one. */
1262 state->last_str = str;
1263 } else {
1264 /* Concatenate this with the previous string. */
1265 PyUnicode_AppendAndDel(&state->last_str, str);
1266 if (!state->last_str)
1267 return -1;
1268 }
1269 FstringParser_check_invariants(state);
1270 return 0;
1271}
1272
1273/* Parse an f-string. The f-string is in *str to end, with no
1274 'f' or quotes. */
1275int
1276_PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str,
1277 const char *end, int raw, int recurse_lvl,
1278 Token *first_token, Token* t, Token *last_token)
1279{
1280 FstringParser_check_invariants(state);
1281 state->fmode = 1;
1282
1283 /* Parse the f-string. */
1284 while (1) {
1285 PyObject *literal = NULL;
1286 PyObject *expr_text = NULL;
1287 expr_ty expression = NULL;
1288
1289 /* If there's a zero length literal in front of the
1290 expression, literal will be NULL. If we're at the end of
1291 the f-string, expression will be NULL (unless result == 1,
1292 see below). */
1293 int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
1294 &literal, &expr_text,
1295 &expression, first_token, t, last_token);
1296 if (result < 0)
1297 return -1;
1298
1299 /* Add the literal, if any. */
1300 if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
1301 Py_XDECREF(expr_text);
1302 return -1;
1303 }
1304 /* Add the expr_text, if any. */
1305 if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
1306 return -1;
1307 }
1308
1309 /* We've dealt with the literal and expr_text, their ownership has
1310 been transferred to the state object. Don't look at them again. */
1311
1312 /* See if we should just loop around to get the next literal
1313 and expression, while ignoring the expression this
1314 time. This is used for un-doubling braces, as an
1315 optimization. */
1316 if (result == 1)
1317 continue;
1318
1319 if (!expression)
1320 /* We're done with this f-string. */
1321 break;
1322
1323 /* We know we have an expression. Convert any existing string
1324 to a Constant node. */
1325 if (!state->last_str) {
1326 /* Do nothing. No previous literal. */
1327 } else {
1328 /* Convert the existing last_str literal to a Constant node. */
1329 expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1330 if (!str || ExprList_Append(&state->expr_list, str) < 0)
1331 return -1;
1332 }
1333
1334 if (ExprList_Append(&state->expr_list, expression) < 0)
1335 return -1;
1336 }
1337
1338 /* If recurse_lvl is zero, then we must be at the end of the
1339 string. Otherwise, we must be at a right brace. */
1340
1341 if (recurse_lvl == 0 && *str < end-1) {
1342 RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
1343 return -1;
1344 }
1345 if (recurse_lvl != 0 && **str != '}') {
1346 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1347 return -1;
1348 }
1349
1350 FstringParser_check_invariants(state);
1351 return 0;
1352}
1353
1354/* Convert the partial state reflected in last_str and expr_list to an
1355 expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
1356expr_ty
1357_PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token,
1358 Token *last_token)
1359{
1360 asdl_seq *seq;
1361
1362 FstringParser_check_invariants(state);
1363
1364 /* If we're just a constant string with no expressions, return
1365 that. */
1366 if (!state->fmode) {
1367 assert(!state->expr_list.size);
1368 if (!state->last_str) {
1369 /* Create a zero length string. */
1370 state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
1371 if (!state->last_str)
1372 goto error;
1373 }
1374 return make_str_node_and_del(p, &state->last_str, first_token, last_token);
1375 }
1376
1377 /* Create a Constant node out of last_str, if needed. It will be the
1378 last node in our expression list. */
1379 if (state->last_str) {
1380 expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1381 if (!str || ExprList_Append(&state->expr_list, str) < 0)
1382 goto error;
1383 }
1384 /* This has already been freed. */
1385 assert(state->last_str == NULL);
1386
1387 seq = ExprList_Finish(&state->expr_list, p->arena);
1388 if (!seq)
1389 goto error;
1390
1391 return _Py_JoinedStr(seq, first_token->lineno, first_token->col_offset,
1392 last_token->end_lineno, last_token->end_col_offset, p->arena);
1393
1394error:
1395 _PyPegen_FstringParser_Dealloc(state);
1396 return NULL;
1397}
1398
1399/* Given an f-string (with no 'f' or quotes) that's in *str and ends
1400 at end, parse it into an expr_ty. Return NULL on error. Adjust
1401 str to point past the parsed portion. */
1402static expr_ty
1403fstring_parse(Parser *p, const char **str, const char *end, int raw,
1404 int recurse_lvl, Token *first_token, Token* t, Token *last_token)
1405{
1406 FstringParser state;
1407
1408 _PyPegen_FstringParser_Init(&state);
1409 if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
1410 first_token, t, last_token) < 0) {
1411 _PyPegen_FstringParser_Dealloc(&state);
1412 return NULL;
1413 }
1414
1415 return _PyPegen_FstringParser_Finish(p, &state, t, t);
1416}