blob: ca4b733c153b57b427cb73b2d3756725942d2dde [file] [log] [blame]
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001#include <Python.h>
2
3#include "../tokenizer.h"
4#include "pegen.h"
5#include "parse_string.h"
6
7//// STRING HANDLING FUNCTIONS ////
8
9// These functions are ported directly from Python/ast.c with some modifications
10// to account for the use of "Parser *p", the fact that don't have parser nodes
11// to pass around and the usage of some specialized APIs present only in this
12// file (like "_PyPegen_raise_syntax_error").
13
14static int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030015warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010016{
17 PyObject *msg =
18 PyUnicode_FromFormat("invalid escape sequence \\%c", first_invalid_escape_char);
19 if (msg == NULL) {
20 return -1;
21 }
22 if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030023 t->lineno, NULL, NULL) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +010024 if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
25 /* Replace the DeprecationWarning exception with a SyntaxError
26 to get a more accurate error report */
27 PyErr_Clear();
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030028
29 /* This is needed, in order for the SyntaxError to point to the token t,
30 since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
31 error location, if p->known_err_token is not set. */
32 p->known_err_token = t;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010033 RAISE_SYNTAX_ERROR("invalid escape sequence \\%c", first_invalid_escape_char);
34 }
35 Py_DECREF(msg);
36 return -1;
37 }
38 Py_DECREF(msg);
39 return 0;
40}
41
42static PyObject *
43decode_utf8(const char **sPtr, const char *end)
44{
45 const char *s, *t;
46 t = s = *sPtr;
47 while (s < end && (*s & 0x80)) {
48 s++;
49 }
50 *sPtr = s;
51 return PyUnicode_DecodeUTF8(t, s - t, NULL);
52}
53
54static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030055decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010056{
57 PyObject *v, *u;
58 char *buf;
59 char *p;
60 const char *end;
61
62 /* check for integer overflow */
63 if (len > SIZE_MAX / 6) {
64 return NULL;
65 }
66 /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
67 "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
68 u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
69 if (u == NULL) {
70 return NULL;
71 }
72 p = buf = PyBytes_AsString(u);
73 end = s + len;
74 while (s < end) {
75 if (*s == '\\') {
76 *p++ = *s++;
77 if (s >= end || *s & 0x80) {
78 strcpy(p, "u005c");
79 p += 5;
80 if (s >= end) {
81 break;
82 }
83 }
84 }
85 if (*s & 0x80) {
86 PyObject *w;
87 int kind;
88 void *data;
89 Py_ssize_t len, i;
90 w = decode_utf8(&s, end);
91 if (w == NULL) {
92 Py_DECREF(u);
93 return NULL;
94 }
95 kind = PyUnicode_KIND(w);
96 data = PyUnicode_DATA(w);
97 len = PyUnicode_GET_LENGTH(w);
98 for (i = 0; i < len; i++) {
99 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
100 sprintf(p, "\\U%08x", chr);
101 p += 10;
102 }
103 /* Should be impossible to overflow */
104 assert(p - buf <= PyBytes_GET_SIZE(u));
105 Py_DECREF(w);
106 }
107 else {
108 *p++ = *s++;
109 }
110 }
111 len = p - buf;
112 s = buf;
113
114 const char *first_invalid_escape;
115 v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
116
117 if (v != NULL && first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300118 if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100119 /* We have not decref u before because first_invalid_escape points
120 inside u. */
121 Py_XDECREF(u);
122 Py_DECREF(v);
123 return NULL;
124 }
125 }
126 Py_XDECREF(u);
127 return v;
128}
129
130static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300131decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100132{
133 const char *first_invalid_escape;
134 PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
135 if (result == NULL) {
136 return NULL;
137 }
138
139 if (first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300140 if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100141 Py_DECREF(result);
142 return NULL;
143 }
144 }
145 return result;
146}
147
148/* s must include the bracketing quote characters, and r, b, u,
149 &/or f prefixes (if any), and embedded escape sequences (if any).
150 _PyPegen_parsestr parses it, and sets *result to decoded Python string object.
151 If the string is an f-string, set *fstr and *fstrlen to the unparsed
152 string object. Return 0 if no errors occurred. */
153int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300154_PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
155 const char **fstr, Py_ssize_t *fstrlen, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100156{
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300157 const char *s = PyBytes_AsString(t->bytes);
158 if (s == NULL) {
159 return -1;
160 }
161
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100162 size_t len;
163 int quote = Py_CHARMASK(*s);
164 int fmode = 0;
165 *bytesmode = 0;
166 *rawmode = 0;
167 *result = NULL;
168 *fstr = NULL;
169 if (Py_ISALPHA(quote)) {
170 while (!*bytesmode || !*rawmode) {
171 if (quote == 'b' || quote == 'B') {
172 quote = *++s;
173 *bytesmode = 1;
174 }
175 else if (quote == 'u' || quote == 'U') {
176 quote = *++s;
177 }
178 else if (quote == 'r' || quote == 'R') {
179 quote = *++s;
180 *rawmode = 1;
181 }
182 else if (quote == 'f' || quote == 'F') {
183 quote = *++s;
184 fmode = 1;
185 }
186 else {
187 break;
188 }
189 }
190 }
191
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300192 /* fstrings are only allowed in Python 3.6 and greater */
193 if (fmode && p->feature_version < 6) {
194 p->error_indicator = 1;
195 RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
196 return -1;
197 }
198
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100199 if (fmode && *bytesmode) {
200 PyErr_BadInternalCall();
201 return -1;
202 }
203 if (quote != '\'' && quote != '\"') {
204 PyErr_BadInternalCall();
205 return -1;
206 }
207 /* Skip the leading quote char. */
208 s++;
209 len = strlen(s);
210 if (len > INT_MAX) {
211 PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
212 return -1;
213 }
214 if (s[--len] != quote) {
215 /* Last quote char must match the first. */
216 PyErr_BadInternalCall();
217 return -1;
218 }
219 if (len >= 4 && s[0] == quote && s[1] == quote) {
220 /* A triple quoted string. We've already skipped one quote at
221 the start and one at the end of the string. Now skip the
222 two at the start. */
223 s += 2;
224 len -= 2;
225 /* And check that the last two match. */
226 if (s[--len] != quote || s[--len] != quote) {
227 PyErr_BadInternalCall();
228 return -1;
229 }
230 }
231
232 if (fmode) {
233 /* Just return the bytes. The caller will parse the resulting
234 string. */
235 *fstr = s;
236 *fstrlen = len;
237 return 0;
238 }
239
240 /* Not an f-string. */
241 /* Avoid invoking escape decoding routines if possible. */
242 *rawmode = *rawmode || strchr(s, '\\') == NULL;
243 if (*bytesmode) {
244 /* Disallow non-ASCII characters. */
245 const char *ch;
246 for (ch = s; *ch; ch++) {
247 if (Py_CHARMASK(*ch) >= 0x80) {
248 RAISE_SYNTAX_ERROR(
249 "bytes can only contain ASCII "
250 "literal characters.");
251 return -1;
252 }
253 }
254 if (*rawmode) {
255 *result = PyBytes_FromStringAndSize(s, len);
256 }
257 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300258 *result = decode_bytes_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100259 }
260 }
261 else {
262 if (*rawmode) {
263 *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
264 }
265 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300266 *result = decode_unicode_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100267 }
268 }
269 return *result == NULL ? -1 : 0;
270}
271
272
273
274// FSTRING STUFF
275
276static void fstring_shift_expr_locations(expr_ty n, int lineno, int col_offset);
277static void fstring_shift_argument(expr_ty parent, arg_ty args, int lineno, int col_offset);
278
279
280static inline void shift_expr(expr_ty parent, expr_ty n, int line, int col) {
281 if (parent->lineno < n->lineno) {
282 col = 0;
283 }
284 fstring_shift_expr_locations(n, line, col);
285}
286
287static inline void shift_arg(expr_ty parent, arg_ty n, int line, int col) {
288 if (parent->lineno < n->lineno) {
289 col = 0;
290 }
291 fstring_shift_argument(parent, n, line, col);
292}
293
294static void fstring_shift_seq_locations(expr_ty parent, asdl_seq *seq, int lineno, int col_offset) {
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100295 for (Py_ssize_t i = 0, l = asdl_seq_LEN(seq); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100296 expr_ty expr = asdl_seq_GET(seq, i);
297 if (expr == NULL){
298 continue;
299 }
300 shift_expr(parent, expr, lineno, col_offset);
301 }
302}
303
304static void fstring_shift_slice_locations(expr_ty parent, expr_ty slice, int lineno, int col_offset) {
305 switch (slice->kind) {
306 case Slice_kind:
307 if (slice->v.Slice.lower) {
308 shift_expr(parent, slice->v.Slice.lower, lineno, col_offset);
309 }
310 if (slice->v.Slice.upper) {
311 shift_expr(parent, slice->v.Slice.upper, lineno, col_offset);
312 }
313 if (slice->v.Slice.step) {
314 shift_expr(parent, slice->v.Slice.step, lineno, col_offset);
315 }
316 break;
317 case Tuple_kind:
318 fstring_shift_seq_locations(parent, slice->v.Tuple.elts, lineno, col_offset);
319 break;
320 default:
321 break;
322 }
323}
324
325static void fstring_shift_comprehension(expr_ty parent, comprehension_ty comp, int lineno, int col_offset) {
326 shift_expr(parent, comp->target, lineno, col_offset);
327 shift_expr(parent, comp->iter, lineno, col_offset);
328 fstring_shift_seq_locations(parent, comp->ifs, lineno, col_offset);
329}
330
331static void fstring_shift_argument(expr_ty parent, arg_ty arg, int lineno, int col_offset) {
332 if (arg->annotation != NULL){
333 shift_expr(parent, arg->annotation, lineno, col_offset);
334 }
335 arg->col_offset = arg->col_offset + col_offset;
336 arg->end_col_offset = arg->end_col_offset + col_offset;
337 arg->lineno = arg->lineno + lineno;
338 arg->end_lineno = arg->end_lineno + lineno;
339}
340
341static void fstring_shift_arguments(expr_ty parent, arguments_ty args, int lineno, int col_offset) {
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100342 for (Py_ssize_t i = 0, l = asdl_seq_LEN(args->posonlyargs); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100343 arg_ty arg = asdl_seq_GET(args->posonlyargs, i);
344 shift_arg(parent, arg, lineno, col_offset);
345 }
346
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100347 for (Py_ssize_t i = 0, l = asdl_seq_LEN(args->args); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100348 arg_ty arg = asdl_seq_GET(args->args, i);
349 shift_arg(parent, arg, lineno, col_offset);
350 }
351
352 if (args->vararg != NULL) {
353 shift_arg(parent, args->vararg, lineno, col_offset);
354 }
355
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100356 for (Py_ssize_t i = 0, l = asdl_seq_LEN(args->kwonlyargs); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100357 arg_ty arg = asdl_seq_GET(args->kwonlyargs, i);
358 shift_arg(parent, arg, lineno, col_offset);
359 }
360
361 fstring_shift_seq_locations(parent, args->kw_defaults, lineno, col_offset);
362
363 if (args->kwarg != NULL) {
364 shift_arg(parent, args->kwarg, lineno, col_offset);
365 }
366
367 fstring_shift_seq_locations(parent, args->defaults, lineno, col_offset);
368}
369
370static void fstring_shift_children_locations(expr_ty n, int lineno, int col_offset) {
371 switch (n->kind) {
372 case BoolOp_kind:
373 fstring_shift_seq_locations(n, n->v.BoolOp.values, lineno, col_offset);
374 break;
375 case NamedExpr_kind:
376 shift_expr(n, n->v.NamedExpr.target, lineno, col_offset);
377 shift_expr(n, n->v.NamedExpr.value, lineno, col_offset);
378 break;
379 case BinOp_kind:
380 shift_expr(n, n->v.BinOp.left, lineno, col_offset);
381 shift_expr(n, n->v.BinOp.right, lineno, col_offset);
382 break;
383 case UnaryOp_kind:
384 shift_expr(n, n->v.UnaryOp.operand, lineno, col_offset);
385 break;
386 case Lambda_kind:
387 fstring_shift_arguments(n, n->v.Lambda.args, lineno, col_offset);
388 shift_expr(n, n->v.Lambda.body, lineno, col_offset);
389 break;
390 case IfExp_kind:
391 shift_expr(n, n->v.IfExp.test, lineno, col_offset);
392 shift_expr(n, n->v.IfExp.body, lineno, col_offset);
393 shift_expr(n, n->v.IfExp.orelse, lineno, col_offset);
394 break;
395 case Dict_kind:
396 fstring_shift_seq_locations(n, n->v.Dict.keys, lineno, col_offset);
397 fstring_shift_seq_locations(n, n->v.Dict.values, lineno, col_offset);
398 break;
399 case Set_kind:
400 fstring_shift_seq_locations(n, n->v.Set.elts, lineno, col_offset);
401 break;
402 case ListComp_kind:
403 shift_expr(n, n->v.ListComp.elt, lineno, col_offset);
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100404 for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.ListComp.generators); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100405 comprehension_ty comp = asdl_seq_GET(n->v.ListComp.generators, i);
406 fstring_shift_comprehension(n, comp, lineno, col_offset);
407 }
408 break;
409 case SetComp_kind:
410 shift_expr(n, n->v.SetComp.elt, lineno, col_offset);
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100411 for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.SetComp.generators); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100412 comprehension_ty comp = asdl_seq_GET(n->v.SetComp.generators, i);
413 fstring_shift_comprehension(n, comp, lineno, col_offset);
414 }
415 break;
416 case DictComp_kind:
417 shift_expr(n, n->v.DictComp.key, lineno, col_offset);
418 shift_expr(n, n->v.DictComp.value, lineno, col_offset);
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100419 for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.DictComp.generators); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100420 comprehension_ty comp = asdl_seq_GET(n->v.DictComp.generators, i);
421 fstring_shift_comprehension(n, comp, lineno, col_offset);
422 }
423 break;
424 case GeneratorExp_kind:
425 shift_expr(n, n->v.GeneratorExp.elt, lineno, col_offset);
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100426 for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.GeneratorExp.generators); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100427 comprehension_ty comp = asdl_seq_GET(n->v.GeneratorExp.generators, i);
428 fstring_shift_comprehension(n, comp, lineno, col_offset);
429 }
430 break;
431 case Await_kind:
432 shift_expr(n, n->v.Await.value, lineno, col_offset);
433 break;
434 case Yield_kind:
435 shift_expr(n, n->v.Yield.value, lineno, col_offset);
436 break;
437 case YieldFrom_kind:
438 shift_expr(n, n->v.YieldFrom.value, lineno, col_offset);
439 break;
440 case Compare_kind:
441 shift_expr(n, n->v.Compare.left, lineno, col_offset);
442 fstring_shift_seq_locations(n, n->v.Compare.comparators, lineno, col_offset);
443 break;
444 case Call_kind:
445 shift_expr(n, n->v.Call.func, lineno, col_offset);
446 fstring_shift_seq_locations(n, n->v.Call.args, lineno, col_offset);
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100447 for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.Call.keywords); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100448 keyword_ty keyword = asdl_seq_GET(n->v.Call.keywords, i);
449 shift_expr(n, keyword->value, lineno, col_offset);
450 }
451 break;
452 case Attribute_kind:
453 shift_expr(n, n->v.Attribute.value, lineno, col_offset);
454 break;
455 case Subscript_kind:
456 shift_expr(n, n->v.Subscript.value, lineno, col_offset);
457 fstring_shift_slice_locations(n, n->v.Subscript.slice, lineno, col_offset);
458 shift_expr(n, n->v.Subscript.slice, lineno, col_offset);
459 break;
460 case Starred_kind:
461 shift_expr(n, n->v.Starred.value, lineno, col_offset);
462 break;
463 case List_kind:
464 fstring_shift_seq_locations(n, n->v.List.elts, lineno, col_offset);
465 break;
466 case Tuple_kind:
467 fstring_shift_seq_locations(n, n->v.Tuple.elts, lineno, col_offset);
468 break;
Lysandros Nikolaou37af21b2020-04-29 03:43:50 +0300469 case JoinedStr_kind:
470 fstring_shift_seq_locations(n, n->v.JoinedStr.values, lineno, col_offset);
471 break;
472 case FormattedValue_kind:
473 shift_expr(n, n->v.FormattedValue.value, lineno, col_offset);
474 if (n->v.FormattedValue.format_spec) {
475 shift_expr(n, n->v.FormattedValue.format_spec, lineno, col_offset);
476 }
477 break;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100478 default:
479 return;
480 }
481}
482
483/* Shift locations for the given node and all its children by adding `lineno`
484 and `col_offset` to existing locations. Note that n is the already parsed
485 expression. */
486static void fstring_shift_expr_locations(expr_ty n, int lineno, int col_offset)
487{
488 n->col_offset = n->col_offset + col_offset;
489
490 // The following is needed, in order for nodes spanning across multiple lines
491 // to be shifted correctly. An example of such a node is a Call node, the closing
492 // parenthesis of which is not on the same line as its name.
493 if (n->lineno == n->end_lineno) {
494 n->end_col_offset = n->end_col_offset + col_offset;
495 }
496
497 fstring_shift_children_locations(n, lineno, col_offset);
498 n->lineno = n->lineno + lineno;
499 n->end_lineno = n->end_lineno + lineno;
500}
501
502/* Fix locations for the given node and its children.
503
504 `parent` is the enclosing node.
505 `n` is the node which locations are going to be fixed relative to parent.
506 `expr_str` is the child node's string representation, including braces.
507*/
508static void
509fstring_fix_expr_location(Token *parent, expr_ty n, char *expr_str)
510{
511 char *substr = NULL;
512 char *start;
513 int lines = 0;
514 int cols = 0;
515
516 if (parent && parent->bytes) {
517 char *parent_str = PyBytes_AsString(parent->bytes);
518 if (!parent_str) {
519 return;
520 }
521 substr = strstr(parent_str, expr_str);
522 if (substr) {
523 // The following is needed, in order to correctly shift the column
524 // offset, in the case that (disregarding any whitespace) a newline
525 // immediately follows the opening curly brace of the fstring expression.
526 int newline_after_brace = 1;
527 start = substr + 1;
528 while (start && *start != '}' && *start != '\n') {
529 if (*start != ' ' && *start != '\t' && *start != '\f') {
530 newline_after_brace = 0;
531 break;
532 }
533 start++;
534 }
535
536 // Account for the characters from the last newline character to our
537 // left until the beginning of substr.
538 if (!newline_after_brace) {
539 start = substr;
540 while (start > parent_str && *start != '\n') {
541 start--;
542 }
543 cols += (int)(substr - start);
544 }
545 /* adjust the start based on the number of newlines encountered
546 before the f-string expression */
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100547 for (char* p = parent_str; p < substr; p++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100548 if (*p == '\n') {
549 lines++;
550 }
551 }
552 }
553 }
554 fstring_shift_expr_locations(n, lines, cols);
555}
556
557
558/* Compile this expression in to an expr_ty. Add parens around the
559 expression, in order to allow leading spaces in the expression. */
560static expr_ty
561fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
562 Token *t)
563{
564 expr_ty expr = NULL;
565 char *str;
566 Py_ssize_t len;
567 const char *s;
568 expr_ty result = NULL;
569
570 assert(expr_end >= expr_start);
571 assert(*(expr_start-1) == '{');
572 assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' ||
573 *expr_end == '=');
574
575 /* If the substring is all whitespace, it's an error. We need to catch this
576 here, and not when we call PyParser_SimpleParseStringFlagsFilename,
577 because turning the expression '' in to '()' would go from being invalid
578 to valid. */
579 for (s = expr_start; s != expr_end; s++) {
580 char c = *s;
581 /* The Python parser ignores only the following whitespace
582 characters (\r already is converted to \n). */
583 if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) {
584 break;
585 }
586 }
587 if (s == expr_end) {
588 RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
589 return NULL;
590 }
591
592 len = expr_end - expr_start;
593 /* Allocate 3 extra bytes: open paren, close paren, null byte. */
594 str = PyMem_RawMalloc(len + 3);
595 if (str == NULL) {
596 PyErr_NoMemory();
597 return NULL;
598 }
599
600 str[0] = '(';
601 memcpy(str+1, expr_start, len);
602 str[len+1] = ')';
603 str[len+2] = 0;
604
605 struct tok_state* tok = PyTokenizer_FromString(str, 1);
606 if (tok == NULL) {
607 return NULL;
608 }
609 tok->filename = PyUnicode_FromString("<fstring>");
610 if (!tok->filename) {
611 PyTokenizer_Free(tok);
612 return NULL;
613 }
614
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300615 Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
616 NULL, p->arena);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100617 p2->starting_lineno = p->starting_lineno + p->tok->first_lineno - 1;
618 p2->starting_col_offset = p->tok->first_lineno == p->tok->lineno
619 ? p->starting_col_offset + t->col_offset : 0;
620
621 expr = _PyPegen_run_parser(p2);
622
623 if (expr == NULL) {
624 goto exit;
625 }
626
627 /* Reuse str to find the correct column offset. */
628 str[0] = '{';
629 str[len+1] = '}';
630 fstring_fix_expr_location(t, expr, str);
631
632 result = expr;
633
634exit:
635 _PyPegen_Parser_Free(p2);
636 PyTokenizer_Free(tok);
637 return result;
638}
639
640/* Return -1 on error.
641
642 Return 0 if we reached the end of the literal.
643
644 Return 1 if we haven't reached the end of the literal, but we want
645 the caller to process the literal up to this point. Used for
646 doubled braces.
647*/
648static int
649fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300650 PyObject **literal, int recurse_lvl, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100651{
652 /* Get any literal string. It ends when we hit an un-doubled left
653 brace (which isn't part of a unicode name escape such as
654 "\N{EULER CONSTANT}"), or the end of the string. */
655
656 const char *s = *str;
657 const char *literal_start = s;
658 int result = 0;
659
660 assert(*literal == NULL);
661 while (s < end) {
662 char ch = *s++;
663 if (!raw && ch == '\\' && s < end) {
664 ch = *s++;
665 if (ch == 'N') {
666 if (s < end && *s++ == '{') {
667 while (s < end && *s++ != '}') {
668 }
669 continue;
670 }
671 break;
672 }
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300673 if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100674 return -1;
675 }
676 }
677 if (ch == '{' || ch == '}') {
678 /* Check for doubled braces, but only at the top level. If
679 we checked at every level, then f'{0:{3}}' would fail
680 with the two closing braces. */
681 if (recurse_lvl == 0) {
682 if (s < end && *s == ch) {
683 /* We're going to tell the caller that the literal ends
684 here, but that they should continue scanning. But also
685 skip over the second brace when we resume scanning. */
686 *str = s + 1;
687 result = 1;
688 goto done;
689 }
690
691 /* Where a single '{' is the start of a new expression, a
692 single '}' is not allowed. */
693 if (ch == '}') {
694 *str = s - 1;
695 RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
696 return -1;
697 }
698 }
699 /* We're either at a '{', which means we're starting another
700 expression; or a '}', which means we're at the end of this
701 f-string (for a nested format_spec). */
702 s--;
703 break;
704 }
705 }
706 *str = s;
707 assert(s <= end);
708 assert(s == end || *s == '{' || *s == '}');
709done:
710 if (literal_start != s) {
711 if (raw)
712 *literal = PyUnicode_DecodeUTF8Stateful(literal_start,
713 s - literal_start,
714 NULL, NULL);
715 else
716 *literal = decode_unicode_with_escapes(p, literal_start,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300717 s - literal_start, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100718 if (!*literal)
719 return -1;
720 }
721 return result;
722}
723
724/* Forward declaration because parsing is recursive. */
725static expr_ty
726fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
727 Token *first_token, Token* t, Token *last_token);
728
729/* Parse the f-string at *str, ending at end. We know *str starts an
730 expression (so it must be a '{'). Returns the FormattedValue node, which
731 includes the expression, conversion character, format_spec expression, and
732 optionally the text of the expression (if = is used).
733
734 Note that I don't do a perfect job here: I don't make sure that a
735 closing brace doesn't match an opening paren, for example. It
736 doesn't need to error on all invalid expressions, just correctly
737 find the end of all valid ones. Any errors inside the expression
738 will be caught when we parse it later.
739
740 *expression is set to the expression. For an '=' "debug" expression,
741 *expr_text is set to the debug text (the original text of the expression,
742 including the '=' and any whitespace around it, as a string object). If
743 not a debug expression, *expr_text set to NULL. */
744static int
745fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
746 PyObject **expr_text, expr_ty *expression, Token *first_token,
747 Token *t, Token *last_token)
748{
749 /* Return -1 on error, else 0. */
750
751 const char *expr_start;
752 const char *expr_end;
753 expr_ty simple_expression;
754 expr_ty format_spec = NULL; /* Optional format specifier. */
755 int conversion = -1; /* The conversion char. Use default if not
756 specified, or !r if using = and no format
757 spec. */
758
759 /* 0 if we're not in a string, else the quote char we're trying to
760 match (single or double quote). */
761 char quote_char = 0;
762
763 /* If we're inside a string, 1=normal, 3=triple-quoted. */
764 int string_type = 0;
765
766 /* Keep track of nesting level for braces/parens/brackets in
767 expressions. */
768 Py_ssize_t nested_depth = 0;
769 char parenstack[MAXLEVEL];
770
771 *expr_text = NULL;
772
773 /* Can only nest one level deep. */
774 if (recurse_lvl >= 2) {
775 RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
776 goto error;
777 }
778
779 /* The first char must be a left brace, or we wouldn't have gotten
780 here. Skip over it. */
781 assert(**str == '{');
782 *str += 1;
783
784 expr_start = *str;
785 for (; *str < end; (*str)++) {
786 char ch;
787
788 /* Loop invariants. */
789 assert(nested_depth >= 0);
790 assert(*str >= expr_start && *str < end);
791 if (quote_char)
792 assert(string_type == 1 || string_type == 3);
793 else
794 assert(string_type == 0);
795
796 ch = **str;
797 /* Nowhere inside an expression is a backslash allowed. */
798 if (ch == '\\') {
799 /* Error: can't include a backslash character, inside
800 parens or strings or not. */
801 RAISE_SYNTAX_ERROR(
802 "f-string expression part "
803 "cannot include a backslash");
804 goto error;
805 }
806 if (quote_char) {
807 /* We're inside a string. See if we're at the end. */
808 /* This code needs to implement the same non-error logic
809 as tok_get from tokenizer.c, at the letter_quote
810 label. To actually share that code would be a
811 nightmare. But, it's unlikely to change and is small,
812 so duplicate it here. Note we don't need to catch all
813 of the errors, since they'll be caught when parsing the
814 expression. We just need to match the non-error
815 cases. Thus we can ignore \n in single-quoted strings,
816 for example. Or non-terminated strings. */
817 if (ch == quote_char) {
818 /* Does this match the string_type (single or triple
819 quoted)? */
820 if (string_type == 3) {
821 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
822 /* We're at the end of a triple quoted string. */
823 *str += 2;
824 string_type = 0;
825 quote_char = 0;
826 continue;
827 }
828 } else {
829 /* We're at the end of a normal string. */
830 quote_char = 0;
831 string_type = 0;
832 continue;
833 }
834 }
835 } else if (ch == '\'' || ch == '"') {
836 /* Is this a triple quoted string? */
837 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
838 string_type = 3;
839 *str += 2;
840 } else {
841 /* Start of a normal string. */
842 string_type = 1;
843 }
844 /* Start looking for the end of the string. */
845 quote_char = ch;
846 } else if (ch == '[' || ch == '{' || ch == '(') {
847 if (nested_depth >= MAXLEVEL) {
848 RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
849 goto error;
850 }
851 parenstack[nested_depth] = ch;
852 nested_depth++;
853 } else if (ch == '#') {
854 /* Error: can't include a comment character, inside parens
855 or not. */
856 RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
857 goto error;
858 } else if (nested_depth == 0 &&
859 (ch == '!' || ch == ':' || ch == '}' ||
860 ch == '=' || ch == '>' || ch == '<')) {
861 /* See if there's a next character. */
862 if (*str+1 < end) {
863 char next = *(*str+1);
864
865 /* For "!=". since '=' is not an allowed conversion character,
866 nothing is lost in this test. */
867 if ((ch == '!' && next == '=') || /* != */
868 (ch == '=' && next == '=') || /* == */
869 (ch == '<' && next == '=') || /* <= */
870 (ch == '>' && next == '=') /* >= */
871 ) {
872 *str += 1;
873 continue;
874 }
875 /* Don't get out of the loop for these, if they're single
876 chars (not part of 2-char tokens). If by themselves, they
877 don't end an expression (unlike say '!'). */
878 if (ch == '>' || ch == '<') {
879 continue;
880 }
881 }
882
883 /* Normal way out of this loop. */
884 break;
885 } else if (ch == ']' || ch == '}' || ch == ')') {
886 if (!nested_depth) {
887 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
888 goto error;
889 }
890 nested_depth--;
891 int opening = parenstack[nested_depth];
892 if (!((opening == '(' && ch == ')') ||
893 (opening == '[' && ch == ']') ||
894 (opening == '{' && ch == '}')))
895 {
896 RAISE_SYNTAX_ERROR(
897 "f-string: closing parenthesis '%c' "
898 "does not match opening parenthesis '%c'",
899 ch, opening);
900 goto error;
901 }
902 } else {
903 /* Just consume this char and loop around. */
904 }
905 }
906 expr_end = *str;
907 /* If we leave this loop in a string or with mismatched parens, we
908 don't care. We'll get a syntax error when compiling the
909 expression. But, we can produce a better error message, so
910 let's just do that.*/
911 if (quote_char) {
912 RAISE_SYNTAX_ERROR("f-string: unterminated string");
913 goto error;
914 }
915 if (nested_depth) {
916 int opening = parenstack[nested_depth - 1];
917 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
918 goto error;
919 }
920
921 if (*str >= end)
922 goto unexpected_end_of_string;
923
924 /* Compile the expression as soon as possible, so we show errors
925 related to the expression before errors related to the
926 conversion or format_spec. */
927 simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
928 if (!simple_expression)
929 goto error;
930
931 /* Check for =, which puts the text value of the expression in
932 expr_text. */
933 if (**str == '=') {
934 *str += 1;
935
936 /* Skip over ASCII whitespace. No need to test for end of string
937 here, since we know there's at least a trailing quote somewhere
938 ahead. */
939 while (Py_ISSPACE(**str)) {
940 *str += 1;
941 }
942
943 /* Set *expr_text to the text of the expression. */
944 *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start);
945 if (!*expr_text) {
946 goto error;
947 }
948 }
949
950 /* Check for a conversion char, if present. */
951 if (**str == '!') {
952 *str += 1;
953 if (*str >= end)
954 goto unexpected_end_of_string;
955
956 conversion = **str;
957 *str += 1;
958
959 /* Validate the conversion. */
960 if (!(conversion == 's' || conversion == 'r' || conversion == 'a')) {
961 RAISE_SYNTAX_ERROR(
962 "f-string: invalid conversion character: "
963 "expected 's', 'r', or 'a'");
964 goto error;
965 }
966
967 }
968
969 /* Check for the format spec, if present. */
970 if (*str >= end)
971 goto unexpected_end_of_string;
972 if (**str == ':') {
973 *str += 1;
974 if (*str >= end)
975 goto unexpected_end_of_string;
976
977 /* Parse the format spec. */
978 format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
979 first_token, t, last_token);
980 if (!format_spec)
981 goto error;
982 }
983
984 if (*str >= end || **str != '}')
985 goto unexpected_end_of_string;
986
987 /* We're at a right brace. Consume it. */
988 assert(*str < end);
989 assert(**str == '}');
990 *str += 1;
991
992 /* If we're in = mode (detected by non-NULL expr_text), and have no format
993 spec and no explicit conversion, set the conversion to 'r'. */
994 if (*expr_text && format_spec == NULL && conversion == -1) {
995 conversion = 'r';
996 }
997
998 /* And now create the FormattedValue node that represents this
999 entire expression with the conversion and format spec. */
1000 //TODO: Fix this
1001 *expression = FormattedValue(simple_expression, conversion,
1002 format_spec, first_token->lineno,
1003 first_token->col_offset, last_token->end_lineno,
1004 last_token->end_col_offset, p->arena);
1005 if (!*expression)
1006 goto error;
1007
1008 return 0;
1009
1010unexpected_end_of_string:
1011 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1012 /* Falls through to error. */
1013
1014error:
1015 Py_XDECREF(*expr_text);
1016 return -1;
1017
1018}
1019
1020/* Return -1 on error.
1021
1022 Return 0 if we have a literal (possible zero length) and an
1023 expression (zero length if at the end of the string.
1024
1025 Return 1 if we have a literal, but no expression, and we want the
1026 caller to call us again. This is used to deal with doubled
1027 braces.
1028
1029 When called multiple times on the string 'a{{b{0}c', this function
1030 will return:
1031
1032 1. the literal 'a{' with no expression, and a return value
1033 of 1. Despite the fact that there's no expression, the return
1034 value of 1 means we're not finished yet.
1035
1036 2. the literal 'b' and the expression '0', with a return value of
1037 0. The fact that there's an expression means we're not finished.
1038
1039 3. literal 'c' with no expression and a return value of 0. The
1040 combination of the return value of 0 with no expression means
1041 we're finished.
1042*/
1043static int
1044fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw,
1045 int recurse_lvl, PyObject **literal,
1046 PyObject **expr_text, expr_ty *expression,
1047 Token *first_token, Token *t, Token *last_token)
1048{
1049 int result;
1050
1051 assert(*literal == NULL && *expression == NULL);
1052
1053 /* Get any literal string. */
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +03001054 result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001055 if (result < 0)
1056 goto error;
1057
1058 assert(result == 0 || result == 1);
1059
1060 if (result == 1)
1061 /* We have a literal, but don't look at the expression. */
1062 return 1;
1063
1064 if (*str >= end || **str == '}')
1065 /* We're at the end of the string or the end of a nested
1066 f-string: no expression. The top-level error case where we
1067 expect to be at the end of the string but we're at a '}' is
1068 handled later. */
1069 return 0;
1070
1071 /* We must now be the start of an expression, on a '{'. */
1072 assert(**str == '{');
1073
1074 if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
1075 expression, first_token, t, last_token) < 0)
1076 goto error;
1077
1078 return 0;
1079
1080error:
1081 Py_CLEAR(*literal);
1082 return -1;
1083}
1084
1085#ifdef NDEBUG
1086#define ExprList_check_invariants(l)
1087#else
1088static void
1089ExprList_check_invariants(ExprList *l)
1090{
1091 /* Check our invariants. Make sure this object is "live", and
1092 hasn't been deallocated. */
1093 assert(l->size >= 0);
1094 assert(l->p != NULL);
1095 if (l->size <= EXPRLIST_N_CACHED)
1096 assert(l->data == l->p);
1097}
1098#endif
1099
1100static void
1101ExprList_Init(ExprList *l)
1102{
1103 l->allocated = EXPRLIST_N_CACHED;
1104 l->size = 0;
1105
1106 /* Until we start allocating dynamically, p points to data. */
1107 l->p = l->data;
1108
1109 ExprList_check_invariants(l);
1110}
1111
1112static int
1113ExprList_Append(ExprList *l, expr_ty exp)
1114{
1115 ExprList_check_invariants(l);
1116 if (l->size >= l->allocated) {
1117 /* We need to alloc (or realloc) the memory. */
1118 Py_ssize_t new_size = l->allocated * 2;
1119
1120 /* See if we've ever allocated anything dynamically. */
1121 if (l->p == l->data) {
1122 Py_ssize_t i;
1123 /* We're still using the cached data. Switch to
1124 alloc-ing. */
1125 l->p = PyMem_RawMalloc(sizeof(expr_ty) * new_size);
1126 if (!l->p)
1127 return -1;
1128 /* Copy the cached data into the new buffer. */
1129 for (i = 0; i < l->size; i++)
1130 l->p[i] = l->data[i];
1131 } else {
1132 /* Just realloc. */
1133 expr_ty *tmp = PyMem_RawRealloc(l->p, sizeof(expr_ty) * new_size);
1134 if (!tmp) {
1135 PyMem_RawFree(l->p);
1136 l->p = NULL;
1137 return -1;
1138 }
1139 l->p = tmp;
1140 }
1141
1142 l->allocated = new_size;
1143 assert(l->allocated == 2 * l->size);
1144 }
1145
1146 l->p[l->size++] = exp;
1147
1148 ExprList_check_invariants(l);
1149 return 0;
1150}
1151
1152static void
1153ExprList_Dealloc(ExprList *l)
1154{
1155 ExprList_check_invariants(l);
1156
1157 /* If there's been an error, or we've never dynamically allocated,
1158 do nothing. */
1159 if (!l->p || l->p == l->data) {
1160 /* Do nothing. */
1161 } else {
1162 /* We have dynamically allocated. Free the memory. */
1163 PyMem_RawFree(l->p);
1164 }
1165 l->p = NULL;
1166 l->size = -1;
1167}
1168
1169static asdl_seq *
1170ExprList_Finish(ExprList *l, PyArena *arena)
1171{
1172 asdl_seq *seq;
1173
1174 ExprList_check_invariants(l);
1175
1176 /* Allocate the asdl_seq and copy the expressions in to it. */
1177 seq = _Py_asdl_seq_new(l->size, arena);
1178 if (seq) {
1179 Py_ssize_t i;
1180 for (i = 0; i < l->size; i++)
1181 asdl_seq_SET(seq, i, l->p[i]);
1182 }
1183 ExprList_Dealloc(l);
1184 return seq;
1185}
1186
1187#ifdef NDEBUG
1188#define FstringParser_check_invariants(state)
1189#else
1190static void
1191FstringParser_check_invariants(FstringParser *state)
1192{
1193 if (state->last_str)
1194 assert(PyUnicode_CheckExact(state->last_str));
1195 ExprList_check_invariants(&state->expr_list);
1196}
1197#endif
1198
1199void
1200_PyPegen_FstringParser_Init(FstringParser *state)
1201{
1202 state->last_str = NULL;
1203 state->fmode = 0;
1204 ExprList_Init(&state->expr_list);
1205 FstringParser_check_invariants(state);
1206}
1207
1208void
1209_PyPegen_FstringParser_Dealloc(FstringParser *state)
1210{
1211 FstringParser_check_invariants(state);
1212
1213 Py_XDECREF(state->last_str);
1214 ExprList_Dealloc(&state->expr_list);
1215}
1216
1217/* Make a Constant node, but decref the PyUnicode object being added. */
1218static expr_ty
1219make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token)
1220{
1221 PyObject *s = *str;
1222 PyObject *kind = NULL;
1223 *str = NULL;
1224 assert(PyUnicode_CheckExact(s));
1225 if (PyArena_AddPyObject(p->arena, s) < 0) {
1226 Py_DECREF(s);
1227 return NULL;
1228 }
1229 const char* the_str = PyBytes_AsString(first_token->bytes);
1230 if (the_str && the_str[0] == 'u') {
1231 kind = _PyPegen_new_identifier(p, "u");
1232 }
1233
1234 if (kind == NULL && PyErr_Occurred()) {
1235 return NULL;
1236 }
1237
1238 return Constant(s, kind, first_token->lineno, first_token->col_offset,
1239 last_token->end_lineno, last_token->end_col_offset, p->arena);
1240
1241}
1242
1243
1244/* Add a non-f-string (that is, a regular literal string). str is
1245 decref'd. */
1246int
1247_PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str)
1248{
1249 FstringParser_check_invariants(state);
1250
1251 assert(PyUnicode_CheckExact(str));
1252
1253 if (PyUnicode_GET_LENGTH(str) == 0) {
1254 Py_DECREF(str);
1255 return 0;
1256 }
1257
1258 if (!state->last_str) {
1259 /* We didn't have a string before, so just remember this one. */
1260 state->last_str = str;
1261 } else {
1262 /* Concatenate this with the previous string. */
1263 PyUnicode_AppendAndDel(&state->last_str, str);
1264 if (!state->last_str)
1265 return -1;
1266 }
1267 FstringParser_check_invariants(state);
1268 return 0;
1269}
1270
1271/* Parse an f-string. The f-string is in *str to end, with no
1272 'f' or quotes. */
1273int
1274_PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str,
1275 const char *end, int raw, int recurse_lvl,
1276 Token *first_token, Token* t, Token *last_token)
1277{
1278 FstringParser_check_invariants(state);
1279 state->fmode = 1;
1280
1281 /* Parse the f-string. */
1282 while (1) {
1283 PyObject *literal = NULL;
1284 PyObject *expr_text = NULL;
1285 expr_ty expression = NULL;
1286
1287 /* If there's a zero length literal in front of the
1288 expression, literal will be NULL. If we're at the end of
1289 the f-string, expression will be NULL (unless result == 1,
1290 see below). */
1291 int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
1292 &literal, &expr_text,
1293 &expression, first_token, t, last_token);
1294 if (result < 0)
1295 return -1;
1296
1297 /* Add the literal, if any. */
1298 if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
1299 Py_XDECREF(expr_text);
1300 return -1;
1301 }
1302 /* Add the expr_text, if any. */
1303 if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
1304 return -1;
1305 }
1306
1307 /* We've dealt with the literal and expr_text, their ownership has
1308 been transferred to the state object. Don't look at them again. */
1309
1310 /* See if we should just loop around to get the next literal
1311 and expression, while ignoring the expression this
1312 time. This is used for un-doubling braces, as an
1313 optimization. */
1314 if (result == 1)
1315 continue;
1316
1317 if (!expression)
1318 /* We're done with this f-string. */
1319 break;
1320
1321 /* We know we have an expression. Convert any existing string
1322 to a Constant node. */
1323 if (!state->last_str) {
1324 /* Do nothing. No previous literal. */
1325 } else {
1326 /* Convert the existing last_str literal to a Constant node. */
1327 expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1328 if (!str || ExprList_Append(&state->expr_list, str) < 0)
1329 return -1;
1330 }
1331
1332 if (ExprList_Append(&state->expr_list, expression) < 0)
1333 return -1;
1334 }
1335
1336 /* If recurse_lvl is zero, then we must be at the end of the
1337 string. Otherwise, we must be at a right brace. */
1338
1339 if (recurse_lvl == 0 && *str < end-1) {
1340 RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
1341 return -1;
1342 }
1343 if (recurse_lvl != 0 && **str != '}') {
1344 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1345 return -1;
1346 }
1347
1348 FstringParser_check_invariants(state);
1349 return 0;
1350}
1351
1352/* Convert the partial state reflected in last_str and expr_list to an
1353 expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
1354expr_ty
1355_PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token,
1356 Token *last_token)
1357{
1358 asdl_seq *seq;
1359
1360 FstringParser_check_invariants(state);
1361
1362 /* If we're just a constant string with no expressions, return
1363 that. */
1364 if (!state->fmode) {
1365 assert(!state->expr_list.size);
1366 if (!state->last_str) {
1367 /* Create a zero length string. */
1368 state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
1369 if (!state->last_str)
1370 goto error;
1371 }
1372 return make_str_node_and_del(p, &state->last_str, first_token, last_token);
1373 }
1374
1375 /* Create a Constant node out of last_str, if needed. It will be the
1376 last node in our expression list. */
1377 if (state->last_str) {
1378 expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1379 if (!str || ExprList_Append(&state->expr_list, str) < 0)
1380 goto error;
1381 }
1382 /* This has already been freed. */
1383 assert(state->last_str == NULL);
1384
1385 seq = ExprList_Finish(&state->expr_list, p->arena);
1386 if (!seq)
1387 goto error;
1388
1389 return _Py_JoinedStr(seq, first_token->lineno, first_token->col_offset,
1390 last_token->end_lineno, last_token->end_col_offset, p->arena);
1391
1392error:
1393 _PyPegen_FstringParser_Dealloc(state);
1394 return NULL;
1395}
1396
1397/* Given an f-string (with no 'f' or quotes) that's in *str and ends
1398 at end, parse it into an expr_ty. Return NULL on error. Adjust
1399 str to point past the parsed portion. */
1400static expr_ty
1401fstring_parse(Parser *p, const char **str, const char *end, int raw,
1402 int recurse_lvl, Token *first_token, Token* t, Token *last_token)
1403{
1404 FstringParser state;
1405
1406 _PyPegen_FstringParser_Init(&state);
1407 if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
1408 first_token, t, last_token) < 0) {
1409 _PyPegen_FstringParser_Dealloc(&state);
1410 return NULL;
1411 }
1412
1413 return _PyPegen_FstringParser_Finish(p, &state, t, t);
1414}