blob: a0ec698fa56a24d5d24f7fb40b321f6108f83ada [file] [log] [blame]
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001#include <Python.h>
2
3#include "../tokenizer.h"
4#include "pegen.h"
5#include "parse_string.h"
6
7//// STRING HANDLING FUNCTIONS ////
8
9// These functions are ported directly from Python/ast.c with some modifications
10// to account for the use of "Parser *p", the fact that don't have parser nodes
11// to pass around and the usage of some specialized APIs present only in this
12// file (like "_PyPegen_raise_syntax_error").
13
14static int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030015warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010016{
17 PyObject *msg =
18 PyUnicode_FromFormat("invalid escape sequence \\%c", first_invalid_escape_char);
19 if (msg == NULL) {
20 return -1;
21 }
22 if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030023 t->lineno, NULL, NULL) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +010024 if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
25 /* Replace the DeprecationWarning exception with a SyntaxError
26 to get a more accurate error report */
27 PyErr_Clear();
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030028
29 /* This is needed, in order for the SyntaxError to point to the token t,
30 since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
31 error location, if p->known_err_token is not set. */
32 p->known_err_token = t;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010033 RAISE_SYNTAX_ERROR("invalid escape sequence \\%c", first_invalid_escape_char);
34 }
35 Py_DECREF(msg);
36 return -1;
37 }
38 Py_DECREF(msg);
39 return 0;
40}
41
42static PyObject *
43decode_utf8(const char **sPtr, const char *end)
44{
45 const char *s, *t;
46 t = s = *sPtr;
47 while (s < end && (*s & 0x80)) {
48 s++;
49 }
50 *sPtr = s;
51 return PyUnicode_DecodeUTF8(t, s - t, NULL);
52}
53
54static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030055decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010056{
57 PyObject *v, *u;
58 char *buf;
59 char *p;
60 const char *end;
61
62 /* check for integer overflow */
63 if (len > SIZE_MAX / 6) {
64 return NULL;
65 }
66 /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
67 "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
68 u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
69 if (u == NULL) {
70 return NULL;
71 }
72 p = buf = PyBytes_AsString(u);
73 end = s + len;
74 while (s < end) {
75 if (*s == '\\') {
76 *p++ = *s++;
77 if (s >= end || *s & 0x80) {
78 strcpy(p, "u005c");
79 p += 5;
80 if (s >= end) {
81 break;
82 }
83 }
84 }
85 if (*s & 0x80) {
86 PyObject *w;
87 int kind;
88 void *data;
89 Py_ssize_t len, i;
90 w = decode_utf8(&s, end);
91 if (w == NULL) {
92 Py_DECREF(u);
93 return NULL;
94 }
95 kind = PyUnicode_KIND(w);
96 data = PyUnicode_DATA(w);
97 len = PyUnicode_GET_LENGTH(w);
98 for (i = 0; i < len; i++) {
99 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
100 sprintf(p, "\\U%08x", chr);
101 p += 10;
102 }
103 /* Should be impossible to overflow */
104 assert(p - buf <= PyBytes_GET_SIZE(u));
105 Py_DECREF(w);
106 }
107 else {
108 *p++ = *s++;
109 }
110 }
111 len = p - buf;
112 s = buf;
113
114 const char *first_invalid_escape;
115 v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
116
117 if (v != NULL && first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300118 if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100119 /* We have not decref u before because first_invalid_escape points
120 inside u. */
121 Py_XDECREF(u);
122 Py_DECREF(v);
123 return NULL;
124 }
125 }
126 Py_XDECREF(u);
127 return v;
128}
129
130static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300131decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100132{
133 const char *first_invalid_escape;
134 PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
135 if (result == NULL) {
136 return NULL;
137 }
138
139 if (first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300140 if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100141 Py_DECREF(result);
142 return NULL;
143 }
144 }
145 return result;
146}
147
148/* s must include the bracketing quote characters, and r, b, u,
149 &/or f prefixes (if any), and embedded escape sequences (if any).
150 _PyPegen_parsestr parses it, and sets *result to decoded Python string object.
151 If the string is an f-string, set *fstr and *fstrlen to the unparsed
152 string object. Return 0 if no errors occurred. */
153int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300154_PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
155 const char **fstr, Py_ssize_t *fstrlen, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100156{
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300157 const char *s = PyBytes_AsString(t->bytes);
158 if (s == NULL) {
159 return -1;
160 }
161
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100162 size_t len;
163 int quote = Py_CHARMASK(*s);
164 int fmode = 0;
165 *bytesmode = 0;
166 *rawmode = 0;
167 *result = NULL;
168 *fstr = NULL;
169 if (Py_ISALPHA(quote)) {
170 while (!*bytesmode || !*rawmode) {
171 if (quote == 'b' || quote == 'B') {
172 quote = *++s;
173 *bytesmode = 1;
174 }
175 else if (quote == 'u' || quote == 'U') {
176 quote = *++s;
177 }
178 else if (quote == 'r' || quote == 'R') {
179 quote = *++s;
180 *rawmode = 1;
181 }
182 else if (quote == 'f' || quote == 'F') {
183 quote = *++s;
184 fmode = 1;
185 }
186 else {
187 break;
188 }
189 }
190 }
191
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300192 /* fstrings are only allowed in Python 3.6 and greater */
193 if (fmode && p->feature_version < 6) {
194 p->error_indicator = 1;
195 RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
196 return -1;
197 }
198
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100199 if (fmode && *bytesmode) {
200 PyErr_BadInternalCall();
201 return -1;
202 }
203 if (quote != '\'' && quote != '\"') {
204 PyErr_BadInternalCall();
205 return -1;
206 }
207 /* Skip the leading quote char. */
208 s++;
209 len = strlen(s);
210 if (len > INT_MAX) {
211 PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
212 return -1;
213 }
214 if (s[--len] != quote) {
215 /* Last quote char must match the first. */
216 PyErr_BadInternalCall();
217 return -1;
218 }
219 if (len >= 4 && s[0] == quote && s[1] == quote) {
220 /* A triple quoted string. We've already skipped one quote at
221 the start and one at the end of the string. Now skip the
222 two at the start. */
223 s += 2;
224 len -= 2;
225 /* And check that the last two match. */
226 if (s[--len] != quote || s[--len] != quote) {
227 PyErr_BadInternalCall();
228 return -1;
229 }
230 }
231
232 if (fmode) {
233 /* Just return the bytes. The caller will parse the resulting
234 string. */
235 *fstr = s;
236 *fstrlen = len;
237 return 0;
238 }
239
240 /* Not an f-string. */
241 /* Avoid invoking escape decoding routines if possible. */
242 *rawmode = *rawmode || strchr(s, '\\') == NULL;
243 if (*bytesmode) {
244 /* Disallow non-ASCII characters. */
245 const char *ch;
246 for (ch = s; *ch; ch++) {
247 if (Py_CHARMASK(*ch) >= 0x80) {
248 RAISE_SYNTAX_ERROR(
249 "bytes can only contain ASCII "
250 "literal characters.");
251 return -1;
252 }
253 }
254 if (*rawmode) {
255 *result = PyBytes_FromStringAndSize(s, len);
256 }
257 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300258 *result = decode_bytes_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100259 }
260 }
261 else {
262 if (*rawmode) {
263 *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
264 }
265 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300266 *result = decode_unicode_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100267 }
268 }
269 return *result == NULL ? -1 : 0;
270}
271
272
273
274// FSTRING STUFF
275
276static void fstring_shift_expr_locations(expr_ty n, int lineno, int col_offset);
277static void fstring_shift_argument(expr_ty parent, arg_ty args, int lineno, int col_offset);
278
279
280static inline void shift_expr(expr_ty parent, expr_ty n, int line, int col) {
281 if (parent->lineno < n->lineno) {
282 col = 0;
283 }
284 fstring_shift_expr_locations(n, line, col);
285}
286
287static inline void shift_arg(expr_ty parent, arg_ty n, int line, int col) {
288 if (parent->lineno < n->lineno) {
289 col = 0;
290 }
291 fstring_shift_argument(parent, n, line, col);
292}
293
294static void fstring_shift_seq_locations(expr_ty parent, asdl_seq *seq, int lineno, int col_offset) {
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100295 for (Py_ssize_t i = 0, l = asdl_seq_LEN(seq); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100296 expr_ty expr = asdl_seq_GET(seq, i);
297 if (expr == NULL){
298 continue;
299 }
300 shift_expr(parent, expr, lineno, col_offset);
301 }
302}
303
304static void fstring_shift_slice_locations(expr_ty parent, expr_ty slice, int lineno, int col_offset) {
305 switch (slice->kind) {
306 case Slice_kind:
307 if (slice->v.Slice.lower) {
308 shift_expr(parent, slice->v.Slice.lower, lineno, col_offset);
309 }
310 if (slice->v.Slice.upper) {
311 shift_expr(parent, slice->v.Slice.upper, lineno, col_offset);
312 }
313 if (slice->v.Slice.step) {
314 shift_expr(parent, slice->v.Slice.step, lineno, col_offset);
315 }
316 break;
317 case Tuple_kind:
318 fstring_shift_seq_locations(parent, slice->v.Tuple.elts, lineno, col_offset);
319 break;
320 default:
321 break;
322 }
323}
324
325static void fstring_shift_comprehension(expr_ty parent, comprehension_ty comp, int lineno, int col_offset) {
326 shift_expr(parent, comp->target, lineno, col_offset);
327 shift_expr(parent, comp->iter, lineno, col_offset);
328 fstring_shift_seq_locations(parent, comp->ifs, lineno, col_offset);
329}
330
331static void fstring_shift_argument(expr_ty parent, arg_ty arg, int lineno, int col_offset) {
332 if (arg->annotation != NULL){
333 shift_expr(parent, arg->annotation, lineno, col_offset);
334 }
335 arg->col_offset = arg->col_offset + col_offset;
336 arg->end_col_offset = arg->end_col_offset + col_offset;
337 arg->lineno = arg->lineno + lineno;
338 arg->end_lineno = arg->end_lineno + lineno;
339}
340
341static void fstring_shift_arguments(expr_ty parent, arguments_ty args, int lineno, int col_offset) {
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100342 for (Py_ssize_t i = 0, l = asdl_seq_LEN(args->posonlyargs); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100343 arg_ty arg = asdl_seq_GET(args->posonlyargs, i);
344 shift_arg(parent, arg, lineno, col_offset);
345 }
346
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100347 for (Py_ssize_t i = 0, l = asdl_seq_LEN(args->args); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100348 arg_ty arg = asdl_seq_GET(args->args, i);
349 shift_arg(parent, arg, lineno, col_offset);
350 }
351
352 if (args->vararg != NULL) {
353 shift_arg(parent, args->vararg, lineno, col_offset);
354 }
355
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100356 for (Py_ssize_t i = 0, l = asdl_seq_LEN(args->kwonlyargs); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100357 arg_ty arg = asdl_seq_GET(args->kwonlyargs, i);
358 shift_arg(parent, arg, lineno, col_offset);
359 }
360
361 fstring_shift_seq_locations(parent, args->kw_defaults, lineno, col_offset);
362
363 if (args->kwarg != NULL) {
364 shift_arg(parent, args->kwarg, lineno, col_offset);
365 }
366
367 fstring_shift_seq_locations(parent, args->defaults, lineno, col_offset);
368}
369
370static void fstring_shift_children_locations(expr_ty n, int lineno, int col_offset) {
371 switch (n->kind) {
372 case BoolOp_kind:
373 fstring_shift_seq_locations(n, n->v.BoolOp.values, lineno, col_offset);
374 break;
375 case NamedExpr_kind:
376 shift_expr(n, n->v.NamedExpr.target, lineno, col_offset);
377 shift_expr(n, n->v.NamedExpr.value, lineno, col_offset);
378 break;
379 case BinOp_kind:
380 shift_expr(n, n->v.BinOp.left, lineno, col_offset);
381 shift_expr(n, n->v.BinOp.right, lineno, col_offset);
382 break;
383 case UnaryOp_kind:
384 shift_expr(n, n->v.UnaryOp.operand, lineno, col_offset);
385 break;
386 case Lambda_kind:
387 fstring_shift_arguments(n, n->v.Lambda.args, lineno, col_offset);
388 shift_expr(n, n->v.Lambda.body, lineno, col_offset);
389 break;
390 case IfExp_kind:
391 shift_expr(n, n->v.IfExp.test, lineno, col_offset);
392 shift_expr(n, n->v.IfExp.body, lineno, col_offset);
393 shift_expr(n, n->v.IfExp.orelse, lineno, col_offset);
394 break;
395 case Dict_kind:
396 fstring_shift_seq_locations(n, n->v.Dict.keys, lineno, col_offset);
397 fstring_shift_seq_locations(n, n->v.Dict.values, lineno, col_offset);
398 break;
399 case Set_kind:
400 fstring_shift_seq_locations(n, n->v.Set.elts, lineno, col_offset);
401 break;
402 case ListComp_kind:
403 shift_expr(n, n->v.ListComp.elt, lineno, col_offset);
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100404 for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.ListComp.generators); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100405 comprehension_ty comp = asdl_seq_GET(n->v.ListComp.generators, i);
406 fstring_shift_comprehension(n, comp, lineno, col_offset);
407 }
408 break;
409 case SetComp_kind:
410 shift_expr(n, n->v.SetComp.elt, lineno, col_offset);
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100411 for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.SetComp.generators); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100412 comprehension_ty comp = asdl_seq_GET(n->v.SetComp.generators, i);
413 fstring_shift_comprehension(n, comp, lineno, col_offset);
414 }
415 break;
416 case DictComp_kind:
417 shift_expr(n, n->v.DictComp.key, lineno, col_offset);
418 shift_expr(n, n->v.DictComp.value, lineno, col_offset);
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100419 for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.DictComp.generators); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100420 comprehension_ty comp = asdl_seq_GET(n->v.DictComp.generators, i);
421 fstring_shift_comprehension(n, comp, lineno, col_offset);
422 }
423 break;
424 case GeneratorExp_kind:
425 shift_expr(n, n->v.GeneratorExp.elt, lineno, col_offset);
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100426 for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.GeneratorExp.generators); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100427 comprehension_ty comp = asdl_seq_GET(n->v.GeneratorExp.generators, i);
428 fstring_shift_comprehension(n, comp, lineno, col_offset);
429 }
430 break;
431 case Await_kind:
432 shift_expr(n, n->v.Await.value, lineno, col_offset);
433 break;
434 case Yield_kind:
435 shift_expr(n, n->v.Yield.value, lineno, col_offset);
436 break;
437 case YieldFrom_kind:
438 shift_expr(n, n->v.YieldFrom.value, lineno, col_offset);
439 break;
440 case Compare_kind:
441 shift_expr(n, n->v.Compare.left, lineno, col_offset);
442 fstring_shift_seq_locations(n, n->v.Compare.comparators, lineno, col_offset);
443 break;
444 case Call_kind:
445 shift_expr(n, n->v.Call.func, lineno, col_offset);
446 fstring_shift_seq_locations(n, n->v.Call.args, lineno, col_offset);
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100447 for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.Call.keywords); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100448 keyword_ty keyword = asdl_seq_GET(n->v.Call.keywords, i);
449 shift_expr(n, keyword->value, lineno, col_offset);
450 }
451 break;
452 case Attribute_kind:
453 shift_expr(n, n->v.Attribute.value, lineno, col_offset);
454 break;
455 case Subscript_kind:
456 shift_expr(n, n->v.Subscript.value, lineno, col_offset);
457 fstring_shift_slice_locations(n, n->v.Subscript.slice, lineno, col_offset);
458 shift_expr(n, n->v.Subscript.slice, lineno, col_offset);
459 break;
460 case Starred_kind:
461 shift_expr(n, n->v.Starred.value, lineno, col_offset);
462 break;
463 case List_kind:
464 fstring_shift_seq_locations(n, n->v.List.elts, lineno, col_offset);
465 break;
466 case Tuple_kind:
467 fstring_shift_seq_locations(n, n->v.Tuple.elts, lineno, col_offset);
468 break;
Lysandros Nikolaou37af21b2020-04-29 03:43:50 +0300469 case JoinedStr_kind:
470 fstring_shift_seq_locations(n, n->v.JoinedStr.values, lineno, col_offset);
471 break;
472 case FormattedValue_kind:
473 shift_expr(n, n->v.FormattedValue.value, lineno, col_offset);
474 if (n->v.FormattedValue.format_spec) {
475 shift_expr(n, n->v.FormattedValue.format_spec, lineno, col_offset);
476 }
477 break;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100478 default:
479 return;
480 }
481}
482
483/* Shift locations for the given node and all its children by adding `lineno`
484 and `col_offset` to existing locations. Note that n is the already parsed
485 expression. */
486static void fstring_shift_expr_locations(expr_ty n, int lineno, int col_offset)
487{
488 n->col_offset = n->col_offset + col_offset;
489
490 // The following is needed, in order for nodes spanning across multiple lines
491 // to be shifted correctly. An example of such a node is a Call node, the closing
492 // parenthesis of which is not on the same line as its name.
493 if (n->lineno == n->end_lineno) {
494 n->end_col_offset = n->end_col_offset + col_offset;
495 }
496
497 fstring_shift_children_locations(n, lineno, col_offset);
498 n->lineno = n->lineno + lineno;
499 n->end_lineno = n->end_lineno + lineno;
500}
501
502/* Fix locations for the given node and its children.
503
504 `parent` is the enclosing node.
505 `n` is the node which locations are going to be fixed relative to parent.
506 `expr_str` is the child node's string representation, including braces.
507*/
508static void
509fstring_fix_expr_location(Token *parent, expr_ty n, char *expr_str)
510{
511 char *substr = NULL;
512 char *start;
513 int lines = 0;
514 int cols = 0;
515
516 if (parent && parent->bytes) {
517 char *parent_str = PyBytes_AsString(parent->bytes);
518 if (!parent_str) {
519 return;
520 }
521 substr = strstr(parent_str, expr_str);
522 if (substr) {
523 // The following is needed, in order to correctly shift the column
524 // offset, in the case that (disregarding any whitespace) a newline
525 // immediately follows the opening curly brace of the fstring expression.
526 int newline_after_brace = 1;
527 start = substr + 1;
528 while (start && *start != '}' && *start != '\n') {
529 if (*start != ' ' && *start != '\t' && *start != '\f') {
530 newline_after_brace = 0;
531 break;
532 }
533 start++;
534 }
535
536 // Account for the characters from the last newline character to our
537 // left until the beginning of substr.
538 if (!newline_after_brace) {
539 start = substr;
540 while (start > parent_str && *start != '\n') {
541 start--;
542 }
543 cols += (int)(substr - start);
544 }
545 /* adjust the start based on the number of newlines encountered
546 before the f-string expression */
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100547 for (char* p = parent_str; p < substr; p++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100548 if (*p == '\n') {
549 lines++;
550 }
551 }
552 }
553 }
554 fstring_shift_expr_locations(n, lines, cols);
555}
556
557
558/* Compile this expression in to an expr_ty. Add parens around the
559 expression, in order to allow leading spaces in the expression. */
560static expr_ty
561fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
562 Token *t)
563{
564 expr_ty expr = NULL;
565 char *str;
566 Py_ssize_t len;
567 const char *s;
568 expr_ty result = NULL;
569
570 assert(expr_end >= expr_start);
571 assert(*(expr_start-1) == '{');
572 assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' ||
573 *expr_end == '=');
574
575 /* If the substring is all whitespace, it's an error. We need to catch this
576 here, and not when we call PyParser_SimpleParseStringFlagsFilename,
577 because turning the expression '' in to '()' would go from being invalid
578 to valid. */
579 for (s = expr_start; s != expr_end; s++) {
580 char c = *s;
581 /* The Python parser ignores only the following whitespace
582 characters (\r already is converted to \n). */
583 if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) {
584 break;
585 }
586 }
587 if (s == expr_end) {
588 RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
589 return NULL;
590 }
591
592 len = expr_end - expr_start;
593 /* Allocate 3 extra bytes: open paren, close paren, null byte. */
594 str = PyMem_RawMalloc(len + 3);
595 if (str == NULL) {
596 PyErr_NoMemory();
597 return NULL;
598 }
599
600 str[0] = '(';
601 memcpy(str+1, expr_start, len);
602 str[len+1] = ')';
603 str[len+2] = 0;
604
605 struct tok_state* tok = PyTokenizer_FromString(str, 1);
606 if (tok == NULL) {
607 return NULL;
608 }
Lysandros Nikolaou791a46e2020-05-26 04:24:31 +0300609 Py_INCREF(p->tok->filename);
610 tok->filename = p->tok->filename;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100611
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300612 Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
613 NULL, p->arena);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100614 p2->starting_lineno = p->starting_lineno + p->tok->first_lineno - 1;
615 p2->starting_col_offset = p->tok->first_lineno == p->tok->lineno
616 ? p->starting_col_offset + t->col_offset : 0;
617
618 expr = _PyPegen_run_parser(p2);
619
620 if (expr == NULL) {
621 goto exit;
622 }
623
624 /* Reuse str to find the correct column offset. */
625 str[0] = '{';
626 str[len+1] = '}';
627 fstring_fix_expr_location(t, expr, str);
628
629 result = expr;
630
631exit:
632 _PyPegen_Parser_Free(p2);
633 PyTokenizer_Free(tok);
634 return result;
635}
636
637/* Return -1 on error.
638
639 Return 0 if we reached the end of the literal.
640
641 Return 1 if we haven't reached the end of the literal, but we want
642 the caller to process the literal up to this point. Used for
643 doubled braces.
644*/
645static int
646fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300647 PyObject **literal, int recurse_lvl, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100648{
649 /* Get any literal string. It ends when we hit an un-doubled left
650 brace (which isn't part of a unicode name escape such as
651 "\N{EULER CONSTANT}"), or the end of the string. */
652
653 const char *s = *str;
654 const char *literal_start = s;
655 int result = 0;
656
657 assert(*literal == NULL);
658 while (s < end) {
659 char ch = *s++;
660 if (!raw && ch == '\\' && s < end) {
661 ch = *s++;
662 if (ch == 'N') {
663 if (s < end && *s++ == '{') {
664 while (s < end && *s++ != '}') {
665 }
666 continue;
667 }
668 break;
669 }
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300670 if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100671 return -1;
672 }
673 }
674 if (ch == '{' || ch == '}') {
675 /* Check for doubled braces, but only at the top level. If
676 we checked at every level, then f'{0:{3}}' would fail
677 with the two closing braces. */
678 if (recurse_lvl == 0) {
679 if (s < end && *s == ch) {
680 /* We're going to tell the caller that the literal ends
681 here, but that they should continue scanning. But also
682 skip over the second brace when we resume scanning. */
683 *str = s + 1;
684 result = 1;
685 goto done;
686 }
687
688 /* Where a single '{' is the start of a new expression, a
689 single '}' is not allowed. */
690 if (ch == '}') {
691 *str = s - 1;
692 RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
693 return -1;
694 }
695 }
696 /* We're either at a '{', which means we're starting another
697 expression; or a '}', which means we're at the end of this
698 f-string (for a nested format_spec). */
699 s--;
700 break;
701 }
702 }
703 *str = s;
704 assert(s <= end);
705 assert(s == end || *s == '{' || *s == '}');
706done:
707 if (literal_start != s) {
708 if (raw)
709 *literal = PyUnicode_DecodeUTF8Stateful(literal_start,
710 s - literal_start,
711 NULL, NULL);
712 else
713 *literal = decode_unicode_with_escapes(p, literal_start,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300714 s - literal_start, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100715 if (!*literal)
716 return -1;
717 }
718 return result;
719}
720
721/* Forward declaration because parsing is recursive. */
722static expr_ty
723fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
724 Token *first_token, Token* t, Token *last_token);
725
726/* Parse the f-string at *str, ending at end. We know *str starts an
727 expression (so it must be a '{'). Returns the FormattedValue node, which
728 includes the expression, conversion character, format_spec expression, and
729 optionally the text of the expression (if = is used).
730
731 Note that I don't do a perfect job here: I don't make sure that a
732 closing brace doesn't match an opening paren, for example. It
733 doesn't need to error on all invalid expressions, just correctly
734 find the end of all valid ones. Any errors inside the expression
735 will be caught when we parse it later.
736
737 *expression is set to the expression. For an '=' "debug" expression,
738 *expr_text is set to the debug text (the original text of the expression,
739 including the '=' and any whitespace around it, as a string object). If
740 not a debug expression, *expr_text set to NULL. */
741static int
742fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
743 PyObject **expr_text, expr_ty *expression, Token *first_token,
744 Token *t, Token *last_token)
745{
746 /* Return -1 on error, else 0. */
747
748 const char *expr_start;
749 const char *expr_end;
750 expr_ty simple_expression;
751 expr_ty format_spec = NULL; /* Optional format specifier. */
752 int conversion = -1; /* The conversion char. Use default if not
753 specified, or !r if using = and no format
754 spec. */
755
756 /* 0 if we're not in a string, else the quote char we're trying to
757 match (single or double quote). */
758 char quote_char = 0;
759
760 /* If we're inside a string, 1=normal, 3=triple-quoted. */
761 int string_type = 0;
762
763 /* Keep track of nesting level for braces/parens/brackets in
764 expressions. */
765 Py_ssize_t nested_depth = 0;
766 char parenstack[MAXLEVEL];
767
768 *expr_text = NULL;
769
770 /* Can only nest one level deep. */
771 if (recurse_lvl >= 2) {
772 RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
773 goto error;
774 }
775
776 /* The first char must be a left brace, or we wouldn't have gotten
777 here. Skip over it. */
778 assert(**str == '{');
779 *str += 1;
780
781 expr_start = *str;
782 for (; *str < end; (*str)++) {
783 char ch;
784
785 /* Loop invariants. */
786 assert(nested_depth >= 0);
787 assert(*str >= expr_start && *str < end);
788 if (quote_char)
789 assert(string_type == 1 || string_type == 3);
790 else
791 assert(string_type == 0);
792
793 ch = **str;
794 /* Nowhere inside an expression is a backslash allowed. */
795 if (ch == '\\') {
796 /* Error: can't include a backslash character, inside
797 parens or strings or not. */
798 RAISE_SYNTAX_ERROR(
799 "f-string expression part "
800 "cannot include a backslash");
801 goto error;
802 }
803 if (quote_char) {
804 /* We're inside a string. See if we're at the end. */
805 /* This code needs to implement the same non-error logic
806 as tok_get from tokenizer.c, at the letter_quote
807 label. To actually share that code would be a
808 nightmare. But, it's unlikely to change and is small,
809 so duplicate it here. Note we don't need to catch all
810 of the errors, since they'll be caught when parsing the
811 expression. We just need to match the non-error
812 cases. Thus we can ignore \n in single-quoted strings,
813 for example. Or non-terminated strings. */
814 if (ch == quote_char) {
815 /* Does this match the string_type (single or triple
816 quoted)? */
817 if (string_type == 3) {
818 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
819 /* We're at the end of a triple quoted string. */
820 *str += 2;
821 string_type = 0;
822 quote_char = 0;
823 continue;
824 }
825 } else {
826 /* We're at the end of a normal string. */
827 quote_char = 0;
828 string_type = 0;
829 continue;
830 }
831 }
832 } else if (ch == '\'' || ch == '"') {
833 /* Is this a triple quoted string? */
834 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
835 string_type = 3;
836 *str += 2;
837 } else {
838 /* Start of a normal string. */
839 string_type = 1;
840 }
841 /* Start looking for the end of the string. */
842 quote_char = ch;
843 } else if (ch == '[' || ch == '{' || ch == '(') {
844 if (nested_depth >= MAXLEVEL) {
845 RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
846 goto error;
847 }
848 parenstack[nested_depth] = ch;
849 nested_depth++;
850 } else if (ch == '#') {
851 /* Error: can't include a comment character, inside parens
852 or not. */
853 RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
854 goto error;
855 } else if (nested_depth == 0 &&
856 (ch == '!' || ch == ':' || ch == '}' ||
857 ch == '=' || ch == '>' || ch == '<')) {
858 /* See if there's a next character. */
859 if (*str+1 < end) {
860 char next = *(*str+1);
861
862 /* For "!=". since '=' is not an allowed conversion character,
863 nothing is lost in this test. */
864 if ((ch == '!' && next == '=') || /* != */
865 (ch == '=' && next == '=') || /* == */
866 (ch == '<' && next == '=') || /* <= */
867 (ch == '>' && next == '=') /* >= */
868 ) {
869 *str += 1;
870 continue;
871 }
872 /* Don't get out of the loop for these, if they're single
873 chars (not part of 2-char tokens). If by themselves, they
874 don't end an expression (unlike say '!'). */
875 if (ch == '>' || ch == '<') {
876 continue;
877 }
878 }
879
880 /* Normal way out of this loop. */
881 break;
882 } else if (ch == ']' || ch == '}' || ch == ')') {
883 if (!nested_depth) {
884 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
885 goto error;
886 }
887 nested_depth--;
888 int opening = parenstack[nested_depth];
889 if (!((opening == '(' && ch == ')') ||
890 (opening == '[' && ch == ']') ||
891 (opening == '{' && ch == '}')))
892 {
893 RAISE_SYNTAX_ERROR(
894 "f-string: closing parenthesis '%c' "
895 "does not match opening parenthesis '%c'",
896 ch, opening);
897 goto error;
898 }
899 } else {
900 /* Just consume this char and loop around. */
901 }
902 }
903 expr_end = *str;
904 /* If we leave this loop in a string or with mismatched parens, we
905 don't care. We'll get a syntax error when compiling the
906 expression. But, we can produce a better error message, so
907 let's just do that.*/
908 if (quote_char) {
909 RAISE_SYNTAX_ERROR("f-string: unterminated string");
910 goto error;
911 }
912 if (nested_depth) {
913 int opening = parenstack[nested_depth - 1];
914 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
915 goto error;
916 }
917
918 if (*str >= end)
919 goto unexpected_end_of_string;
920
921 /* Compile the expression as soon as possible, so we show errors
922 related to the expression before errors related to the
923 conversion or format_spec. */
924 simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
925 if (!simple_expression)
926 goto error;
927
928 /* Check for =, which puts the text value of the expression in
929 expr_text. */
930 if (**str == '=') {
931 *str += 1;
932
933 /* Skip over ASCII whitespace. No need to test for end of string
934 here, since we know there's at least a trailing quote somewhere
935 ahead. */
936 while (Py_ISSPACE(**str)) {
937 *str += 1;
938 }
939
940 /* Set *expr_text to the text of the expression. */
941 *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start);
942 if (!*expr_text) {
943 goto error;
944 }
945 }
946
947 /* Check for a conversion char, if present. */
948 if (**str == '!') {
949 *str += 1;
950 if (*str >= end)
951 goto unexpected_end_of_string;
952
953 conversion = **str;
954 *str += 1;
955
956 /* Validate the conversion. */
957 if (!(conversion == 's' || conversion == 'r' || conversion == 'a')) {
958 RAISE_SYNTAX_ERROR(
959 "f-string: invalid conversion character: "
960 "expected 's', 'r', or 'a'");
961 goto error;
962 }
963
964 }
965
966 /* Check for the format spec, if present. */
967 if (*str >= end)
968 goto unexpected_end_of_string;
969 if (**str == ':') {
970 *str += 1;
971 if (*str >= end)
972 goto unexpected_end_of_string;
973
974 /* Parse the format spec. */
975 format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
976 first_token, t, last_token);
977 if (!format_spec)
978 goto error;
979 }
980
981 if (*str >= end || **str != '}')
982 goto unexpected_end_of_string;
983
984 /* We're at a right brace. Consume it. */
985 assert(*str < end);
986 assert(**str == '}');
987 *str += 1;
988
989 /* If we're in = mode (detected by non-NULL expr_text), and have no format
990 spec and no explicit conversion, set the conversion to 'r'. */
991 if (*expr_text && format_spec == NULL && conversion == -1) {
992 conversion = 'r';
993 }
994
995 /* And now create the FormattedValue node that represents this
996 entire expression with the conversion and format spec. */
997 //TODO: Fix this
998 *expression = FormattedValue(simple_expression, conversion,
999 format_spec, first_token->lineno,
1000 first_token->col_offset, last_token->end_lineno,
1001 last_token->end_col_offset, p->arena);
1002 if (!*expression)
1003 goto error;
1004
1005 return 0;
1006
1007unexpected_end_of_string:
1008 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1009 /* Falls through to error. */
1010
1011error:
1012 Py_XDECREF(*expr_text);
1013 return -1;
1014
1015}
1016
1017/* Return -1 on error.
1018
1019 Return 0 if we have a literal (possible zero length) and an
1020 expression (zero length if at the end of the string.
1021
1022 Return 1 if we have a literal, but no expression, and we want the
1023 caller to call us again. This is used to deal with doubled
1024 braces.
1025
1026 When called multiple times on the string 'a{{b{0}c', this function
1027 will return:
1028
1029 1. the literal 'a{' with no expression, and a return value
1030 of 1. Despite the fact that there's no expression, the return
1031 value of 1 means we're not finished yet.
1032
1033 2. the literal 'b' and the expression '0', with a return value of
1034 0. The fact that there's an expression means we're not finished.
1035
1036 3. literal 'c' with no expression and a return value of 0. The
1037 combination of the return value of 0 with no expression means
1038 we're finished.
1039*/
1040static int
1041fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw,
1042 int recurse_lvl, PyObject **literal,
1043 PyObject **expr_text, expr_ty *expression,
1044 Token *first_token, Token *t, Token *last_token)
1045{
1046 int result;
1047
1048 assert(*literal == NULL && *expression == NULL);
1049
1050 /* Get any literal string. */
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +03001051 result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001052 if (result < 0)
1053 goto error;
1054
1055 assert(result == 0 || result == 1);
1056
1057 if (result == 1)
1058 /* We have a literal, but don't look at the expression. */
1059 return 1;
1060
1061 if (*str >= end || **str == '}')
1062 /* We're at the end of the string or the end of a nested
1063 f-string: no expression. The top-level error case where we
1064 expect to be at the end of the string but we're at a '}' is
1065 handled later. */
1066 return 0;
1067
1068 /* We must now be the start of an expression, on a '{'. */
1069 assert(**str == '{');
1070
1071 if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
1072 expression, first_token, t, last_token) < 0)
1073 goto error;
1074
1075 return 0;
1076
1077error:
1078 Py_CLEAR(*literal);
1079 return -1;
1080}
1081
1082#ifdef NDEBUG
1083#define ExprList_check_invariants(l)
1084#else
1085static void
1086ExprList_check_invariants(ExprList *l)
1087{
1088 /* Check our invariants. Make sure this object is "live", and
1089 hasn't been deallocated. */
1090 assert(l->size >= 0);
1091 assert(l->p != NULL);
1092 if (l->size <= EXPRLIST_N_CACHED)
1093 assert(l->data == l->p);
1094}
1095#endif
1096
1097static void
1098ExprList_Init(ExprList *l)
1099{
1100 l->allocated = EXPRLIST_N_CACHED;
1101 l->size = 0;
1102
1103 /* Until we start allocating dynamically, p points to data. */
1104 l->p = l->data;
1105
1106 ExprList_check_invariants(l);
1107}
1108
1109static int
1110ExprList_Append(ExprList *l, expr_ty exp)
1111{
1112 ExprList_check_invariants(l);
1113 if (l->size >= l->allocated) {
1114 /* We need to alloc (or realloc) the memory. */
1115 Py_ssize_t new_size = l->allocated * 2;
1116
1117 /* See if we've ever allocated anything dynamically. */
1118 if (l->p == l->data) {
1119 Py_ssize_t i;
1120 /* We're still using the cached data. Switch to
1121 alloc-ing. */
1122 l->p = PyMem_RawMalloc(sizeof(expr_ty) * new_size);
1123 if (!l->p)
1124 return -1;
1125 /* Copy the cached data into the new buffer. */
1126 for (i = 0; i < l->size; i++)
1127 l->p[i] = l->data[i];
1128 } else {
1129 /* Just realloc. */
1130 expr_ty *tmp = PyMem_RawRealloc(l->p, sizeof(expr_ty) * new_size);
1131 if (!tmp) {
1132 PyMem_RawFree(l->p);
1133 l->p = NULL;
1134 return -1;
1135 }
1136 l->p = tmp;
1137 }
1138
1139 l->allocated = new_size;
1140 assert(l->allocated == 2 * l->size);
1141 }
1142
1143 l->p[l->size++] = exp;
1144
1145 ExprList_check_invariants(l);
1146 return 0;
1147}
1148
1149static void
1150ExprList_Dealloc(ExprList *l)
1151{
1152 ExprList_check_invariants(l);
1153
1154 /* If there's been an error, or we've never dynamically allocated,
1155 do nothing. */
1156 if (!l->p || l->p == l->data) {
1157 /* Do nothing. */
1158 } else {
1159 /* We have dynamically allocated. Free the memory. */
1160 PyMem_RawFree(l->p);
1161 }
1162 l->p = NULL;
1163 l->size = -1;
1164}
1165
1166static asdl_seq *
1167ExprList_Finish(ExprList *l, PyArena *arena)
1168{
1169 asdl_seq *seq;
1170
1171 ExprList_check_invariants(l);
1172
1173 /* Allocate the asdl_seq and copy the expressions in to it. */
1174 seq = _Py_asdl_seq_new(l->size, arena);
1175 if (seq) {
1176 Py_ssize_t i;
1177 for (i = 0; i < l->size; i++)
1178 asdl_seq_SET(seq, i, l->p[i]);
1179 }
1180 ExprList_Dealloc(l);
1181 return seq;
1182}
1183
1184#ifdef NDEBUG
1185#define FstringParser_check_invariants(state)
1186#else
1187static void
1188FstringParser_check_invariants(FstringParser *state)
1189{
1190 if (state->last_str)
1191 assert(PyUnicode_CheckExact(state->last_str));
1192 ExprList_check_invariants(&state->expr_list);
1193}
1194#endif
1195
1196void
1197_PyPegen_FstringParser_Init(FstringParser *state)
1198{
1199 state->last_str = NULL;
1200 state->fmode = 0;
1201 ExprList_Init(&state->expr_list);
1202 FstringParser_check_invariants(state);
1203}
1204
1205void
1206_PyPegen_FstringParser_Dealloc(FstringParser *state)
1207{
1208 FstringParser_check_invariants(state);
1209
1210 Py_XDECREF(state->last_str);
1211 ExprList_Dealloc(&state->expr_list);
1212}
1213
1214/* Make a Constant node, but decref the PyUnicode object being added. */
1215static expr_ty
1216make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token)
1217{
1218 PyObject *s = *str;
1219 PyObject *kind = NULL;
1220 *str = NULL;
1221 assert(PyUnicode_CheckExact(s));
1222 if (PyArena_AddPyObject(p->arena, s) < 0) {
1223 Py_DECREF(s);
1224 return NULL;
1225 }
1226 const char* the_str = PyBytes_AsString(first_token->bytes);
1227 if (the_str && the_str[0] == 'u') {
1228 kind = _PyPegen_new_identifier(p, "u");
1229 }
1230
1231 if (kind == NULL && PyErr_Occurred()) {
1232 return NULL;
1233 }
1234
1235 return Constant(s, kind, first_token->lineno, first_token->col_offset,
1236 last_token->end_lineno, last_token->end_col_offset, p->arena);
1237
1238}
1239
1240
1241/* Add a non-f-string (that is, a regular literal string). str is
1242 decref'd. */
1243int
1244_PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str)
1245{
1246 FstringParser_check_invariants(state);
1247
1248 assert(PyUnicode_CheckExact(str));
1249
1250 if (PyUnicode_GET_LENGTH(str) == 0) {
1251 Py_DECREF(str);
1252 return 0;
1253 }
1254
1255 if (!state->last_str) {
1256 /* We didn't have a string before, so just remember this one. */
1257 state->last_str = str;
1258 } else {
1259 /* Concatenate this with the previous string. */
1260 PyUnicode_AppendAndDel(&state->last_str, str);
1261 if (!state->last_str)
1262 return -1;
1263 }
1264 FstringParser_check_invariants(state);
1265 return 0;
1266}
1267
1268/* Parse an f-string. The f-string is in *str to end, with no
1269 'f' or quotes. */
1270int
1271_PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str,
1272 const char *end, int raw, int recurse_lvl,
1273 Token *first_token, Token* t, Token *last_token)
1274{
1275 FstringParser_check_invariants(state);
1276 state->fmode = 1;
1277
1278 /* Parse the f-string. */
1279 while (1) {
1280 PyObject *literal = NULL;
1281 PyObject *expr_text = NULL;
1282 expr_ty expression = NULL;
1283
1284 /* If there's a zero length literal in front of the
1285 expression, literal will be NULL. If we're at the end of
1286 the f-string, expression will be NULL (unless result == 1,
1287 see below). */
1288 int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
1289 &literal, &expr_text,
1290 &expression, first_token, t, last_token);
1291 if (result < 0)
1292 return -1;
1293
1294 /* Add the literal, if any. */
1295 if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
1296 Py_XDECREF(expr_text);
1297 return -1;
1298 }
1299 /* Add the expr_text, if any. */
1300 if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
1301 return -1;
1302 }
1303
1304 /* We've dealt with the literal and expr_text, their ownership has
1305 been transferred to the state object. Don't look at them again. */
1306
1307 /* See if we should just loop around to get the next literal
1308 and expression, while ignoring the expression this
1309 time. This is used for un-doubling braces, as an
1310 optimization. */
1311 if (result == 1)
1312 continue;
1313
1314 if (!expression)
1315 /* We're done with this f-string. */
1316 break;
1317
1318 /* We know we have an expression. Convert any existing string
1319 to a Constant node. */
1320 if (!state->last_str) {
1321 /* Do nothing. No previous literal. */
1322 } else {
1323 /* Convert the existing last_str literal to a Constant node. */
1324 expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1325 if (!str || ExprList_Append(&state->expr_list, str) < 0)
1326 return -1;
1327 }
1328
1329 if (ExprList_Append(&state->expr_list, expression) < 0)
1330 return -1;
1331 }
1332
1333 /* If recurse_lvl is zero, then we must be at the end of the
1334 string. Otherwise, we must be at a right brace. */
1335
1336 if (recurse_lvl == 0 && *str < end-1) {
1337 RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
1338 return -1;
1339 }
1340 if (recurse_lvl != 0 && **str != '}') {
1341 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1342 return -1;
1343 }
1344
1345 FstringParser_check_invariants(state);
1346 return 0;
1347}
1348
1349/* Convert the partial state reflected in last_str and expr_list to an
1350 expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
1351expr_ty
1352_PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token,
1353 Token *last_token)
1354{
1355 asdl_seq *seq;
1356
1357 FstringParser_check_invariants(state);
1358
1359 /* If we're just a constant string with no expressions, return
1360 that. */
1361 if (!state->fmode) {
1362 assert(!state->expr_list.size);
1363 if (!state->last_str) {
1364 /* Create a zero length string. */
1365 state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
1366 if (!state->last_str)
1367 goto error;
1368 }
1369 return make_str_node_and_del(p, &state->last_str, first_token, last_token);
1370 }
1371
1372 /* Create a Constant node out of last_str, if needed. It will be the
1373 last node in our expression list. */
1374 if (state->last_str) {
1375 expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1376 if (!str || ExprList_Append(&state->expr_list, str) < 0)
1377 goto error;
1378 }
1379 /* This has already been freed. */
1380 assert(state->last_str == NULL);
1381
1382 seq = ExprList_Finish(&state->expr_list, p->arena);
1383 if (!seq)
1384 goto error;
1385
1386 return _Py_JoinedStr(seq, first_token->lineno, first_token->col_offset,
1387 last_token->end_lineno, last_token->end_col_offset, p->arena);
1388
1389error:
1390 _PyPegen_FstringParser_Dealloc(state);
1391 return NULL;
1392}
1393
1394/* Given an f-string (with no 'f' or quotes) that's in *str and ends
1395 at end, parse it into an expr_ty. Return NULL on error. Adjust
1396 str to point past the parsed portion. */
1397static expr_ty
1398fstring_parse(Parser *p, const char **str, const char *end, int raw,
1399 int recurse_lvl, Token *first_token, Token* t, Token *last_token)
1400{
1401 FstringParser state;
1402
1403 _PyPegen_FstringParser_Init(&state);
1404 if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
1405 first_token, t, last_token) < 0) {
1406 _PyPegen_FstringParser_Dealloc(&state);
1407 return NULL;
1408 }
1409
1410 return _PyPegen_FstringParser_Finish(p, &state, t, t);
1411}