blob: 94241e1965e9a8e01c2c33fe91efb5bdcf1fcaa2 [file] [log] [blame]
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001#include <Python.h>
2
3#include "../tokenizer.h"
4#include "pegen.h"
5#include "parse_string.h"
6
7//// STRING HANDLING FUNCTIONS ////
8
9// These functions are ported directly from Python/ast.c with some modifications
10// to account for the use of "Parser *p", the fact that don't have parser nodes
11// to pass around and the usage of some specialized APIs present only in this
12// file (like "_PyPegen_raise_syntax_error").
13
14static int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030015warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010016{
17 PyObject *msg =
18 PyUnicode_FromFormat("invalid escape sequence \\%c", first_invalid_escape_char);
19 if (msg == NULL) {
20 return -1;
21 }
22 if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030023 t->lineno, NULL, NULL) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +010024 if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
25 /* Replace the DeprecationWarning exception with a SyntaxError
26 to get a more accurate error report */
27 PyErr_Clear();
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030028
29 /* This is needed, in order for the SyntaxError to point to the token t,
30 since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
31 error location, if p->known_err_token is not set. */
32 p->known_err_token = t;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010033 RAISE_SYNTAX_ERROR("invalid escape sequence \\%c", first_invalid_escape_char);
34 }
35 Py_DECREF(msg);
36 return -1;
37 }
38 Py_DECREF(msg);
39 return 0;
40}
41
42static PyObject *
43decode_utf8(const char **sPtr, const char *end)
44{
45 const char *s, *t;
46 t = s = *sPtr;
47 while (s < end && (*s & 0x80)) {
48 s++;
49 }
50 *sPtr = s;
51 return PyUnicode_DecodeUTF8(t, s - t, NULL);
52}
53
54static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030055decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010056{
57 PyObject *v, *u;
58 char *buf;
59 char *p;
60 const char *end;
61
62 /* check for integer overflow */
63 if (len > SIZE_MAX / 6) {
64 return NULL;
65 }
66 /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
67 "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
68 u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
69 if (u == NULL) {
70 return NULL;
71 }
72 p = buf = PyBytes_AsString(u);
73 end = s + len;
74 while (s < end) {
75 if (*s == '\\') {
76 *p++ = *s++;
77 if (s >= end || *s & 0x80) {
78 strcpy(p, "u005c");
79 p += 5;
80 if (s >= end) {
81 break;
82 }
83 }
84 }
85 if (*s & 0x80) {
86 PyObject *w;
87 int kind;
88 void *data;
89 Py_ssize_t len, i;
90 w = decode_utf8(&s, end);
91 if (w == NULL) {
92 Py_DECREF(u);
93 return NULL;
94 }
95 kind = PyUnicode_KIND(w);
96 data = PyUnicode_DATA(w);
97 len = PyUnicode_GET_LENGTH(w);
98 for (i = 0; i < len; i++) {
99 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
100 sprintf(p, "\\U%08x", chr);
101 p += 10;
102 }
103 /* Should be impossible to overflow */
104 assert(p - buf <= PyBytes_GET_SIZE(u));
105 Py_DECREF(w);
106 }
107 else {
108 *p++ = *s++;
109 }
110 }
111 len = p - buf;
112 s = buf;
113
114 const char *first_invalid_escape;
115 v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
116
117 if (v != NULL && first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300118 if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100119 /* We have not decref u before because first_invalid_escape points
120 inside u. */
121 Py_XDECREF(u);
122 Py_DECREF(v);
123 return NULL;
124 }
125 }
126 Py_XDECREF(u);
127 return v;
128}
129
130static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300131decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100132{
133 const char *first_invalid_escape;
134 PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
135 if (result == NULL) {
136 return NULL;
137 }
138
139 if (first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300140 if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100141 Py_DECREF(result);
142 return NULL;
143 }
144 }
145 return result;
146}
147
148/* s must include the bracketing quote characters, and r, b, u,
149 &/or f prefixes (if any), and embedded escape sequences (if any).
150 _PyPegen_parsestr parses it, and sets *result to decoded Python string object.
151 If the string is an f-string, set *fstr and *fstrlen to the unparsed
152 string object. Return 0 if no errors occurred. */
153int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300154_PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
155 const char **fstr, Py_ssize_t *fstrlen, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100156{
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300157 const char *s = PyBytes_AsString(t->bytes);
158 if (s == NULL) {
159 return -1;
160 }
161
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100162 size_t len;
163 int quote = Py_CHARMASK(*s);
164 int fmode = 0;
165 *bytesmode = 0;
166 *rawmode = 0;
167 *result = NULL;
168 *fstr = NULL;
169 if (Py_ISALPHA(quote)) {
170 while (!*bytesmode || !*rawmode) {
171 if (quote == 'b' || quote == 'B') {
172 quote = *++s;
173 *bytesmode = 1;
174 }
175 else if (quote == 'u' || quote == 'U') {
176 quote = *++s;
177 }
178 else if (quote == 'r' || quote == 'R') {
179 quote = *++s;
180 *rawmode = 1;
181 }
182 else if (quote == 'f' || quote == 'F') {
183 quote = *++s;
184 fmode = 1;
185 }
186 else {
187 break;
188 }
189 }
190 }
191
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300192 /* fstrings are only allowed in Python 3.6 and greater */
193 if (fmode && p->feature_version < 6) {
194 p->error_indicator = 1;
195 RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
196 return -1;
197 }
198
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100199 if (fmode && *bytesmode) {
200 PyErr_BadInternalCall();
201 return -1;
202 }
203 if (quote != '\'' && quote != '\"') {
204 PyErr_BadInternalCall();
205 return -1;
206 }
207 /* Skip the leading quote char. */
208 s++;
209 len = strlen(s);
210 if (len > INT_MAX) {
211 PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
212 return -1;
213 }
214 if (s[--len] != quote) {
215 /* Last quote char must match the first. */
216 PyErr_BadInternalCall();
217 return -1;
218 }
219 if (len >= 4 && s[0] == quote && s[1] == quote) {
220 /* A triple quoted string. We've already skipped one quote at
221 the start and one at the end of the string. Now skip the
222 two at the start. */
223 s += 2;
224 len -= 2;
225 /* And check that the last two match. */
226 if (s[--len] != quote || s[--len] != quote) {
227 PyErr_BadInternalCall();
228 return -1;
229 }
230 }
231
232 if (fmode) {
233 /* Just return the bytes. The caller will parse the resulting
234 string. */
235 *fstr = s;
236 *fstrlen = len;
237 return 0;
238 }
239
240 /* Not an f-string. */
241 /* Avoid invoking escape decoding routines if possible. */
242 *rawmode = *rawmode || strchr(s, '\\') == NULL;
243 if (*bytesmode) {
244 /* Disallow non-ASCII characters. */
245 const char *ch;
246 for (ch = s; *ch; ch++) {
247 if (Py_CHARMASK(*ch) >= 0x80) {
248 RAISE_SYNTAX_ERROR(
249 "bytes can only contain ASCII "
250 "literal characters.");
251 return -1;
252 }
253 }
254 if (*rawmode) {
255 *result = PyBytes_FromStringAndSize(s, len);
256 }
257 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300258 *result = decode_bytes_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100259 }
260 }
261 else {
262 if (*rawmode) {
263 *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
264 }
265 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300266 *result = decode_unicode_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100267 }
268 }
269 return *result == NULL ? -1 : 0;
270}
271
272
273
274// FSTRING STUFF
275
276static void fstring_shift_expr_locations(expr_ty n, int lineno, int col_offset);
277static void fstring_shift_argument(expr_ty parent, arg_ty args, int lineno, int col_offset);
278
279
280static inline void shift_expr(expr_ty parent, expr_ty n, int line, int col) {
Miss Islington (bot)64409112020-06-07 18:08:53 -0700281 if (n == NULL) {
282 return;
283 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100284 if (parent->lineno < n->lineno) {
285 col = 0;
286 }
287 fstring_shift_expr_locations(n, line, col);
288}
289
290static inline void shift_arg(expr_ty parent, arg_ty n, int line, int col) {
291 if (parent->lineno < n->lineno) {
292 col = 0;
293 }
294 fstring_shift_argument(parent, n, line, col);
295}
296
297static void fstring_shift_seq_locations(expr_ty parent, asdl_seq *seq, int lineno, int col_offset) {
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100298 for (Py_ssize_t i = 0, l = asdl_seq_LEN(seq); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100299 expr_ty expr = asdl_seq_GET(seq, i);
300 if (expr == NULL){
301 continue;
302 }
303 shift_expr(parent, expr, lineno, col_offset);
304 }
305}
306
307static void fstring_shift_slice_locations(expr_ty parent, expr_ty slice, int lineno, int col_offset) {
308 switch (slice->kind) {
309 case Slice_kind:
310 if (slice->v.Slice.lower) {
311 shift_expr(parent, slice->v.Slice.lower, lineno, col_offset);
312 }
313 if (slice->v.Slice.upper) {
314 shift_expr(parent, slice->v.Slice.upper, lineno, col_offset);
315 }
316 if (slice->v.Slice.step) {
317 shift_expr(parent, slice->v.Slice.step, lineno, col_offset);
318 }
319 break;
320 case Tuple_kind:
321 fstring_shift_seq_locations(parent, slice->v.Tuple.elts, lineno, col_offset);
322 break;
323 default:
324 break;
325 }
326}
327
328static void fstring_shift_comprehension(expr_ty parent, comprehension_ty comp, int lineno, int col_offset) {
329 shift_expr(parent, comp->target, lineno, col_offset);
330 shift_expr(parent, comp->iter, lineno, col_offset);
331 fstring_shift_seq_locations(parent, comp->ifs, lineno, col_offset);
332}
333
334static void fstring_shift_argument(expr_ty parent, arg_ty arg, int lineno, int col_offset) {
335 if (arg->annotation != NULL){
336 shift_expr(parent, arg->annotation, lineno, col_offset);
337 }
338 arg->col_offset = arg->col_offset + col_offset;
339 arg->end_col_offset = arg->end_col_offset + col_offset;
340 arg->lineno = arg->lineno + lineno;
341 arg->end_lineno = arg->end_lineno + lineno;
342}
343
344static void fstring_shift_arguments(expr_ty parent, arguments_ty args, int lineno, int col_offset) {
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100345 for (Py_ssize_t i = 0, l = asdl_seq_LEN(args->posonlyargs); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100346 arg_ty arg = asdl_seq_GET(args->posonlyargs, i);
347 shift_arg(parent, arg, lineno, col_offset);
348 }
349
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100350 for (Py_ssize_t i = 0, l = asdl_seq_LEN(args->args); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100351 arg_ty arg = asdl_seq_GET(args->args, i);
352 shift_arg(parent, arg, lineno, col_offset);
353 }
354
355 if (args->vararg != NULL) {
356 shift_arg(parent, args->vararg, lineno, col_offset);
357 }
358
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100359 for (Py_ssize_t i = 0, l = asdl_seq_LEN(args->kwonlyargs); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100360 arg_ty arg = asdl_seq_GET(args->kwonlyargs, i);
361 shift_arg(parent, arg, lineno, col_offset);
362 }
363
364 fstring_shift_seq_locations(parent, args->kw_defaults, lineno, col_offset);
365
366 if (args->kwarg != NULL) {
367 shift_arg(parent, args->kwarg, lineno, col_offset);
368 }
369
370 fstring_shift_seq_locations(parent, args->defaults, lineno, col_offset);
371}
372
373static void fstring_shift_children_locations(expr_ty n, int lineno, int col_offset) {
374 switch (n->kind) {
375 case BoolOp_kind:
376 fstring_shift_seq_locations(n, n->v.BoolOp.values, lineno, col_offset);
377 break;
378 case NamedExpr_kind:
379 shift_expr(n, n->v.NamedExpr.target, lineno, col_offset);
380 shift_expr(n, n->v.NamedExpr.value, lineno, col_offset);
381 break;
382 case BinOp_kind:
383 shift_expr(n, n->v.BinOp.left, lineno, col_offset);
384 shift_expr(n, n->v.BinOp.right, lineno, col_offset);
385 break;
386 case UnaryOp_kind:
387 shift_expr(n, n->v.UnaryOp.operand, lineno, col_offset);
388 break;
389 case Lambda_kind:
390 fstring_shift_arguments(n, n->v.Lambda.args, lineno, col_offset);
391 shift_expr(n, n->v.Lambda.body, lineno, col_offset);
392 break;
393 case IfExp_kind:
394 shift_expr(n, n->v.IfExp.test, lineno, col_offset);
395 shift_expr(n, n->v.IfExp.body, lineno, col_offset);
396 shift_expr(n, n->v.IfExp.orelse, lineno, col_offset);
397 break;
398 case Dict_kind:
399 fstring_shift_seq_locations(n, n->v.Dict.keys, lineno, col_offset);
400 fstring_shift_seq_locations(n, n->v.Dict.values, lineno, col_offset);
401 break;
402 case Set_kind:
403 fstring_shift_seq_locations(n, n->v.Set.elts, lineno, col_offset);
404 break;
405 case ListComp_kind:
406 shift_expr(n, n->v.ListComp.elt, lineno, col_offset);
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100407 for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.ListComp.generators); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100408 comprehension_ty comp = asdl_seq_GET(n->v.ListComp.generators, i);
409 fstring_shift_comprehension(n, comp, lineno, col_offset);
410 }
411 break;
412 case SetComp_kind:
413 shift_expr(n, n->v.SetComp.elt, lineno, col_offset);
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100414 for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.SetComp.generators); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100415 comprehension_ty comp = asdl_seq_GET(n->v.SetComp.generators, i);
416 fstring_shift_comprehension(n, comp, lineno, col_offset);
417 }
418 break;
419 case DictComp_kind:
420 shift_expr(n, n->v.DictComp.key, lineno, col_offset);
421 shift_expr(n, n->v.DictComp.value, lineno, col_offset);
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100422 for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.DictComp.generators); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100423 comprehension_ty comp = asdl_seq_GET(n->v.DictComp.generators, i);
424 fstring_shift_comprehension(n, comp, lineno, col_offset);
425 }
426 break;
427 case GeneratorExp_kind:
428 shift_expr(n, n->v.GeneratorExp.elt, lineno, col_offset);
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100429 for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.GeneratorExp.generators); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100430 comprehension_ty comp = asdl_seq_GET(n->v.GeneratorExp.generators, i);
431 fstring_shift_comprehension(n, comp, lineno, col_offset);
432 }
433 break;
434 case Await_kind:
435 shift_expr(n, n->v.Await.value, lineno, col_offset);
436 break;
437 case Yield_kind:
438 shift_expr(n, n->v.Yield.value, lineno, col_offset);
439 break;
440 case YieldFrom_kind:
441 shift_expr(n, n->v.YieldFrom.value, lineno, col_offset);
442 break;
443 case Compare_kind:
444 shift_expr(n, n->v.Compare.left, lineno, col_offset);
445 fstring_shift_seq_locations(n, n->v.Compare.comparators, lineno, col_offset);
446 break;
447 case Call_kind:
448 shift_expr(n, n->v.Call.func, lineno, col_offset);
449 fstring_shift_seq_locations(n, n->v.Call.args, lineno, col_offset);
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100450 for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.Call.keywords); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100451 keyword_ty keyword = asdl_seq_GET(n->v.Call.keywords, i);
452 shift_expr(n, keyword->value, lineno, col_offset);
453 }
454 break;
455 case Attribute_kind:
456 shift_expr(n, n->v.Attribute.value, lineno, col_offset);
457 break;
458 case Subscript_kind:
459 shift_expr(n, n->v.Subscript.value, lineno, col_offset);
460 fstring_shift_slice_locations(n, n->v.Subscript.slice, lineno, col_offset);
461 shift_expr(n, n->v.Subscript.slice, lineno, col_offset);
462 break;
463 case Starred_kind:
464 shift_expr(n, n->v.Starred.value, lineno, col_offset);
465 break;
466 case List_kind:
467 fstring_shift_seq_locations(n, n->v.List.elts, lineno, col_offset);
468 break;
469 case Tuple_kind:
470 fstring_shift_seq_locations(n, n->v.Tuple.elts, lineno, col_offset);
471 break;
Lysandros Nikolaou37af21b2020-04-29 03:43:50 +0300472 case JoinedStr_kind:
473 fstring_shift_seq_locations(n, n->v.JoinedStr.values, lineno, col_offset);
474 break;
475 case FormattedValue_kind:
476 shift_expr(n, n->v.FormattedValue.value, lineno, col_offset);
477 if (n->v.FormattedValue.format_spec) {
478 shift_expr(n, n->v.FormattedValue.format_spec, lineno, col_offset);
479 }
480 break;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100481 default:
482 return;
483 }
484}
485
486/* Shift locations for the given node and all its children by adding `lineno`
487 and `col_offset` to existing locations. Note that n is the already parsed
488 expression. */
489static void fstring_shift_expr_locations(expr_ty n, int lineno, int col_offset)
490{
491 n->col_offset = n->col_offset + col_offset;
492
493 // The following is needed, in order for nodes spanning across multiple lines
494 // to be shifted correctly. An example of such a node is a Call node, the closing
495 // parenthesis of which is not on the same line as its name.
496 if (n->lineno == n->end_lineno) {
497 n->end_col_offset = n->end_col_offset + col_offset;
498 }
499
500 fstring_shift_children_locations(n, lineno, col_offset);
501 n->lineno = n->lineno + lineno;
502 n->end_lineno = n->end_lineno + lineno;
503}
504
505/* Fix locations for the given node and its children.
506
507 `parent` is the enclosing node.
508 `n` is the node which locations are going to be fixed relative to parent.
509 `expr_str` is the child node's string representation, including braces.
510*/
511static void
512fstring_fix_expr_location(Token *parent, expr_ty n, char *expr_str)
513{
514 char *substr = NULL;
515 char *start;
516 int lines = 0;
517 int cols = 0;
518
519 if (parent && parent->bytes) {
520 char *parent_str = PyBytes_AsString(parent->bytes);
521 if (!parent_str) {
522 return;
523 }
524 substr = strstr(parent_str, expr_str);
525 if (substr) {
526 // The following is needed, in order to correctly shift the column
527 // offset, in the case that (disregarding any whitespace) a newline
528 // immediately follows the opening curly brace of the fstring expression.
529 int newline_after_brace = 1;
530 start = substr + 1;
531 while (start && *start != '}' && *start != '\n') {
532 if (*start != ' ' && *start != '\t' && *start != '\f') {
533 newline_after_brace = 0;
534 break;
535 }
536 start++;
537 }
538
539 // Account for the characters from the last newline character to our
540 // left until the beginning of substr.
541 if (!newline_after_brace) {
542 start = substr;
543 while (start > parent_str && *start != '\n') {
544 start--;
545 }
546 cols += (int)(substr - start);
547 }
548 /* adjust the start based on the number of newlines encountered
549 before the f-string expression */
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100550 for (char* p = parent_str; p < substr; p++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100551 if (*p == '\n') {
552 lines++;
553 }
554 }
555 }
556 }
557 fstring_shift_expr_locations(n, lines, cols);
558}
559
560
561/* Compile this expression in to an expr_ty. Add parens around the
562 expression, in order to allow leading spaces in the expression. */
563static expr_ty
564fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
565 Token *t)
566{
567 expr_ty expr = NULL;
568 char *str;
569 Py_ssize_t len;
570 const char *s;
571 expr_ty result = NULL;
572
573 assert(expr_end >= expr_start);
574 assert(*(expr_start-1) == '{');
575 assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' ||
576 *expr_end == '=');
577
578 /* If the substring is all whitespace, it's an error. We need to catch this
579 here, and not when we call PyParser_SimpleParseStringFlagsFilename,
580 because turning the expression '' in to '()' would go from being invalid
581 to valid. */
582 for (s = expr_start; s != expr_end; s++) {
583 char c = *s;
584 /* The Python parser ignores only the following whitespace
585 characters (\r already is converted to \n). */
586 if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) {
587 break;
588 }
589 }
590 if (s == expr_end) {
591 RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
592 return NULL;
593 }
594
595 len = expr_end - expr_start;
596 /* Allocate 3 extra bytes: open paren, close paren, null byte. */
597 str = PyMem_RawMalloc(len + 3);
598 if (str == NULL) {
599 PyErr_NoMemory();
600 return NULL;
601 }
602
603 str[0] = '(';
604 memcpy(str+1, expr_start, len);
605 str[len+1] = ')';
606 str[len+2] = 0;
607
608 struct tok_state* tok = PyTokenizer_FromString(str, 1);
609 if (tok == NULL) {
Miss Islington (bot)79e6c152020-06-05 17:10:57 -0700610 PyMem_RawFree(str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100611 return NULL;
612 }
Lysandros Nikolaou791a46e2020-05-26 04:24:31 +0300613 Py_INCREF(p->tok->filename);
614 tok->filename = p->tok->filename;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100615
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300616 Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
617 NULL, p->arena);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100618 p2->starting_lineno = p->starting_lineno + p->tok->first_lineno - 1;
619 p2->starting_col_offset = p->tok->first_lineno == p->tok->lineno
620 ? p->starting_col_offset + t->col_offset : 0;
621
622 expr = _PyPegen_run_parser(p2);
623
624 if (expr == NULL) {
625 goto exit;
626 }
627
628 /* Reuse str to find the correct column offset. */
629 str[0] = '{';
630 str[len+1] = '}';
631 fstring_fix_expr_location(t, expr, str);
632
633 result = expr;
634
635exit:
Miss Islington (bot)79e6c152020-06-05 17:10:57 -0700636 PyMem_RawFree(str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100637 _PyPegen_Parser_Free(p2);
638 PyTokenizer_Free(tok);
639 return result;
640}
641
642/* Return -1 on error.
643
644 Return 0 if we reached the end of the literal.
645
646 Return 1 if we haven't reached the end of the literal, but we want
647 the caller to process the literal up to this point. Used for
648 doubled braces.
649*/
650static int
651fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300652 PyObject **literal, int recurse_lvl, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100653{
654 /* Get any literal string. It ends when we hit an un-doubled left
655 brace (which isn't part of a unicode name escape such as
656 "\N{EULER CONSTANT}"), or the end of the string. */
657
658 const char *s = *str;
659 const char *literal_start = s;
660 int result = 0;
661
662 assert(*literal == NULL);
663 while (s < end) {
664 char ch = *s++;
665 if (!raw && ch == '\\' && s < end) {
666 ch = *s++;
667 if (ch == 'N') {
668 if (s < end && *s++ == '{') {
669 while (s < end && *s++ != '}') {
670 }
671 continue;
672 }
673 break;
674 }
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300675 if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100676 return -1;
677 }
678 }
679 if (ch == '{' || ch == '}') {
680 /* Check for doubled braces, but only at the top level. If
681 we checked at every level, then f'{0:{3}}' would fail
682 with the two closing braces. */
683 if (recurse_lvl == 0) {
684 if (s < end && *s == ch) {
685 /* We're going to tell the caller that the literal ends
686 here, but that they should continue scanning. But also
687 skip over the second brace when we resume scanning. */
688 *str = s + 1;
689 result = 1;
690 goto done;
691 }
692
693 /* Where a single '{' is the start of a new expression, a
694 single '}' is not allowed. */
695 if (ch == '}') {
696 *str = s - 1;
697 RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
698 return -1;
699 }
700 }
701 /* We're either at a '{', which means we're starting another
702 expression; or a '}', which means we're at the end of this
703 f-string (for a nested format_spec). */
704 s--;
705 break;
706 }
707 }
708 *str = s;
709 assert(s <= end);
710 assert(s == end || *s == '{' || *s == '}');
711done:
712 if (literal_start != s) {
713 if (raw)
714 *literal = PyUnicode_DecodeUTF8Stateful(literal_start,
715 s - literal_start,
716 NULL, NULL);
717 else
718 *literal = decode_unicode_with_escapes(p, literal_start,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300719 s - literal_start, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100720 if (!*literal)
721 return -1;
722 }
723 return result;
724}
725
726/* Forward declaration because parsing is recursive. */
727static expr_ty
728fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
729 Token *first_token, Token* t, Token *last_token);
730
731/* Parse the f-string at *str, ending at end. We know *str starts an
732 expression (so it must be a '{'). Returns the FormattedValue node, which
733 includes the expression, conversion character, format_spec expression, and
734 optionally the text of the expression (if = is used).
735
736 Note that I don't do a perfect job here: I don't make sure that a
737 closing brace doesn't match an opening paren, for example. It
738 doesn't need to error on all invalid expressions, just correctly
739 find the end of all valid ones. Any errors inside the expression
740 will be caught when we parse it later.
741
742 *expression is set to the expression. For an '=' "debug" expression,
743 *expr_text is set to the debug text (the original text of the expression,
744 including the '=' and any whitespace around it, as a string object). If
745 not a debug expression, *expr_text set to NULL. */
746static int
747fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
748 PyObject **expr_text, expr_ty *expression, Token *first_token,
749 Token *t, Token *last_token)
750{
751 /* Return -1 on error, else 0. */
752
753 const char *expr_start;
754 const char *expr_end;
755 expr_ty simple_expression;
756 expr_ty format_spec = NULL; /* Optional format specifier. */
757 int conversion = -1; /* The conversion char. Use default if not
758 specified, or !r if using = and no format
759 spec. */
760
761 /* 0 if we're not in a string, else the quote char we're trying to
762 match (single or double quote). */
763 char quote_char = 0;
764
765 /* If we're inside a string, 1=normal, 3=triple-quoted. */
766 int string_type = 0;
767
768 /* Keep track of nesting level for braces/parens/brackets in
769 expressions. */
770 Py_ssize_t nested_depth = 0;
771 char parenstack[MAXLEVEL];
772
773 *expr_text = NULL;
774
775 /* Can only nest one level deep. */
776 if (recurse_lvl >= 2) {
777 RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
778 goto error;
779 }
780
781 /* The first char must be a left brace, or we wouldn't have gotten
782 here. Skip over it. */
783 assert(**str == '{');
784 *str += 1;
785
786 expr_start = *str;
787 for (; *str < end; (*str)++) {
788 char ch;
789
790 /* Loop invariants. */
791 assert(nested_depth >= 0);
792 assert(*str >= expr_start && *str < end);
793 if (quote_char)
794 assert(string_type == 1 || string_type == 3);
795 else
796 assert(string_type == 0);
797
798 ch = **str;
799 /* Nowhere inside an expression is a backslash allowed. */
800 if (ch == '\\') {
801 /* Error: can't include a backslash character, inside
802 parens or strings or not. */
803 RAISE_SYNTAX_ERROR(
804 "f-string expression part "
805 "cannot include a backslash");
806 goto error;
807 }
808 if (quote_char) {
809 /* We're inside a string. See if we're at the end. */
810 /* This code needs to implement the same non-error logic
811 as tok_get from tokenizer.c, at the letter_quote
812 label. To actually share that code would be a
813 nightmare. But, it's unlikely to change and is small,
814 so duplicate it here. Note we don't need to catch all
815 of the errors, since they'll be caught when parsing the
816 expression. We just need to match the non-error
817 cases. Thus we can ignore \n in single-quoted strings,
818 for example. Or non-terminated strings. */
819 if (ch == quote_char) {
820 /* Does this match the string_type (single or triple
821 quoted)? */
822 if (string_type == 3) {
823 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
824 /* We're at the end of a triple quoted string. */
825 *str += 2;
826 string_type = 0;
827 quote_char = 0;
828 continue;
829 }
830 } else {
831 /* We're at the end of a normal string. */
832 quote_char = 0;
833 string_type = 0;
834 continue;
835 }
836 }
837 } else if (ch == '\'' || ch == '"') {
838 /* Is this a triple quoted string? */
839 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
840 string_type = 3;
841 *str += 2;
842 } else {
843 /* Start of a normal string. */
844 string_type = 1;
845 }
846 /* Start looking for the end of the string. */
847 quote_char = ch;
848 } else if (ch == '[' || ch == '{' || ch == '(') {
849 if (nested_depth >= MAXLEVEL) {
850 RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
851 goto error;
852 }
853 parenstack[nested_depth] = ch;
854 nested_depth++;
855 } else if (ch == '#') {
856 /* Error: can't include a comment character, inside parens
857 or not. */
858 RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
859 goto error;
860 } else if (nested_depth == 0 &&
861 (ch == '!' || ch == ':' || ch == '}' ||
862 ch == '=' || ch == '>' || ch == '<')) {
863 /* See if there's a next character. */
864 if (*str+1 < end) {
865 char next = *(*str+1);
866
867 /* For "!=". since '=' is not an allowed conversion character,
868 nothing is lost in this test. */
869 if ((ch == '!' && next == '=') || /* != */
870 (ch == '=' && next == '=') || /* == */
871 (ch == '<' && next == '=') || /* <= */
872 (ch == '>' && next == '=') /* >= */
873 ) {
874 *str += 1;
875 continue;
876 }
877 /* Don't get out of the loop for these, if they're single
878 chars (not part of 2-char tokens). If by themselves, they
879 don't end an expression (unlike say '!'). */
880 if (ch == '>' || ch == '<') {
881 continue;
882 }
883 }
884
885 /* Normal way out of this loop. */
886 break;
887 } else if (ch == ']' || ch == '}' || ch == ')') {
888 if (!nested_depth) {
889 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
890 goto error;
891 }
892 nested_depth--;
893 int opening = parenstack[nested_depth];
894 if (!((opening == '(' && ch == ')') ||
895 (opening == '[' && ch == ']') ||
896 (opening == '{' && ch == '}')))
897 {
898 RAISE_SYNTAX_ERROR(
899 "f-string: closing parenthesis '%c' "
900 "does not match opening parenthesis '%c'",
901 ch, opening);
902 goto error;
903 }
904 } else {
905 /* Just consume this char and loop around. */
906 }
907 }
908 expr_end = *str;
909 /* If we leave this loop in a string or with mismatched parens, we
910 don't care. We'll get a syntax error when compiling the
911 expression. But, we can produce a better error message, so
912 let's just do that.*/
913 if (quote_char) {
914 RAISE_SYNTAX_ERROR("f-string: unterminated string");
915 goto error;
916 }
917 if (nested_depth) {
918 int opening = parenstack[nested_depth - 1];
919 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
920 goto error;
921 }
922
923 if (*str >= end)
924 goto unexpected_end_of_string;
925
926 /* Compile the expression as soon as possible, so we show errors
927 related to the expression before errors related to the
928 conversion or format_spec. */
929 simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
930 if (!simple_expression)
931 goto error;
932
933 /* Check for =, which puts the text value of the expression in
934 expr_text. */
935 if (**str == '=') {
Pablo Galindo9b838292020-05-27 22:01:11 +0100936 if (p->feature_version < 8) {
937 RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
938 "only supported in Python 3.8 and greater");
939 goto error;
940 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100941 *str += 1;
942
943 /* Skip over ASCII whitespace. No need to test for end of string
944 here, since we know there's at least a trailing quote somewhere
945 ahead. */
946 while (Py_ISSPACE(**str)) {
947 *str += 1;
948 }
949
950 /* Set *expr_text to the text of the expression. */
951 *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start);
952 if (!*expr_text) {
953 goto error;
954 }
955 }
956
957 /* Check for a conversion char, if present. */
958 if (**str == '!') {
959 *str += 1;
960 if (*str >= end)
961 goto unexpected_end_of_string;
962
963 conversion = **str;
964 *str += 1;
965
966 /* Validate the conversion. */
967 if (!(conversion == 's' || conversion == 'r' || conversion == 'a')) {
968 RAISE_SYNTAX_ERROR(
969 "f-string: invalid conversion character: "
970 "expected 's', 'r', or 'a'");
971 goto error;
972 }
973
974 }
975
976 /* Check for the format spec, if present. */
977 if (*str >= end)
978 goto unexpected_end_of_string;
979 if (**str == ':') {
980 *str += 1;
981 if (*str >= end)
982 goto unexpected_end_of_string;
983
984 /* Parse the format spec. */
985 format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
986 first_token, t, last_token);
987 if (!format_spec)
988 goto error;
989 }
990
991 if (*str >= end || **str != '}')
992 goto unexpected_end_of_string;
993
994 /* We're at a right brace. Consume it. */
995 assert(*str < end);
996 assert(**str == '}');
997 *str += 1;
998
999 /* If we're in = mode (detected by non-NULL expr_text), and have no format
1000 spec and no explicit conversion, set the conversion to 'r'. */
1001 if (*expr_text && format_spec == NULL && conversion == -1) {
1002 conversion = 'r';
1003 }
1004
1005 /* And now create the FormattedValue node that represents this
1006 entire expression with the conversion and format spec. */
1007 //TODO: Fix this
1008 *expression = FormattedValue(simple_expression, conversion,
1009 format_spec, first_token->lineno,
1010 first_token->col_offset, last_token->end_lineno,
1011 last_token->end_col_offset, p->arena);
1012 if (!*expression)
1013 goto error;
1014
1015 return 0;
1016
1017unexpected_end_of_string:
1018 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1019 /* Falls through to error. */
1020
1021error:
1022 Py_XDECREF(*expr_text);
1023 return -1;
1024
1025}
1026
1027/* Return -1 on error.
1028
1029 Return 0 if we have a literal (possible zero length) and an
1030 expression (zero length if at the end of the string.
1031
1032 Return 1 if we have a literal, but no expression, and we want the
1033 caller to call us again. This is used to deal with doubled
1034 braces.
1035
1036 When called multiple times on the string 'a{{b{0}c', this function
1037 will return:
1038
1039 1. the literal 'a{' with no expression, and a return value
1040 of 1. Despite the fact that there's no expression, the return
1041 value of 1 means we're not finished yet.
1042
1043 2. the literal 'b' and the expression '0', with a return value of
1044 0. The fact that there's an expression means we're not finished.
1045
1046 3. literal 'c' with no expression and a return value of 0. The
1047 combination of the return value of 0 with no expression means
1048 we're finished.
1049*/
1050static int
1051fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw,
1052 int recurse_lvl, PyObject **literal,
1053 PyObject **expr_text, expr_ty *expression,
1054 Token *first_token, Token *t, Token *last_token)
1055{
1056 int result;
1057
1058 assert(*literal == NULL && *expression == NULL);
1059
1060 /* Get any literal string. */
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +03001061 result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001062 if (result < 0)
1063 goto error;
1064
1065 assert(result == 0 || result == 1);
1066
1067 if (result == 1)
1068 /* We have a literal, but don't look at the expression. */
1069 return 1;
1070
1071 if (*str >= end || **str == '}')
1072 /* We're at the end of the string or the end of a nested
1073 f-string: no expression. The top-level error case where we
1074 expect to be at the end of the string but we're at a '}' is
1075 handled later. */
1076 return 0;
1077
1078 /* We must now be the start of an expression, on a '{'. */
1079 assert(**str == '{');
1080
1081 if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
1082 expression, first_token, t, last_token) < 0)
1083 goto error;
1084
1085 return 0;
1086
1087error:
1088 Py_CLEAR(*literal);
1089 return -1;
1090}
1091
1092#ifdef NDEBUG
1093#define ExprList_check_invariants(l)
1094#else
1095static void
1096ExprList_check_invariants(ExprList *l)
1097{
1098 /* Check our invariants. Make sure this object is "live", and
1099 hasn't been deallocated. */
1100 assert(l->size >= 0);
1101 assert(l->p != NULL);
1102 if (l->size <= EXPRLIST_N_CACHED)
1103 assert(l->data == l->p);
1104}
1105#endif
1106
1107static void
1108ExprList_Init(ExprList *l)
1109{
1110 l->allocated = EXPRLIST_N_CACHED;
1111 l->size = 0;
1112
1113 /* Until we start allocating dynamically, p points to data. */
1114 l->p = l->data;
1115
1116 ExprList_check_invariants(l);
1117}
1118
1119static int
1120ExprList_Append(ExprList *l, expr_ty exp)
1121{
1122 ExprList_check_invariants(l);
1123 if (l->size >= l->allocated) {
1124 /* We need to alloc (or realloc) the memory. */
1125 Py_ssize_t new_size = l->allocated * 2;
1126
1127 /* See if we've ever allocated anything dynamically. */
1128 if (l->p == l->data) {
1129 Py_ssize_t i;
1130 /* We're still using the cached data. Switch to
1131 alloc-ing. */
1132 l->p = PyMem_RawMalloc(sizeof(expr_ty) * new_size);
1133 if (!l->p)
1134 return -1;
1135 /* Copy the cached data into the new buffer. */
1136 for (i = 0; i < l->size; i++)
1137 l->p[i] = l->data[i];
1138 } else {
1139 /* Just realloc. */
1140 expr_ty *tmp = PyMem_RawRealloc(l->p, sizeof(expr_ty) * new_size);
1141 if (!tmp) {
1142 PyMem_RawFree(l->p);
1143 l->p = NULL;
1144 return -1;
1145 }
1146 l->p = tmp;
1147 }
1148
1149 l->allocated = new_size;
1150 assert(l->allocated == 2 * l->size);
1151 }
1152
1153 l->p[l->size++] = exp;
1154
1155 ExprList_check_invariants(l);
1156 return 0;
1157}
1158
1159static void
1160ExprList_Dealloc(ExprList *l)
1161{
1162 ExprList_check_invariants(l);
1163
1164 /* If there's been an error, or we've never dynamically allocated,
1165 do nothing. */
1166 if (!l->p || l->p == l->data) {
1167 /* Do nothing. */
1168 } else {
1169 /* We have dynamically allocated. Free the memory. */
1170 PyMem_RawFree(l->p);
1171 }
1172 l->p = NULL;
1173 l->size = -1;
1174}
1175
1176static asdl_seq *
1177ExprList_Finish(ExprList *l, PyArena *arena)
1178{
1179 asdl_seq *seq;
1180
1181 ExprList_check_invariants(l);
1182
1183 /* Allocate the asdl_seq and copy the expressions in to it. */
1184 seq = _Py_asdl_seq_new(l->size, arena);
1185 if (seq) {
1186 Py_ssize_t i;
1187 for (i = 0; i < l->size; i++)
1188 asdl_seq_SET(seq, i, l->p[i]);
1189 }
1190 ExprList_Dealloc(l);
1191 return seq;
1192}
1193
1194#ifdef NDEBUG
1195#define FstringParser_check_invariants(state)
1196#else
1197static void
1198FstringParser_check_invariants(FstringParser *state)
1199{
1200 if (state->last_str)
1201 assert(PyUnicode_CheckExact(state->last_str));
1202 ExprList_check_invariants(&state->expr_list);
1203}
1204#endif
1205
1206void
1207_PyPegen_FstringParser_Init(FstringParser *state)
1208{
1209 state->last_str = NULL;
1210 state->fmode = 0;
1211 ExprList_Init(&state->expr_list);
1212 FstringParser_check_invariants(state);
1213}
1214
1215void
1216_PyPegen_FstringParser_Dealloc(FstringParser *state)
1217{
1218 FstringParser_check_invariants(state);
1219
1220 Py_XDECREF(state->last_str);
1221 ExprList_Dealloc(&state->expr_list);
1222}
1223
1224/* Make a Constant node, but decref the PyUnicode object being added. */
1225static expr_ty
1226make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token)
1227{
1228 PyObject *s = *str;
1229 PyObject *kind = NULL;
1230 *str = NULL;
1231 assert(PyUnicode_CheckExact(s));
1232 if (PyArena_AddPyObject(p->arena, s) < 0) {
1233 Py_DECREF(s);
1234 return NULL;
1235 }
1236 const char* the_str = PyBytes_AsString(first_token->bytes);
1237 if (the_str && the_str[0] == 'u') {
1238 kind = _PyPegen_new_identifier(p, "u");
1239 }
1240
1241 if (kind == NULL && PyErr_Occurred()) {
1242 return NULL;
1243 }
1244
1245 return Constant(s, kind, first_token->lineno, first_token->col_offset,
1246 last_token->end_lineno, last_token->end_col_offset, p->arena);
1247
1248}
1249
1250
1251/* Add a non-f-string (that is, a regular literal string). str is
1252 decref'd. */
1253int
1254_PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str)
1255{
1256 FstringParser_check_invariants(state);
1257
1258 assert(PyUnicode_CheckExact(str));
1259
1260 if (PyUnicode_GET_LENGTH(str) == 0) {
1261 Py_DECREF(str);
1262 return 0;
1263 }
1264
1265 if (!state->last_str) {
1266 /* We didn't have a string before, so just remember this one. */
1267 state->last_str = str;
1268 } else {
1269 /* Concatenate this with the previous string. */
1270 PyUnicode_AppendAndDel(&state->last_str, str);
1271 if (!state->last_str)
1272 return -1;
1273 }
1274 FstringParser_check_invariants(state);
1275 return 0;
1276}
1277
1278/* Parse an f-string. The f-string is in *str to end, with no
1279 'f' or quotes. */
1280int
1281_PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str,
1282 const char *end, int raw, int recurse_lvl,
1283 Token *first_token, Token* t, Token *last_token)
1284{
1285 FstringParser_check_invariants(state);
1286 state->fmode = 1;
1287
1288 /* Parse the f-string. */
1289 while (1) {
1290 PyObject *literal = NULL;
1291 PyObject *expr_text = NULL;
1292 expr_ty expression = NULL;
1293
1294 /* If there's a zero length literal in front of the
1295 expression, literal will be NULL. If we're at the end of
1296 the f-string, expression will be NULL (unless result == 1,
1297 see below). */
1298 int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
1299 &literal, &expr_text,
1300 &expression, first_token, t, last_token);
1301 if (result < 0)
1302 return -1;
1303
1304 /* Add the literal, if any. */
1305 if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
1306 Py_XDECREF(expr_text);
1307 return -1;
1308 }
1309 /* Add the expr_text, if any. */
1310 if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
1311 return -1;
1312 }
1313
1314 /* We've dealt with the literal and expr_text, their ownership has
1315 been transferred to the state object. Don't look at them again. */
1316
1317 /* See if we should just loop around to get the next literal
1318 and expression, while ignoring the expression this
1319 time. This is used for un-doubling braces, as an
1320 optimization. */
1321 if (result == 1)
1322 continue;
1323
1324 if (!expression)
1325 /* We're done with this f-string. */
1326 break;
1327
1328 /* We know we have an expression. Convert any existing string
1329 to a Constant node. */
1330 if (!state->last_str) {
1331 /* Do nothing. No previous literal. */
1332 } else {
1333 /* Convert the existing last_str literal to a Constant node. */
1334 expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1335 if (!str || ExprList_Append(&state->expr_list, str) < 0)
1336 return -1;
1337 }
1338
1339 if (ExprList_Append(&state->expr_list, expression) < 0)
1340 return -1;
1341 }
1342
1343 /* If recurse_lvl is zero, then we must be at the end of the
1344 string. Otherwise, we must be at a right brace. */
1345
1346 if (recurse_lvl == 0 && *str < end-1) {
1347 RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
1348 return -1;
1349 }
1350 if (recurse_lvl != 0 && **str != '}') {
1351 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1352 return -1;
1353 }
1354
1355 FstringParser_check_invariants(state);
1356 return 0;
1357}
1358
1359/* Convert the partial state reflected in last_str and expr_list to an
1360 expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
1361expr_ty
1362_PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token,
1363 Token *last_token)
1364{
1365 asdl_seq *seq;
1366
1367 FstringParser_check_invariants(state);
1368
1369 /* If we're just a constant string with no expressions, return
1370 that. */
1371 if (!state->fmode) {
1372 assert(!state->expr_list.size);
1373 if (!state->last_str) {
1374 /* Create a zero length string. */
1375 state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
1376 if (!state->last_str)
1377 goto error;
1378 }
1379 return make_str_node_and_del(p, &state->last_str, first_token, last_token);
1380 }
1381
1382 /* Create a Constant node out of last_str, if needed. It will be the
1383 last node in our expression list. */
1384 if (state->last_str) {
1385 expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1386 if (!str || ExprList_Append(&state->expr_list, str) < 0)
1387 goto error;
1388 }
1389 /* This has already been freed. */
1390 assert(state->last_str == NULL);
1391
1392 seq = ExprList_Finish(&state->expr_list, p->arena);
1393 if (!seq)
1394 goto error;
1395
1396 return _Py_JoinedStr(seq, first_token->lineno, first_token->col_offset,
1397 last_token->end_lineno, last_token->end_col_offset, p->arena);
1398
1399error:
1400 _PyPegen_FstringParser_Dealloc(state);
1401 return NULL;
1402}
1403
1404/* Given an f-string (with no 'f' or quotes) that's in *str and ends
1405 at end, parse it into an expr_ty. Return NULL on error. Adjust
1406 str to point past the parsed portion. */
1407static expr_ty
1408fstring_parse(Parser *p, const char **str, const char *end, int raw,
1409 int recurse_lvl, Token *first_token, Token* t, Token *last_token)
1410{
1411 FstringParser state;
1412
1413 _PyPegen_FstringParser_Init(&state);
1414 if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
1415 first_token, t, last_token) < 0) {
1416 _PyPegen_FstringParser_Dealloc(&state);
1417 return NULL;
1418 }
1419
1420 return _PyPegen_FstringParser_Finish(p, &state, t, t);
1421}