blob: 9f56ce21d0f206912d9d1fc123d4433c34851336 [file] [log] [blame]
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001#include <Python.h>
2
Pablo Galindo1ed83ad2020-06-11 17:30:46 +01003#include "tokenizer.h"
Pablo Galindoc5fc1562020-04-22 23:29:27 +01004#include "pegen.h"
Pablo Galindo1ed83ad2020-06-11 17:30:46 +01005#include "string_parser.h"
Pablo Galindoc5fc1562020-04-22 23:29:27 +01006
7//// STRING HANDLING FUNCTIONS ////
8
Pablo Galindoc5fc1562020-04-22 23:29:27 +01009static int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030010warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010011{
12 PyObject *msg =
13 PyUnicode_FromFormat("invalid escape sequence \\%c", first_invalid_escape_char);
14 if (msg == NULL) {
15 return -1;
16 }
17 if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030018 t->lineno, NULL, NULL) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +010019 if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
20 /* Replace the DeprecationWarning exception with a SyntaxError
21 to get a more accurate error report */
22 PyErr_Clear();
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030023
24 /* This is needed, in order for the SyntaxError to point to the token t,
25 since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
26 error location, if p->known_err_token is not set. */
27 p->known_err_token = t;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010028 RAISE_SYNTAX_ERROR("invalid escape sequence \\%c", first_invalid_escape_char);
29 }
30 Py_DECREF(msg);
31 return -1;
32 }
33 Py_DECREF(msg);
34 return 0;
35}
36
37static PyObject *
38decode_utf8(const char **sPtr, const char *end)
39{
Pablo Galindofb61c422020-06-15 14:23:43 +010040 const char *s;
41 const char *t;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010042 t = s = *sPtr;
43 while (s < end && (*s & 0x80)) {
44 s++;
45 }
46 *sPtr = s;
47 return PyUnicode_DecodeUTF8(t, s - t, NULL);
48}
49
50static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030051decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010052{
Pablo Galindofb61c422020-06-15 14:23:43 +010053 PyObject *v;
54 PyObject *u;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010055 char *buf;
56 char *p;
57 const char *end;
58
59 /* check for integer overflow */
60 if (len > SIZE_MAX / 6) {
61 return NULL;
62 }
63 /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
64 "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
65 u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
66 if (u == NULL) {
67 return NULL;
68 }
69 p = buf = PyBytes_AsString(u);
70 end = s + len;
71 while (s < end) {
72 if (*s == '\\') {
73 *p++ = *s++;
74 if (s >= end || *s & 0x80) {
75 strcpy(p, "u005c");
76 p += 5;
77 if (s >= end) {
78 break;
79 }
80 }
81 }
82 if (*s & 0x80) {
83 PyObject *w;
84 int kind;
85 void *data;
Pablo Galindofb61c422020-06-15 14:23:43 +010086 Py_ssize_t w_len;
87 Py_ssize_t i;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010088 w = decode_utf8(&s, end);
89 if (w == NULL) {
90 Py_DECREF(u);
91 return NULL;
92 }
93 kind = PyUnicode_KIND(w);
94 data = PyUnicode_DATA(w);
Pablo Galindofb61c422020-06-15 14:23:43 +010095 w_len = PyUnicode_GET_LENGTH(w);
96 for (i = 0; i < w_len; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +010097 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
98 sprintf(p, "\\U%08x", chr);
99 p += 10;
100 }
101 /* Should be impossible to overflow */
102 assert(p - buf <= PyBytes_GET_SIZE(u));
103 Py_DECREF(w);
104 }
105 else {
106 *p++ = *s++;
107 }
108 }
109 len = p - buf;
110 s = buf;
111
112 const char *first_invalid_escape;
113 v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
114
115 if (v != NULL && first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300116 if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100117 /* We have not decref u before because first_invalid_escape points
118 inside u. */
119 Py_XDECREF(u);
120 Py_DECREF(v);
121 return NULL;
122 }
123 }
124 Py_XDECREF(u);
125 return v;
126}
127
128static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300129decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100130{
131 const char *first_invalid_escape;
132 PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
133 if (result == NULL) {
134 return NULL;
135 }
136
137 if (first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300138 if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100139 Py_DECREF(result);
140 return NULL;
141 }
142 }
143 return result;
144}
145
146/* s must include the bracketing quote characters, and r, b, u,
147 &/or f prefixes (if any), and embedded escape sequences (if any).
148 _PyPegen_parsestr parses it, and sets *result to decoded Python string object.
149 If the string is an f-string, set *fstr and *fstrlen to the unparsed
150 string object. Return 0 if no errors occurred. */
151int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300152_PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
153 const char **fstr, Py_ssize_t *fstrlen, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100154{
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300155 const char *s = PyBytes_AsString(t->bytes);
156 if (s == NULL) {
157 return -1;
158 }
159
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100160 size_t len;
161 int quote = Py_CHARMASK(*s);
162 int fmode = 0;
163 *bytesmode = 0;
164 *rawmode = 0;
165 *result = NULL;
166 *fstr = NULL;
167 if (Py_ISALPHA(quote)) {
168 while (!*bytesmode || !*rawmode) {
169 if (quote == 'b' || quote == 'B') {
Pablo Galindofb61c422020-06-15 14:23:43 +0100170 quote =(unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100171 *bytesmode = 1;
172 }
173 else if (quote == 'u' || quote == 'U') {
Pablo Galindofb61c422020-06-15 14:23:43 +0100174 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100175 }
176 else if (quote == 'r' || quote == 'R') {
Pablo Galindofb61c422020-06-15 14:23:43 +0100177 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100178 *rawmode = 1;
179 }
180 else if (quote == 'f' || quote == 'F') {
Pablo Galindofb61c422020-06-15 14:23:43 +0100181 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100182 fmode = 1;
183 }
184 else {
185 break;
186 }
187 }
188 }
189
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300190 /* fstrings are only allowed in Python 3.6 and greater */
191 if (fmode && p->feature_version < 6) {
192 p->error_indicator = 1;
193 RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
194 return -1;
195 }
196
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100197 if (fmode && *bytesmode) {
198 PyErr_BadInternalCall();
199 return -1;
200 }
201 if (quote != '\'' && quote != '\"') {
202 PyErr_BadInternalCall();
203 return -1;
204 }
205 /* Skip the leading quote char. */
206 s++;
207 len = strlen(s);
208 if (len > INT_MAX) {
209 PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
210 return -1;
211 }
212 if (s[--len] != quote) {
213 /* Last quote char must match the first. */
214 PyErr_BadInternalCall();
215 return -1;
216 }
217 if (len >= 4 && s[0] == quote && s[1] == quote) {
218 /* A triple quoted string. We've already skipped one quote at
219 the start and one at the end of the string. Now skip the
220 two at the start. */
221 s += 2;
222 len -= 2;
223 /* And check that the last two match. */
224 if (s[--len] != quote || s[--len] != quote) {
225 PyErr_BadInternalCall();
226 return -1;
227 }
228 }
229
230 if (fmode) {
231 /* Just return the bytes. The caller will parse the resulting
232 string. */
233 *fstr = s;
234 *fstrlen = len;
235 return 0;
236 }
237
238 /* Not an f-string. */
239 /* Avoid invoking escape decoding routines if possible. */
240 *rawmode = *rawmode || strchr(s, '\\') == NULL;
241 if (*bytesmode) {
242 /* Disallow non-ASCII characters. */
243 const char *ch;
244 for (ch = s; *ch; ch++) {
245 if (Py_CHARMASK(*ch) >= 0x80) {
246 RAISE_SYNTAX_ERROR(
247 "bytes can only contain ASCII "
248 "literal characters.");
249 return -1;
250 }
251 }
252 if (*rawmode) {
253 *result = PyBytes_FromStringAndSize(s, len);
254 }
255 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300256 *result = decode_bytes_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100257 }
258 }
259 else {
260 if (*rawmode) {
261 *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
262 }
263 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300264 *result = decode_unicode_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100265 }
266 }
267 return *result == NULL ? -1 : 0;
268}
269
270
271
272// FSTRING STUFF
273
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100274/* Fix locations for the given node and its children.
275
276 `parent` is the enclosing node.
277 `n` is the node which locations are going to be fixed relative to parent.
278 `expr_str` is the child node's string representation, including braces.
279*/
280static void
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300281fstring_find_expr_location(Token *parent, char *expr_str, int *p_lines, int *p_cols)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100282{
283 char *substr = NULL;
284 char *start;
285 int lines = 0;
286 int cols = 0;
287
288 if (parent && parent->bytes) {
289 char *parent_str = PyBytes_AsString(parent->bytes);
290 if (!parent_str) {
291 return;
292 }
293 substr = strstr(parent_str, expr_str);
294 if (substr) {
295 // The following is needed, in order to correctly shift the column
296 // offset, in the case that (disregarding any whitespace) a newline
297 // immediately follows the opening curly brace of the fstring expression.
298 int newline_after_brace = 1;
299 start = substr + 1;
300 while (start && *start != '}' && *start != '\n') {
301 if (*start != ' ' && *start != '\t' && *start != '\f') {
302 newline_after_brace = 0;
303 break;
304 }
305 start++;
306 }
307
308 // Account for the characters from the last newline character to our
309 // left until the beginning of substr.
310 if (!newline_after_brace) {
311 start = substr;
312 while (start > parent_str && *start != '\n') {
313 start--;
314 }
315 cols += (int)(substr - start);
316 }
317 /* adjust the start based on the number of newlines encountered
318 before the f-string expression */
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100319 for (char* p = parent_str; p < substr; p++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100320 if (*p == '\n') {
321 lines++;
322 }
323 }
324 }
325 }
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300326 *p_lines = lines;
327 *p_cols = cols;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100328}
329
330
331/* Compile this expression in to an expr_ty. Add parens around the
332 expression, in order to allow leading spaces in the expression. */
333static expr_ty
334fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
335 Token *t)
336{
337 expr_ty expr = NULL;
338 char *str;
339 Py_ssize_t len;
340 const char *s;
341 expr_ty result = NULL;
342
343 assert(expr_end >= expr_start);
344 assert(*(expr_start-1) == '{');
345 assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' ||
346 *expr_end == '=');
347
348 /* If the substring is all whitespace, it's an error. We need to catch this
349 here, and not when we call PyParser_SimpleParseStringFlagsFilename,
350 because turning the expression '' in to '()' would go from being invalid
351 to valid. */
352 for (s = expr_start; s != expr_end; s++) {
353 char c = *s;
354 /* The Python parser ignores only the following whitespace
355 characters (\r already is converted to \n). */
356 if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) {
357 break;
358 }
359 }
360 if (s == expr_end) {
361 RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
362 return NULL;
363 }
364
365 len = expr_end - expr_start;
366 /* Allocate 3 extra bytes: open paren, close paren, null byte. */
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300367 str = PyMem_Malloc(len + 3);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100368 if (str == NULL) {
369 PyErr_NoMemory();
370 return NULL;
371 }
372
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300373 // The call to fstring_find_expr_location is responsible for finding the column offset
374 // the generated AST nodes need to be shifted to the right, which is equal to the number
375 // of the f-string characters before the expression starts. In order to correctly compute
376 // this offset, strstr gets called in fstring_find_expr_location which only succeeds
377 // if curly braces appear before and after the f-string expression (exactly like they do
378 // in the f-string itself), hence the following lines.
379 str[0] = '{';
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100380 memcpy(str+1, expr_start, len);
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300381 str[len+1] = '}';
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100382 str[len+2] = 0;
383
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300384 int lines, cols;
385 fstring_find_expr_location(t, str, &lines, &cols);
386
387 // The parentheses are needed in order to allow for leading whitespace withing
388 // the f-string expression. This consequently gets parsed as a group (see the
389 // group rule in python.gram).
390 str[0] = '(';
391 str[len+1] = ')';
392
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100393 struct tok_state* tok = PyTokenizer_FromString(str, 1);
394 if (tok == NULL) {
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300395 PyMem_Free(str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100396 return NULL;
397 }
Lysandros Nikolaouf7b1e462020-05-26 03:32:18 +0300398 Py_INCREF(p->tok->filename);
399 tok->filename = p->tok->filename;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100400
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300401 Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
402 NULL, p->arena);
Lysandros Nikolaou1f0f4ab2020-06-28 02:41:48 +0300403 p2->starting_lineno = t->lineno + lines - 1;
404 p2->starting_col_offset = p->tok->first_lineno == p->tok->lineno ? t->col_offset + cols : cols;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100405
406 expr = _PyPegen_run_parser(p2);
407
408 if (expr == NULL) {
409 goto exit;
410 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100411 result = expr;
412
413exit:
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300414 PyMem_Free(str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100415 _PyPegen_Parser_Free(p2);
416 PyTokenizer_Free(tok);
417 return result;
418}
419
420/* Return -1 on error.
421
422 Return 0 if we reached the end of the literal.
423
424 Return 1 if we haven't reached the end of the literal, but we want
425 the caller to process the literal up to this point. Used for
426 doubled braces.
427*/
428static int
429fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300430 PyObject **literal, int recurse_lvl, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100431{
432 /* Get any literal string. It ends when we hit an un-doubled left
433 brace (which isn't part of a unicode name escape such as
434 "\N{EULER CONSTANT}"), or the end of the string. */
435
436 const char *s = *str;
437 const char *literal_start = s;
438 int result = 0;
439
440 assert(*literal == NULL);
441 while (s < end) {
442 char ch = *s++;
443 if (!raw && ch == '\\' && s < end) {
444 ch = *s++;
445 if (ch == 'N') {
446 if (s < end && *s++ == '{') {
447 while (s < end && *s++ != '}') {
448 }
449 continue;
450 }
451 break;
452 }
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300453 if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100454 return -1;
455 }
456 }
457 if (ch == '{' || ch == '}') {
458 /* Check for doubled braces, but only at the top level. If
459 we checked at every level, then f'{0:{3}}' would fail
460 with the two closing braces. */
461 if (recurse_lvl == 0) {
462 if (s < end && *s == ch) {
463 /* We're going to tell the caller that the literal ends
464 here, but that they should continue scanning. But also
465 skip over the second brace when we resume scanning. */
466 *str = s + 1;
467 result = 1;
468 goto done;
469 }
470
471 /* Where a single '{' is the start of a new expression, a
472 single '}' is not allowed. */
473 if (ch == '}') {
474 *str = s - 1;
475 RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
476 return -1;
477 }
478 }
479 /* We're either at a '{', which means we're starting another
480 expression; or a '}', which means we're at the end of this
481 f-string (for a nested format_spec). */
482 s--;
483 break;
484 }
485 }
486 *str = s;
487 assert(s <= end);
488 assert(s == end || *s == '{' || *s == '}');
489done:
490 if (literal_start != s) {
Pablo Galindofb61c422020-06-15 14:23:43 +0100491 if (raw) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100492 *literal = PyUnicode_DecodeUTF8Stateful(literal_start,
493 s - literal_start,
494 NULL, NULL);
Pablo Galindofb61c422020-06-15 14:23:43 +0100495 } else {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100496 *literal = decode_unicode_with_escapes(p, literal_start,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300497 s - literal_start, t);
Pablo Galindofb61c422020-06-15 14:23:43 +0100498 }
499 if (!*literal) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100500 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +0100501 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100502 }
503 return result;
504}
505
506/* Forward declaration because parsing is recursive. */
507static expr_ty
508fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
509 Token *first_token, Token* t, Token *last_token);
510
511/* Parse the f-string at *str, ending at end. We know *str starts an
512 expression (so it must be a '{'). Returns the FormattedValue node, which
513 includes the expression, conversion character, format_spec expression, and
514 optionally the text of the expression (if = is used).
515
516 Note that I don't do a perfect job here: I don't make sure that a
517 closing brace doesn't match an opening paren, for example. It
518 doesn't need to error on all invalid expressions, just correctly
519 find the end of all valid ones. Any errors inside the expression
520 will be caught when we parse it later.
521
522 *expression is set to the expression. For an '=' "debug" expression,
523 *expr_text is set to the debug text (the original text of the expression,
524 including the '=' and any whitespace around it, as a string object). If
525 not a debug expression, *expr_text set to NULL. */
526static int
527fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
528 PyObject **expr_text, expr_ty *expression, Token *first_token,
529 Token *t, Token *last_token)
530{
531 /* Return -1 on error, else 0. */
532
533 const char *expr_start;
534 const char *expr_end;
535 expr_ty simple_expression;
536 expr_ty format_spec = NULL; /* Optional format specifier. */
537 int conversion = -1; /* The conversion char. Use default if not
538 specified, or !r if using = and no format
539 spec. */
540
541 /* 0 if we're not in a string, else the quote char we're trying to
542 match (single or double quote). */
543 char quote_char = 0;
544
545 /* If we're inside a string, 1=normal, 3=triple-quoted. */
546 int string_type = 0;
547
548 /* Keep track of nesting level for braces/parens/brackets in
549 expressions. */
550 Py_ssize_t nested_depth = 0;
551 char parenstack[MAXLEVEL];
552
553 *expr_text = NULL;
554
555 /* Can only nest one level deep. */
556 if (recurse_lvl >= 2) {
557 RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
558 goto error;
559 }
560
561 /* The first char must be a left brace, or we wouldn't have gotten
562 here. Skip over it. */
563 assert(**str == '{');
564 *str += 1;
565
566 expr_start = *str;
567 for (; *str < end; (*str)++) {
568 char ch;
569
570 /* Loop invariants. */
571 assert(nested_depth >= 0);
572 assert(*str >= expr_start && *str < end);
Pablo Galindofb61c422020-06-15 14:23:43 +0100573 if (quote_char) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100574 assert(string_type == 1 || string_type == 3);
Pablo Galindofb61c422020-06-15 14:23:43 +0100575 } else {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100576 assert(string_type == 0);
Pablo Galindofb61c422020-06-15 14:23:43 +0100577 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100578
579 ch = **str;
580 /* Nowhere inside an expression is a backslash allowed. */
581 if (ch == '\\') {
582 /* Error: can't include a backslash character, inside
583 parens or strings or not. */
584 RAISE_SYNTAX_ERROR(
585 "f-string expression part "
586 "cannot include a backslash");
587 goto error;
588 }
589 if (quote_char) {
590 /* We're inside a string. See if we're at the end. */
591 /* This code needs to implement the same non-error logic
592 as tok_get from tokenizer.c, at the letter_quote
593 label. To actually share that code would be a
594 nightmare. But, it's unlikely to change and is small,
595 so duplicate it here. Note we don't need to catch all
596 of the errors, since they'll be caught when parsing the
597 expression. We just need to match the non-error
598 cases. Thus we can ignore \n in single-quoted strings,
599 for example. Or non-terminated strings. */
600 if (ch == quote_char) {
601 /* Does this match the string_type (single or triple
602 quoted)? */
603 if (string_type == 3) {
604 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
605 /* We're at the end of a triple quoted string. */
606 *str += 2;
607 string_type = 0;
608 quote_char = 0;
609 continue;
610 }
611 } else {
612 /* We're at the end of a normal string. */
613 quote_char = 0;
614 string_type = 0;
615 continue;
616 }
617 }
618 } else if (ch == '\'' || ch == '"') {
619 /* Is this a triple quoted string? */
620 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
621 string_type = 3;
622 *str += 2;
623 } else {
624 /* Start of a normal string. */
625 string_type = 1;
626 }
627 /* Start looking for the end of the string. */
628 quote_char = ch;
629 } else if (ch == '[' || ch == '{' || ch == '(') {
630 if (nested_depth >= MAXLEVEL) {
631 RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
632 goto error;
633 }
634 parenstack[nested_depth] = ch;
635 nested_depth++;
636 } else if (ch == '#') {
637 /* Error: can't include a comment character, inside parens
638 or not. */
639 RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
640 goto error;
641 } else if (nested_depth == 0 &&
642 (ch == '!' || ch == ':' || ch == '}' ||
643 ch == '=' || ch == '>' || ch == '<')) {
644 /* See if there's a next character. */
645 if (*str+1 < end) {
646 char next = *(*str+1);
647
648 /* For "!=". since '=' is not an allowed conversion character,
649 nothing is lost in this test. */
650 if ((ch == '!' && next == '=') || /* != */
651 (ch == '=' && next == '=') || /* == */
652 (ch == '<' && next == '=') || /* <= */
653 (ch == '>' && next == '=') /* >= */
654 ) {
655 *str += 1;
656 continue;
657 }
658 /* Don't get out of the loop for these, if they're single
659 chars (not part of 2-char tokens). If by themselves, they
660 don't end an expression (unlike say '!'). */
661 if (ch == '>' || ch == '<') {
662 continue;
663 }
664 }
665
666 /* Normal way out of this loop. */
667 break;
668 } else if (ch == ']' || ch == '}' || ch == ')') {
669 if (!nested_depth) {
670 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
671 goto error;
672 }
673 nested_depth--;
Pablo Galindofb61c422020-06-15 14:23:43 +0100674 int opening = (unsigned char)parenstack[nested_depth];
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100675 if (!((opening == '(' && ch == ')') ||
676 (opening == '[' && ch == ']') ||
677 (opening == '{' && ch == '}')))
678 {
679 RAISE_SYNTAX_ERROR(
680 "f-string: closing parenthesis '%c' "
681 "does not match opening parenthesis '%c'",
682 ch, opening);
683 goto error;
684 }
685 } else {
686 /* Just consume this char and loop around. */
687 }
688 }
689 expr_end = *str;
690 /* If we leave this loop in a string or with mismatched parens, we
691 don't care. We'll get a syntax error when compiling the
692 expression. But, we can produce a better error message, so
693 let's just do that.*/
694 if (quote_char) {
695 RAISE_SYNTAX_ERROR("f-string: unterminated string");
696 goto error;
697 }
698 if (nested_depth) {
Pablo Galindofb61c422020-06-15 14:23:43 +0100699 int opening = (unsigned char)parenstack[nested_depth - 1];
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100700 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
701 goto error;
702 }
703
Pablo Galindofb61c422020-06-15 14:23:43 +0100704 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100705 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100706 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100707
708 /* Compile the expression as soon as possible, so we show errors
709 related to the expression before errors related to the
710 conversion or format_spec. */
711 simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
Pablo Galindofb61c422020-06-15 14:23:43 +0100712 if (!simple_expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100713 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +0100714 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100715
716 /* Check for =, which puts the text value of the expression in
717 expr_text. */
718 if (**str == '=') {
Shantanuc116c942020-05-27 13:30:38 -0700719 if (p->feature_version < 8) {
720 RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
721 "only supported in Python 3.8 and greater");
722 goto error;
723 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100724 *str += 1;
725
726 /* Skip over ASCII whitespace. No need to test for end of string
727 here, since we know there's at least a trailing quote somewhere
728 ahead. */
729 while (Py_ISSPACE(**str)) {
730 *str += 1;
731 }
732
733 /* Set *expr_text to the text of the expression. */
734 *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start);
735 if (!*expr_text) {
736 goto error;
737 }
738 }
739
740 /* Check for a conversion char, if present. */
741 if (**str == '!') {
742 *str += 1;
Pablo Galindofb61c422020-06-15 14:23:43 +0100743 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100744 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100745 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100746
Pablo Galindofb61c422020-06-15 14:23:43 +0100747 conversion = (unsigned char)**str;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100748 *str += 1;
749
750 /* Validate the conversion. */
751 if (!(conversion == 's' || conversion == 'r' || conversion == 'a')) {
752 RAISE_SYNTAX_ERROR(
753 "f-string: invalid conversion character: "
754 "expected 's', 'r', or 'a'");
755 goto error;
756 }
757
758 }
759
760 /* Check for the format spec, if present. */
Pablo Galindofb61c422020-06-15 14:23:43 +0100761 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100762 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100763 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100764 if (**str == ':') {
765 *str += 1;
Pablo Galindofb61c422020-06-15 14:23:43 +0100766 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100767 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100768 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100769
770 /* Parse the format spec. */
771 format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
772 first_token, t, last_token);
Pablo Galindofb61c422020-06-15 14:23:43 +0100773 if (!format_spec) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100774 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +0100775 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100776 }
777
Pablo Galindofb61c422020-06-15 14:23:43 +0100778 if (*str >= end || **str != '}') {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100779 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100780 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100781
782 /* We're at a right brace. Consume it. */
783 assert(*str < end);
784 assert(**str == '}');
785 *str += 1;
786
787 /* If we're in = mode (detected by non-NULL expr_text), and have no format
788 spec and no explicit conversion, set the conversion to 'r'. */
789 if (*expr_text && format_spec == NULL && conversion == -1) {
790 conversion = 'r';
791 }
792
793 /* And now create the FormattedValue node that represents this
794 entire expression with the conversion and format spec. */
795 //TODO: Fix this
796 *expression = FormattedValue(simple_expression, conversion,
797 format_spec, first_token->lineno,
798 first_token->col_offset, last_token->end_lineno,
799 last_token->end_col_offset, p->arena);
Pablo Galindofb61c422020-06-15 14:23:43 +0100800 if (!*expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100801 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +0100802 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100803
804 return 0;
805
806unexpected_end_of_string:
807 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
808 /* Falls through to error. */
809
810error:
811 Py_XDECREF(*expr_text);
812 return -1;
813
814}
815
816/* Return -1 on error.
817
818 Return 0 if we have a literal (possible zero length) and an
819 expression (zero length if at the end of the string.
820
821 Return 1 if we have a literal, but no expression, and we want the
822 caller to call us again. This is used to deal with doubled
823 braces.
824
825 When called multiple times on the string 'a{{b{0}c', this function
826 will return:
827
828 1. the literal 'a{' with no expression, and a return value
829 of 1. Despite the fact that there's no expression, the return
830 value of 1 means we're not finished yet.
831
832 2. the literal 'b' and the expression '0', with a return value of
833 0. The fact that there's an expression means we're not finished.
834
835 3. literal 'c' with no expression and a return value of 0. The
836 combination of the return value of 0 with no expression means
837 we're finished.
838*/
839static int
840fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw,
841 int recurse_lvl, PyObject **literal,
842 PyObject **expr_text, expr_ty *expression,
843 Token *first_token, Token *t, Token *last_token)
844{
845 int result;
846
847 assert(*literal == NULL && *expression == NULL);
848
849 /* Get any literal string. */
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300850 result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
Pablo Galindofb61c422020-06-15 14:23:43 +0100851 if (result < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100852 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +0100853 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100854
855 assert(result == 0 || result == 1);
856
Pablo Galindofb61c422020-06-15 14:23:43 +0100857 if (result == 1) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100858 /* We have a literal, but don't look at the expression. */
859 return 1;
Pablo Galindofb61c422020-06-15 14:23:43 +0100860 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100861
Pablo Galindofb61c422020-06-15 14:23:43 +0100862 if (*str >= end || **str == '}') {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100863 /* We're at the end of the string or the end of a nested
864 f-string: no expression. The top-level error case where we
865 expect to be at the end of the string but we're at a '}' is
866 handled later. */
867 return 0;
Pablo Galindofb61c422020-06-15 14:23:43 +0100868 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100869
870 /* We must now be the start of an expression, on a '{'. */
871 assert(**str == '{');
872
873 if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
Pablo Galindofb61c422020-06-15 14:23:43 +0100874 expression, first_token, t, last_token) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100875 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +0100876 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100877
878 return 0;
879
880error:
881 Py_CLEAR(*literal);
882 return -1;
883}
884
885#ifdef NDEBUG
886#define ExprList_check_invariants(l)
887#else
888static void
889ExprList_check_invariants(ExprList *l)
890{
891 /* Check our invariants. Make sure this object is "live", and
892 hasn't been deallocated. */
893 assert(l->size >= 0);
894 assert(l->p != NULL);
Pablo Galindofb61c422020-06-15 14:23:43 +0100895 if (l->size <= EXPRLIST_N_CACHED) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100896 assert(l->data == l->p);
Pablo Galindofb61c422020-06-15 14:23:43 +0100897 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100898}
899#endif
900
901static void
902ExprList_Init(ExprList *l)
903{
904 l->allocated = EXPRLIST_N_CACHED;
905 l->size = 0;
906
907 /* Until we start allocating dynamically, p points to data. */
908 l->p = l->data;
909
910 ExprList_check_invariants(l);
911}
912
913static int
914ExprList_Append(ExprList *l, expr_ty exp)
915{
916 ExprList_check_invariants(l);
917 if (l->size >= l->allocated) {
918 /* We need to alloc (or realloc) the memory. */
919 Py_ssize_t new_size = l->allocated * 2;
920
921 /* See if we've ever allocated anything dynamically. */
922 if (l->p == l->data) {
923 Py_ssize_t i;
924 /* We're still using the cached data. Switch to
925 alloc-ing. */
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300926 l->p = PyMem_Malloc(sizeof(expr_ty) * new_size);
Pablo Galindofb61c422020-06-15 14:23:43 +0100927 if (!l->p) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100928 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +0100929 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100930 /* Copy the cached data into the new buffer. */
Pablo Galindofb61c422020-06-15 14:23:43 +0100931 for (i = 0; i < l->size; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100932 l->p[i] = l->data[i];
Pablo Galindofb61c422020-06-15 14:23:43 +0100933 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100934 } else {
935 /* Just realloc. */
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300936 expr_ty *tmp = PyMem_Realloc(l->p, sizeof(expr_ty) * new_size);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100937 if (!tmp) {
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300938 PyMem_Free(l->p);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100939 l->p = NULL;
940 return -1;
941 }
942 l->p = tmp;
943 }
944
945 l->allocated = new_size;
946 assert(l->allocated == 2 * l->size);
947 }
948
949 l->p[l->size++] = exp;
950
951 ExprList_check_invariants(l);
952 return 0;
953}
954
955static void
956ExprList_Dealloc(ExprList *l)
957{
958 ExprList_check_invariants(l);
959
960 /* If there's been an error, or we've never dynamically allocated,
961 do nothing. */
962 if (!l->p || l->p == l->data) {
963 /* Do nothing. */
964 } else {
965 /* We have dynamically allocated. Free the memory. */
Lysandros Nikolaou6dcbc242020-06-27 20:47:00 +0300966 PyMem_Free(l->p);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100967 }
968 l->p = NULL;
969 l->size = -1;
970}
971
972static asdl_seq *
973ExprList_Finish(ExprList *l, PyArena *arena)
974{
975 asdl_seq *seq;
976
977 ExprList_check_invariants(l);
978
979 /* Allocate the asdl_seq and copy the expressions in to it. */
980 seq = _Py_asdl_seq_new(l->size, arena);
981 if (seq) {
982 Py_ssize_t i;
Pablo Galindofb61c422020-06-15 14:23:43 +0100983 for (i = 0; i < l->size; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100984 asdl_seq_SET(seq, i, l->p[i]);
Pablo Galindofb61c422020-06-15 14:23:43 +0100985 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100986 }
987 ExprList_Dealloc(l);
988 return seq;
989}
990
991#ifdef NDEBUG
992#define FstringParser_check_invariants(state)
993#else
994static void
995FstringParser_check_invariants(FstringParser *state)
996{
Pablo Galindofb61c422020-06-15 14:23:43 +0100997 if (state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100998 assert(PyUnicode_CheckExact(state->last_str));
Pablo Galindofb61c422020-06-15 14:23:43 +0100999 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001000 ExprList_check_invariants(&state->expr_list);
1001}
1002#endif
1003
1004void
1005_PyPegen_FstringParser_Init(FstringParser *state)
1006{
1007 state->last_str = NULL;
1008 state->fmode = 0;
1009 ExprList_Init(&state->expr_list);
1010 FstringParser_check_invariants(state);
1011}
1012
1013void
1014_PyPegen_FstringParser_Dealloc(FstringParser *state)
1015{
1016 FstringParser_check_invariants(state);
1017
1018 Py_XDECREF(state->last_str);
1019 ExprList_Dealloc(&state->expr_list);
1020}
1021
1022/* Make a Constant node, but decref the PyUnicode object being added. */
1023static expr_ty
1024make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token)
1025{
1026 PyObject *s = *str;
1027 PyObject *kind = NULL;
1028 *str = NULL;
1029 assert(PyUnicode_CheckExact(s));
1030 if (PyArena_AddPyObject(p->arena, s) < 0) {
1031 Py_DECREF(s);
1032 return NULL;
1033 }
1034 const char* the_str = PyBytes_AsString(first_token->bytes);
1035 if (the_str && the_str[0] == 'u') {
1036 kind = _PyPegen_new_identifier(p, "u");
1037 }
1038
1039 if (kind == NULL && PyErr_Occurred()) {
1040 return NULL;
1041 }
1042
1043 return Constant(s, kind, first_token->lineno, first_token->col_offset,
1044 last_token->end_lineno, last_token->end_col_offset, p->arena);
1045
1046}
1047
1048
1049/* Add a non-f-string (that is, a regular literal string). str is
1050 decref'd. */
1051int
1052_PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str)
1053{
1054 FstringParser_check_invariants(state);
1055
1056 assert(PyUnicode_CheckExact(str));
1057
1058 if (PyUnicode_GET_LENGTH(str) == 0) {
1059 Py_DECREF(str);
1060 return 0;
1061 }
1062
1063 if (!state->last_str) {
1064 /* We didn't have a string before, so just remember this one. */
1065 state->last_str = str;
1066 } else {
1067 /* Concatenate this with the previous string. */
1068 PyUnicode_AppendAndDel(&state->last_str, str);
Pablo Galindofb61c422020-06-15 14:23:43 +01001069 if (!state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001070 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +01001071 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001072 }
1073 FstringParser_check_invariants(state);
1074 return 0;
1075}
1076
1077/* Parse an f-string. The f-string is in *str to end, with no
1078 'f' or quotes. */
1079int
1080_PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str,
1081 const char *end, int raw, int recurse_lvl,
1082 Token *first_token, Token* t, Token *last_token)
1083{
1084 FstringParser_check_invariants(state);
1085 state->fmode = 1;
1086
1087 /* Parse the f-string. */
1088 while (1) {
1089 PyObject *literal = NULL;
1090 PyObject *expr_text = NULL;
1091 expr_ty expression = NULL;
1092
1093 /* If there's a zero length literal in front of the
1094 expression, literal will be NULL. If we're at the end of
1095 the f-string, expression will be NULL (unless result == 1,
1096 see below). */
1097 int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
1098 &literal, &expr_text,
1099 &expression, first_token, t, last_token);
Pablo Galindofb61c422020-06-15 14:23:43 +01001100 if (result < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001101 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +01001102 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001103
1104 /* Add the literal, if any. */
1105 if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
1106 Py_XDECREF(expr_text);
1107 return -1;
1108 }
1109 /* Add the expr_text, if any. */
1110 if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
1111 return -1;
1112 }
1113
1114 /* We've dealt with the literal and expr_text, their ownership has
1115 been transferred to the state object. Don't look at them again. */
1116
1117 /* See if we should just loop around to get the next literal
1118 and expression, while ignoring the expression this
1119 time. This is used for un-doubling braces, as an
1120 optimization. */
Pablo Galindofb61c422020-06-15 14:23:43 +01001121 if (result == 1) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001122 continue;
Pablo Galindofb61c422020-06-15 14:23:43 +01001123 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001124
Pablo Galindofb61c422020-06-15 14:23:43 +01001125 if (!expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001126 /* We're done with this f-string. */
1127 break;
Pablo Galindofb61c422020-06-15 14:23:43 +01001128 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001129
1130 /* We know we have an expression. Convert any existing string
1131 to a Constant node. */
1132 if (!state->last_str) {
1133 /* Do nothing. No previous literal. */
1134 } else {
1135 /* Convert the existing last_str literal to a Constant node. */
Pablo Galindofb61c422020-06-15 14:23:43 +01001136 expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1137 if (!last_str || ExprList_Append(&state->expr_list, last_str) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001138 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +01001139 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001140 }
1141
Pablo Galindofb61c422020-06-15 14:23:43 +01001142 if (ExprList_Append(&state->expr_list, expression) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001143 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +01001144 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001145 }
1146
1147 /* If recurse_lvl is zero, then we must be at the end of the
1148 string. Otherwise, we must be at a right brace. */
1149
1150 if (recurse_lvl == 0 && *str < end-1) {
1151 RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
1152 return -1;
1153 }
1154 if (recurse_lvl != 0 && **str != '}') {
1155 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1156 return -1;
1157 }
1158
1159 FstringParser_check_invariants(state);
1160 return 0;
1161}
1162
1163/* Convert the partial state reflected in last_str and expr_list to an
1164 expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
1165expr_ty
1166_PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token,
1167 Token *last_token)
1168{
1169 asdl_seq *seq;
1170
1171 FstringParser_check_invariants(state);
1172
1173 /* If we're just a constant string with no expressions, return
1174 that. */
1175 if (!state->fmode) {
1176 assert(!state->expr_list.size);
1177 if (!state->last_str) {
1178 /* Create a zero length string. */
1179 state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
Pablo Galindofb61c422020-06-15 14:23:43 +01001180 if (!state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001181 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +01001182 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001183 }
1184 return make_str_node_and_del(p, &state->last_str, first_token, last_token);
1185 }
1186
1187 /* Create a Constant node out of last_str, if needed. It will be the
1188 last node in our expression list. */
1189 if (state->last_str) {
1190 expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
Pablo Galindofb61c422020-06-15 14:23:43 +01001191 if (!str || ExprList_Append(&state->expr_list, str) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001192 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +01001193 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001194 }
1195 /* This has already been freed. */
1196 assert(state->last_str == NULL);
1197
1198 seq = ExprList_Finish(&state->expr_list, p->arena);
Pablo Galindofb61c422020-06-15 14:23:43 +01001199 if (!seq) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001200 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +01001201 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001202
1203 return _Py_JoinedStr(seq, first_token->lineno, first_token->col_offset,
1204 last_token->end_lineno, last_token->end_col_offset, p->arena);
1205
1206error:
1207 _PyPegen_FstringParser_Dealloc(state);
1208 return NULL;
1209}
1210
1211/* Given an f-string (with no 'f' or quotes) that's in *str and ends
1212 at end, parse it into an expr_ty. Return NULL on error. Adjust
1213 str to point past the parsed portion. */
1214static expr_ty
1215fstring_parse(Parser *p, const char **str, const char *end, int raw,
1216 int recurse_lvl, Token *first_token, Token* t, Token *last_token)
1217{
1218 FstringParser state;
1219
1220 _PyPegen_FstringParser_Init(&state);
1221 if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
1222 first_token, t, last_token) < 0) {
1223 _PyPegen_FstringParser_Dealloc(&state);
1224 return NULL;
1225 }
1226
1227 return _PyPegen_FstringParser_Finish(p, &state, t, t);
1228}