blob: 61e60446e26fa9ecb167e66bcd1c2a0dda4ac633 [file] [log] [blame]
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001#include <Python.h>
2
3#include "../tokenizer.h"
4#include "pegen.h"
5#include "parse_string.h"
6
7//// STRING HANDLING FUNCTIONS ////
8
9// These functions are ported directly from Python/ast.c with some modifications
10// to account for the use of "Parser *p", the fact that don't have parser nodes
11// to pass around and the usage of some specialized APIs present only in this
12// file (like "_PyPegen_raise_syntax_error").
13
14static int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030015warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010016{
17 PyObject *msg =
18 PyUnicode_FromFormat("invalid escape sequence \\%c", first_invalid_escape_char);
19 if (msg == NULL) {
20 return -1;
21 }
22 if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030023 t->lineno, NULL, NULL) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +010024 if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
25 /* Replace the DeprecationWarning exception with a SyntaxError
26 to get a more accurate error report */
27 PyErr_Clear();
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030028
29 /* This is needed, in order for the SyntaxError to point to the token t,
30 since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
31 error location, if p->known_err_token is not set. */
32 p->known_err_token = t;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010033 RAISE_SYNTAX_ERROR("invalid escape sequence \\%c", first_invalid_escape_char);
34 }
35 Py_DECREF(msg);
36 return -1;
37 }
38 Py_DECREF(msg);
39 return 0;
40}
41
42static PyObject *
43decode_utf8(const char **sPtr, const char *end)
44{
Pablo Galindo30b59fd2020-06-15 15:08:00 +010045 const char *s;
46 const char *t;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010047 t = s = *sPtr;
48 while (s < end && (*s & 0x80)) {
49 s++;
50 }
51 *sPtr = s;
52 return PyUnicode_DecodeUTF8(t, s - t, NULL);
53}
54
55static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030056decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010057{
Pablo Galindo30b59fd2020-06-15 15:08:00 +010058 PyObject *v;
59 PyObject *u;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010060 char *buf;
61 char *p;
62 const char *end;
63
64 /* check for integer overflow */
65 if (len > SIZE_MAX / 6) {
66 return NULL;
67 }
68 /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
69 "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
70 u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
71 if (u == NULL) {
72 return NULL;
73 }
74 p = buf = PyBytes_AsString(u);
75 end = s + len;
76 while (s < end) {
77 if (*s == '\\') {
78 *p++ = *s++;
79 if (s >= end || *s & 0x80) {
80 strcpy(p, "u005c");
81 p += 5;
82 if (s >= end) {
83 break;
84 }
85 }
86 }
87 if (*s & 0x80) {
88 PyObject *w;
89 int kind;
90 void *data;
Pablo Galindo30b59fd2020-06-15 15:08:00 +010091 Py_ssize_t w_len;
92 Py_ssize_t i;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010093 w = decode_utf8(&s, end);
94 if (w == NULL) {
95 Py_DECREF(u);
96 return NULL;
97 }
98 kind = PyUnicode_KIND(w);
99 data = PyUnicode_DATA(w);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100100 w_len = PyUnicode_GET_LENGTH(w);
101 for (i = 0; i < w_len; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100102 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
103 sprintf(p, "\\U%08x", chr);
104 p += 10;
105 }
106 /* Should be impossible to overflow */
107 assert(p - buf <= PyBytes_GET_SIZE(u));
108 Py_DECREF(w);
109 }
110 else {
111 *p++ = *s++;
112 }
113 }
114 len = p - buf;
115 s = buf;
116
117 const char *first_invalid_escape;
118 v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
119
120 if (v != NULL && first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300121 if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100122 /* We have not decref u before because first_invalid_escape points
123 inside u. */
124 Py_XDECREF(u);
125 Py_DECREF(v);
126 return NULL;
127 }
128 }
129 Py_XDECREF(u);
130 return v;
131}
132
133static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300134decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100135{
136 const char *first_invalid_escape;
137 PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
138 if (result == NULL) {
139 return NULL;
140 }
141
142 if (first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300143 if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100144 Py_DECREF(result);
145 return NULL;
146 }
147 }
148 return result;
149}
150
151/* s must include the bracketing quote characters, and r, b, u,
152 &/or f prefixes (if any), and embedded escape sequences (if any).
153 _PyPegen_parsestr parses it, and sets *result to decoded Python string object.
154 If the string is an f-string, set *fstr and *fstrlen to the unparsed
155 string object. Return 0 if no errors occurred. */
156int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300157_PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
158 const char **fstr, Py_ssize_t *fstrlen, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100159{
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300160 const char *s = PyBytes_AsString(t->bytes);
161 if (s == NULL) {
162 return -1;
163 }
164
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100165 size_t len;
166 int quote = Py_CHARMASK(*s);
167 int fmode = 0;
168 *bytesmode = 0;
169 *rawmode = 0;
170 *result = NULL;
171 *fstr = NULL;
172 if (Py_ISALPHA(quote)) {
173 while (!*bytesmode || !*rawmode) {
174 if (quote == 'b' || quote == 'B') {
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100175 quote =(unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100176 *bytesmode = 1;
177 }
178 else if (quote == 'u' || quote == 'U') {
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100179 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100180 }
181 else if (quote == 'r' || quote == 'R') {
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100182 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100183 *rawmode = 1;
184 }
185 else if (quote == 'f' || quote == 'F') {
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100186 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100187 fmode = 1;
188 }
189 else {
190 break;
191 }
192 }
193 }
194
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300195 /* fstrings are only allowed in Python 3.6 and greater */
196 if (fmode && p->feature_version < 6) {
197 p->error_indicator = 1;
198 RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
199 return -1;
200 }
201
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100202 if (fmode && *bytesmode) {
203 PyErr_BadInternalCall();
204 return -1;
205 }
206 if (quote != '\'' && quote != '\"') {
207 PyErr_BadInternalCall();
208 return -1;
209 }
210 /* Skip the leading quote char. */
211 s++;
212 len = strlen(s);
213 if (len > INT_MAX) {
214 PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
215 return -1;
216 }
217 if (s[--len] != quote) {
218 /* Last quote char must match the first. */
219 PyErr_BadInternalCall();
220 return -1;
221 }
222 if (len >= 4 && s[0] == quote && s[1] == quote) {
223 /* A triple quoted string. We've already skipped one quote at
224 the start and one at the end of the string. Now skip the
225 two at the start. */
226 s += 2;
227 len -= 2;
228 /* And check that the last two match. */
229 if (s[--len] != quote || s[--len] != quote) {
230 PyErr_BadInternalCall();
231 return -1;
232 }
233 }
234
235 if (fmode) {
236 /* Just return the bytes. The caller will parse the resulting
237 string. */
238 *fstr = s;
239 *fstrlen = len;
240 return 0;
241 }
242
243 /* Not an f-string. */
244 /* Avoid invoking escape decoding routines if possible. */
245 *rawmode = *rawmode || strchr(s, '\\') == NULL;
246 if (*bytesmode) {
247 /* Disallow non-ASCII characters. */
248 const char *ch;
249 for (ch = s; *ch; ch++) {
250 if (Py_CHARMASK(*ch) >= 0x80) {
251 RAISE_SYNTAX_ERROR(
252 "bytes can only contain ASCII "
253 "literal characters.");
254 return -1;
255 }
256 }
257 if (*rawmode) {
258 *result = PyBytes_FromStringAndSize(s, len);
259 }
260 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300261 *result = decode_bytes_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100262 }
263 }
264 else {
265 if (*rawmode) {
266 *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
267 }
268 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300269 *result = decode_unicode_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100270 }
271 }
272 return *result == NULL ? -1 : 0;
273}
274
275
276
277// FSTRING STUFF
278
279static void fstring_shift_expr_locations(expr_ty n, int lineno, int col_offset);
280static void fstring_shift_argument(expr_ty parent, arg_ty args, int lineno, int col_offset);
281
282
283static inline void shift_expr(expr_ty parent, expr_ty n, int line, int col) {
Miss Islington (bot)64409112020-06-07 18:08:53 -0700284 if (n == NULL) {
285 return;
286 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100287 if (parent->lineno < n->lineno) {
288 col = 0;
289 }
290 fstring_shift_expr_locations(n, line, col);
291}
292
293static inline void shift_arg(expr_ty parent, arg_ty n, int line, int col) {
294 if (parent->lineno < n->lineno) {
295 col = 0;
296 }
297 fstring_shift_argument(parent, n, line, col);
298}
299
300static void fstring_shift_seq_locations(expr_ty parent, asdl_seq *seq, int lineno, int col_offset) {
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100301 for (Py_ssize_t i = 0, l = asdl_seq_LEN(seq); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100302 expr_ty expr = asdl_seq_GET(seq, i);
303 if (expr == NULL){
304 continue;
305 }
306 shift_expr(parent, expr, lineno, col_offset);
307 }
308}
309
310static void fstring_shift_slice_locations(expr_ty parent, expr_ty slice, int lineno, int col_offset) {
311 switch (slice->kind) {
312 case Slice_kind:
313 if (slice->v.Slice.lower) {
314 shift_expr(parent, slice->v.Slice.lower, lineno, col_offset);
315 }
316 if (slice->v.Slice.upper) {
317 shift_expr(parent, slice->v.Slice.upper, lineno, col_offset);
318 }
319 if (slice->v.Slice.step) {
320 shift_expr(parent, slice->v.Slice.step, lineno, col_offset);
321 }
322 break;
323 case Tuple_kind:
324 fstring_shift_seq_locations(parent, slice->v.Tuple.elts, lineno, col_offset);
325 break;
326 default:
327 break;
328 }
329}
330
331static void fstring_shift_comprehension(expr_ty parent, comprehension_ty comp, int lineno, int col_offset) {
332 shift_expr(parent, comp->target, lineno, col_offset);
333 shift_expr(parent, comp->iter, lineno, col_offset);
334 fstring_shift_seq_locations(parent, comp->ifs, lineno, col_offset);
335}
336
337static void fstring_shift_argument(expr_ty parent, arg_ty arg, int lineno, int col_offset) {
338 if (arg->annotation != NULL){
339 shift_expr(parent, arg->annotation, lineno, col_offset);
340 }
341 arg->col_offset = arg->col_offset + col_offset;
342 arg->end_col_offset = arg->end_col_offset + col_offset;
343 arg->lineno = arg->lineno + lineno;
344 arg->end_lineno = arg->end_lineno + lineno;
345}
346
347static void fstring_shift_arguments(expr_ty parent, arguments_ty args, int lineno, int col_offset) {
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100348 for (Py_ssize_t i = 0, l = asdl_seq_LEN(args->posonlyargs); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100349 arg_ty arg = asdl_seq_GET(args->posonlyargs, i);
350 shift_arg(parent, arg, lineno, col_offset);
351 }
352
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100353 for (Py_ssize_t i = 0, l = asdl_seq_LEN(args->args); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100354 arg_ty arg = asdl_seq_GET(args->args, i);
355 shift_arg(parent, arg, lineno, col_offset);
356 }
357
358 if (args->vararg != NULL) {
359 shift_arg(parent, args->vararg, lineno, col_offset);
360 }
361
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100362 for (Py_ssize_t i = 0, l = asdl_seq_LEN(args->kwonlyargs); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100363 arg_ty arg = asdl_seq_GET(args->kwonlyargs, i);
364 shift_arg(parent, arg, lineno, col_offset);
365 }
366
367 fstring_shift_seq_locations(parent, args->kw_defaults, lineno, col_offset);
368
369 if (args->kwarg != NULL) {
370 shift_arg(parent, args->kwarg, lineno, col_offset);
371 }
372
373 fstring_shift_seq_locations(parent, args->defaults, lineno, col_offset);
374}
375
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100376static void fstring_shift_children_locations(expr_ty node, int lineno, int col_offset) {
377 switch (node->kind) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100378 case BoolOp_kind:
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100379 fstring_shift_seq_locations(node, node->v.BoolOp.values, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100380 break;
381 case NamedExpr_kind:
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100382 shift_expr(node, node->v.NamedExpr.target, lineno, col_offset);
383 shift_expr(node, node->v.NamedExpr.value, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100384 break;
385 case BinOp_kind:
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100386 shift_expr(node, node->v.BinOp.left, lineno, col_offset);
387 shift_expr(node, node->v.BinOp.right, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100388 break;
389 case UnaryOp_kind:
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100390 shift_expr(node, node->v.UnaryOp.operand, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100391 break;
392 case Lambda_kind:
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100393 fstring_shift_arguments(node, node->v.Lambda.args, lineno, col_offset);
394 shift_expr(node, node->v.Lambda.body, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100395 break;
396 case IfExp_kind:
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100397 shift_expr(node, node->v.IfExp.test, lineno, col_offset);
398 shift_expr(node, node->v.IfExp.body, lineno, col_offset);
399 shift_expr(node, node->v.IfExp.orelse, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100400 break;
401 case Dict_kind:
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100402 fstring_shift_seq_locations(node, node->v.Dict.keys, lineno, col_offset);
403 fstring_shift_seq_locations(node, node->v.Dict.values, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100404 break;
405 case Set_kind:
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100406 fstring_shift_seq_locations(node, node->v.Set.elts, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100407 break;
408 case ListComp_kind:
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100409 shift_expr(node, node->v.ListComp.elt, lineno, col_offset);
410 for (Py_ssize_t i = 0, l = asdl_seq_LEN(node->v.ListComp.generators); i < l; i++) {
411 comprehension_ty comp = asdl_seq_GET(node->v.ListComp.generators, i);
412 fstring_shift_comprehension(node, comp, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100413 }
414 break;
415 case SetComp_kind:
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100416 shift_expr(node, node->v.SetComp.elt, lineno, col_offset);
417 for (Py_ssize_t i = 0, l = asdl_seq_LEN(node->v.SetComp.generators); i < l; i++) {
418 comprehension_ty comp = asdl_seq_GET(node->v.SetComp.generators, i);
419 fstring_shift_comprehension(node, comp, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100420 }
421 break;
422 case DictComp_kind:
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100423 shift_expr(node, node->v.DictComp.key, lineno, col_offset);
424 shift_expr(node, node->v.DictComp.value, lineno, col_offset);
425 for (Py_ssize_t i = 0, l = asdl_seq_LEN(node->v.DictComp.generators); i < l; i++) {
426 comprehension_ty comp = asdl_seq_GET(node->v.DictComp.generators, i);
427 fstring_shift_comprehension(node, comp, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100428 }
429 break;
430 case GeneratorExp_kind:
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100431 shift_expr(node, node->v.GeneratorExp.elt, lineno, col_offset);
432 for (Py_ssize_t i = 0, l = asdl_seq_LEN(node->v.GeneratorExp.generators); i < l; i++) {
433 comprehension_ty comp = asdl_seq_GET(node->v.GeneratorExp.generators, i);
434 fstring_shift_comprehension(node, comp, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100435 }
436 break;
437 case Await_kind:
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100438 shift_expr(node, node->v.Await.value, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100439 break;
440 case Yield_kind:
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100441 shift_expr(node, node->v.Yield.value, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100442 break;
443 case YieldFrom_kind:
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100444 shift_expr(node, node->v.YieldFrom.value, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100445 break;
446 case Compare_kind:
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100447 shift_expr(node, node->v.Compare.left, lineno, col_offset);
448 fstring_shift_seq_locations(node, node->v.Compare.comparators, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100449 break;
450 case Call_kind:
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100451 shift_expr(node, node->v.Call.func, lineno, col_offset);
452 fstring_shift_seq_locations(node, node->v.Call.args, lineno, col_offset);
453 for (Py_ssize_t i = 0, l = asdl_seq_LEN(node->v.Call.keywords); i < l; i++) {
454 keyword_ty keyword = asdl_seq_GET(node->v.Call.keywords, i);
455 shift_expr(node, keyword->value, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100456 }
457 break;
458 case Attribute_kind:
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100459 shift_expr(node, node->v.Attribute.value, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100460 break;
461 case Subscript_kind:
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100462 shift_expr(node, node->v.Subscript.value, lineno, col_offset);
463 fstring_shift_slice_locations(node, node->v.Subscript.slice, lineno, col_offset);
464 shift_expr(node, node->v.Subscript.slice, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100465 break;
466 case Starred_kind:
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100467 shift_expr(node, node->v.Starred.value, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100468 break;
469 case List_kind:
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100470 fstring_shift_seq_locations(node, node->v.List.elts, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100471 break;
472 case Tuple_kind:
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100473 fstring_shift_seq_locations(node, node->v.Tuple.elts, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100474 break;
Lysandros Nikolaou37af21b2020-04-29 03:43:50 +0300475 case JoinedStr_kind:
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100476 fstring_shift_seq_locations(node, node->v.JoinedStr.values, lineno, col_offset);
Lysandros Nikolaou37af21b2020-04-29 03:43:50 +0300477 break;
478 case FormattedValue_kind:
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100479 shift_expr(node, node->v.FormattedValue.value, lineno, col_offset);
480 if (node->v.FormattedValue.format_spec) {
481 shift_expr(node, node->v.FormattedValue.format_spec, lineno, col_offset);
Lysandros Nikolaou37af21b2020-04-29 03:43:50 +0300482 }
483 break;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100484 default:
485 return;
486 }
487}
488
489/* Shift locations for the given node and all its children by adding `lineno`
490 and `col_offset` to existing locations. Note that n is the already parsed
491 expression. */
492static void fstring_shift_expr_locations(expr_ty n, int lineno, int col_offset)
493{
494 n->col_offset = n->col_offset + col_offset;
495
496 // The following is needed, in order for nodes spanning across multiple lines
497 // to be shifted correctly. An example of such a node is a Call node, the closing
498 // parenthesis of which is not on the same line as its name.
499 if (n->lineno == n->end_lineno) {
500 n->end_col_offset = n->end_col_offset + col_offset;
501 }
502
503 fstring_shift_children_locations(n, lineno, col_offset);
504 n->lineno = n->lineno + lineno;
505 n->end_lineno = n->end_lineno + lineno;
506}
507
508/* Fix locations for the given node and its children.
509
510 `parent` is the enclosing node.
511 `n` is the node which locations are going to be fixed relative to parent.
512 `expr_str` is the child node's string representation, including braces.
513*/
514static void
515fstring_fix_expr_location(Token *parent, expr_ty n, char *expr_str)
516{
517 char *substr = NULL;
518 char *start;
519 int lines = 0;
520 int cols = 0;
521
522 if (parent && parent->bytes) {
523 char *parent_str = PyBytes_AsString(parent->bytes);
524 if (!parent_str) {
525 return;
526 }
527 substr = strstr(parent_str, expr_str);
528 if (substr) {
529 // The following is needed, in order to correctly shift the column
530 // offset, in the case that (disregarding any whitespace) a newline
531 // immediately follows the opening curly brace of the fstring expression.
532 int newline_after_brace = 1;
533 start = substr + 1;
534 while (start && *start != '}' && *start != '\n') {
535 if (*start != ' ' && *start != '\t' && *start != '\f') {
536 newline_after_brace = 0;
537 break;
538 }
539 start++;
540 }
541
542 // Account for the characters from the last newline character to our
543 // left until the beginning of substr.
544 if (!newline_after_brace) {
545 start = substr;
546 while (start > parent_str && *start != '\n') {
547 start--;
548 }
549 cols += (int)(substr - start);
550 }
551 /* adjust the start based on the number of newlines encountered
552 before the f-string expression */
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100553 for (char* p = parent_str; p < substr; p++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100554 if (*p == '\n') {
555 lines++;
556 }
557 }
558 }
559 }
560 fstring_shift_expr_locations(n, lines, cols);
561}
562
563
564/* Compile this expression in to an expr_ty. Add parens around the
565 expression, in order to allow leading spaces in the expression. */
566static expr_ty
567fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
568 Token *t)
569{
570 expr_ty expr = NULL;
571 char *str;
572 Py_ssize_t len;
573 const char *s;
574 expr_ty result = NULL;
575
576 assert(expr_end >= expr_start);
577 assert(*(expr_start-1) == '{');
578 assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' ||
579 *expr_end == '=');
580
581 /* If the substring is all whitespace, it's an error. We need to catch this
582 here, and not when we call PyParser_SimpleParseStringFlagsFilename,
583 because turning the expression '' in to '()' would go from being invalid
584 to valid. */
585 for (s = expr_start; s != expr_end; s++) {
586 char c = *s;
587 /* The Python parser ignores only the following whitespace
588 characters (\r already is converted to \n). */
589 if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) {
590 break;
591 }
592 }
593 if (s == expr_end) {
594 RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
595 return NULL;
596 }
597
598 len = expr_end - expr_start;
599 /* Allocate 3 extra bytes: open paren, close paren, null byte. */
Lysandros Nikolaou5193d0a2020-06-27 21:35:18 +0300600 str = PyMem_Malloc(len + 3);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100601 if (str == NULL) {
602 PyErr_NoMemory();
603 return NULL;
604 }
605
606 str[0] = '(';
607 memcpy(str+1, expr_start, len);
608 str[len+1] = ')';
609 str[len+2] = 0;
610
611 struct tok_state* tok = PyTokenizer_FromString(str, 1);
612 if (tok == NULL) {
Lysandros Nikolaou5193d0a2020-06-27 21:35:18 +0300613 PyMem_Free(str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100614 return NULL;
615 }
Lysandros Nikolaou791a46e2020-05-26 04:24:31 +0300616 Py_INCREF(p->tok->filename);
617 tok->filename = p->tok->filename;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100618
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300619 Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
620 NULL, p->arena);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100621 p2->starting_lineno = p->starting_lineno + p->tok->first_lineno - 1;
622 p2->starting_col_offset = p->tok->first_lineno == p->tok->lineno
623 ? p->starting_col_offset + t->col_offset : 0;
624
625 expr = _PyPegen_run_parser(p2);
626
627 if (expr == NULL) {
628 goto exit;
629 }
630
631 /* Reuse str to find the correct column offset. */
632 str[0] = '{';
633 str[len+1] = '}';
634 fstring_fix_expr_location(t, expr, str);
635
636 result = expr;
637
638exit:
Lysandros Nikolaou5193d0a2020-06-27 21:35:18 +0300639 PyMem_Free(str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100640 _PyPegen_Parser_Free(p2);
641 PyTokenizer_Free(tok);
642 return result;
643}
644
645/* Return -1 on error.
646
647 Return 0 if we reached the end of the literal.
648
649 Return 1 if we haven't reached the end of the literal, but we want
650 the caller to process the literal up to this point. Used for
651 doubled braces.
652*/
653static int
654fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300655 PyObject **literal, int recurse_lvl, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100656{
657 /* Get any literal string. It ends when we hit an un-doubled left
658 brace (which isn't part of a unicode name escape such as
659 "\N{EULER CONSTANT}"), or the end of the string. */
660
661 const char *s = *str;
662 const char *literal_start = s;
663 int result = 0;
664
665 assert(*literal == NULL);
666 while (s < end) {
667 char ch = *s++;
668 if (!raw && ch == '\\' && s < end) {
669 ch = *s++;
670 if (ch == 'N') {
671 if (s < end && *s++ == '{') {
672 while (s < end && *s++ != '}') {
673 }
674 continue;
675 }
676 break;
677 }
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300678 if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100679 return -1;
680 }
681 }
682 if (ch == '{' || ch == '}') {
683 /* Check for doubled braces, but only at the top level. If
684 we checked at every level, then f'{0:{3}}' would fail
685 with the two closing braces. */
686 if (recurse_lvl == 0) {
687 if (s < end && *s == ch) {
688 /* We're going to tell the caller that the literal ends
689 here, but that they should continue scanning. But also
690 skip over the second brace when we resume scanning. */
691 *str = s + 1;
692 result = 1;
693 goto done;
694 }
695
696 /* Where a single '{' is the start of a new expression, a
697 single '}' is not allowed. */
698 if (ch == '}') {
699 *str = s - 1;
700 RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
701 return -1;
702 }
703 }
704 /* We're either at a '{', which means we're starting another
705 expression; or a '}', which means we're at the end of this
706 f-string (for a nested format_spec). */
707 s--;
708 break;
709 }
710 }
711 *str = s;
712 assert(s <= end);
713 assert(s == end || *s == '{' || *s == '}');
714done:
715 if (literal_start != s) {
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100716 if (raw) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100717 *literal = PyUnicode_DecodeUTF8Stateful(literal_start,
718 s - literal_start,
719 NULL, NULL);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100720 } else {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100721 *literal = decode_unicode_with_escapes(p, literal_start,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300722 s - literal_start, t);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100723 }
724 if (!*literal) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100725 return -1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100726 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100727 }
728 return result;
729}
730
731/* Forward declaration because parsing is recursive. */
732static expr_ty
733fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
734 Token *first_token, Token* t, Token *last_token);
735
736/* Parse the f-string at *str, ending at end. We know *str starts an
737 expression (so it must be a '{'). Returns the FormattedValue node, which
738 includes the expression, conversion character, format_spec expression, and
739 optionally the text of the expression (if = is used).
740
741 Note that I don't do a perfect job here: I don't make sure that a
742 closing brace doesn't match an opening paren, for example. It
743 doesn't need to error on all invalid expressions, just correctly
744 find the end of all valid ones. Any errors inside the expression
745 will be caught when we parse it later.
746
747 *expression is set to the expression. For an '=' "debug" expression,
748 *expr_text is set to the debug text (the original text of the expression,
749 including the '=' and any whitespace around it, as a string object). If
750 not a debug expression, *expr_text set to NULL. */
751static int
752fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
753 PyObject **expr_text, expr_ty *expression, Token *first_token,
754 Token *t, Token *last_token)
755{
756 /* Return -1 on error, else 0. */
757
758 const char *expr_start;
759 const char *expr_end;
760 expr_ty simple_expression;
761 expr_ty format_spec = NULL; /* Optional format specifier. */
762 int conversion = -1; /* The conversion char. Use default if not
763 specified, or !r if using = and no format
764 spec. */
765
766 /* 0 if we're not in a string, else the quote char we're trying to
767 match (single or double quote). */
768 char quote_char = 0;
769
770 /* If we're inside a string, 1=normal, 3=triple-quoted. */
771 int string_type = 0;
772
773 /* Keep track of nesting level for braces/parens/brackets in
774 expressions. */
775 Py_ssize_t nested_depth = 0;
776 char parenstack[MAXLEVEL];
777
778 *expr_text = NULL;
779
780 /* Can only nest one level deep. */
781 if (recurse_lvl >= 2) {
782 RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
783 goto error;
784 }
785
786 /* The first char must be a left brace, or we wouldn't have gotten
787 here. Skip over it. */
788 assert(**str == '{');
789 *str += 1;
790
791 expr_start = *str;
792 for (; *str < end; (*str)++) {
793 char ch;
794
795 /* Loop invariants. */
796 assert(nested_depth >= 0);
797 assert(*str >= expr_start && *str < end);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100798 if (quote_char) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100799 assert(string_type == 1 || string_type == 3);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100800 } else {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100801 assert(string_type == 0);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100802 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100803
804 ch = **str;
805 /* Nowhere inside an expression is a backslash allowed. */
806 if (ch == '\\') {
807 /* Error: can't include a backslash character, inside
808 parens or strings or not. */
809 RAISE_SYNTAX_ERROR(
810 "f-string expression part "
811 "cannot include a backslash");
812 goto error;
813 }
814 if (quote_char) {
815 /* We're inside a string. See if we're at the end. */
816 /* This code needs to implement the same non-error logic
817 as tok_get from tokenizer.c, at the letter_quote
818 label. To actually share that code would be a
819 nightmare. But, it's unlikely to change and is small,
820 so duplicate it here. Note we don't need to catch all
821 of the errors, since they'll be caught when parsing the
822 expression. We just need to match the non-error
823 cases. Thus we can ignore \n in single-quoted strings,
824 for example. Or non-terminated strings. */
825 if (ch == quote_char) {
826 /* Does this match the string_type (single or triple
827 quoted)? */
828 if (string_type == 3) {
829 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
830 /* We're at the end of a triple quoted string. */
831 *str += 2;
832 string_type = 0;
833 quote_char = 0;
834 continue;
835 }
836 } else {
837 /* We're at the end of a normal string. */
838 quote_char = 0;
839 string_type = 0;
840 continue;
841 }
842 }
843 } else if (ch == '\'' || ch == '"') {
844 /* Is this a triple quoted string? */
845 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
846 string_type = 3;
847 *str += 2;
848 } else {
849 /* Start of a normal string. */
850 string_type = 1;
851 }
852 /* Start looking for the end of the string. */
853 quote_char = ch;
854 } else if (ch == '[' || ch == '{' || ch == '(') {
855 if (nested_depth >= MAXLEVEL) {
856 RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
857 goto error;
858 }
859 parenstack[nested_depth] = ch;
860 nested_depth++;
861 } else if (ch == '#') {
862 /* Error: can't include a comment character, inside parens
863 or not. */
864 RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
865 goto error;
866 } else if (nested_depth == 0 &&
867 (ch == '!' || ch == ':' || ch == '}' ||
868 ch == '=' || ch == '>' || ch == '<')) {
869 /* See if there's a next character. */
870 if (*str+1 < end) {
871 char next = *(*str+1);
872
873 /* For "!=". since '=' is not an allowed conversion character,
874 nothing is lost in this test. */
875 if ((ch == '!' && next == '=') || /* != */
876 (ch == '=' && next == '=') || /* == */
877 (ch == '<' && next == '=') || /* <= */
878 (ch == '>' && next == '=') /* >= */
879 ) {
880 *str += 1;
881 continue;
882 }
883 /* Don't get out of the loop for these, if they're single
884 chars (not part of 2-char tokens). If by themselves, they
885 don't end an expression (unlike say '!'). */
886 if (ch == '>' || ch == '<') {
887 continue;
888 }
889 }
890
891 /* Normal way out of this loop. */
892 break;
893 } else if (ch == ']' || ch == '}' || ch == ')') {
894 if (!nested_depth) {
895 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
896 goto error;
897 }
898 nested_depth--;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100899 int opening = (unsigned char)parenstack[nested_depth];
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100900 if (!((opening == '(' && ch == ')') ||
901 (opening == '[' && ch == ']') ||
902 (opening == '{' && ch == '}')))
903 {
904 RAISE_SYNTAX_ERROR(
905 "f-string: closing parenthesis '%c' "
906 "does not match opening parenthesis '%c'",
907 ch, opening);
908 goto error;
909 }
910 } else {
911 /* Just consume this char and loop around. */
912 }
913 }
914 expr_end = *str;
915 /* If we leave this loop in a string or with mismatched parens, we
916 don't care. We'll get a syntax error when compiling the
917 expression. But, we can produce a better error message, so
918 let's just do that.*/
919 if (quote_char) {
920 RAISE_SYNTAX_ERROR("f-string: unterminated string");
921 goto error;
922 }
923 if (nested_depth) {
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100924 int opening = (unsigned char)parenstack[nested_depth - 1];
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100925 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
926 goto error;
927 }
928
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100929 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100930 goto unexpected_end_of_string;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100931 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100932
933 /* Compile the expression as soon as possible, so we show errors
934 related to the expression before errors related to the
935 conversion or format_spec. */
936 simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100937 if (!simple_expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100938 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100939 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100940
941 /* Check for =, which puts the text value of the expression in
942 expr_text. */
943 if (**str == '=') {
Pablo Galindo9b838292020-05-27 22:01:11 +0100944 if (p->feature_version < 8) {
945 RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
946 "only supported in Python 3.8 and greater");
947 goto error;
948 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100949 *str += 1;
950
951 /* Skip over ASCII whitespace. No need to test for end of string
952 here, since we know there's at least a trailing quote somewhere
953 ahead. */
954 while (Py_ISSPACE(**str)) {
955 *str += 1;
956 }
957
958 /* Set *expr_text to the text of the expression. */
959 *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start);
960 if (!*expr_text) {
961 goto error;
962 }
963 }
964
965 /* Check for a conversion char, if present. */
966 if (**str == '!') {
967 *str += 1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100968 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100969 goto unexpected_end_of_string;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100970 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100971
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100972 conversion = (unsigned char)**str;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100973 *str += 1;
974
975 /* Validate the conversion. */
976 if (!(conversion == 's' || conversion == 'r' || conversion == 'a')) {
977 RAISE_SYNTAX_ERROR(
978 "f-string: invalid conversion character: "
979 "expected 's', 'r', or 'a'");
980 goto error;
981 }
982
983 }
984
985 /* Check for the format spec, if present. */
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100986 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100987 goto unexpected_end_of_string;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100988 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100989 if (**str == ':') {
990 *str += 1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100991 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100992 goto unexpected_end_of_string;
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100993 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100994
995 /* Parse the format spec. */
996 format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
997 first_token, t, last_token);
Pablo Galindo30b59fd2020-06-15 15:08:00 +0100998 if (!format_spec) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100999 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001000 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001001 }
1002
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001003 if (*str >= end || **str != '}') {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001004 goto unexpected_end_of_string;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001005 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001006
1007 /* We're at a right brace. Consume it. */
1008 assert(*str < end);
1009 assert(**str == '}');
1010 *str += 1;
1011
1012 /* If we're in = mode (detected by non-NULL expr_text), and have no format
1013 spec and no explicit conversion, set the conversion to 'r'. */
1014 if (*expr_text && format_spec == NULL && conversion == -1) {
1015 conversion = 'r';
1016 }
1017
1018 /* And now create the FormattedValue node that represents this
1019 entire expression with the conversion and format spec. */
1020 //TODO: Fix this
1021 *expression = FormattedValue(simple_expression, conversion,
1022 format_spec, first_token->lineno,
1023 first_token->col_offset, last_token->end_lineno,
1024 last_token->end_col_offset, p->arena);
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001025 if (!*expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001026 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001027 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001028
1029 return 0;
1030
1031unexpected_end_of_string:
1032 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1033 /* Falls through to error. */
1034
1035error:
1036 Py_XDECREF(*expr_text);
1037 return -1;
1038
1039}
1040
1041/* Return -1 on error.
1042
1043 Return 0 if we have a literal (possible zero length) and an
1044 expression (zero length if at the end of the string.
1045
1046 Return 1 if we have a literal, but no expression, and we want the
1047 caller to call us again. This is used to deal with doubled
1048 braces.
1049
1050 When called multiple times on the string 'a{{b{0}c', this function
1051 will return:
1052
1053 1. the literal 'a{' with no expression, and a return value
1054 of 1. Despite the fact that there's no expression, the return
1055 value of 1 means we're not finished yet.
1056
1057 2. the literal 'b' and the expression '0', with a return value of
1058 0. The fact that there's an expression means we're not finished.
1059
1060 3. literal 'c' with no expression and a return value of 0. The
1061 combination of the return value of 0 with no expression means
1062 we're finished.
1063*/
1064static int
1065fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw,
1066 int recurse_lvl, PyObject **literal,
1067 PyObject **expr_text, expr_ty *expression,
1068 Token *first_token, Token *t, Token *last_token)
1069{
1070 int result;
1071
1072 assert(*literal == NULL && *expression == NULL);
1073
1074 /* Get any literal string. */
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +03001075 result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001076 if (result < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001077 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001078 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001079
1080 assert(result == 0 || result == 1);
1081
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001082 if (result == 1) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001083 /* We have a literal, but don't look at the expression. */
1084 return 1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001085 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001086
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001087 if (*str >= end || **str == '}') {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001088 /* We're at the end of the string or the end of a nested
1089 f-string: no expression. The top-level error case where we
1090 expect to be at the end of the string but we're at a '}' is
1091 handled later. */
1092 return 0;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001093 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001094
1095 /* We must now be the start of an expression, on a '{'. */
1096 assert(**str == '{');
1097
1098 if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001099 expression, first_token, t, last_token) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001100 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001101 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001102
1103 return 0;
1104
1105error:
1106 Py_CLEAR(*literal);
1107 return -1;
1108}
1109
1110#ifdef NDEBUG
1111#define ExprList_check_invariants(l)
1112#else
1113static void
1114ExprList_check_invariants(ExprList *l)
1115{
1116 /* Check our invariants. Make sure this object is "live", and
1117 hasn't been deallocated. */
1118 assert(l->size >= 0);
1119 assert(l->p != NULL);
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001120 if (l->size <= EXPRLIST_N_CACHED) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001121 assert(l->data == l->p);
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001122 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001123}
1124#endif
1125
1126static void
1127ExprList_Init(ExprList *l)
1128{
1129 l->allocated = EXPRLIST_N_CACHED;
1130 l->size = 0;
1131
1132 /* Until we start allocating dynamically, p points to data. */
1133 l->p = l->data;
1134
1135 ExprList_check_invariants(l);
1136}
1137
1138static int
1139ExprList_Append(ExprList *l, expr_ty exp)
1140{
1141 ExprList_check_invariants(l);
1142 if (l->size >= l->allocated) {
1143 /* We need to alloc (or realloc) the memory. */
1144 Py_ssize_t new_size = l->allocated * 2;
1145
1146 /* See if we've ever allocated anything dynamically. */
1147 if (l->p == l->data) {
1148 Py_ssize_t i;
1149 /* We're still using the cached data. Switch to
1150 alloc-ing. */
Lysandros Nikolaou5193d0a2020-06-27 21:35:18 +03001151 l->p = PyMem_Malloc(sizeof(expr_ty) * new_size);
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001152 if (!l->p) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001153 return -1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001154 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001155 /* Copy the cached data into the new buffer. */
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001156 for (i = 0; i < l->size; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001157 l->p[i] = l->data[i];
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001158 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001159 } else {
1160 /* Just realloc. */
Lysandros Nikolaou5193d0a2020-06-27 21:35:18 +03001161 expr_ty *tmp = PyMem_Realloc(l->p, sizeof(expr_ty) * new_size);
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001162 if (!tmp) {
Lysandros Nikolaou5193d0a2020-06-27 21:35:18 +03001163 PyMem_Free(l->p);
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001164 l->p = NULL;
1165 return -1;
1166 }
1167 l->p = tmp;
1168 }
1169
1170 l->allocated = new_size;
1171 assert(l->allocated == 2 * l->size);
1172 }
1173
1174 l->p[l->size++] = exp;
1175
1176 ExprList_check_invariants(l);
1177 return 0;
1178}
1179
1180static void
1181ExprList_Dealloc(ExprList *l)
1182{
1183 ExprList_check_invariants(l);
1184
1185 /* If there's been an error, or we've never dynamically allocated,
1186 do nothing. */
1187 if (!l->p || l->p == l->data) {
1188 /* Do nothing. */
1189 } else {
1190 /* We have dynamically allocated. Free the memory. */
Lysandros Nikolaou5193d0a2020-06-27 21:35:18 +03001191 PyMem_Free(l->p);
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001192 }
1193 l->p = NULL;
1194 l->size = -1;
1195}
1196
1197static asdl_seq *
1198ExprList_Finish(ExprList *l, PyArena *arena)
1199{
1200 asdl_seq *seq;
1201
1202 ExprList_check_invariants(l);
1203
1204 /* Allocate the asdl_seq and copy the expressions in to it. */
1205 seq = _Py_asdl_seq_new(l->size, arena);
1206 if (seq) {
1207 Py_ssize_t i;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001208 for (i = 0; i < l->size; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001209 asdl_seq_SET(seq, i, l->p[i]);
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001210 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001211 }
1212 ExprList_Dealloc(l);
1213 return seq;
1214}
1215
1216#ifdef NDEBUG
1217#define FstringParser_check_invariants(state)
1218#else
1219static void
1220FstringParser_check_invariants(FstringParser *state)
1221{
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001222 if (state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001223 assert(PyUnicode_CheckExact(state->last_str));
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001224 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001225 ExprList_check_invariants(&state->expr_list);
1226}
1227#endif
1228
1229void
1230_PyPegen_FstringParser_Init(FstringParser *state)
1231{
1232 state->last_str = NULL;
1233 state->fmode = 0;
1234 ExprList_Init(&state->expr_list);
1235 FstringParser_check_invariants(state);
1236}
1237
1238void
1239_PyPegen_FstringParser_Dealloc(FstringParser *state)
1240{
1241 FstringParser_check_invariants(state);
1242
1243 Py_XDECREF(state->last_str);
1244 ExprList_Dealloc(&state->expr_list);
1245}
1246
1247/* Make a Constant node, but decref the PyUnicode object being added. */
1248static expr_ty
1249make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token)
1250{
1251 PyObject *s = *str;
1252 PyObject *kind = NULL;
1253 *str = NULL;
1254 assert(PyUnicode_CheckExact(s));
1255 if (PyArena_AddPyObject(p->arena, s) < 0) {
1256 Py_DECREF(s);
1257 return NULL;
1258 }
1259 const char* the_str = PyBytes_AsString(first_token->bytes);
1260 if (the_str && the_str[0] == 'u') {
1261 kind = _PyPegen_new_identifier(p, "u");
1262 }
1263
1264 if (kind == NULL && PyErr_Occurred()) {
1265 return NULL;
1266 }
1267
1268 return Constant(s, kind, first_token->lineno, first_token->col_offset,
1269 last_token->end_lineno, last_token->end_col_offset, p->arena);
1270
1271}
1272
1273
1274/* Add a non-f-string (that is, a regular literal string). str is
1275 decref'd. */
1276int
1277_PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str)
1278{
1279 FstringParser_check_invariants(state);
1280
1281 assert(PyUnicode_CheckExact(str));
1282
1283 if (PyUnicode_GET_LENGTH(str) == 0) {
1284 Py_DECREF(str);
1285 return 0;
1286 }
1287
1288 if (!state->last_str) {
1289 /* We didn't have a string before, so just remember this one. */
1290 state->last_str = str;
1291 } else {
1292 /* Concatenate this with the previous string. */
1293 PyUnicode_AppendAndDel(&state->last_str, str);
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001294 if (!state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001295 return -1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001296 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001297 }
1298 FstringParser_check_invariants(state);
1299 return 0;
1300}
1301
1302/* Parse an f-string. The f-string is in *str to end, with no
1303 'f' or quotes. */
1304int
1305_PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str,
1306 const char *end, int raw, int recurse_lvl,
1307 Token *first_token, Token* t, Token *last_token)
1308{
1309 FstringParser_check_invariants(state);
1310 state->fmode = 1;
1311
1312 /* Parse the f-string. */
1313 while (1) {
1314 PyObject *literal = NULL;
1315 PyObject *expr_text = NULL;
1316 expr_ty expression = NULL;
1317
1318 /* If there's a zero length literal in front of the
1319 expression, literal will be NULL. If we're at the end of
1320 the f-string, expression will be NULL (unless result == 1,
1321 see below). */
1322 int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
1323 &literal, &expr_text,
1324 &expression, first_token, t, last_token);
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001325 if (result < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001326 return -1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001327 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001328
1329 /* Add the literal, if any. */
1330 if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
1331 Py_XDECREF(expr_text);
1332 return -1;
1333 }
1334 /* Add the expr_text, if any. */
1335 if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
1336 return -1;
1337 }
1338
1339 /* We've dealt with the literal and expr_text, their ownership has
1340 been transferred to the state object. Don't look at them again. */
1341
1342 /* See if we should just loop around to get the next literal
1343 and expression, while ignoring the expression this
1344 time. This is used for un-doubling braces, as an
1345 optimization. */
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001346 if (result == 1) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001347 continue;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001348 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001349
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001350 if (!expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001351 /* We're done with this f-string. */
1352 break;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001353 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001354
1355 /* We know we have an expression. Convert any existing string
1356 to a Constant node. */
1357 if (!state->last_str) {
1358 /* Do nothing. No previous literal. */
1359 } else {
1360 /* Convert the existing last_str literal to a Constant node. */
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001361 expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1362 if (!last_str || ExprList_Append(&state->expr_list, last_str) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001363 return -1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001364 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001365 }
1366
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001367 if (ExprList_Append(&state->expr_list, expression) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001368 return -1;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001369 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001370 }
1371
1372 /* If recurse_lvl is zero, then we must be at the end of the
1373 string. Otherwise, we must be at a right brace. */
1374
1375 if (recurse_lvl == 0 && *str < end-1) {
1376 RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
1377 return -1;
1378 }
1379 if (recurse_lvl != 0 && **str != '}') {
1380 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1381 return -1;
1382 }
1383
1384 FstringParser_check_invariants(state);
1385 return 0;
1386}
1387
1388/* Convert the partial state reflected in last_str and expr_list to an
1389 expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
1390expr_ty
1391_PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token,
1392 Token *last_token)
1393{
1394 asdl_seq *seq;
1395
1396 FstringParser_check_invariants(state);
1397
1398 /* If we're just a constant string with no expressions, return
1399 that. */
1400 if (!state->fmode) {
1401 assert(!state->expr_list.size);
1402 if (!state->last_str) {
1403 /* Create a zero length string. */
1404 state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001405 if (!state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001406 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001407 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001408 }
1409 return make_str_node_and_del(p, &state->last_str, first_token, last_token);
1410 }
1411
1412 /* Create a Constant node out of last_str, if needed. It will be the
1413 last node in our expression list. */
1414 if (state->last_str) {
1415 expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001416 if (!str || ExprList_Append(&state->expr_list, str) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001417 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001418 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001419 }
1420 /* This has already been freed. */
1421 assert(state->last_str == NULL);
1422
1423 seq = ExprList_Finish(&state->expr_list, p->arena);
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001424 if (!seq) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001425 goto error;
Pablo Galindo30b59fd2020-06-15 15:08:00 +01001426 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001427
1428 return _Py_JoinedStr(seq, first_token->lineno, first_token->col_offset,
1429 last_token->end_lineno, last_token->end_col_offset, p->arena);
1430
1431error:
1432 _PyPegen_FstringParser_Dealloc(state);
1433 return NULL;
1434}
1435
1436/* Given an f-string (with no 'f' or quotes) that's in *str and ends
1437 at end, parse it into an expr_ty. Return NULL on error. Adjust
1438 str to point past the parsed portion. */
1439static expr_ty
1440fstring_parse(Parser *p, const char **str, const char *end, int raw,
1441 int recurse_lvl, Token *first_token, Token* t, Token *last_token)
1442{
1443 FstringParser state;
1444
1445 _PyPegen_FstringParser_Init(&state);
1446 if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
1447 first_token, t, last_token) < 0) {
1448 _PyPegen_FstringParser_Dealloc(&state);
1449 return NULL;
1450 }
1451
1452 return _PyPegen_FstringParser_Finish(p, &state, t, t);
1453}