blob: f8e2427276cd3c593b67a22260cb2565ece497ff [file] [log] [blame]
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001#include <Python.h>
2
Pablo Galindo1ed83ad2020-06-11 17:30:46 +01003#include "tokenizer.h"
Pablo Galindoc5fc1562020-04-22 23:29:27 +01004#include "pegen.h"
Pablo Galindo1ed83ad2020-06-11 17:30:46 +01005#include "string_parser.h"
Pablo Galindoc5fc1562020-04-22 23:29:27 +01006
7//// STRING HANDLING FUNCTIONS ////
8
Pablo Galindoc5fc1562020-04-22 23:29:27 +01009static int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030010warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010011{
12 PyObject *msg =
13 PyUnicode_FromFormat("invalid escape sequence \\%c", first_invalid_escape_char);
14 if (msg == NULL) {
15 return -1;
16 }
17 if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030018 t->lineno, NULL, NULL) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +010019 if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
20 /* Replace the DeprecationWarning exception with a SyntaxError
21 to get a more accurate error report */
22 PyErr_Clear();
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030023
24 /* This is needed, in order for the SyntaxError to point to the token t,
25 since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
26 error location, if p->known_err_token is not set. */
27 p->known_err_token = t;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010028 RAISE_SYNTAX_ERROR("invalid escape sequence \\%c", first_invalid_escape_char);
29 }
30 Py_DECREF(msg);
31 return -1;
32 }
33 Py_DECREF(msg);
34 return 0;
35}
36
37static PyObject *
38decode_utf8(const char **sPtr, const char *end)
39{
Pablo Galindofb61c422020-06-15 14:23:43 +010040 const char *s;
41 const char *t;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010042 t = s = *sPtr;
43 while (s < end && (*s & 0x80)) {
44 s++;
45 }
46 *sPtr = s;
47 return PyUnicode_DecodeUTF8(t, s - t, NULL);
48}
49
50static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +030051decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +010052{
Pablo Galindofb61c422020-06-15 14:23:43 +010053 PyObject *v;
54 PyObject *u;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010055 char *buf;
56 char *p;
57 const char *end;
58
59 /* check for integer overflow */
60 if (len > SIZE_MAX / 6) {
61 return NULL;
62 }
63 /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
64 "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
65 u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
66 if (u == NULL) {
67 return NULL;
68 }
69 p = buf = PyBytes_AsString(u);
70 end = s + len;
71 while (s < end) {
72 if (*s == '\\') {
73 *p++ = *s++;
74 if (s >= end || *s & 0x80) {
75 strcpy(p, "u005c");
76 p += 5;
77 if (s >= end) {
78 break;
79 }
80 }
81 }
82 if (*s & 0x80) {
83 PyObject *w;
84 int kind;
85 void *data;
Pablo Galindofb61c422020-06-15 14:23:43 +010086 Py_ssize_t w_len;
87 Py_ssize_t i;
Pablo Galindoc5fc1562020-04-22 23:29:27 +010088 w = decode_utf8(&s, end);
89 if (w == NULL) {
90 Py_DECREF(u);
91 return NULL;
92 }
93 kind = PyUnicode_KIND(w);
94 data = PyUnicode_DATA(w);
Pablo Galindofb61c422020-06-15 14:23:43 +010095 w_len = PyUnicode_GET_LENGTH(w);
96 for (i = 0; i < w_len; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +010097 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
98 sprintf(p, "\\U%08x", chr);
99 p += 10;
100 }
101 /* Should be impossible to overflow */
102 assert(p - buf <= PyBytes_GET_SIZE(u));
103 Py_DECREF(w);
104 }
105 else {
106 *p++ = *s++;
107 }
108 }
109 len = p - buf;
110 s = buf;
111
112 const char *first_invalid_escape;
113 v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
114
115 if (v != NULL && first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300116 if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100117 /* We have not decref u before because first_invalid_escape points
118 inside u. */
119 Py_XDECREF(u);
120 Py_DECREF(v);
121 return NULL;
122 }
123 }
124 Py_XDECREF(u);
125 return v;
126}
127
128static PyObject *
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300129decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100130{
131 const char *first_invalid_escape;
132 PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
133 if (result == NULL) {
134 return NULL;
135 }
136
137 if (first_invalid_escape != NULL) {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300138 if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100139 Py_DECREF(result);
140 return NULL;
141 }
142 }
143 return result;
144}
145
146/* s must include the bracketing quote characters, and r, b, u,
147 &/or f prefixes (if any), and embedded escape sequences (if any).
148 _PyPegen_parsestr parses it, and sets *result to decoded Python string object.
149 If the string is an f-string, set *fstr and *fstrlen to the unparsed
150 string object. Return 0 if no errors occurred. */
151int
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300152_PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
153 const char **fstr, Py_ssize_t *fstrlen, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100154{
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300155 const char *s = PyBytes_AsString(t->bytes);
156 if (s == NULL) {
157 return -1;
158 }
159
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100160 size_t len;
161 int quote = Py_CHARMASK(*s);
162 int fmode = 0;
163 *bytesmode = 0;
164 *rawmode = 0;
165 *result = NULL;
166 *fstr = NULL;
167 if (Py_ISALPHA(quote)) {
168 while (!*bytesmode || !*rawmode) {
169 if (quote == 'b' || quote == 'B') {
Pablo Galindofb61c422020-06-15 14:23:43 +0100170 quote =(unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100171 *bytesmode = 1;
172 }
173 else if (quote == 'u' || quote == 'U') {
Pablo Galindofb61c422020-06-15 14:23:43 +0100174 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100175 }
176 else if (quote == 'r' || quote == 'R') {
Pablo Galindofb61c422020-06-15 14:23:43 +0100177 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100178 *rawmode = 1;
179 }
180 else if (quote == 'f' || quote == 'F') {
Pablo Galindofb61c422020-06-15 14:23:43 +0100181 quote = (unsigned char)*++s;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100182 fmode = 1;
183 }
184 else {
185 break;
186 }
187 }
188 }
189
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300190 /* fstrings are only allowed in Python 3.6 and greater */
191 if (fmode && p->feature_version < 6) {
192 p->error_indicator = 1;
193 RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
194 return -1;
195 }
196
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100197 if (fmode && *bytesmode) {
198 PyErr_BadInternalCall();
199 return -1;
200 }
201 if (quote != '\'' && quote != '\"') {
202 PyErr_BadInternalCall();
203 return -1;
204 }
205 /* Skip the leading quote char. */
206 s++;
207 len = strlen(s);
208 if (len > INT_MAX) {
209 PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
210 return -1;
211 }
212 if (s[--len] != quote) {
213 /* Last quote char must match the first. */
214 PyErr_BadInternalCall();
215 return -1;
216 }
217 if (len >= 4 && s[0] == quote && s[1] == quote) {
218 /* A triple quoted string. We've already skipped one quote at
219 the start and one at the end of the string. Now skip the
220 two at the start. */
221 s += 2;
222 len -= 2;
223 /* And check that the last two match. */
224 if (s[--len] != quote || s[--len] != quote) {
225 PyErr_BadInternalCall();
226 return -1;
227 }
228 }
229
230 if (fmode) {
231 /* Just return the bytes. The caller will parse the resulting
232 string. */
233 *fstr = s;
234 *fstrlen = len;
235 return 0;
236 }
237
238 /* Not an f-string. */
239 /* Avoid invoking escape decoding routines if possible. */
240 *rawmode = *rawmode || strchr(s, '\\') == NULL;
241 if (*bytesmode) {
242 /* Disallow non-ASCII characters. */
243 const char *ch;
244 for (ch = s; *ch; ch++) {
245 if (Py_CHARMASK(*ch) >= 0x80) {
246 RAISE_SYNTAX_ERROR(
247 "bytes can only contain ASCII "
248 "literal characters.");
249 return -1;
250 }
251 }
252 if (*rawmode) {
253 *result = PyBytes_FromStringAndSize(s, len);
254 }
255 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300256 *result = decode_bytes_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100257 }
258 }
259 else {
260 if (*rawmode) {
261 *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
262 }
263 else {
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300264 *result = decode_unicode_with_escapes(p, s, len, t);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100265 }
266 }
267 return *result == NULL ? -1 : 0;
268}
269
270
271
272// FSTRING STUFF
273
274static void fstring_shift_expr_locations(expr_ty n, int lineno, int col_offset);
275static void fstring_shift_argument(expr_ty parent, arg_ty args, int lineno, int col_offset);
276
277
278static inline void shift_expr(expr_ty parent, expr_ty n, int line, int col) {
Pablo Galindo972ab032020-06-08 01:47:37 +0100279 if (n == NULL) {
280 return;
281 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100282 if (parent->lineno < n->lineno) {
283 col = 0;
284 }
285 fstring_shift_expr_locations(n, line, col);
286}
287
288static inline void shift_arg(expr_ty parent, arg_ty n, int line, int col) {
289 if (parent->lineno < n->lineno) {
290 col = 0;
291 }
292 fstring_shift_argument(parent, n, line, col);
293}
294
295static void fstring_shift_seq_locations(expr_ty parent, asdl_seq *seq, int lineno, int col_offset) {
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100296 for (Py_ssize_t i = 0, l = asdl_seq_LEN(seq); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100297 expr_ty expr = asdl_seq_GET(seq, i);
298 if (expr == NULL){
299 continue;
300 }
301 shift_expr(parent, expr, lineno, col_offset);
302 }
303}
304
305static void fstring_shift_slice_locations(expr_ty parent, expr_ty slice, int lineno, int col_offset) {
306 switch (slice->kind) {
307 case Slice_kind:
308 if (slice->v.Slice.lower) {
309 shift_expr(parent, slice->v.Slice.lower, lineno, col_offset);
310 }
311 if (slice->v.Slice.upper) {
312 shift_expr(parent, slice->v.Slice.upper, lineno, col_offset);
313 }
314 if (slice->v.Slice.step) {
315 shift_expr(parent, slice->v.Slice.step, lineno, col_offset);
316 }
317 break;
318 case Tuple_kind:
319 fstring_shift_seq_locations(parent, slice->v.Tuple.elts, lineno, col_offset);
320 break;
321 default:
322 break;
323 }
324}
325
326static void fstring_shift_comprehension(expr_ty parent, comprehension_ty comp, int lineno, int col_offset) {
327 shift_expr(parent, comp->target, lineno, col_offset);
328 shift_expr(parent, comp->iter, lineno, col_offset);
329 fstring_shift_seq_locations(parent, comp->ifs, lineno, col_offset);
330}
331
332static void fstring_shift_argument(expr_ty parent, arg_ty arg, int lineno, int col_offset) {
333 if (arg->annotation != NULL){
334 shift_expr(parent, arg->annotation, lineno, col_offset);
335 }
336 arg->col_offset = arg->col_offset + col_offset;
337 arg->end_col_offset = arg->end_col_offset + col_offset;
338 arg->lineno = arg->lineno + lineno;
339 arg->end_lineno = arg->end_lineno + lineno;
340}
341
342static void fstring_shift_arguments(expr_ty parent, arguments_ty args, int lineno, int col_offset) {
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100343 for (Py_ssize_t i = 0, l = asdl_seq_LEN(args->posonlyargs); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100344 arg_ty arg = asdl_seq_GET(args->posonlyargs, i);
345 shift_arg(parent, arg, lineno, col_offset);
346 }
347
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100348 for (Py_ssize_t i = 0, l = asdl_seq_LEN(args->args); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100349 arg_ty arg = asdl_seq_GET(args->args, i);
350 shift_arg(parent, arg, lineno, col_offset);
351 }
352
353 if (args->vararg != NULL) {
354 shift_arg(parent, args->vararg, lineno, col_offset);
355 }
356
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100357 for (Py_ssize_t i = 0, l = asdl_seq_LEN(args->kwonlyargs); i < l; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100358 arg_ty arg = asdl_seq_GET(args->kwonlyargs, i);
359 shift_arg(parent, arg, lineno, col_offset);
360 }
361
362 fstring_shift_seq_locations(parent, args->kw_defaults, lineno, col_offset);
363
364 if (args->kwarg != NULL) {
365 shift_arg(parent, args->kwarg, lineno, col_offset);
366 }
367
368 fstring_shift_seq_locations(parent, args->defaults, lineno, col_offset);
369}
370
Pablo Galindofb61c422020-06-15 14:23:43 +0100371static void fstring_shift_children_locations(expr_ty node, int lineno, int col_offset) {
372 switch (node->kind) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100373 case BoolOp_kind:
Pablo Galindofb61c422020-06-15 14:23:43 +0100374 fstring_shift_seq_locations(node, node->v.BoolOp.values, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100375 break;
376 case NamedExpr_kind:
Pablo Galindofb61c422020-06-15 14:23:43 +0100377 shift_expr(node, node->v.NamedExpr.target, lineno, col_offset);
378 shift_expr(node, node->v.NamedExpr.value, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100379 break;
380 case BinOp_kind:
Pablo Galindofb61c422020-06-15 14:23:43 +0100381 shift_expr(node, node->v.BinOp.left, lineno, col_offset);
382 shift_expr(node, node->v.BinOp.right, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100383 break;
384 case UnaryOp_kind:
Pablo Galindofb61c422020-06-15 14:23:43 +0100385 shift_expr(node, node->v.UnaryOp.operand, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100386 break;
387 case Lambda_kind:
Pablo Galindofb61c422020-06-15 14:23:43 +0100388 fstring_shift_arguments(node, node->v.Lambda.args, lineno, col_offset);
389 shift_expr(node, node->v.Lambda.body, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100390 break;
391 case IfExp_kind:
Pablo Galindofb61c422020-06-15 14:23:43 +0100392 shift_expr(node, node->v.IfExp.test, lineno, col_offset);
393 shift_expr(node, node->v.IfExp.body, lineno, col_offset);
394 shift_expr(node, node->v.IfExp.orelse, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100395 break;
396 case Dict_kind:
Pablo Galindofb61c422020-06-15 14:23:43 +0100397 fstring_shift_seq_locations(node, node->v.Dict.keys, lineno, col_offset);
398 fstring_shift_seq_locations(node, node->v.Dict.values, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100399 break;
400 case Set_kind:
Pablo Galindofb61c422020-06-15 14:23:43 +0100401 fstring_shift_seq_locations(node, node->v.Set.elts, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100402 break;
403 case ListComp_kind:
Pablo Galindofb61c422020-06-15 14:23:43 +0100404 shift_expr(node, node->v.ListComp.elt, lineno, col_offset);
405 for (Py_ssize_t i = 0, l = asdl_seq_LEN(node->v.ListComp.generators); i < l; i++) {
406 comprehension_ty comp = asdl_seq_GET(node->v.ListComp.generators, i);
407 fstring_shift_comprehension(node, comp, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100408 }
409 break;
410 case SetComp_kind:
Pablo Galindofb61c422020-06-15 14:23:43 +0100411 shift_expr(node, node->v.SetComp.elt, lineno, col_offset);
412 for (Py_ssize_t i = 0, l = asdl_seq_LEN(node->v.SetComp.generators); i < l; i++) {
413 comprehension_ty comp = asdl_seq_GET(node->v.SetComp.generators, i);
414 fstring_shift_comprehension(node, comp, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100415 }
416 break;
417 case DictComp_kind:
Pablo Galindofb61c422020-06-15 14:23:43 +0100418 shift_expr(node, node->v.DictComp.key, lineno, col_offset);
419 shift_expr(node, node->v.DictComp.value, lineno, col_offset);
420 for (Py_ssize_t i = 0, l = asdl_seq_LEN(node->v.DictComp.generators); i < l; i++) {
421 comprehension_ty comp = asdl_seq_GET(node->v.DictComp.generators, i);
422 fstring_shift_comprehension(node, comp, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100423 }
424 break;
425 case GeneratorExp_kind:
Pablo Galindofb61c422020-06-15 14:23:43 +0100426 shift_expr(node, node->v.GeneratorExp.elt, lineno, col_offset);
427 for (Py_ssize_t i = 0, l = asdl_seq_LEN(node->v.GeneratorExp.generators); i < l; i++) {
428 comprehension_ty comp = asdl_seq_GET(node->v.GeneratorExp.generators, i);
429 fstring_shift_comprehension(node, comp, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100430 }
431 break;
432 case Await_kind:
Pablo Galindofb61c422020-06-15 14:23:43 +0100433 shift_expr(node, node->v.Await.value, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100434 break;
435 case Yield_kind:
Pablo Galindofb61c422020-06-15 14:23:43 +0100436 shift_expr(node, node->v.Yield.value, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100437 break;
438 case YieldFrom_kind:
Pablo Galindofb61c422020-06-15 14:23:43 +0100439 shift_expr(node, node->v.YieldFrom.value, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100440 break;
441 case Compare_kind:
Pablo Galindofb61c422020-06-15 14:23:43 +0100442 shift_expr(node, node->v.Compare.left, lineno, col_offset);
443 fstring_shift_seq_locations(node, node->v.Compare.comparators, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100444 break;
445 case Call_kind:
Pablo Galindofb61c422020-06-15 14:23:43 +0100446 shift_expr(node, node->v.Call.func, lineno, col_offset);
447 fstring_shift_seq_locations(node, node->v.Call.args, lineno, col_offset);
448 for (Py_ssize_t i = 0, l = asdl_seq_LEN(node->v.Call.keywords); i < l; i++) {
449 keyword_ty keyword = asdl_seq_GET(node->v.Call.keywords, i);
450 shift_expr(node, keyword->value, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100451 }
452 break;
453 case Attribute_kind:
Pablo Galindofb61c422020-06-15 14:23:43 +0100454 shift_expr(node, node->v.Attribute.value, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100455 break;
456 case Subscript_kind:
Pablo Galindofb61c422020-06-15 14:23:43 +0100457 shift_expr(node, node->v.Subscript.value, lineno, col_offset);
458 fstring_shift_slice_locations(node, node->v.Subscript.slice, lineno, col_offset);
459 shift_expr(node, node->v.Subscript.slice, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100460 break;
461 case Starred_kind:
Pablo Galindofb61c422020-06-15 14:23:43 +0100462 shift_expr(node, node->v.Starred.value, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100463 break;
464 case List_kind:
Pablo Galindofb61c422020-06-15 14:23:43 +0100465 fstring_shift_seq_locations(node, node->v.List.elts, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100466 break;
467 case Tuple_kind:
Pablo Galindofb61c422020-06-15 14:23:43 +0100468 fstring_shift_seq_locations(node, node->v.Tuple.elts, lineno, col_offset);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100469 break;
Lysandros Nikolaou37af21b2020-04-29 03:43:50 +0300470 case JoinedStr_kind:
Pablo Galindofb61c422020-06-15 14:23:43 +0100471 fstring_shift_seq_locations(node, node->v.JoinedStr.values, lineno, col_offset);
Lysandros Nikolaou37af21b2020-04-29 03:43:50 +0300472 break;
473 case FormattedValue_kind:
Pablo Galindofb61c422020-06-15 14:23:43 +0100474 shift_expr(node, node->v.FormattedValue.value, lineno, col_offset);
475 if (node->v.FormattedValue.format_spec) {
476 shift_expr(node, node->v.FormattedValue.format_spec, lineno, col_offset);
Lysandros Nikolaou37af21b2020-04-29 03:43:50 +0300477 }
478 break;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100479 default:
480 return;
481 }
482}
483
484/* Shift locations for the given node and all its children by adding `lineno`
485 and `col_offset` to existing locations. Note that n is the already parsed
486 expression. */
487static void fstring_shift_expr_locations(expr_ty n, int lineno, int col_offset)
488{
489 n->col_offset = n->col_offset + col_offset;
490
491 // The following is needed, in order for nodes spanning across multiple lines
492 // to be shifted correctly. An example of such a node is a Call node, the closing
493 // parenthesis of which is not on the same line as its name.
494 if (n->lineno == n->end_lineno) {
495 n->end_col_offset = n->end_col_offset + col_offset;
496 }
497
498 fstring_shift_children_locations(n, lineno, col_offset);
499 n->lineno = n->lineno + lineno;
500 n->end_lineno = n->end_lineno + lineno;
501}
502
503/* Fix locations for the given node and its children.
504
505 `parent` is the enclosing node.
506 `n` is the node which locations are going to be fixed relative to parent.
507 `expr_str` is the child node's string representation, including braces.
508*/
509static void
510fstring_fix_expr_location(Token *parent, expr_ty n, char *expr_str)
511{
512 char *substr = NULL;
513 char *start;
514 int lines = 0;
515 int cols = 0;
516
517 if (parent && parent->bytes) {
518 char *parent_str = PyBytes_AsString(parent->bytes);
519 if (!parent_str) {
520 return;
521 }
522 substr = strstr(parent_str, expr_str);
523 if (substr) {
524 // The following is needed, in order to correctly shift the column
525 // offset, in the case that (disregarding any whitespace) a newline
526 // immediately follows the opening curly brace of the fstring expression.
527 int newline_after_brace = 1;
528 start = substr + 1;
529 while (start && *start != '}' && *start != '\n') {
530 if (*start != ' ' && *start != '\t' && *start != '\f') {
531 newline_after_brace = 0;
532 break;
533 }
534 start++;
535 }
536
537 // Account for the characters from the last newline character to our
538 // left until the beginning of substr.
539 if (!newline_after_brace) {
540 start = substr;
541 while (start > parent_str && *start != '\n') {
542 start--;
543 }
544 cols += (int)(substr - start);
545 }
546 /* adjust the start based on the number of newlines encountered
547 before the f-string expression */
Pablo Galindo0b7829e2020-04-23 03:24:25 +0100548 for (char* p = parent_str; p < substr; p++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100549 if (*p == '\n') {
550 lines++;
551 }
552 }
553 }
554 }
555 fstring_shift_expr_locations(n, lines, cols);
556}
557
558
559/* Compile this expression in to an expr_ty. Add parens around the
560 expression, in order to allow leading spaces in the expression. */
561static expr_ty
562fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
563 Token *t)
564{
565 expr_ty expr = NULL;
566 char *str;
567 Py_ssize_t len;
568 const char *s;
569 expr_ty result = NULL;
570
571 assert(expr_end >= expr_start);
572 assert(*(expr_start-1) == '{');
573 assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' ||
574 *expr_end == '=');
575
576 /* If the substring is all whitespace, it's an error. We need to catch this
577 here, and not when we call PyParser_SimpleParseStringFlagsFilename,
578 because turning the expression '' in to '()' would go from being invalid
579 to valid. */
580 for (s = expr_start; s != expr_end; s++) {
581 char c = *s;
582 /* The Python parser ignores only the following whitespace
583 characters (\r already is converted to \n). */
584 if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) {
585 break;
586 }
587 }
588 if (s == expr_end) {
589 RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
590 return NULL;
591 }
592
593 len = expr_end - expr_start;
594 /* Allocate 3 extra bytes: open paren, close paren, null byte. */
595 str = PyMem_RawMalloc(len + 3);
596 if (str == NULL) {
597 PyErr_NoMemory();
598 return NULL;
599 }
600
601 str[0] = '(';
602 memcpy(str+1, expr_start, len);
603 str[len+1] = ')';
604 str[len+2] = 0;
605
606 struct tok_state* tok = PyTokenizer_FromString(str, 1);
607 if (tok == NULL) {
Pablo Galindoa54096e2020-06-06 00:52:15 +0100608 PyMem_RawFree(str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100609 return NULL;
610 }
Lysandros Nikolaouf7b1e462020-05-26 03:32:18 +0300611 Py_INCREF(p->tok->filename);
612 tok->filename = p->tok->filename;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100613
Lysandros Nikolaou3e0a6f32020-05-01 06:27:52 +0300614 Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
615 NULL, p->arena);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100616 p2->starting_lineno = p->starting_lineno + p->tok->first_lineno - 1;
617 p2->starting_col_offset = p->tok->first_lineno == p->tok->lineno
618 ? p->starting_col_offset + t->col_offset : 0;
619
620 expr = _PyPegen_run_parser(p2);
621
622 if (expr == NULL) {
623 goto exit;
624 }
625
626 /* Reuse str to find the correct column offset. */
627 str[0] = '{';
628 str[len+1] = '}';
629 fstring_fix_expr_location(t, expr, str);
630
631 result = expr;
632
633exit:
Pablo Galindoa54096e2020-06-06 00:52:15 +0100634 PyMem_RawFree(str);
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100635 _PyPegen_Parser_Free(p2);
636 PyTokenizer_Free(tok);
637 return result;
638}
639
640/* Return -1 on error.
641
642 Return 0 if we reached the end of the literal.
643
644 Return 1 if we haven't reached the end of the literal, but we want
645 the caller to process the literal up to this point. Used for
646 doubled braces.
647*/
648static int
649fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300650 PyObject **literal, int recurse_lvl, Token *t)
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100651{
652 /* Get any literal string. It ends when we hit an un-doubled left
653 brace (which isn't part of a unicode name escape such as
654 "\N{EULER CONSTANT}"), or the end of the string. */
655
656 const char *s = *str;
657 const char *literal_start = s;
658 int result = 0;
659
660 assert(*literal == NULL);
661 while (s < end) {
662 char ch = *s++;
663 if (!raw && ch == '\\' && s < end) {
664 ch = *s++;
665 if (ch == 'N') {
666 if (s < end && *s++ == '{') {
667 while (s < end && *s++ != '}') {
668 }
669 continue;
670 }
671 break;
672 }
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300673 if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100674 return -1;
675 }
676 }
677 if (ch == '{' || ch == '}') {
678 /* Check for doubled braces, but only at the top level. If
679 we checked at every level, then f'{0:{3}}' would fail
680 with the two closing braces. */
681 if (recurse_lvl == 0) {
682 if (s < end && *s == ch) {
683 /* We're going to tell the caller that the literal ends
684 here, but that they should continue scanning. But also
685 skip over the second brace when we resume scanning. */
686 *str = s + 1;
687 result = 1;
688 goto done;
689 }
690
691 /* Where a single '{' is the start of a new expression, a
692 single '}' is not allowed. */
693 if (ch == '}') {
694 *str = s - 1;
695 RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
696 return -1;
697 }
698 }
699 /* We're either at a '{', which means we're starting another
700 expression; or a '}', which means we're at the end of this
701 f-string (for a nested format_spec). */
702 s--;
703 break;
704 }
705 }
706 *str = s;
707 assert(s <= end);
708 assert(s == end || *s == '{' || *s == '}');
709done:
710 if (literal_start != s) {
Pablo Galindofb61c422020-06-15 14:23:43 +0100711 if (raw) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100712 *literal = PyUnicode_DecodeUTF8Stateful(literal_start,
713 s - literal_start,
714 NULL, NULL);
Pablo Galindofb61c422020-06-15 14:23:43 +0100715 } else {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100716 *literal = decode_unicode_with_escapes(p, literal_start,
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +0300717 s - literal_start, t);
Pablo Galindofb61c422020-06-15 14:23:43 +0100718 }
719 if (!*literal) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100720 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +0100721 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100722 }
723 return result;
724}
725
726/* Forward declaration because parsing is recursive. */
727static expr_ty
728fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
729 Token *first_token, Token* t, Token *last_token);
730
731/* Parse the f-string at *str, ending at end. We know *str starts an
732 expression (so it must be a '{'). Returns the FormattedValue node, which
733 includes the expression, conversion character, format_spec expression, and
734 optionally the text of the expression (if = is used).
735
736 Note that I don't do a perfect job here: I don't make sure that a
737 closing brace doesn't match an opening paren, for example. It
738 doesn't need to error on all invalid expressions, just correctly
739 find the end of all valid ones. Any errors inside the expression
740 will be caught when we parse it later.
741
742 *expression is set to the expression. For an '=' "debug" expression,
743 *expr_text is set to the debug text (the original text of the expression,
744 including the '=' and any whitespace around it, as a string object). If
745 not a debug expression, *expr_text set to NULL. */
746static int
747fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
748 PyObject **expr_text, expr_ty *expression, Token *first_token,
749 Token *t, Token *last_token)
750{
751 /* Return -1 on error, else 0. */
752
753 const char *expr_start;
754 const char *expr_end;
755 expr_ty simple_expression;
756 expr_ty format_spec = NULL; /* Optional format specifier. */
757 int conversion = -1; /* The conversion char. Use default if not
758 specified, or !r if using = and no format
759 spec. */
760
761 /* 0 if we're not in a string, else the quote char we're trying to
762 match (single or double quote). */
763 char quote_char = 0;
764
765 /* If we're inside a string, 1=normal, 3=triple-quoted. */
766 int string_type = 0;
767
768 /* Keep track of nesting level for braces/parens/brackets in
769 expressions. */
770 Py_ssize_t nested_depth = 0;
771 char parenstack[MAXLEVEL];
772
773 *expr_text = NULL;
774
775 /* Can only nest one level deep. */
776 if (recurse_lvl >= 2) {
777 RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
778 goto error;
779 }
780
781 /* The first char must be a left brace, or we wouldn't have gotten
782 here. Skip over it. */
783 assert(**str == '{');
784 *str += 1;
785
786 expr_start = *str;
787 for (; *str < end; (*str)++) {
788 char ch;
789
790 /* Loop invariants. */
791 assert(nested_depth >= 0);
792 assert(*str >= expr_start && *str < end);
Pablo Galindofb61c422020-06-15 14:23:43 +0100793 if (quote_char) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100794 assert(string_type == 1 || string_type == 3);
Pablo Galindofb61c422020-06-15 14:23:43 +0100795 } else {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100796 assert(string_type == 0);
Pablo Galindofb61c422020-06-15 14:23:43 +0100797 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100798
799 ch = **str;
800 /* Nowhere inside an expression is a backslash allowed. */
801 if (ch == '\\') {
802 /* Error: can't include a backslash character, inside
803 parens or strings or not. */
804 RAISE_SYNTAX_ERROR(
805 "f-string expression part "
806 "cannot include a backslash");
807 goto error;
808 }
809 if (quote_char) {
810 /* We're inside a string. See if we're at the end. */
811 /* This code needs to implement the same non-error logic
812 as tok_get from tokenizer.c, at the letter_quote
813 label. To actually share that code would be a
814 nightmare. But, it's unlikely to change and is small,
815 so duplicate it here. Note we don't need to catch all
816 of the errors, since they'll be caught when parsing the
817 expression. We just need to match the non-error
818 cases. Thus we can ignore \n in single-quoted strings,
819 for example. Or non-terminated strings. */
820 if (ch == quote_char) {
821 /* Does this match the string_type (single or triple
822 quoted)? */
823 if (string_type == 3) {
824 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
825 /* We're at the end of a triple quoted string. */
826 *str += 2;
827 string_type = 0;
828 quote_char = 0;
829 continue;
830 }
831 } else {
832 /* We're at the end of a normal string. */
833 quote_char = 0;
834 string_type = 0;
835 continue;
836 }
837 }
838 } else if (ch == '\'' || ch == '"') {
839 /* Is this a triple quoted string? */
840 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
841 string_type = 3;
842 *str += 2;
843 } else {
844 /* Start of a normal string. */
845 string_type = 1;
846 }
847 /* Start looking for the end of the string. */
848 quote_char = ch;
849 } else if (ch == '[' || ch == '{' || ch == '(') {
850 if (nested_depth >= MAXLEVEL) {
851 RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
852 goto error;
853 }
854 parenstack[nested_depth] = ch;
855 nested_depth++;
856 } else if (ch == '#') {
857 /* Error: can't include a comment character, inside parens
858 or not. */
859 RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
860 goto error;
861 } else if (nested_depth == 0 &&
862 (ch == '!' || ch == ':' || ch == '}' ||
863 ch == '=' || ch == '>' || ch == '<')) {
864 /* See if there's a next character. */
865 if (*str+1 < end) {
866 char next = *(*str+1);
867
868 /* For "!=". since '=' is not an allowed conversion character,
869 nothing is lost in this test. */
870 if ((ch == '!' && next == '=') || /* != */
871 (ch == '=' && next == '=') || /* == */
872 (ch == '<' && next == '=') || /* <= */
873 (ch == '>' && next == '=') /* >= */
874 ) {
875 *str += 1;
876 continue;
877 }
878 /* Don't get out of the loop for these, if they're single
879 chars (not part of 2-char tokens). If by themselves, they
880 don't end an expression (unlike say '!'). */
881 if (ch == '>' || ch == '<') {
882 continue;
883 }
884 }
885
886 /* Normal way out of this loop. */
887 break;
888 } else if (ch == ']' || ch == '}' || ch == ')') {
889 if (!nested_depth) {
890 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
891 goto error;
892 }
893 nested_depth--;
Pablo Galindofb61c422020-06-15 14:23:43 +0100894 int opening = (unsigned char)parenstack[nested_depth];
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100895 if (!((opening == '(' && ch == ')') ||
896 (opening == '[' && ch == ']') ||
897 (opening == '{' && ch == '}')))
898 {
899 RAISE_SYNTAX_ERROR(
900 "f-string: closing parenthesis '%c' "
901 "does not match opening parenthesis '%c'",
902 ch, opening);
903 goto error;
904 }
905 } else {
906 /* Just consume this char and loop around. */
907 }
908 }
909 expr_end = *str;
910 /* If we leave this loop in a string or with mismatched parens, we
911 don't care. We'll get a syntax error when compiling the
912 expression. But, we can produce a better error message, so
913 let's just do that.*/
914 if (quote_char) {
915 RAISE_SYNTAX_ERROR("f-string: unterminated string");
916 goto error;
917 }
918 if (nested_depth) {
Pablo Galindofb61c422020-06-15 14:23:43 +0100919 int opening = (unsigned char)parenstack[nested_depth - 1];
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100920 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
921 goto error;
922 }
923
Pablo Galindofb61c422020-06-15 14:23:43 +0100924 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100925 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100926 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100927
928 /* Compile the expression as soon as possible, so we show errors
929 related to the expression before errors related to the
930 conversion or format_spec. */
931 simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
Pablo Galindofb61c422020-06-15 14:23:43 +0100932 if (!simple_expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100933 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +0100934 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100935
936 /* Check for =, which puts the text value of the expression in
937 expr_text. */
938 if (**str == '=') {
Shantanuc116c942020-05-27 13:30:38 -0700939 if (p->feature_version < 8) {
940 RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
941 "only supported in Python 3.8 and greater");
942 goto error;
943 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100944 *str += 1;
945
946 /* Skip over ASCII whitespace. No need to test for end of string
947 here, since we know there's at least a trailing quote somewhere
948 ahead. */
949 while (Py_ISSPACE(**str)) {
950 *str += 1;
951 }
952
953 /* Set *expr_text to the text of the expression. */
954 *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start);
955 if (!*expr_text) {
956 goto error;
957 }
958 }
959
960 /* Check for a conversion char, if present. */
961 if (**str == '!') {
962 *str += 1;
Pablo Galindofb61c422020-06-15 14:23:43 +0100963 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100964 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100965 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100966
Pablo Galindofb61c422020-06-15 14:23:43 +0100967 conversion = (unsigned char)**str;
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100968 *str += 1;
969
970 /* Validate the conversion. */
971 if (!(conversion == 's' || conversion == 'r' || conversion == 'a')) {
972 RAISE_SYNTAX_ERROR(
973 "f-string: invalid conversion character: "
974 "expected 's', 'r', or 'a'");
975 goto error;
976 }
977
978 }
979
980 /* Check for the format spec, if present. */
Pablo Galindofb61c422020-06-15 14:23:43 +0100981 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100982 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100983 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100984 if (**str == ':') {
985 *str += 1;
Pablo Galindofb61c422020-06-15 14:23:43 +0100986 if (*str >= end) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100987 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +0100988 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100989
990 /* Parse the format spec. */
991 format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
992 first_token, t, last_token);
Pablo Galindofb61c422020-06-15 14:23:43 +0100993 if (!format_spec) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100994 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +0100995 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100996 }
997
Pablo Galindofb61c422020-06-15 14:23:43 +0100998 if (*str >= end || **str != '}') {
Pablo Galindoc5fc1562020-04-22 23:29:27 +0100999 goto unexpected_end_of_string;
Pablo Galindofb61c422020-06-15 14:23:43 +01001000 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001001
1002 /* We're at a right brace. Consume it. */
1003 assert(*str < end);
1004 assert(**str == '}');
1005 *str += 1;
1006
1007 /* If we're in = mode (detected by non-NULL expr_text), and have no format
1008 spec and no explicit conversion, set the conversion to 'r'. */
1009 if (*expr_text && format_spec == NULL && conversion == -1) {
1010 conversion = 'r';
1011 }
1012
1013 /* And now create the FormattedValue node that represents this
1014 entire expression with the conversion and format spec. */
1015 //TODO: Fix this
1016 *expression = FormattedValue(simple_expression, conversion,
1017 format_spec, first_token->lineno,
1018 first_token->col_offset, last_token->end_lineno,
1019 last_token->end_col_offset, p->arena);
Pablo Galindofb61c422020-06-15 14:23:43 +01001020 if (!*expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001021 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +01001022 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001023
1024 return 0;
1025
1026unexpected_end_of_string:
1027 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1028 /* Falls through to error. */
1029
1030error:
1031 Py_XDECREF(*expr_text);
1032 return -1;
1033
1034}
1035
1036/* Return -1 on error.
1037
1038 Return 0 if we have a literal (possible zero length) and an
1039 expression (zero length if at the end of the string.
1040
1041 Return 1 if we have a literal, but no expression, and we want the
1042 caller to call us again. This is used to deal with doubled
1043 braces.
1044
1045 When called multiple times on the string 'a{{b{0}c', this function
1046 will return:
1047
1048 1. the literal 'a{' with no expression, and a return value
1049 of 1. Despite the fact that there's no expression, the return
1050 value of 1 means we're not finished yet.
1051
1052 2. the literal 'b' and the expression '0', with a return value of
1053 0. The fact that there's an expression means we're not finished.
1054
1055 3. literal 'c' with no expression and a return value of 0. The
1056 combination of the return value of 0 with no expression means
1057 we're finished.
1058*/
1059static int
1060fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw,
1061 int recurse_lvl, PyObject **literal,
1062 PyObject **expr_text, expr_ty *expression,
1063 Token *first_token, Token *t, Token *last_token)
1064{
1065 int result;
1066
1067 assert(*literal == NULL && *expression == NULL);
1068
1069 /* Get any literal string. */
Lysandros Nikolaou2f37c352020-05-07 13:37:51 +03001070 result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
Pablo Galindofb61c422020-06-15 14:23:43 +01001071 if (result < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001072 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +01001073 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001074
1075 assert(result == 0 || result == 1);
1076
Pablo Galindofb61c422020-06-15 14:23:43 +01001077 if (result == 1) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001078 /* We have a literal, but don't look at the expression. */
1079 return 1;
Pablo Galindofb61c422020-06-15 14:23:43 +01001080 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001081
Pablo Galindofb61c422020-06-15 14:23:43 +01001082 if (*str >= end || **str == '}') {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001083 /* We're at the end of the string or the end of a nested
1084 f-string: no expression. The top-level error case where we
1085 expect to be at the end of the string but we're at a '}' is
1086 handled later. */
1087 return 0;
Pablo Galindofb61c422020-06-15 14:23:43 +01001088 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001089
1090 /* We must now be the start of an expression, on a '{'. */
1091 assert(**str == '{');
1092
1093 if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
Pablo Galindofb61c422020-06-15 14:23:43 +01001094 expression, first_token, t, last_token) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001095 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +01001096 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001097
1098 return 0;
1099
1100error:
1101 Py_CLEAR(*literal);
1102 return -1;
1103}
1104
1105#ifdef NDEBUG
1106#define ExprList_check_invariants(l)
1107#else
1108static void
1109ExprList_check_invariants(ExprList *l)
1110{
1111 /* Check our invariants. Make sure this object is "live", and
1112 hasn't been deallocated. */
1113 assert(l->size >= 0);
1114 assert(l->p != NULL);
Pablo Galindofb61c422020-06-15 14:23:43 +01001115 if (l->size <= EXPRLIST_N_CACHED) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001116 assert(l->data == l->p);
Pablo Galindofb61c422020-06-15 14:23:43 +01001117 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001118}
1119#endif
1120
1121static void
1122ExprList_Init(ExprList *l)
1123{
1124 l->allocated = EXPRLIST_N_CACHED;
1125 l->size = 0;
1126
1127 /* Until we start allocating dynamically, p points to data. */
1128 l->p = l->data;
1129
1130 ExprList_check_invariants(l);
1131}
1132
1133static int
1134ExprList_Append(ExprList *l, expr_ty exp)
1135{
1136 ExprList_check_invariants(l);
1137 if (l->size >= l->allocated) {
1138 /* We need to alloc (or realloc) the memory. */
1139 Py_ssize_t new_size = l->allocated * 2;
1140
1141 /* See if we've ever allocated anything dynamically. */
1142 if (l->p == l->data) {
1143 Py_ssize_t i;
1144 /* We're still using the cached data. Switch to
1145 alloc-ing. */
1146 l->p = PyMem_RawMalloc(sizeof(expr_ty) * new_size);
Pablo Galindofb61c422020-06-15 14:23:43 +01001147 if (!l->p) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001148 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +01001149 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001150 /* Copy the cached data into the new buffer. */
Pablo Galindofb61c422020-06-15 14:23:43 +01001151 for (i = 0; i < l->size; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001152 l->p[i] = l->data[i];
Pablo Galindofb61c422020-06-15 14:23:43 +01001153 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001154 } else {
1155 /* Just realloc. */
1156 expr_ty *tmp = PyMem_RawRealloc(l->p, sizeof(expr_ty) * new_size);
1157 if (!tmp) {
1158 PyMem_RawFree(l->p);
1159 l->p = NULL;
1160 return -1;
1161 }
1162 l->p = tmp;
1163 }
1164
1165 l->allocated = new_size;
1166 assert(l->allocated == 2 * l->size);
1167 }
1168
1169 l->p[l->size++] = exp;
1170
1171 ExprList_check_invariants(l);
1172 return 0;
1173}
1174
1175static void
1176ExprList_Dealloc(ExprList *l)
1177{
1178 ExprList_check_invariants(l);
1179
1180 /* If there's been an error, or we've never dynamically allocated,
1181 do nothing. */
1182 if (!l->p || l->p == l->data) {
1183 /* Do nothing. */
1184 } else {
1185 /* We have dynamically allocated. Free the memory. */
1186 PyMem_RawFree(l->p);
1187 }
1188 l->p = NULL;
1189 l->size = -1;
1190}
1191
1192static asdl_seq *
1193ExprList_Finish(ExprList *l, PyArena *arena)
1194{
1195 asdl_seq *seq;
1196
1197 ExprList_check_invariants(l);
1198
1199 /* Allocate the asdl_seq and copy the expressions in to it. */
1200 seq = _Py_asdl_seq_new(l->size, arena);
1201 if (seq) {
1202 Py_ssize_t i;
Pablo Galindofb61c422020-06-15 14:23:43 +01001203 for (i = 0; i < l->size; i++) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001204 asdl_seq_SET(seq, i, l->p[i]);
Pablo Galindofb61c422020-06-15 14:23:43 +01001205 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001206 }
1207 ExprList_Dealloc(l);
1208 return seq;
1209}
1210
1211#ifdef NDEBUG
1212#define FstringParser_check_invariants(state)
1213#else
1214static void
1215FstringParser_check_invariants(FstringParser *state)
1216{
Pablo Galindofb61c422020-06-15 14:23:43 +01001217 if (state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001218 assert(PyUnicode_CheckExact(state->last_str));
Pablo Galindofb61c422020-06-15 14:23:43 +01001219 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001220 ExprList_check_invariants(&state->expr_list);
1221}
1222#endif
1223
1224void
1225_PyPegen_FstringParser_Init(FstringParser *state)
1226{
1227 state->last_str = NULL;
1228 state->fmode = 0;
1229 ExprList_Init(&state->expr_list);
1230 FstringParser_check_invariants(state);
1231}
1232
1233void
1234_PyPegen_FstringParser_Dealloc(FstringParser *state)
1235{
1236 FstringParser_check_invariants(state);
1237
1238 Py_XDECREF(state->last_str);
1239 ExprList_Dealloc(&state->expr_list);
1240}
1241
1242/* Make a Constant node, but decref the PyUnicode object being added. */
1243static expr_ty
1244make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token)
1245{
1246 PyObject *s = *str;
1247 PyObject *kind = NULL;
1248 *str = NULL;
1249 assert(PyUnicode_CheckExact(s));
1250 if (PyArena_AddPyObject(p->arena, s) < 0) {
1251 Py_DECREF(s);
1252 return NULL;
1253 }
1254 const char* the_str = PyBytes_AsString(first_token->bytes);
1255 if (the_str && the_str[0] == 'u') {
1256 kind = _PyPegen_new_identifier(p, "u");
1257 }
1258
1259 if (kind == NULL && PyErr_Occurred()) {
1260 return NULL;
1261 }
1262
1263 return Constant(s, kind, first_token->lineno, first_token->col_offset,
1264 last_token->end_lineno, last_token->end_col_offset, p->arena);
1265
1266}
1267
1268
1269/* Add a non-f-string (that is, a regular literal string). str is
1270 decref'd. */
1271int
1272_PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str)
1273{
1274 FstringParser_check_invariants(state);
1275
1276 assert(PyUnicode_CheckExact(str));
1277
1278 if (PyUnicode_GET_LENGTH(str) == 0) {
1279 Py_DECREF(str);
1280 return 0;
1281 }
1282
1283 if (!state->last_str) {
1284 /* We didn't have a string before, so just remember this one. */
1285 state->last_str = str;
1286 } else {
1287 /* Concatenate this with the previous string. */
1288 PyUnicode_AppendAndDel(&state->last_str, str);
Pablo Galindofb61c422020-06-15 14:23:43 +01001289 if (!state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001290 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +01001291 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001292 }
1293 FstringParser_check_invariants(state);
1294 return 0;
1295}
1296
1297/* Parse an f-string. The f-string is in *str to end, with no
1298 'f' or quotes. */
1299int
1300_PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str,
1301 const char *end, int raw, int recurse_lvl,
1302 Token *first_token, Token* t, Token *last_token)
1303{
1304 FstringParser_check_invariants(state);
1305 state->fmode = 1;
1306
1307 /* Parse the f-string. */
1308 while (1) {
1309 PyObject *literal = NULL;
1310 PyObject *expr_text = NULL;
1311 expr_ty expression = NULL;
1312
1313 /* If there's a zero length literal in front of the
1314 expression, literal will be NULL. If we're at the end of
1315 the f-string, expression will be NULL (unless result == 1,
1316 see below). */
1317 int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
1318 &literal, &expr_text,
1319 &expression, first_token, t, last_token);
Pablo Galindofb61c422020-06-15 14:23:43 +01001320 if (result < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001321 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +01001322 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001323
1324 /* Add the literal, if any. */
1325 if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
1326 Py_XDECREF(expr_text);
1327 return -1;
1328 }
1329 /* Add the expr_text, if any. */
1330 if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
1331 return -1;
1332 }
1333
1334 /* We've dealt with the literal and expr_text, their ownership has
1335 been transferred to the state object. Don't look at them again. */
1336
1337 /* See if we should just loop around to get the next literal
1338 and expression, while ignoring the expression this
1339 time. This is used for un-doubling braces, as an
1340 optimization. */
Pablo Galindofb61c422020-06-15 14:23:43 +01001341 if (result == 1) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001342 continue;
Pablo Galindofb61c422020-06-15 14:23:43 +01001343 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001344
Pablo Galindofb61c422020-06-15 14:23:43 +01001345 if (!expression) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001346 /* We're done with this f-string. */
1347 break;
Pablo Galindofb61c422020-06-15 14:23:43 +01001348 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001349
1350 /* We know we have an expression. Convert any existing string
1351 to a Constant node. */
1352 if (!state->last_str) {
1353 /* Do nothing. No previous literal. */
1354 } else {
1355 /* Convert the existing last_str literal to a Constant node. */
Pablo Galindofb61c422020-06-15 14:23:43 +01001356 expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1357 if (!last_str || ExprList_Append(&state->expr_list, last_str) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001358 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +01001359 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001360 }
1361
Pablo Galindofb61c422020-06-15 14:23:43 +01001362 if (ExprList_Append(&state->expr_list, expression) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001363 return -1;
Pablo Galindofb61c422020-06-15 14:23:43 +01001364 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001365 }
1366
1367 /* If recurse_lvl is zero, then we must be at the end of the
1368 string. Otherwise, we must be at a right brace. */
1369
1370 if (recurse_lvl == 0 && *str < end-1) {
1371 RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
1372 return -1;
1373 }
1374 if (recurse_lvl != 0 && **str != '}') {
1375 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1376 return -1;
1377 }
1378
1379 FstringParser_check_invariants(state);
1380 return 0;
1381}
1382
1383/* Convert the partial state reflected in last_str and expr_list to an
1384 expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
1385expr_ty
1386_PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token,
1387 Token *last_token)
1388{
1389 asdl_seq *seq;
1390
1391 FstringParser_check_invariants(state);
1392
1393 /* If we're just a constant string with no expressions, return
1394 that. */
1395 if (!state->fmode) {
1396 assert(!state->expr_list.size);
1397 if (!state->last_str) {
1398 /* Create a zero length string. */
1399 state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
Pablo Galindofb61c422020-06-15 14:23:43 +01001400 if (!state->last_str) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001401 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +01001402 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001403 }
1404 return make_str_node_and_del(p, &state->last_str, first_token, last_token);
1405 }
1406
1407 /* Create a Constant node out of last_str, if needed. It will be the
1408 last node in our expression list. */
1409 if (state->last_str) {
1410 expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
Pablo Galindofb61c422020-06-15 14:23:43 +01001411 if (!str || ExprList_Append(&state->expr_list, str) < 0) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001412 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +01001413 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001414 }
1415 /* This has already been freed. */
1416 assert(state->last_str == NULL);
1417
1418 seq = ExprList_Finish(&state->expr_list, p->arena);
Pablo Galindofb61c422020-06-15 14:23:43 +01001419 if (!seq) {
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001420 goto error;
Pablo Galindofb61c422020-06-15 14:23:43 +01001421 }
Pablo Galindoc5fc1562020-04-22 23:29:27 +01001422
1423 return _Py_JoinedStr(seq, first_token->lineno, first_token->col_offset,
1424 last_token->end_lineno, last_token->end_col_offset, p->arena);
1425
1426error:
1427 _PyPegen_FstringParser_Dealloc(state);
1428 return NULL;
1429}
1430
1431/* Given an f-string (with no 'f' or quotes) that's in *str and ends
1432 at end, parse it into an expr_ty. Return NULL on error. Adjust
1433 str to point past the parsed portion. */
1434static expr_ty
1435fstring_parse(Parser *p, const char **str, const char *end, int raw,
1436 int recurse_lvl, Token *first_token, Token* t, Token *last_token)
1437{
1438 FstringParser state;
1439
1440 _PyPegen_FstringParser_Init(&state);
1441 if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
1442 first_token, t, last_token) < 0) {
1443 _PyPegen_FstringParser_Dealloc(&state);
1444 return NULL;
1445 }
1446
1447 return _PyPegen_FstringParser_Finish(p, &state, t, t);
1448}