Blame - Parser/pegen/parse_string.c - platform/external/python/cpython3

blob: a0ec698fa56a24d5d24f7fb40b321f6108f83ada [file] [log] [blame]

Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1	#include <Python.h>
				2
				3	#include "../tokenizer.h"
				4	#include "pegen.h"
				5	#include "parse_string.h"
				6
				7	//// STRING HANDLING FUNCTIONS ////
				8
				9	// These functions are ported directly from Python/ast.c with some modifications
				10	// to account for the use of "Parser *p", the fact that don't have parser nodes
				11	// to pass around and the usage of some specialized APIs present only in this
				12	// file (like "_PyPegen_raise_syntax_error").
				13
				14	static int
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	15	warn_invalid_escape_sequence(Parser p, unsigned char first_invalid_escape_char, Token t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	16	{
				17	PyObject *msg =
				18	PyUnicode_FromFormat("invalid escape sequence \\%c", first_invalid_escape_char);
				19	if (msg == NULL) {
				20	return -1;
				21	}
				22	if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	23	t->lineno, NULL, NULL) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	24	if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
				25	/* Replace the DeprecationWarning exception with a SyntaxError
				26	to get a more accurate error report */
				27	PyErr_Clear();
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	28
				29	/* This is needed, in order for the SyntaxError to point to the token t,
				30	since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
				31	error location, if p->known_err_token is not set. */
				32	p->known_err_token = t;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	33	RAISE_SYNTAX_ERROR("invalid escape sequence \\%c", first_invalid_escape_char);
				34	}
				35	Py_DECREF(msg);
				36	return -1;
				37	}
				38	Py_DECREF(msg);
				39	return 0;
				40	}
				41
				42	static PyObject *
				43	decode_utf8(const char *sPtr, const char end)
				44	{
				45	const char s, t;
				46	t = s = *sPtr;
				47	while (s < end && (*s & 0x80)) {
				48	s++;
				49	}
				50	*sPtr = s;
				51	return PyUnicode_DecodeUTF8(t, s - t, NULL);
				52	}
				53
				54	static PyObject *
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	55	decode_unicode_with_escapes(Parser parser, const char s, size_t len, Token *t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	56	{
				57	PyObject v, u;
				58	char *buf;
				59	char *p;
				60	const char *end;
				61
				62	/* check for integer overflow */
				63	if (len > SIZE_MAX / 6) {
				64	return NULL;
				65	}
				66	/* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
				67	"\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
				68	u = PyBytes_FromStringAndSize((char )NULL, len 6);
				69	if (u == NULL) {
				70	return NULL;
				71	}
				72	p = buf = PyBytes_AsString(u);
				73	end = s + len;
				74	while (s < end) {
				75	if (*s == '\\') {
				76	p++ = s++;
				77	if (s >= end \|\| *s & 0x80) {
				78	strcpy(p, "u005c");
				79	p += 5;
				80	if (s >= end) {
				81	break;
				82	}
				83	}
				84	}
				85	if (*s & 0x80) {
				86	PyObject *w;
				87	int kind;
				88	void *data;
				89	Py_ssize_t len, i;
				90	w = decode_utf8(&s, end);
				91	if (w == NULL) {
				92	Py_DECREF(u);
				93	return NULL;
				94	}
				95	kind = PyUnicode_KIND(w);
				96	data = PyUnicode_DATA(w);
				97	len = PyUnicode_GET_LENGTH(w);
				98	for (i = 0; i < len; i++) {
				99	Py_UCS4 chr = PyUnicode_READ(kind, data, i);
				100	sprintf(p, "\\U%08x", chr);
				101	p += 10;
				102	}
				103	/* Should be impossible to overflow */
				104	assert(p - buf <= PyBytes_GET_SIZE(u));
				105	Py_DECREF(w);
				106	}
				107	else {
				108	p++ = s++;
				109	}
				110	}
				111	len = p - buf;
				112	s = buf;
				113
				114	const char *first_invalid_escape;
				115	v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
				116
				117	if (v != NULL && first_invalid_escape != NULL) {
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	118	if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	119	/* We have not decref u before because first_invalid_escape points
				120	inside u. */
				121	Py_XDECREF(u);
				122	Py_DECREF(v);
				123	return NULL;
				124	}
				125	}
				126	Py_XDECREF(u);
				127	return v;
				128	}
				129
				130	static PyObject *
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	131	decode_bytes_with_escapes(Parser p, const char s, Py_ssize_t len, Token *t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	132	{
				133	const char *first_invalid_escape;
				134	PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
				135	if (result == NULL) {
				136	return NULL;
				137	}
				138
				139	if (first_invalid_escape != NULL) {
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	140	if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	141	Py_DECREF(result);
				142	return NULL;
				143	}
				144	}
				145	return result;
				146	}
				147
				148	/* s must include the bracketing quote characters, and r, b, u,
				149	&/or f prefixes (if any), and embedded escape sequences (if any).
				150	_PyPegen_parsestr parses it, and sets *result to decoded Python string object.
				151	If the string is an f-string, set fstr and fstrlen to the unparsed
				152	string object. Return 0 if no errors occurred. */
				153	int
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	154	_PyPegen_parsestr(Parser p, int bytesmode, int rawmode, PyObject *result,
				155	const char *fstr, Py_ssize_t fstrlen, Token *t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	156	{
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	157	const char *s = PyBytes_AsString(t->bytes);
				158	if (s == NULL) {
				159	return -1;
				160	}
				161
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	162	size_t len;
				163	int quote = Py_CHARMASK(*s);
				164	int fmode = 0;
				165	*bytesmode = 0;
				166	*rawmode = 0;
				167	*result = NULL;
				168	*fstr = NULL;
				169	if (Py_ISALPHA(quote)) {
				170	while (!bytesmode \|\| !rawmode) {
				171	if (quote == 'b' \|\| quote == 'B') {
				172	quote = *++s;
				173	*bytesmode = 1;
				174	}
				175	else if (quote == 'u' \|\| quote == 'U') {
				176	quote = *++s;
				177	}
				178	else if (quote == 'r' \|\| quote == 'R') {
				179	quote = *++s;
				180	*rawmode = 1;
				181	}
				182	else if (quote == 'f' \|\| quote == 'F') {
				183	quote = *++s;
				184	fmode = 1;
				185	}
				186	else {
				187	break;
				188	}
				189	}
				190	}
				191
Lysandros Nikolaou	3e0a6f3	2020-05-01 06:27:52 +0300	[diff] [blame]	192	/* fstrings are only allowed in Python 3.6 and greater */
				193	if (fmode && p->feature_version < 6) {
				194	p->error_indicator = 1;
				195	RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
				196	return -1;
				197	}
				198
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	199	if (fmode && *bytesmode) {
				200	PyErr_BadInternalCall();
				201	return -1;
				202	}
				203	if (quote != '\'' && quote != '\"') {
				204	PyErr_BadInternalCall();
				205	return -1;
				206	}
				207	/* Skip the leading quote char. */
				208	s++;
				209	len = strlen(s);
				210	if (len > INT_MAX) {
				211	PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
				212	return -1;
				213	}
				214	if (s[--len] != quote) {
				215	/* Last quote char must match the first. */
				216	PyErr_BadInternalCall();
				217	return -1;
				218	}
				219	if (len >= 4 && s[0] == quote && s[1] == quote) {
				220	/* A triple quoted string. We've already skipped one quote at
				221	the start and one at the end of the string. Now skip the
				222	two at the start. */
				223	s += 2;
				224	len -= 2;
				225	/* And check that the last two match. */
				226	if (s[--len] != quote \|\| s[--len] != quote) {
				227	PyErr_BadInternalCall();
				228	return -1;
				229	}
				230	}
				231
				232	if (fmode) {
				233	/* Just return the bytes. The caller will parse the resulting
				234	string. */
				235	*fstr = s;
				236	*fstrlen = len;
				237	return 0;
				238	}
				239
				240	/* Not an f-string. */
				241	/* Avoid invoking escape decoding routines if possible. */
				242	rawmode = rawmode \|\| strchr(s, '\\') == NULL;
				243	if (*bytesmode) {
				244	/* Disallow non-ASCII characters. */
				245	const char *ch;
				246	for (ch = s; *ch; ch++) {
				247	if (Py_CHARMASK(*ch) >= 0x80) {
				248	RAISE_SYNTAX_ERROR(
				249	"bytes can only contain ASCII "
				250	"literal characters.");
				251	return -1;
				252	}
				253	}
				254	if (*rawmode) {
				255	*result = PyBytes_FromStringAndSize(s, len);
				256	}
				257	else {
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	258	*result = decode_bytes_with_escapes(p, s, len, t);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	259	}
				260	}
				261	else {
				262	if (*rawmode) {
				263	*result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
				264	}
				265	else {
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	266	*result = decode_unicode_with_escapes(p, s, len, t);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	267	}
				268	}
				269	return *result == NULL ? -1 : 0;
				270	}
				271
				272
				273
				274	// FSTRING STUFF
				275
				276	static void fstring_shift_expr_locations(expr_ty n, int lineno, int col_offset);
				277	static void fstring_shift_argument(expr_ty parent, arg_ty args, int lineno, int col_offset);
				278
				279
				280	static inline void shift_expr(expr_ty parent, expr_ty n, int line, int col) {
				281	if (parent->lineno < n->lineno) {
				282	col = 0;
				283	}
				284	fstring_shift_expr_locations(n, line, col);
				285	}
				286
				287	static inline void shift_arg(expr_ty parent, arg_ty n, int line, int col) {
				288	if (parent->lineno < n->lineno) {
				289	col = 0;
				290	}
				291	fstring_shift_argument(parent, n, line, col);
				292	}
				293
				294	static void fstring_shift_seq_locations(expr_ty parent, asdl_seq *seq, int lineno, int col_offset) {
Pablo Galindo	0b7829e	2020-04-23 03:24:25 +0100	[diff] [blame]	295	for (Py_ssize_t i = 0, l = asdl_seq_LEN(seq); i < l; i++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	296	expr_ty expr = asdl_seq_GET(seq, i);
				297	if (expr == NULL){
				298	continue;
				299	}
				300	shift_expr(parent, expr, lineno, col_offset);
				301	}
				302	}
				303
				304	static void fstring_shift_slice_locations(expr_ty parent, expr_ty slice, int lineno, int col_offset) {
				305	switch (slice->kind) {
				306	case Slice_kind:
				307	if (slice->v.Slice.lower) {
				308	shift_expr(parent, slice->v.Slice.lower, lineno, col_offset);
				309	}
				310	if (slice->v.Slice.upper) {
				311	shift_expr(parent, slice->v.Slice.upper, lineno, col_offset);
				312	}
				313	if (slice->v.Slice.step) {
				314	shift_expr(parent, slice->v.Slice.step, lineno, col_offset);
				315	}
				316	break;
				317	case Tuple_kind:
				318	fstring_shift_seq_locations(parent, slice->v.Tuple.elts, lineno, col_offset);
				319	break;
				320	default:
				321	break;
				322	}
				323	}
				324
				325	static void fstring_shift_comprehension(expr_ty parent, comprehension_ty comp, int lineno, int col_offset) {
				326	shift_expr(parent, comp->target, lineno, col_offset);
				327	shift_expr(parent, comp->iter, lineno, col_offset);
				328	fstring_shift_seq_locations(parent, comp->ifs, lineno, col_offset);
				329	}
				330
				331	static void fstring_shift_argument(expr_ty parent, arg_ty arg, int lineno, int col_offset) {
				332	if (arg->annotation != NULL){
				333	shift_expr(parent, arg->annotation, lineno, col_offset);
				334	}
				335	arg->col_offset = arg->col_offset + col_offset;
				336	arg->end_col_offset = arg->end_col_offset + col_offset;
				337	arg->lineno = arg->lineno + lineno;
				338	arg->end_lineno = arg->end_lineno + lineno;
				339	}
				340
				341	static void fstring_shift_arguments(expr_ty parent, arguments_ty args, int lineno, int col_offset) {
Pablo Galindo	0b7829e	2020-04-23 03:24:25 +0100	[diff] [blame]	342	for (Py_ssize_t i = 0, l = asdl_seq_LEN(args->posonlyargs); i < l; i++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	343	arg_ty arg = asdl_seq_GET(args->posonlyargs, i);
				344	shift_arg(parent, arg, lineno, col_offset);
				345	}
				346
Pablo Galindo	0b7829e	2020-04-23 03:24:25 +0100	[diff] [blame]	347	for (Py_ssize_t i = 0, l = asdl_seq_LEN(args->args); i < l; i++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	348	arg_ty arg = asdl_seq_GET(args->args, i);
				349	shift_arg(parent, arg, lineno, col_offset);
				350	}
				351
				352	if (args->vararg != NULL) {
				353	shift_arg(parent, args->vararg, lineno, col_offset);
				354	}
				355
Pablo Galindo	0b7829e	2020-04-23 03:24:25 +0100	[diff] [blame]	356	for (Py_ssize_t i = 0, l = asdl_seq_LEN(args->kwonlyargs); i < l; i++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	357	arg_ty arg = asdl_seq_GET(args->kwonlyargs, i);
				358	shift_arg(parent, arg, lineno, col_offset);
				359	}
				360
				361	fstring_shift_seq_locations(parent, args->kw_defaults, lineno, col_offset);
				362
				363	if (args->kwarg != NULL) {
				364	shift_arg(parent, args->kwarg, lineno, col_offset);
				365	}
				366
				367	fstring_shift_seq_locations(parent, args->defaults, lineno, col_offset);
				368	}
				369
				370	static void fstring_shift_children_locations(expr_ty n, int lineno, int col_offset) {
				371	switch (n->kind) {
				372	case BoolOp_kind:
				373	fstring_shift_seq_locations(n, n->v.BoolOp.values, lineno, col_offset);
				374	break;
				375	case NamedExpr_kind:
				376	shift_expr(n, n->v.NamedExpr.target, lineno, col_offset);
				377	shift_expr(n, n->v.NamedExpr.value, lineno, col_offset);
				378	break;
				379	case BinOp_kind:
				380	shift_expr(n, n->v.BinOp.left, lineno, col_offset);
				381	shift_expr(n, n->v.BinOp.right, lineno, col_offset);
				382	break;
				383	case UnaryOp_kind:
				384	shift_expr(n, n->v.UnaryOp.operand, lineno, col_offset);
				385	break;
				386	case Lambda_kind:
				387	fstring_shift_arguments(n, n->v.Lambda.args, lineno, col_offset);
				388	shift_expr(n, n->v.Lambda.body, lineno, col_offset);
				389	break;
				390	case IfExp_kind:
				391	shift_expr(n, n->v.IfExp.test, lineno, col_offset);
				392	shift_expr(n, n->v.IfExp.body, lineno, col_offset);
				393	shift_expr(n, n->v.IfExp.orelse, lineno, col_offset);
				394	break;
				395	case Dict_kind:
				396	fstring_shift_seq_locations(n, n->v.Dict.keys, lineno, col_offset);
				397	fstring_shift_seq_locations(n, n->v.Dict.values, lineno, col_offset);
				398	break;
				399	case Set_kind:
				400	fstring_shift_seq_locations(n, n->v.Set.elts, lineno, col_offset);
				401	break;
				402	case ListComp_kind:
				403	shift_expr(n, n->v.ListComp.elt, lineno, col_offset);
Pablo Galindo	0b7829e	2020-04-23 03:24:25 +0100	[diff] [blame]	404	for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.ListComp.generators); i < l; i++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	405	comprehension_ty comp = asdl_seq_GET(n->v.ListComp.generators, i);
				406	fstring_shift_comprehension(n, comp, lineno, col_offset);
				407	}
				408	break;
				409	case SetComp_kind:
				410	shift_expr(n, n->v.SetComp.elt, lineno, col_offset);
Pablo Galindo	0b7829e	2020-04-23 03:24:25 +0100	[diff] [blame]	411	for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.SetComp.generators); i < l; i++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	412	comprehension_ty comp = asdl_seq_GET(n->v.SetComp.generators, i);
				413	fstring_shift_comprehension(n, comp, lineno, col_offset);
				414	}
				415	break;
				416	case DictComp_kind:
				417	shift_expr(n, n->v.DictComp.key, lineno, col_offset);
				418	shift_expr(n, n->v.DictComp.value, lineno, col_offset);
Pablo Galindo	0b7829e	2020-04-23 03:24:25 +0100	[diff] [blame]	419	for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.DictComp.generators); i < l; i++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	420	comprehension_ty comp = asdl_seq_GET(n->v.DictComp.generators, i);
				421	fstring_shift_comprehension(n, comp, lineno, col_offset);
				422	}
				423	break;
				424	case GeneratorExp_kind:
				425	shift_expr(n, n->v.GeneratorExp.elt, lineno, col_offset);
Pablo Galindo	0b7829e	2020-04-23 03:24:25 +0100	[diff] [blame]	426	for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.GeneratorExp.generators); i < l; i++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	427	comprehension_ty comp = asdl_seq_GET(n->v.GeneratorExp.generators, i);
				428	fstring_shift_comprehension(n, comp, lineno, col_offset);
				429	}
				430	break;
				431	case Await_kind:
				432	shift_expr(n, n->v.Await.value, lineno, col_offset);
				433	break;
				434	case Yield_kind:
				435	shift_expr(n, n->v.Yield.value, lineno, col_offset);
				436	break;
				437	case YieldFrom_kind:
				438	shift_expr(n, n->v.YieldFrom.value, lineno, col_offset);
				439	break;
				440	case Compare_kind:
				441	shift_expr(n, n->v.Compare.left, lineno, col_offset);
				442	fstring_shift_seq_locations(n, n->v.Compare.comparators, lineno, col_offset);
				443	break;
				444	case Call_kind:
				445	shift_expr(n, n->v.Call.func, lineno, col_offset);
				446	fstring_shift_seq_locations(n, n->v.Call.args, lineno, col_offset);
Pablo Galindo	0b7829e	2020-04-23 03:24:25 +0100	[diff] [blame]	447	for (Py_ssize_t i = 0, l = asdl_seq_LEN(n->v.Call.keywords); i < l; i++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	448	keyword_ty keyword = asdl_seq_GET(n->v.Call.keywords, i);
				449	shift_expr(n, keyword->value, lineno, col_offset);
				450	}
				451	break;
				452	case Attribute_kind:
				453	shift_expr(n, n->v.Attribute.value, lineno, col_offset);
				454	break;
				455	case Subscript_kind:
				456	shift_expr(n, n->v.Subscript.value, lineno, col_offset);
				457	fstring_shift_slice_locations(n, n->v.Subscript.slice, lineno, col_offset);
				458	shift_expr(n, n->v.Subscript.slice, lineno, col_offset);
				459	break;
				460	case Starred_kind:
				461	shift_expr(n, n->v.Starred.value, lineno, col_offset);
				462	break;
				463	case List_kind:
				464	fstring_shift_seq_locations(n, n->v.List.elts, lineno, col_offset);
				465	break;
				466	case Tuple_kind:
				467	fstring_shift_seq_locations(n, n->v.Tuple.elts, lineno, col_offset);
				468	break;
Lysandros Nikolaou	37af21b	2020-04-29 03:43:50 +0300	[diff] [blame]	469	case JoinedStr_kind:
				470	fstring_shift_seq_locations(n, n->v.JoinedStr.values, lineno, col_offset);
				471	break;
				472	case FormattedValue_kind:
				473	shift_expr(n, n->v.FormattedValue.value, lineno, col_offset);
				474	if (n->v.FormattedValue.format_spec) {
				475	shift_expr(n, n->v.FormattedValue.format_spec, lineno, col_offset);
				476	}
				477	break;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	478	default:
				479	return;
				480	}
				481	}
				482
				483	/* Shift locations for the given node and all its children by adding `lineno`
				484	and `col_offset` to existing locations. Note that n is the already parsed
				485	expression. */
				486	static void fstring_shift_expr_locations(expr_ty n, int lineno, int col_offset)
				487	{
				488	n->col_offset = n->col_offset + col_offset;
				489
				490	// The following is needed, in order for nodes spanning across multiple lines
				491	// to be shifted correctly. An example of such a node is a Call node, the closing
				492	// parenthesis of which is not on the same line as its name.
				493	if (n->lineno == n->end_lineno) {
				494	n->end_col_offset = n->end_col_offset + col_offset;
				495	}
				496
				497	fstring_shift_children_locations(n, lineno, col_offset);
				498	n->lineno = n->lineno + lineno;
				499	n->end_lineno = n->end_lineno + lineno;
				500	}
				501
				502	/* Fix locations for the given node and its children.
				503
				504	`parent` is the enclosing node.
				505	`n` is the node which locations are going to be fixed relative to parent.
				506	`expr_str` is the child node's string representation, including braces.
				507	*/
				508	static void
				509	fstring_fix_expr_location(Token parent, expr_ty n, char expr_str)
				510	{
				511	char *substr = NULL;
				512	char *start;
				513	int lines = 0;
				514	int cols = 0;
				515
				516	if (parent && parent->bytes) {
				517	char *parent_str = PyBytes_AsString(parent->bytes);
				518	if (!parent_str) {
				519	return;
				520	}
				521	substr = strstr(parent_str, expr_str);
				522	if (substr) {
				523	// The following is needed, in order to correctly shift the column
				524	// offset, in the case that (disregarding any whitespace) a newline
				525	// immediately follows the opening curly brace of the fstring expression.
				526	int newline_after_brace = 1;
				527	start = substr + 1;
				528	while (start && start != '}' && start != '\n') {
				529	if (start != ' ' && start != '\t' && *start != '\f') {
				530	newline_after_brace = 0;
				531	break;
				532	}
				533	start++;
				534	}
				535
				536	// Account for the characters from the last newline character to our
				537	// left until the beginning of substr.
				538	if (!newline_after_brace) {
				539	start = substr;
				540	while (start > parent_str && *start != '\n') {
				541	start--;
				542	}
				543	cols += (int)(substr - start);
				544	}
				545	/* adjust the start based on the number of newlines encountered
				546	before the f-string expression */
Pablo Galindo	0b7829e	2020-04-23 03:24:25 +0100	[diff] [blame]	547	for (char* p = parent_str; p < substr; p++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	548	if (*p == '\n') {
				549	lines++;
				550	}
				551	}
				552	}
				553	}
				554	fstring_shift_expr_locations(n, lines, cols);
				555	}
				556
				557
				558	/* Compile this expression in to an expr_ty. Add parens around the
				559	expression, in order to allow leading spaces in the expression. */
				560	static expr_ty
				561	fstring_compile_expr(Parser p, const char expr_start, const char *expr_end,
				562	Token *t)
				563	{
				564	expr_ty expr = NULL;
				565	char *str;
				566	Py_ssize_t len;
				567	const char *s;
				568	expr_ty result = NULL;
				569
				570	assert(expr_end >= expr_start);
				571	assert(*(expr_start-1) == '{');
				572	assert(expr_end == '}' \|\| expr_end == '!' \|\| *expr_end == ':' \|\|
				573	*expr_end == '=');
				574
				575	/* If the substring is all whitespace, it's an error. We need to catch this
				576	here, and not when we call PyParser_SimpleParseStringFlagsFilename,
				577	because turning the expression '' in to '()' would go from being invalid
				578	to valid. */
				579	for (s = expr_start; s != expr_end; s++) {
				580	char c = *s;
				581	/* The Python parser ignores only the following whitespace
				582	characters (\r already is converted to \n). */
				583	if (!(c == ' ' \|\| c == '\t' \|\| c == '\n' \|\| c == '\f')) {
				584	break;
				585	}
				586	}
				587	if (s == expr_end) {
				588	RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
				589	return NULL;
				590	}
				591
				592	len = expr_end - expr_start;
				593	/* Allocate 3 extra bytes: open paren, close paren, null byte. */
				594	str = PyMem_RawMalloc(len + 3);
				595	if (str == NULL) {
				596	PyErr_NoMemory();
				597	return NULL;
				598	}
				599
				600	str[0] = '(';
				601	memcpy(str+1, expr_start, len);
				602	str[len+1] = ')';
				603	str[len+2] = 0;
				604
				605	struct tok_state* tok = PyTokenizer_FromString(str, 1);
				606	if (tok == NULL) {
				607	return NULL;
				608	}
Lysandros Nikolaou	791a46e	2020-05-26 04:24:31 +0300	[diff] [blame]	609	Py_INCREF(p->tok->filename);
				610	tok->filename = p->tok->filename;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	611
Lysandros Nikolaou	3e0a6f3	2020-05-01 06:27:52 +0300	[diff] [blame]	612	Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
				613	NULL, p->arena);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	614	p2->starting_lineno = p->starting_lineno + p->tok->first_lineno - 1;
				615	p2->starting_col_offset = p->tok->first_lineno == p->tok->lineno
				616	? p->starting_col_offset + t->col_offset : 0;
				617
				618	expr = _PyPegen_run_parser(p2);
				619
				620	if (expr == NULL) {
				621	goto exit;
				622	}
				623
				624	/* Reuse str to find the correct column offset. */
				625	str[0] = '{';
				626	str[len+1] = '}';
				627	fstring_fix_expr_location(t, expr, str);
				628
				629	result = expr;
				630
				631	exit:
				632	_PyPegen_Parser_Free(p2);
				633	PyTokenizer_Free(tok);
				634	return result;
				635	}
				636
				637	/* Return -1 on error.
				638
				639	Return 0 if we reached the end of the literal.
				640
				641	Return 1 if we haven't reached the end of the literal, but we want
				642	the caller to process the literal up to this point. Used for
				643	doubled braces.
				644	*/
				645	static int
				646	fstring_find_literal(Parser p, const char str, const char end, int raw,
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	647	PyObject *literal, int recurse_lvl, Token t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	648	{
				649	/* Get any literal string. It ends when we hit an un-doubled left
				650	brace (which isn't part of a unicode name escape such as
				651	"\N{EULER CONSTANT}"), or the end of the string. */
				652
				653	const char s = str;
				654	const char *literal_start = s;
				655	int result = 0;
				656
				657	assert(*literal == NULL);
				658	while (s < end) {
				659	char ch = *s++;
				660	if (!raw && ch == '\\' && s < end) {
				661	ch = *s++;
				662	if (ch == 'N') {
				663	if (s < end && *s++ == '{') {
				664	while (s < end && *s++ != '}') {
				665	}
				666	continue;
				667	}
				668	break;
				669	}
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	670	if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	671	return -1;
				672	}
				673	}
				674	if (ch == '{' \|\| ch == '}') {
				675	/* Check for doubled braces, but only at the top level. If
				676	we checked at every level, then f'{0:{3}}' would fail
				677	with the two closing braces. */
				678	if (recurse_lvl == 0) {
				679	if (s < end && *s == ch) {
				680	/* We're going to tell the caller that the literal ends
				681	here, but that they should continue scanning. But also
				682	skip over the second brace when we resume scanning. */
				683	*str = s + 1;
				684	result = 1;
				685	goto done;
				686	}
				687
				688	/* Where a single '{' is the start of a new expression, a
				689	single '}' is not allowed. */
				690	if (ch == '}') {
				691	*str = s - 1;
				692	RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
				693	return -1;
				694	}
				695	}
				696	/* We're either at a '{', which means we're starting another
				697	expression; or a '}', which means we're at the end of this
				698	f-string (for a nested format_spec). */
				699	s--;
				700	break;
				701	}
				702	}
				703	*str = s;
				704	assert(s <= end);
				705	assert(s == end \|\| s == '{' \|\| s == '}');
				706	done:
				707	if (literal_start != s) {
				708	if (raw)
				709	*literal = PyUnicode_DecodeUTF8Stateful(literal_start,
				710	s - literal_start,
				711	NULL, NULL);
				712	else
				713	*literal = decode_unicode_with_escapes(p, literal_start,
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	714	s - literal_start, t);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	715	if (!*literal)
				716	return -1;
				717	}
				718	return result;
				719	}
				720
				721	/* Forward declaration because parsing is recursive. */
				722	static expr_ty
				723	fstring_parse(Parser p, const char str, const char end, int raw, int recurse_lvl,
				724	Token first_token, Token t, Token *last_token);
				725
				726	/* Parse the f-string at str, ending at end. We know str starts an
				727	expression (so it must be a '{'). Returns the FormattedValue node, which
				728	includes the expression, conversion character, format_spec expression, and
				729	optionally the text of the expression (if = is used).
				730
				731	Note that I don't do a perfect job here: I don't make sure that a
				732	closing brace doesn't match an opening paren, for example. It
				733	doesn't need to error on all invalid expressions, just correctly
				734	find the end of all valid ones. Any errors inside the expression
				735	will be caught when we parse it later.
				736
				737	*expression is set to the expression. For an '=' "debug" expression,
				738	*expr_text is set to the debug text (the original text of the expression,
				739	including the '=' and any whitespace around it, as a string object). If
				740	not a debug expression, expr_text set to NULL. /
				741	static int
				742	fstring_find_expr(Parser p, const char str, const char end, int raw, int recurse_lvl,
				743	PyObject *expr_text, expr_ty expression, Token *first_token,
				744	Token t, Token last_token)
				745	{
				746	/* Return -1 on error, else 0. */
				747
				748	const char *expr_start;
				749	const char *expr_end;
				750	expr_ty simple_expression;
				751	expr_ty format_spec = NULL; /* Optional format specifier. */
				752	int conversion = -1; /* The conversion char. Use default if not
				753	specified, or !r if using = and no format
				754	spec. */
				755
				756	/* 0 if we're not in a string, else the quote char we're trying to
				757	match (single or double quote). */
				758	char quote_char = 0;
				759
				760	/* If we're inside a string, 1=normal, 3=triple-quoted. */
				761	int string_type = 0;
				762
				763	/* Keep track of nesting level for braces/parens/brackets in
				764	expressions. */
				765	Py_ssize_t nested_depth = 0;
				766	char parenstack[MAXLEVEL];
				767
				768	*expr_text = NULL;
				769
				770	/* Can only nest one level deep. */
				771	if (recurse_lvl >= 2) {
				772	RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
				773	goto error;
				774	}
				775
				776	/* The first char must be a left brace, or we wouldn't have gotten
				777	here. Skip over it. */
				778	assert(**str == '{');
				779	*str += 1;
				780
				781	expr_start = *str;
				782	for (; str < end; (str)++) {
				783	char ch;
				784
				785	/* Loop invariants. */
				786	assert(nested_depth >= 0);
				787	assert(str >= expr_start && str < end);
				788	if (quote_char)
				789	assert(string_type == 1 \|\| string_type == 3);
				790	else
				791	assert(string_type == 0);
				792
				793	ch = **str;
				794	/* Nowhere inside an expression is a backslash allowed. */
				795	if (ch == '\\') {
				796	/* Error: can't include a backslash character, inside
				797	parens or strings or not. */
				798	RAISE_SYNTAX_ERROR(
				799	"f-string expression part "
				800	"cannot include a backslash");
				801	goto error;
				802	}
				803	if (quote_char) {
				804	/* We're inside a string. See if we're at the end. */
				805	/* This code needs to implement the same non-error logic
				806	as tok_get from tokenizer.c, at the letter_quote
				807	label. To actually share that code would be a
				808	nightmare. But, it's unlikely to change and is small,
				809	so duplicate it here. Note we don't need to catch all
				810	of the errors, since they'll be caught when parsing the
				811	expression. We just need to match the non-error
				812	cases. Thus we can ignore \n in single-quoted strings,
				813	for example. Or non-terminated strings. */
				814	if (ch == quote_char) {
				815	/* Does this match the string_type (single or triple
				816	quoted)? */
				817	if (string_type == 3) {
				818	if (str+2 < end && (str+1) == ch && (*str+2) == ch) {
				819	/* We're at the end of a triple quoted string. */
				820	*str += 2;
				821	string_type = 0;
				822	quote_char = 0;
				823	continue;
				824	}
				825	} else {
				826	/* We're at the end of a normal string. */
				827	quote_char = 0;
				828	string_type = 0;
				829	continue;
				830	}
				831	}
				832	} else if (ch == '\'' \|\| ch == '"') {
				833	/* Is this a triple quoted string? */
				834	if (str+2 < end && (str+1) == ch && (*str+2) == ch) {
				835	string_type = 3;
				836	*str += 2;
				837	} else {
				838	/* Start of a normal string. */
				839	string_type = 1;
				840	}
				841	/* Start looking for the end of the string. */
				842	quote_char = ch;
				843	} else if (ch == '[' \|\| ch == '{' \|\| ch == '(') {
				844	if (nested_depth >= MAXLEVEL) {
				845	RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
				846	goto error;
				847	}
				848	parenstack[nested_depth] = ch;
				849	nested_depth++;
				850	} else if (ch == '#') {
				851	/* Error: can't include a comment character, inside parens
				852	or not. */
				853	RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
				854	goto error;
				855	} else if (nested_depth == 0 &&
				856	(ch == '!' \|\| ch == ':' \|\| ch == '}' \|\|
				857	ch == '=' \|\| ch == '>' \|\| ch == '<')) {
				858	/* See if there's a next character. */
				859	if (*str+1 < end) {
				860	char next = (str+1);
				861
				862	/* For "!=". since '=' is not an allowed conversion character,
				863	nothing is lost in this test. */
				864	if ((ch == '!' && next == '=') \|\| /* != */
				865	(ch == '=' && next == '=') \|\| /* == */
				866	(ch == '<' && next == '=') \|\| /* <= */
				867	(ch == '>' && next == '=') /* >= */
				868	) {
				869	*str += 1;
				870	continue;
				871	}
				872	/* Don't get out of the loop for these, if they're single
				873	chars (not part of 2-char tokens). If by themselves, they
				874	don't end an expression (unlike say '!'). */
				875	if (ch == '>' \|\| ch == '<') {
				876	continue;
				877	}
				878	}
				879
				880	/* Normal way out of this loop. */
				881	break;
				882	} else if (ch == ']' \|\| ch == '}' \|\| ch == ')') {
				883	if (!nested_depth) {
				884	RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
				885	goto error;
				886	}
				887	nested_depth--;
				888	int opening = parenstack[nested_depth];
				889	if (!((opening == '(' && ch == ')') \|\|
				890	(opening == '[' && ch == ']') \|\|
				891	(opening == '{' && ch == '}')))
				892	{
				893	RAISE_SYNTAX_ERROR(
				894	"f-string: closing parenthesis '%c' "
				895	"does not match opening parenthesis '%c'",
				896	ch, opening);
				897	goto error;
				898	}
				899	} else {
				900	/* Just consume this char and loop around. */
				901	}
				902	}
				903	expr_end = *str;
				904	/* If we leave this loop in a string or with mismatched parens, we
				905	don't care. We'll get a syntax error when compiling the
				906	expression. But, we can produce a better error message, so
				907	let's just do that.*/
				908	if (quote_char) {
				909	RAISE_SYNTAX_ERROR("f-string: unterminated string");
				910	goto error;
				911	}
				912	if (nested_depth) {
				913	int opening = parenstack[nested_depth - 1];
				914	RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
				915	goto error;
				916	}
				917
				918	if (*str >= end)
				919	goto unexpected_end_of_string;
				920
				921	/* Compile the expression as soon as possible, so we show errors
				922	related to the expression before errors related to the
				923	conversion or format_spec. */
				924	simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
				925	if (!simple_expression)
				926	goto error;
				927
				928	/* Check for =, which puts the text value of the expression in
				929	expr_text. */
				930	if (**str == '=') {
				931	*str += 1;
				932
				933	/* Skip over ASCII whitespace. No need to test for end of string
				934	here, since we know there's at least a trailing quote somewhere
				935	ahead. */
				936	while (Py_ISSPACE(**str)) {
				937	*str += 1;
				938	}
				939
				940	/* Set expr_text to the text of the expression. /
				941	expr_text = PyUnicode_FromStringAndSize(expr_start, str-expr_start);
				942	if (!*expr_text) {
				943	goto error;
				944	}
				945	}
				946
				947	/* Check for a conversion char, if present. */
				948	if (**str == '!') {
				949	*str += 1;
				950	if (*str >= end)
				951	goto unexpected_end_of_string;
				952
				953	conversion = **str;
				954	*str += 1;
				955
				956	/* Validate the conversion. */
				957	if (!(conversion == 's' \|\| conversion == 'r' \|\| conversion == 'a')) {
				958	RAISE_SYNTAX_ERROR(
				959	"f-string: invalid conversion character: "
				960	"expected 's', 'r', or 'a'");
				961	goto error;
				962	}
				963
				964	}
				965
				966	/* Check for the format spec, if present. */
				967	if (*str >= end)
				968	goto unexpected_end_of_string;
				969	if (**str == ':') {
				970	*str += 1;
				971	if (*str >= end)
				972	goto unexpected_end_of_string;
				973
				974	/* Parse the format spec. */
				975	format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
				976	first_token, t, last_token);
				977	if (!format_spec)
				978	goto error;
				979	}
				980
				981	if (str >= end \|\| *str != '}')
				982	goto unexpected_end_of_string;
				983
				984	/* We're at a right brace. Consume it. */
				985	assert(*str < end);
				986	assert(**str == '}');
				987	*str += 1;
				988
				989	/* If we're in = mode (detected by non-NULL expr_text), and have no format
				990	spec and no explicit conversion, set the conversion to 'r'. */
				991	if (*expr_text && format_spec == NULL && conversion == -1) {
				992	conversion = 'r';
				993	}
				994
				995	/* And now create the FormattedValue node that represents this
				996	entire expression with the conversion and format spec. */
				997	//TODO: Fix this
				998	*expression = FormattedValue(simple_expression, conversion,
				999	format_spec, first_token->lineno,
				1000	first_token->col_offset, last_token->end_lineno,
				1001	last_token->end_col_offset, p->arena);
				1002	if (!*expression)
				1003	goto error;
				1004
				1005	return 0;
				1006
				1007	unexpected_end_of_string:
				1008	RAISE_SYNTAX_ERROR("f-string: expecting '}'");
				1009	/* Falls through to error. */
				1010
				1011	error:
				1012	Py_XDECREF(*expr_text);
				1013	return -1;
				1014
				1015	}
				1016
				1017	/* Return -1 on error.
				1018
				1019	Return 0 if we have a literal (possible zero length) and an
				1020	expression (zero length if at the end of the string.
				1021
				1022	Return 1 if we have a literal, but no expression, and we want the
				1023	caller to call us again. This is used to deal with doubled
				1024	braces.
				1025
				1026	When called multiple times on the string 'a{{b{0}c', this function
				1027	will return:
				1028
				1029	1. the literal 'a{' with no expression, and a return value
				1030	of 1. Despite the fact that there's no expression, the return
				1031	value of 1 means we're not finished yet.
				1032
				1033	2. the literal 'b' and the expression '0', with a return value of
				1034	0. The fact that there's an expression means we're not finished.
				1035
				1036	3. literal 'c' with no expression and a return value of 0. The
				1037	combination of the return value of 0 with no expression means
				1038	we're finished.
				1039	*/
				1040	static int
				1041	fstring_find_literal_and_expr(Parser p, const char str, const char end, int raw,
				1042	int recurse_lvl, PyObject **literal,
				1043	PyObject *expr_text, expr_ty expression,
				1044	Token first_token, Token t, Token *last_token)
				1045	{
				1046	int result;
				1047
				1048	assert(literal == NULL && expression == NULL);
				1049
				1050	/* Get any literal string. */
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	1051	result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1052	if (result < 0)
				1053	goto error;
				1054
				1055	assert(result == 0 \|\| result == 1);
				1056
				1057	if (result == 1)
				1058	/* We have a literal, but don't look at the expression. */
				1059	return 1;
				1060
				1061	if (str >= end \|\| *str == '}')
				1062	/* We're at the end of the string or the end of a nested
				1063	f-string: no expression. The top-level error case where we
				1064	expect to be at the end of the string but we're at a '}' is
				1065	handled later. */
				1066	return 0;
				1067
				1068	/* We must now be the start of an expression, on a '{'. */
				1069	assert(**str == '{');
				1070
				1071	if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
				1072	expression, first_token, t, last_token) < 0)
				1073	goto error;
				1074
				1075	return 0;
				1076
				1077	error:
				1078	Py_CLEAR(*literal);
				1079	return -1;
				1080	}
				1081
				1082	#ifdef NDEBUG
				1083	#define ExprList_check_invariants(l)
				1084	#else
				1085	static void
				1086	ExprList_check_invariants(ExprList *l)
				1087	{
				1088	/* Check our invariants. Make sure this object is "live", and
				1089	hasn't been deallocated. */
				1090	assert(l->size >= 0);
				1091	assert(l->p != NULL);
				1092	if (l->size <= EXPRLIST_N_CACHED)
				1093	assert(l->data == l->p);
				1094	}
				1095	#endif
				1096
				1097	static void
				1098	ExprList_Init(ExprList *l)
				1099	{
				1100	l->allocated = EXPRLIST_N_CACHED;
				1101	l->size = 0;
				1102
				1103	/* Until we start allocating dynamically, p points to data. */
				1104	l->p = l->data;
				1105
				1106	ExprList_check_invariants(l);
				1107	}
				1108
				1109	static int
				1110	ExprList_Append(ExprList *l, expr_ty exp)
				1111	{
				1112	ExprList_check_invariants(l);
				1113	if (l->size >= l->allocated) {
				1114	/* We need to alloc (or realloc) the memory. */
				1115	Py_ssize_t new_size = l->allocated * 2;
				1116
				1117	/* See if we've ever allocated anything dynamically. */
				1118	if (l->p == l->data) {
				1119	Py_ssize_t i;
				1120	/* We're still using the cached data. Switch to
				1121	alloc-ing. */
				1122	l->p = PyMem_RawMalloc(sizeof(expr_ty) * new_size);
				1123	if (!l->p)
				1124	return -1;
				1125	/* Copy the cached data into the new buffer. */
				1126	for (i = 0; i < l->size; i++)
				1127	l->p[i] = l->data[i];
				1128	} else {
				1129	/* Just realloc. */
				1130	expr_ty tmp = PyMem_RawRealloc(l->p, sizeof(expr_ty) new_size);
				1131	if (!tmp) {
				1132	PyMem_RawFree(l->p);
				1133	l->p = NULL;
				1134	return -1;
				1135	}
				1136	l->p = tmp;
				1137	}
				1138
				1139	l->allocated = new_size;
				1140	assert(l->allocated == 2 * l->size);
				1141	}
				1142
				1143	l->p[l->size++] = exp;
				1144
				1145	ExprList_check_invariants(l);
				1146	return 0;
				1147	}
				1148
				1149	static void
				1150	ExprList_Dealloc(ExprList *l)
				1151	{
				1152	ExprList_check_invariants(l);
				1153
				1154	/* If there's been an error, or we've never dynamically allocated,
				1155	do nothing. */
				1156	if (!l->p \|\| l->p == l->data) {
				1157	/* Do nothing. */
				1158	} else {
				1159	/* We have dynamically allocated. Free the memory. */
				1160	PyMem_RawFree(l->p);
				1161	}
				1162	l->p = NULL;
				1163	l->size = -1;
				1164	}
				1165
				1166	static asdl_seq *
				1167	ExprList_Finish(ExprList l, PyArena arena)
				1168	{
				1169	asdl_seq *seq;
				1170
				1171	ExprList_check_invariants(l);
				1172
				1173	/* Allocate the asdl_seq and copy the expressions in to it. */
				1174	seq = _Py_asdl_seq_new(l->size, arena);
				1175	if (seq) {
				1176	Py_ssize_t i;
				1177	for (i = 0; i < l->size; i++)
				1178	asdl_seq_SET(seq, i, l->p[i]);
				1179	}
				1180	ExprList_Dealloc(l);
				1181	return seq;
				1182	}
				1183
				1184	#ifdef NDEBUG
				1185	#define FstringParser_check_invariants(state)
				1186	#else
				1187	static void
				1188	FstringParser_check_invariants(FstringParser *state)
				1189	{
				1190	if (state->last_str)
				1191	assert(PyUnicode_CheckExact(state->last_str));
				1192	ExprList_check_invariants(&state->expr_list);
				1193	}
				1194	#endif
				1195
				1196	void
				1197	_PyPegen_FstringParser_Init(FstringParser *state)
				1198	{
				1199	state->last_str = NULL;
				1200	state->fmode = 0;
				1201	ExprList_Init(&state->expr_list);
				1202	FstringParser_check_invariants(state);
				1203	}
				1204
				1205	void
				1206	_PyPegen_FstringParser_Dealloc(FstringParser *state)
				1207	{
				1208	FstringParser_check_invariants(state);
				1209
				1210	Py_XDECREF(state->last_str);
				1211	ExprList_Dealloc(&state->expr_list);
				1212	}
				1213
				1214	/* Make a Constant node, but decref the PyUnicode object being added. */
				1215	static expr_ty
				1216	make_str_node_and_del(Parser p, PyObject str, Token first_token, Token *last_token)
				1217	{
				1218	PyObject s = str;
				1219	PyObject *kind = NULL;
				1220	*str = NULL;
				1221	assert(PyUnicode_CheckExact(s));
				1222	if (PyArena_AddPyObject(p->arena, s) < 0) {
				1223	Py_DECREF(s);
				1224	return NULL;
				1225	}
				1226	const char* the_str = PyBytes_AsString(first_token->bytes);
				1227	if (the_str && the_str[0] == 'u') {
				1228	kind = _PyPegen_new_identifier(p, "u");
				1229	}
				1230
				1231	if (kind == NULL && PyErr_Occurred()) {
				1232	return NULL;
				1233	}
				1234
				1235	return Constant(s, kind, first_token->lineno, first_token->col_offset,
				1236	last_token->end_lineno, last_token->end_col_offset, p->arena);
				1237
				1238	}
				1239
				1240
				1241	/* Add a non-f-string (that is, a regular literal string). str is
				1242	decref'd. */
				1243	int
				1244	_PyPegen_FstringParser_ConcatAndDel(FstringParser state, PyObject str)
				1245	{
				1246	FstringParser_check_invariants(state);
				1247
				1248	assert(PyUnicode_CheckExact(str));
				1249
				1250	if (PyUnicode_GET_LENGTH(str) == 0) {
				1251	Py_DECREF(str);
				1252	return 0;
				1253	}
				1254
				1255	if (!state->last_str) {
				1256	/* We didn't have a string before, so just remember this one. */
				1257	state->last_str = str;
				1258	} else {
				1259	/* Concatenate this with the previous string. */
				1260	PyUnicode_AppendAndDel(&state->last_str, str);
				1261	if (!state->last_str)
				1262	return -1;
				1263	}
				1264	FstringParser_check_invariants(state);
				1265	return 0;
				1266	}
				1267
				1268	/* Parse an f-string. The f-string is in *str to end, with no
				1269	'f' or quotes. */
				1270	int
				1271	_PyPegen_FstringParser_ConcatFstring(Parser p, FstringParser state, const char **str,
				1272	const char *end, int raw, int recurse_lvl,
				1273	Token first_token, Token t, Token *last_token)
				1274	{
				1275	FstringParser_check_invariants(state);
				1276	state->fmode = 1;
				1277
				1278	/* Parse the f-string. */
				1279	while (1) {
				1280	PyObject *literal = NULL;
				1281	PyObject *expr_text = NULL;
				1282	expr_ty expression = NULL;
				1283
				1284	/* If there's a zero length literal in front of the
				1285	expression, literal will be NULL. If we're at the end of
				1286	the f-string, expression will be NULL (unless result == 1,
				1287	see below). */
				1288	int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
				1289	&literal, &expr_text,
				1290	&expression, first_token, t, last_token);
				1291	if (result < 0)
				1292	return -1;
				1293
				1294	/* Add the literal, if any. */
				1295	if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
				1296	Py_XDECREF(expr_text);
				1297	return -1;
				1298	}
				1299	/* Add the expr_text, if any. */
				1300	if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
				1301	return -1;
				1302	}
				1303
				1304	/* We've dealt with the literal and expr_text, their ownership has
				1305	been transferred to the state object. Don't look at them again. */
				1306
				1307	/* See if we should just loop around to get the next literal
				1308	and expression, while ignoring the expression this
				1309	time. This is used for un-doubling braces, as an
				1310	optimization. */
				1311	if (result == 1)
				1312	continue;
				1313
				1314	if (!expression)
				1315	/* We're done with this f-string. */
				1316	break;
				1317
				1318	/* We know we have an expression. Convert any existing string
				1319	to a Constant node. */
				1320	if (!state->last_str) {
				1321	/* Do nothing. No previous literal. */
				1322	} else {
				1323	/* Convert the existing last_str literal to a Constant node. */
				1324	expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
				1325	if (!str \|\| ExprList_Append(&state->expr_list, str) < 0)
				1326	return -1;
				1327	}
				1328
				1329	if (ExprList_Append(&state->expr_list, expression) < 0)
				1330	return -1;
				1331	}
				1332
				1333	/* If recurse_lvl is zero, then we must be at the end of the
				1334	string. Otherwise, we must be at a right brace. */
				1335
				1336	if (recurse_lvl == 0 && *str < end-1) {
				1337	RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
				1338	return -1;
				1339	}
				1340	if (recurse_lvl != 0 && **str != '}') {
				1341	RAISE_SYNTAX_ERROR("f-string: expecting '}'");
				1342	return -1;
				1343	}
				1344
				1345	FstringParser_check_invariants(state);
				1346	return 0;
				1347	}
				1348
				1349	/* Convert the partial state reflected in last_str and expr_list to an
				1350	expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
				1351	expr_ty
				1352	_PyPegen_FstringParser_Finish(Parser p, FstringParser state, Token* first_token,
				1353	Token *last_token)
				1354	{
				1355	asdl_seq *seq;
				1356
				1357	FstringParser_check_invariants(state);
				1358
				1359	/* If we're just a constant string with no expressions, return
				1360	that. */
				1361	if (!state->fmode) {
				1362	assert(!state->expr_list.size);
				1363	if (!state->last_str) {
				1364	/* Create a zero length string. */
				1365	state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
				1366	if (!state->last_str)
				1367	goto error;
				1368	}
				1369	return make_str_node_and_del(p, &state->last_str, first_token, last_token);
				1370	}
				1371
				1372	/* Create a Constant node out of last_str, if needed. It will be the
				1373	last node in our expression list. */
				1374	if (state->last_str) {
				1375	expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
				1376	if (!str \|\| ExprList_Append(&state->expr_list, str) < 0)
				1377	goto error;
				1378	}
				1379	/* This has already been freed. */
				1380	assert(state->last_str == NULL);
				1381
				1382	seq = ExprList_Finish(&state->expr_list, p->arena);
				1383	if (!seq)
				1384	goto error;
				1385
				1386	return _Py_JoinedStr(seq, first_token->lineno, first_token->col_offset,
				1387	last_token->end_lineno, last_token->end_col_offset, p->arena);
				1388
				1389	error:
				1390	_PyPegen_FstringParser_Dealloc(state);
				1391	return NULL;
				1392	}
				1393
				1394	/* Given an f-string (with no 'f' or quotes) that's in *str and ends
				1395	at end, parse it into an expr_ty. Return NULL on error. Adjust
				1396	str to point past the parsed portion. */
				1397	static expr_ty
				1398	fstring_parse(Parser p, const char str, const char end, int raw,
				1399	int recurse_lvl, Token first_token, Token t, Token *last_token)
				1400	{
				1401	FstringParser state;
				1402
				1403	_PyPegen_FstringParser_Init(&state);
				1404	if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
				1405	first_token, t, last_token) < 0) {
				1406	_PyPegen_FstringParser_Dealloc(&state);
				1407	return NULL;
				1408	}
				1409
				1410	return _PyPegen_FstringParser_Finish(p, &state, t, t);
				1411	}