Blame - Parser/pegen/parse_string.c - platform/external/python/cpython3

blob: 88b10c3f494ccee363df00c4783ade4fcf950a45 [file] [log] [blame]

Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1	#include <Python.h>
				2
				3	#include "../tokenizer.h"
				4	#include "pegen.h"
				5	#include "parse_string.h"
				6
				7	//// STRING HANDLING FUNCTIONS ////
				8
				9	// These functions are ported directly from Python/ast.c with some modifications
				10	// to account for the use of "Parser *p", the fact that don't have parser nodes
				11	// to pass around and the usage of some specialized APIs present only in this
				12	// file (like "_PyPegen_raise_syntax_error").
				13
				14	static int
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	15	warn_invalid_escape_sequence(Parser p, unsigned char first_invalid_escape_char, Token t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	16	{
				17	PyObject *msg =
				18	PyUnicode_FromFormat("invalid escape sequence \\%c", first_invalid_escape_char);
				19	if (msg == NULL) {
				20	return -1;
				21	}
				22	if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	23	t->lineno, NULL, NULL) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	24	if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
				25	/* Replace the DeprecationWarning exception with a SyntaxError
				26	to get a more accurate error report */
				27	PyErr_Clear();
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	28
				29	/* This is needed, in order for the SyntaxError to point to the token t,
				30	since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
				31	error location, if p->known_err_token is not set. */
				32	p->known_err_token = t;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	33	RAISE_SYNTAX_ERROR("invalid escape sequence \\%c", first_invalid_escape_char);
				34	}
				35	Py_DECREF(msg);
				36	return -1;
				37	}
				38	Py_DECREF(msg);
				39	return 0;
				40	}
				41
				42	static PyObject *
				43	decode_utf8(const char *sPtr, const char end)
				44	{
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	45	const char *s;
				46	const char *t;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	47	t = s = *sPtr;
				48	while (s < end && (*s & 0x80)) {
				49	s++;
				50	}
				51	*sPtr = s;
				52	return PyUnicode_DecodeUTF8(t, s - t, NULL);
				53	}
				54
				55	static PyObject *
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	56	decode_unicode_with_escapes(Parser parser, const char s, size_t len, Token *t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	57	{
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	58	PyObject *v;
				59	PyObject *u;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	60	char *buf;
				61	char *p;
				62	const char *end;
				63
				64	/* check for integer overflow */
				65	if (len > SIZE_MAX / 6) {
				66	return NULL;
				67	}
				68	/* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
				69	"\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
				70	u = PyBytes_FromStringAndSize((char )NULL, len 6);
				71	if (u == NULL) {
				72	return NULL;
				73	}
				74	p = buf = PyBytes_AsString(u);
				75	end = s + len;
				76	while (s < end) {
				77	if (*s == '\\') {
				78	p++ = s++;
				79	if (s >= end \|\| *s & 0x80) {
				80	strcpy(p, "u005c");
				81	p += 5;
				82	if (s >= end) {
				83	break;
				84	}
				85	}
				86	}
				87	if (*s & 0x80) {
				88	PyObject *w;
				89	int kind;
				90	void *data;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	91	Py_ssize_t w_len;
				92	Py_ssize_t i;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	93	w = decode_utf8(&s, end);
				94	if (w == NULL) {
				95	Py_DECREF(u);
				96	return NULL;
				97	}
				98	kind = PyUnicode_KIND(w);
				99	data = PyUnicode_DATA(w);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	100	w_len = PyUnicode_GET_LENGTH(w);
				101	for (i = 0; i < w_len; i++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	102	Py_UCS4 chr = PyUnicode_READ(kind, data, i);
				103	sprintf(p, "\\U%08x", chr);
				104	p += 10;
				105	}
				106	/* Should be impossible to overflow */
				107	assert(p - buf <= PyBytes_GET_SIZE(u));
				108	Py_DECREF(w);
				109	}
				110	else {
				111	p++ = s++;
				112	}
				113	}
				114	len = p - buf;
				115	s = buf;
				116
				117	const char *first_invalid_escape;
				118	v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
				119
				120	if (v != NULL && first_invalid_escape != NULL) {
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	121	if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	122	/* We have not decref u before because first_invalid_escape points
				123	inside u. */
				124	Py_XDECREF(u);
				125	Py_DECREF(v);
				126	return NULL;
				127	}
				128	}
				129	Py_XDECREF(u);
				130	return v;
				131	}
				132
				133	static PyObject *
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	134	decode_bytes_with_escapes(Parser p, const char s, Py_ssize_t len, Token *t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	135	{
				136	const char *first_invalid_escape;
				137	PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
				138	if (result == NULL) {
				139	return NULL;
				140	}
				141
				142	if (first_invalid_escape != NULL) {
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	143	if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	144	Py_DECREF(result);
				145	return NULL;
				146	}
				147	}
				148	return result;
				149	}
				150
				151	/* s must include the bracketing quote characters, and r, b, u,
				152	&/or f prefixes (if any), and embedded escape sequences (if any).
				153	_PyPegen_parsestr parses it, and sets *result to decoded Python string object.
				154	If the string is an f-string, set fstr and fstrlen to the unparsed
				155	string object. Return 0 if no errors occurred. */
				156	int
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	157	_PyPegen_parsestr(Parser p, int bytesmode, int rawmode, PyObject *result,
				158	const char *fstr, Py_ssize_t fstrlen, Token *t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	159	{
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	160	const char *s = PyBytes_AsString(t->bytes);
				161	if (s == NULL) {
				162	return -1;
				163	}
				164
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	165	size_t len;
				166	int quote = Py_CHARMASK(*s);
				167	int fmode = 0;
				168	*bytesmode = 0;
				169	*rawmode = 0;
				170	*result = NULL;
				171	*fstr = NULL;
				172	if (Py_ISALPHA(quote)) {
				173	while (!bytesmode \|\| !rawmode) {
				174	if (quote == 'b' \|\| quote == 'B') {
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	175	quote =(unsigned char)*++s;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	176	*bytesmode = 1;
				177	}
				178	else if (quote == 'u' \|\| quote == 'U') {
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	179	quote = (unsigned char)*++s;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	180	}
				181	else if (quote == 'r' \|\| quote == 'R') {
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	182	quote = (unsigned char)*++s;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	183	*rawmode = 1;
				184	}
				185	else if (quote == 'f' \|\| quote == 'F') {
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	186	quote = (unsigned char)*++s;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	187	fmode = 1;
				188	}
				189	else {
				190	break;
				191	}
				192	}
				193	}
				194
Lysandros Nikolaou	3e0a6f3	2020-05-01 06:27:52 +0300	[diff] [blame]	195	/* fstrings are only allowed in Python 3.6 and greater */
				196	if (fmode && p->feature_version < 6) {
				197	p->error_indicator = 1;
				198	RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
				199	return -1;
				200	}
				201
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	202	if (fmode && *bytesmode) {
				203	PyErr_BadInternalCall();
				204	return -1;
				205	}
				206	if (quote != '\'' && quote != '\"') {
				207	PyErr_BadInternalCall();
				208	return -1;
				209	}
				210	/* Skip the leading quote char. */
				211	s++;
				212	len = strlen(s);
				213	if (len > INT_MAX) {
				214	PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
				215	return -1;
				216	}
				217	if (s[--len] != quote) {
				218	/* Last quote char must match the first. */
				219	PyErr_BadInternalCall();
				220	return -1;
				221	}
				222	if (len >= 4 && s[0] == quote && s[1] == quote) {
				223	/* A triple quoted string. We've already skipped one quote at
				224	the start and one at the end of the string. Now skip the
				225	two at the start. */
				226	s += 2;
				227	len -= 2;
				228	/* And check that the last two match. */
				229	if (s[--len] != quote \|\| s[--len] != quote) {
				230	PyErr_BadInternalCall();
				231	return -1;
				232	}
				233	}
				234
				235	if (fmode) {
				236	/* Just return the bytes. The caller will parse the resulting
				237	string. */
				238	*fstr = s;
				239	*fstrlen = len;
				240	return 0;
				241	}
				242
				243	/* Not an f-string. */
				244	/* Avoid invoking escape decoding routines if possible. */
				245	rawmode = rawmode \|\| strchr(s, '\\') == NULL;
				246	if (*bytesmode) {
				247	/* Disallow non-ASCII characters. */
				248	const char *ch;
				249	for (ch = s; *ch; ch++) {
				250	if (Py_CHARMASK(*ch) >= 0x80) {
				251	RAISE_SYNTAX_ERROR(
				252	"bytes can only contain ASCII "
				253	"literal characters.");
				254	return -1;
				255	}
				256	}
				257	if (*rawmode) {
				258	*result = PyBytes_FromStringAndSize(s, len);
				259	}
				260	else {
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	261	*result = decode_bytes_with_escapes(p, s, len, t);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	262	}
				263	}
				264	else {
				265	if (*rawmode) {
				266	*result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
				267	}
				268	else {
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	269	*result = decode_unicode_with_escapes(p, s, len, t);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	270	}
				271	}
				272	return *result == NULL ? -1 : 0;
				273	}
				274
				275
				276
				277	// FSTRING STUFF
				278
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	279	/* Fix locations for the given node and its children.
				280
				281	`parent` is the enclosing node.
				282	`n` is the node which locations are going to be fixed relative to parent.
				283	`expr_str` is the child node's string representation, including braces.
				284	*/
				285	static void
Pablo Galindo	dab533d	2020-06-28 01:15:28 +0100	[diff] [blame^]	286	fstring_find_expr_location(Token parent, char expr_str, int p_lines, int p_cols)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	287	{
				288	char *substr = NULL;
				289	char *start;
				290	int lines = 0;
				291	int cols = 0;
				292
				293	if (parent && parent->bytes) {
				294	char *parent_str = PyBytes_AsString(parent->bytes);
				295	if (!parent_str) {
				296	return;
				297	}
				298	substr = strstr(parent_str, expr_str);
				299	if (substr) {
				300	// The following is needed, in order to correctly shift the column
				301	// offset, in the case that (disregarding any whitespace) a newline
				302	// immediately follows the opening curly brace of the fstring expression.
				303	int newline_after_brace = 1;
				304	start = substr + 1;
				305	while (start && start != '}' && start != '\n') {
				306	if (start != ' ' && start != '\t' && *start != '\f') {
				307	newline_after_brace = 0;
				308	break;
				309	}
				310	start++;
				311	}
				312
				313	// Account for the characters from the last newline character to our
				314	// left until the beginning of substr.
				315	if (!newline_after_brace) {
				316	start = substr;
				317	while (start > parent_str && *start != '\n') {
				318	start--;
				319	}
				320	cols += (int)(substr - start);
				321	}
				322	/* adjust the start based on the number of newlines encountered
				323	before the f-string expression */
Pablo Galindo	0b7829e	2020-04-23 03:24:25 +0100	[diff] [blame]	324	for (char* p = parent_str; p < substr; p++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	325	if (*p == '\n') {
				326	lines++;
				327	}
				328	}
				329	}
				330	}
Pablo Galindo	dab533d	2020-06-28 01:15:28 +0100	[diff] [blame^]	331	*p_lines = lines;
				332	*p_cols = cols;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	333	}
				334
				335
				336	/* Compile this expression in to an expr_ty. Add parens around the
				337	expression, in order to allow leading spaces in the expression. */
				338	static expr_ty
				339	fstring_compile_expr(Parser p, const char expr_start, const char *expr_end,
				340	Token *t)
				341	{
				342	expr_ty expr = NULL;
				343	char *str;
				344	Py_ssize_t len;
				345	const char *s;
				346	expr_ty result = NULL;
				347
				348	assert(expr_end >= expr_start);
				349	assert(*(expr_start-1) == '{');
				350	assert(expr_end == '}' \|\| expr_end == '!' \|\| *expr_end == ':' \|\|
				351	*expr_end == '=');
				352
				353	/* If the substring is all whitespace, it's an error. We need to catch this
				354	here, and not when we call PyParser_SimpleParseStringFlagsFilename,
				355	because turning the expression '' in to '()' would go from being invalid
				356	to valid. */
				357	for (s = expr_start; s != expr_end; s++) {
				358	char c = *s;
				359	/* The Python parser ignores only the following whitespace
				360	characters (\r already is converted to \n). */
				361	if (!(c == ' ' \|\| c == '\t' \|\| c == '\n' \|\| c == '\f')) {
				362	break;
				363	}
				364	}
				365	if (s == expr_end) {
				366	RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
				367	return NULL;
				368	}
				369
				370	len = expr_end - expr_start;
				371	/* Allocate 3 extra bytes: open paren, close paren, null byte. */
Lysandros Nikolaou	5193d0a	2020-06-27 21:35:18 +0300	[diff] [blame]	372	str = PyMem_Malloc(len + 3);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	373	if (str == NULL) {
				374	PyErr_NoMemory();
				375	return NULL;
				376	}
				377
Pablo Galindo	dab533d	2020-06-28 01:15:28 +0100	[diff] [blame^]	378	// The call to fstring_find_expr_location is responsible for finding the column offset
				379	// the generated AST nodes need to be shifted to the right, which is equal to the number
				380	// of the f-string characters before the expression starts. In order to correctly compute
				381	// this offset, strstr gets called in fstring_find_expr_location which only succeeds
				382	// if curly braces appear before and after the f-string expression (exactly like they do
				383	// in the f-string itself), hence the following lines.
				384	str[0] = '{';
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	385	memcpy(str+1, expr_start, len);
Pablo Galindo	dab533d	2020-06-28 01:15:28 +0100	[diff] [blame^]	386	str[len+1] = '}';
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	387	str[len+2] = 0;
				388
Pablo Galindo	dab533d	2020-06-28 01:15:28 +0100	[diff] [blame^]	389	int lines, cols;
				390	fstring_find_expr_location(t, str, &lines, &cols);
				391
				392	// The parentheses are needed in order to allow for leading whitespace withing
				393	// the f-string expression. This consequently gets parsed as a group (see the
				394	// group rule in python.gram).
				395	str[0] = '(';
				396	str[len+1] = ')';
				397
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	398	struct tok_state* tok = PyTokenizer_FromString(str, 1);
				399	if (tok == NULL) {
Lysandros Nikolaou	5193d0a	2020-06-27 21:35:18 +0300	[diff] [blame]	400	PyMem_Free(str);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	401	return NULL;
				402	}
Lysandros Nikolaou	791a46e	2020-05-26 04:24:31 +0300	[diff] [blame]	403	Py_INCREF(p->tok->filename);
				404	tok->filename = p->tok->filename;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	405
Lysandros Nikolaou	3e0a6f3	2020-05-01 06:27:52 +0300	[diff] [blame]	406	Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
				407	NULL, p->arena);
Pablo Galindo	dab533d	2020-06-28 01:15:28 +0100	[diff] [blame^]	408	p2->starting_lineno = t->lineno + lines - 1;
				409	p2->starting_col_offset = p->tok->first_lineno == p->tok->lineno ? t->col_offset + cols : cols;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	410
				411	expr = _PyPegen_run_parser(p2);
				412
				413	if (expr == NULL) {
				414	goto exit;
				415	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	416	result = expr;
				417
				418	exit:
Lysandros Nikolaou	5193d0a	2020-06-27 21:35:18 +0300	[diff] [blame]	419	PyMem_Free(str);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	420	_PyPegen_Parser_Free(p2);
				421	PyTokenizer_Free(tok);
				422	return result;
				423	}
				424
				425	/* Return -1 on error.
				426
				427	Return 0 if we reached the end of the literal.
				428
				429	Return 1 if we haven't reached the end of the literal, but we want
				430	the caller to process the literal up to this point. Used for
				431	doubled braces.
				432	*/
				433	static int
				434	fstring_find_literal(Parser p, const char str, const char end, int raw,
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	435	PyObject *literal, int recurse_lvl, Token t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	436	{
				437	/* Get any literal string. It ends when we hit an un-doubled left
				438	brace (which isn't part of a unicode name escape such as
				439	"\N{EULER CONSTANT}"), or the end of the string. */
				440
				441	const char s = str;
				442	const char *literal_start = s;
				443	int result = 0;
				444
				445	assert(*literal == NULL);
				446	while (s < end) {
				447	char ch = *s++;
				448	if (!raw && ch == '\\' && s < end) {
				449	ch = *s++;
				450	if (ch == 'N') {
				451	if (s < end && *s++ == '{') {
				452	while (s < end && *s++ != '}') {
				453	}
				454	continue;
				455	}
				456	break;
				457	}
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	458	if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	459	return -1;
				460	}
				461	}
				462	if (ch == '{' \|\| ch == '}') {
				463	/* Check for doubled braces, but only at the top level. If
				464	we checked at every level, then f'{0:{3}}' would fail
				465	with the two closing braces. */
				466	if (recurse_lvl == 0) {
				467	if (s < end && *s == ch) {
				468	/* We're going to tell the caller that the literal ends
				469	here, but that they should continue scanning. But also
				470	skip over the second brace when we resume scanning. */
				471	*str = s + 1;
				472	result = 1;
				473	goto done;
				474	}
				475
				476	/* Where a single '{' is the start of a new expression, a
				477	single '}' is not allowed. */
				478	if (ch == '}') {
				479	*str = s - 1;
				480	RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
				481	return -1;
				482	}
				483	}
				484	/* We're either at a '{', which means we're starting another
				485	expression; or a '}', which means we're at the end of this
				486	f-string (for a nested format_spec). */
				487	s--;
				488	break;
				489	}
				490	}
				491	*str = s;
				492	assert(s <= end);
				493	assert(s == end \|\| s == '{' \|\| s == '}');
				494	done:
				495	if (literal_start != s) {
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	496	if (raw) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	497	*literal = PyUnicode_DecodeUTF8Stateful(literal_start,
				498	s - literal_start,
				499	NULL, NULL);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	500	} else {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	501	*literal = decode_unicode_with_escapes(p, literal_start,
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	502	s - literal_start, t);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	503	}
				504	if (!*literal) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	505	return -1;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	506	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	507	}
				508	return result;
				509	}
				510
				511	/* Forward declaration because parsing is recursive. */
				512	static expr_ty
				513	fstring_parse(Parser p, const char str, const char end, int raw, int recurse_lvl,
				514	Token first_token, Token t, Token *last_token);
				515
				516	/* Parse the f-string at str, ending at end. We know str starts an
				517	expression (so it must be a '{'). Returns the FormattedValue node, which
				518	includes the expression, conversion character, format_spec expression, and
				519	optionally the text of the expression (if = is used).
				520
				521	Note that I don't do a perfect job here: I don't make sure that a
				522	closing brace doesn't match an opening paren, for example. It
				523	doesn't need to error on all invalid expressions, just correctly
				524	find the end of all valid ones. Any errors inside the expression
				525	will be caught when we parse it later.
				526
				527	*expression is set to the expression. For an '=' "debug" expression,
				528	*expr_text is set to the debug text (the original text of the expression,
				529	including the '=' and any whitespace around it, as a string object). If
				530	not a debug expression, expr_text set to NULL. /
				531	static int
				532	fstring_find_expr(Parser p, const char str, const char end, int raw, int recurse_lvl,
				533	PyObject *expr_text, expr_ty expression, Token *first_token,
				534	Token t, Token last_token)
				535	{
				536	/* Return -1 on error, else 0. */
				537
				538	const char *expr_start;
				539	const char *expr_end;
				540	expr_ty simple_expression;
				541	expr_ty format_spec = NULL; /* Optional format specifier. */
				542	int conversion = -1; /* The conversion char. Use default if not
				543	specified, or !r if using = and no format
				544	spec. */
				545
				546	/* 0 if we're not in a string, else the quote char we're trying to
				547	match (single or double quote). */
				548	char quote_char = 0;
				549
				550	/* If we're inside a string, 1=normal, 3=triple-quoted. */
				551	int string_type = 0;
				552
				553	/* Keep track of nesting level for braces/parens/brackets in
				554	expressions. */
				555	Py_ssize_t nested_depth = 0;
				556	char parenstack[MAXLEVEL];
				557
				558	*expr_text = NULL;
				559
				560	/* Can only nest one level deep. */
				561	if (recurse_lvl >= 2) {
				562	RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
				563	goto error;
				564	}
				565
				566	/* The first char must be a left brace, or we wouldn't have gotten
				567	here. Skip over it. */
				568	assert(**str == '{');
				569	*str += 1;
				570
				571	expr_start = *str;
				572	for (; str < end; (str)++) {
				573	char ch;
				574
				575	/* Loop invariants. */
				576	assert(nested_depth >= 0);
				577	assert(str >= expr_start && str < end);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	578	if (quote_char) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	579	assert(string_type == 1 \|\| string_type == 3);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	580	} else {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	581	assert(string_type == 0);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	582	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	583
				584	ch = **str;
				585	/* Nowhere inside an expression is a backslash allowed. */
				586	if (ch == '\\') {
				587	/* Error: can't include a backslash character, inside
				588	parens or strings or not. */
				589	RAISE_SYNTAX_ERROR(
				590	"f-string expression part "
				591	"cannot include a backslash");
				592	goto error;
				593	}
				594	if (quote_char) {
				595	/* We're inside a string. See if we're at the end. */
				596	/* This code needs to implement the same non-error logic
				597	as tok_get from tokenizer.c, at the letter_quote
				598	label. To actually share that code would be a
				599	nightmare. But, it's unlikely to change and is small,
				600	so duplicate it here. Note we don't need to catch all
				601	of the errors, since they'll be caught when parsing the
				602	expression. We just need to match the non-error
				603	cases. Thus we can ignore \n in single-quoted strings,
				604	for example. Or non-terminated strings. */
				605	if (ch == quote_char) {
				606	/* Does this match the string_type (single or triple
				607	quoted)? */
				608	if (string_type == 3) {
				609	if (str+2 < end && (str+1) == ch && (*str+2) == ch) {
				610	/* We're at the end of a triple quoted string. */
				611	*str += 2;
				612	string_type = 0;
				613	quote_char = 0;
				614	continue;
				615	}
				616	} else {
				617	/* We're at the end of a normal string. */
				618	quote_char = 0;
				619	string_type = 0;
				620	continue;
				621	}
				622	}
				623	} else if (ch == '\'' \|\| ch == '"') {
				624	/* Is this a triple quoted string? */
				625	if (str+2 < end && (str+1) == ch && (*str+2) == ch) {
				626	string_type = 3;
				627	*str += 2;
				628	} else {
				629	/* Start of a normal string. */
				630	string_type = 1;
				631	}
				632	/* Start looking for the end of the string. */
				633	quote_char = ch;
				634	} else if (ch == '[' \|\| ch == '{' \|\| ch == '(') {
				635	if (nested_depth >= MAXLEVEL) {
				636	RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
				637	goto error;
				638	}
				639	parenstack[nested_depth] = ch;
				640	nested_depth++;
				641	} else if (ch == '#') {
				642	/* Error: can't include a comment character, inside parens
				643	or not. */
				644	RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
				645	goto error;
				646	} else if (nested_depth == 0 &&
				647	(ch == '!' \|\| ch == ':' \|\| ch == '}' \|\|
				648	ch == '=' \|\| ch == '>' \|\| ch == '<')) {
				649	/* See if there's a next character. */
				650	if (*str+1 < end) {
				651	char next = (str+1);
				652
				653	/* For "!=". since '=' is not an allowed conversion character,
				654	nothing is lost in this test. */
				655	if ((ch == '!' && next == '=') \|\| /* != */
				656	(ch == '=' && next == '=') \|\| /* == */
				657	(ch == '<' && next == '=') \|\| /* <= */
				658	(ch == '>' && next == '=') /* >= */
				659	) {
				660	*str += 1;
				661	continue;
				662	}
				663	/* Don't get out of the loop for these, if they're single
				664	chars (not part of 2-char tokens). If by themselves, they
				665	don't end an expression (unlike say '!'). */
				666	if (ch == '>' \|\| ch == '<') {
				667	continue;
				668	}
				669	}
				670
				671	/* Normal way out of this loop. */
				672	break;
				673	} else if (ch == ']' \|\| ch == '}' \|\| ch == ')') {
				674	if (!nested_depth) {
				675	RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
				676	goto error;
				677	}
				678	nested_depth--;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	679	int opening = (unsigned char)parenstack[nested_depth];
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	680	if (!((opening == '(' && ch == ')') \|\|
				681	(opening == '[' && ch == ']') \|\|
				682	(opening == '{' && ch == '}')))
				683	{
				684	RAISE_SYNTAX_ERROR(
				685	"f-string: closing parenthesis '%c' "
				686	"does not match opening parenthesis '%c'",
				687	ch, opening);
				688	goto error;
				689	}
				690	} else {
				691	/* Just consume this char and loop around. */
				692	}
				693	}
				694	expr_end = *str;
				695	/* If we leave this loop in a string or with mismatched parens, we
				696	don't care. We'll get a syntax error when compiling the
				697	expression. But, we can produce a better error message, so
				698	let's just do that.*/
				699	if (quote_char) {
				700	RAISE_SYNTAX_ERROR("f-string: unterminated string");
				701	goto error;
				702	}
				703	if (nested_depth) {
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	704	int opening = (unsigned char)parenstack[nested_depth - 1];
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	705	RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
				706	goto error;
				707	}
				708
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	709	if (*str >= end) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	710	goto unexpected_end_of_string;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	711	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	712
				713	/* Compile the expression as soon as possible, so we show errors
				714	related to the expression before errors related to the
				715	conversion or format_spec. */
				716	simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	717	if (!simple_expression) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	718	goto error;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	719	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	720
				721	/* Check for =, which puts the text value of the expression in
				722	expr_text. */
				723	if (**str == '=') {
Pablo Galindo	9b83829	2020-05-27 22:01:11 +0100	[diff] [blame]	724	if (p->feature_version < 8) {
				725	RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
				726	"only supported in Python 3.8 and greater");
				727	goto error;
				728	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	729	*str += 1;
				730
				731	/* Skip over ASCII whitespace. No need to test for end of string
				732	here, since we know there's at least a trailing quote somewhere
				733	ahead. */
				734	while (Py_ISSPACE(**str)) {
				735	*str += 1;
				736	}
				737
				738	/* Set expr_text to the text of the expression. /
				739	expr_text = PyUnicode_FromStringAndSize(expr_start, str-expr_start);
				740	if (!*expr_text) {
				741	goto error;
				742	}
				743	}
				744
				745	/* Check for a conversion char, if present. */
				746	if (**str == '!') {
				747	*str += 1;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	748	if (*str >= end) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	749	goto unexpected_end_of_string;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	750	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	751
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	752	conversion = (unsigned char)**str;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	753	*str += 1;
				754
				755	/* Validate the conversion. */
				756	if (!(conversion == 's' \|\| conversion == 'r' \|\| conversion == 'a')) {
				757	RAISE_SYNTAX_ERROR(
				758	"f-string: invalid conversion character: "
				759	"expected 's', 'r', or 'a'");
				760	goto error;
				761	}
				762
				763	}
				764
				765	/* Check for the format spec, if present. */
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	766	if (*str >= end) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	767	goto unexpected_end_of_string;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	768	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	769	if (**str == ':') {
				770	*str += 1;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	771	if (*str >= end) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	772	goto unexpected_end_of_string;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	773	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	774
				775	/* Parse the format spec. */
				776	format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
				777	first_token, t, last_token);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	778	if (!format_spec) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	779	goto error;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	780	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	781	}
				782
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	783	if (str >= end \|\| *str != '}') {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	784	goto unexpected_end_of_string;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	785	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	786
				787	/* We're at a right brace. Consume it. */
				788	assert(*str < end);
				789	assert(**str == '}');
				790	*str += 1;
				791
				792	/* If we're in = mode (detected by non-NULL expr_text), and have no format
				793	spec and no explicit conversion, set the conversion to 'r'. */
				794	if (*expr_text && format_spec == NULL && conversion == -1) {
				795	conversion = 'r';
				796	}
				797
				798	/* And now create the FormattedValue node that represents this
				799	entire expression with the conversion and format spec. */
				800	//TODO: Fix this
				801	*expression = FormattedValue(simple_expression, conversion,
				802	format_spec, first_token->lineno,
				803	first_token->col_offset, last_token->end_lineno,
				804	last_token->end_col_offset, p->arena);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	805	if (!*expression) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	806	goto error;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	807	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	808
				809	return 0;
				810
				811	unexpected_end_of_string:
				812	RAISE_SYNTAX_ERROR("f-string: expecting '}'");
				813	/* Falls through to error. */
				814
				815	error:
				816	Py_XDECREF(*expr_text);
				817	return -1;
				818
				819	}
				820
				821	/* Return -1 on error.
				822
				823	Return 0 if we have a literal (possible zero length) and an
				824	expression (zero length if at the end of the string.
				825
				826	Return 1 if we have a literal, but no expression, and we want the
				827	caller to call us again. This is used to deal with doubled
				828	braces.
				829
				830	When called multiple times on the string 'a{{b{0}c', this function
				831	will return:
				832
				833	1. the literal 'a{' with no expression, and a return value
				834	of 1. Despite the fact that there's no expression, the return
				835	value of 1 means we're not finished yet.
				836
				837	2. the literal 'b' and the expression '0', with a return value of
				838	0. The fact that there's an expression means we're not finished.
				839
				840	3. literal 'c' with no expression and a return value of 0. The
				841	combination of the return value of 0 with no expression means
				842	we're finished.
				843	*/
				844	static int
				845	fstring_find_literal_and_expr(Parser p, const char str, const char end, int raw,
				846	int recurse_lvl, PyObject **literal,
				847	PyObject *expr_text, expr_ty expression,
				848	Token first_token, Token t, Token *last_token)
				849	{
				850	int result;
				851
				852	assert(literal == NULL && expression == NULL);
				853
				854	/* Get any literal string. */
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	855	result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	856	if (result < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	857	goto error;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	858	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	859
				860	assert(result == 0 \|\| result == 1);
				861
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	862	if (result == 1) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	863	/* We have a literal, but don't look at the expression. */
				864	return 1;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	865	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	866
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	867	if (str >= end \|\| *str == '}') {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	868	/* We're at the end of the string or the end of a nested
				869	f-string: no expression. The top-level error case where we
				870	expect to be at the end of the string but we're at a '}' is
				871	handled later. */
				872	return 0;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	873	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	874
				875	/* We must now be the start of an expression, on a '{'. */
				876	assert(**str == '{');
				877
				878	if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	879	expression, first_token, t, last_token) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	880	goto error;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	881	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	882
				883	return 0;
				884
				885	error:
				886	Py_CLEAR(*literal);
				887	return -1;
				888	}
				889
				890	#ifdef NDEBUG
				891	#define ExprList_check_invariants(l)
				892	#else
				893	static void
				894	ExprList_check_invariants(ExprList *l)
				895	{
				896	/* Check our invariants. Make sure this object is "live", and
				897	hasn't been deallocated. */
				898	assert(l->size >= 0);
				899	assert(l->p != NULL);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	900	if (l->size <= EXPRLIST_N_CACHED) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	901	assert(l->data == l->p);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	902	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	903	}
				904	#endif
				905
				906	static void
				907	ExprList_Init(ExprList *l)
				908	{
				909	l->allocated = EXPRLIST_N_CACHED;
				910	l->size = 0;
				911
				912	/* Until we start allocating dynamically, p points to data. */
				913	l->p = l->data;
				914
				915	ExprList_check_invariants(l);
				916	}
				917
				918	static int
				919	ExprList_Append(ExprList *l, expr_ty exp)
				920	{
				921	ExprList_check_invariants(l);
				922	if (l->size >= l->allocated) {
				923	/* We need to alloc (or realloc) the memory. */
				924	Py_ssize_t new_size = l->allocated * 2;
				925
				926	/* See if we've ever allocated anything dynamically. */
				927	if (l->p == l->data) {
				928	Py_ssize_t i;
				929	/* We're still using the cached data. Switch to
				930	alloc-ing. */
Lysandros Nikolaou	5193d0a	2020-06-27 21:35:18 +0300	[diff] [blame]	931	l->p = PyMem_Malloc(sizeof(expr_ty) * new_size);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	932	if (!l->p) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	933	return -1;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	934	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	935	/* Copy the cached data into the new buffer. */
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	936	for (i = 0; i < l->size; i++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	937	l->p[i] = l->data[i];
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	938	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	939	} else {
				940	/* Just realloc. */
Lysandros Nikolaou	5193d0a	2020-06-27 21:35:18 +0300	[diff] [blame]	941	expr_ty tmp = PyMem_Realloc(l->p, sizeof(expr_ty) new_size);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	942	if (!tmp) {
Lysandros Nikolaou	5193d0a	2020-06-27 21:35:18 +0300	[diff] [blame]	943	PyMem_Free(l->p);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	944	l->p = NULL;
				945	return -1;
				946	}
				947	l->p = tmp;
				948	}
				949
				950	l->allocated = new_size;
				951	assert(l->allocated == 2 * l->size);
				952	}
				953
				954	l->p[l->size++] = exp;
				955
				956	ExprList_check_invariants(l);
				957	return 0;
				958	}
				959
				960	static void
				961	ExprList_Dealloc(ExprList *l)
				962	{
				963	ExprList_check_invariants(l);
				964
				965	/* If there's been an error, or we've never dynamically allocated,
				966	do nothing. */
				967	if (!l->p \|\| l->p == l->data) {
				968	/* Do nothing. */
				969	} else {
				970	/* We have dynamically allocated. Free the memory. */
Lysandros Nikolaou	5193d0a	2020-06-27 21:35:18 +0300	[diff] [blame]	971	PyMem_Free(l->p);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	972	}
				973	l->p = NULL;
				974	l->size = -1;
				975	}
				976
				977	static asdl_seq *
				978	ExprList_Finish(ExprList l, PyArena arena)
				979	{
				980	asdl_seq *seq;
				981
				982	ExprList_check_invariants(l);
				983
				984	/* Allocate the asdl_seq and copy the expressions in to it. */
				985	seq = _Py_asdl_seq_new(l->size, arena);
				986	if (seq) {
				987	Py_ssize_t i;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	988	for (i = 0; i < l->size; i++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	989	asdl_seq_SET(seq, i, l->p[i]);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	990	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	991	}
				992	ExprList_Dealloc(l);
				993	return seq;
				994	}
				995
				996	#ifdef NDEBUG
				997	#define FstringParser_check_invariants(state)
				998	#else
				999	static void
				1000	FstringParser_check_invariants(FstringParser *state)
				1001	{
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1002	if (state->last_str) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1003	assert(PyUnicode_CheckExact(state->last_str));
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1004	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1005	ExprList_check_invariants(&state->expr_list);
				1006	}
				1007	#endif
				1008
				1009	void
				1010	_PyPegen_FstringParser_Init(FstringParser *state)
				1011	{
				1012	state->last_str = NULL;
				1013	state->fmode = 0;
				1014	ExprList_Init(&state->expr_list);
				1015	FstringParser_check_invariants(state);
				1016	}
				1017
				1018	void
				1019	_PyPegen_FstringParser_Dealloc(FstringParser *state)
				1020	{
				1021	FstringParser_check_invariants(state);
				1022
				1023	Py_XDECREF(state->last_str);
				1024	ExprList_Dealloc(&state->expr_list);
				1025	}
				1026
				1027	/* Make a Constant node, but decref the PyUnicode object being added. */
				1028	static expr_ty
				1029	make_str_node_and_del(Parser p, PyObject str, Token first_token, Token *last_token)
				1030	{
				1031	PyObject s = str;
				1032	PyObject *kind = NULL;
				1033	*str = NULL;
				1034	assert(PyUnicode_CheckExact(s));
				1035	if (PyArena_AddPyObject(p->arena, s) < 0) {
				1036	Py_DECREF(s);
				1037	return NULL;
				1038	}
				1039	const char* the_str = PyBytes_AsString(first_token->bytes);
				1040	if (the_str && the_str[0] == 'u') {
				1041	kind = _PyPegen_new_identifier(p, "u");
				1042	}
				1043
				1044	if (kind == NULL && PyErr_Occurred()) {
				1045	return NULL;
				1046	}
				1047
				1048	return Constant(s, kind, first_token->lineno, first_token->col_offset,
				1049	last_token->end_lineno, last_token->end_col_offset, p->arena);
				1050
				1051	}
				1052
				1053
				1054	/* Add a non-f-string (that is, a regular literal string). str is
				1055	decref'd. */
				1056	int
				1057	_PyPegen_FstringParser_ConcatAndDel(FstringParser state, PyObject str)
				1058	{
				1059	FstringParser_check_invariants(state);
				1060
				1061	assert(PyUnicode_CheckExact(str));
				1062
				1063	if (PyUnicode_GET_LENGTH(str) == 0) {
				1064	Py_DECREF(str);
				1065	return 0;
				1066	}
				1067
				1068	if (!state->last_str) {
				1069	/* We didn't have a string before, so just remember this one. */
				1070	state->last_str = str;
				1071	} else {
				1072	/* Concatenate this with the previous string. */
				1073	PyUnicode_AppendAndDel(&state->last_str, str);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1074	if (!state->last_str) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1075	return -1;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1076	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1077	}
				1078	FstringParser_check_invariants(state);
				1079	return 0;
				1080	}
				1081
				1082	/* Parse an f-string. The f-string is in *str to end, with no
				1083	'f' or quotes. */
				1084	int
				1085	_PyPegen_FstringParser_ConcatFstring(Parser p, FstringParser state, const char **str,
				1086	const char *end, int raw, int recurse_lvl,
				1087	Token first_token, Token t, Token *last_token)
				1088	{
				1089	FstringParser_check_invariants(state);
				1090	state->fmode = 1;
				1091
				1092	/* Parse the f-string. */
				1093	while (1) {
				1094	PyObject *literal = NULL;
				1095	PyObject *expr_text = NULL;
				1096	expr_ty expression = NULL;
				1097
				1098	/* If there's a zero length literal in front of the
				1099	expression, literal will be NULL. If we're at the end of
				1100	the f-string, expression will be NULL (unless result == 1,
				1101	see below). */
				1102	int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
				1103	&literal, &expr_text,
				1104	&expression, first_token, t, last_token);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1105	if (result < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1106	return -1;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1107	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1108
				1109	/* Add the literal, if any. */
				1110	if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
				1111	Py_XDECREF(expr_text);
				1112	return -1;
				1113	}
				1114	/* Add the expr_text, if any. */
				1115	if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
				1116	return -1;
				1117	}
				1118
				1119	/* We've dealt with the literal and expr_text, their ownership has
				1120	been transferred to the state object. Don't look at them again. */
				1121
				1122	/* See if we should just loop around to get the next literal
				1123	and expression, while ignoring the expression this
				1124	time. This is used for un-doubling braces, as an
				1125	optimization. */
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1126	if (result == 1) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1127	continue;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1128	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1129
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1130	if (!expression) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1131	/* We're done with this f-string. */
				1132	break;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1133	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1134
				1135	/* We know we have an expression. Convert any existing string
				1136	to a Constant node. */
				1137	if (!state->last_str) {
				1138	/* Do nothing. No previous literal. */
				1139	} else {
				1140	/* Convert the existing last_str literal to a Constant node. */
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1141	expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
				1142	if (!last_str \|\| ExprList_Append(&state->expr_list, last_str) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1143	return -1;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1144	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1145	}
				1146
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1147	if (ExprList_Append(&state->expr_list, expression) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1148	return -1;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1149	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1150	}
				1151
				1152	/* If recurse_lvl is zero, then we must be at the end of the
				1153	string. Otherwise, we must be at a right brace. */
				1154
				1155	if (recurse_lvl == 0 && *str < end-1) {
				1156	RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
				1157	return -1;
				1158	}
				1159	if (recurse_lvl != 0 && **str != '}') {
				1160	RAISE_SYNTAX_ERROR("f-string: expecting '}'");
				1161	return -1;
				1162	}
				1163
				1164	FstringParser_check_invariants(state);
				1165	return 0;
				1166	}
				1167
				1168	/* Convert the partial state reflected in last_str and expr_list to an
				1169	expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
				1170	expr_ty
				1171	_PyPegen_FstringParser_Finish(Parser p, FstringParser state, Token* first_token,
				1172	Token *last_token)
				1173	{
				1174	asdl_seq *seq;
				1175
				1176	FstringParser_check_invariants(state);
				1177
				1178	/* If we're just a constant string with no expressions, return
				1179	that. */
				1180	if (!state->fmode) {
				1181	assert(!state->expr_list.size);
				1182	if (!state->last_str) {
				1183	/* Create a zero length string. */
				1184	state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1185	if (!state->last_str) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1186	goto error;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1187	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1188	}
				1189	return make_str_node_and_del(p, &state->last_str, first_token, last_token);
				1190	}
				1191
				1192	/* Create a Constant node out of last_str, if needed. It will be the
				1193	last node in our expression list. */
				1194	if (state->last_str) {
				1195	expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1196	if (!str \|\| ExprList_Append(&state->expr_list, str) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1197	goto error;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1198	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1199	}
				1200	/* This has already been freed. */
				1201	assert(state->last_str == NULL);
				1202
				1203	seq = ExprList_Finish(&state->expr_list, p->arena);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1204	if (!seq) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1205	goto error;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1206	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1207
				1208	return _Py_JoinedStr(seq, first_token->lineno, first_token->col_offset,
				1209	last_token->end_lineno, last_token->end_col_offset, p->arena);
				1210
				1211	error:
				1212	_PyPegen_FstringParser_Dealloc(state);
				1213	return NULL;
				1214	}
				1215
				1216	/* Given an f-string (with no 'f' or quotes) that's in *str and ends
				1217	at end, parse it into an expr_ty. Return NULL on error. Adjust
				1218	str to point past the parsed portion. */
				1219	static expr_ty
				1220	fstring_parse(Parser p, const char str, const char end, int raw,
				1221	int recurse_lvl, Token first_token, Token t, Token *last_token)
				1222	{
				1223	FstringParser state;
				1224
				1225	_PyPegen_FstringParser_Init(&state);
				1226	if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
				1227	first_token, t, last_token) < 0) {
				1228	_PyPegen_FstringParser_Dealloc(&state);
				1229	return NULL;
				1230	}
				1231
				1232	return _PyPegen_FstringParser_Finish(p, &state, t, t);
				1233	}