Blame - Parser/pegen/parse_string.c - platform/external/python/cpython3

blob: 7b02bdde645e80f66fe562520854b16d422f554f [file] [log] [blame]

Miss Islington (bot)	961703c	2020-07-16 06:25:31 -0700	[diff] [blame]	1	#include <stdbool.h>
				2
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	3	#include <Python.h>
				4
				5	#include "../tokenizer.h"
				6	#include "pegen.h"
				7	#include "parse_string.h"
				8
				9	//// STRING HANDLING FUNCTIONS ////
				10
				11	// These functions are ported directly from Python/ast.c with some modifications
				12	// to account for the use of "Parser *p", the fact that don't have parser nodes
				13	// to pass around and the usage of some specialized APIs present only in this
				14	// file (like "_PyPegen_raise_syntax_error").
				15
				16	static int
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	17	warn_invalid_escape_sequence(Parser p, unsigned char first_invalid_escape_char, Token t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	18	{
				19	PyObject *msg =
				20	PyUnicode_FromFormat("invalid escape sequence \\%c", first_invalid_escape_char);
				21	if (msg == NULL) {
				22	return -1;
				23	}
				24	if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	25	t->lineno, NULL, NULL) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	26	if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
				27	/* Replace the DeprecationWarning exception with a SyntaxError
				28	to get a more accurate error report */
				29	PyErr_Clear();
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	30
				31	/* This is needed, in order for the SyntaxError to point to the token t,
				32	since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
				33	error location, if p->known_err_token is not set. */
				34	p->known_err_token = t;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	35	RAISE_SYNTAX_ERROR("invalid escape sequence \\%c", first_invalid_escape_char);
				36	}
				37	Py_DECREF(msg);
				38	return -1;
				39	}
				40	Py_DECREF(msg);
				41	return 0;
				42	}
				43
				44	static PyObject *
				45	decode_utf8(const char *sPtr, const char end)
				46	{
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	47	const char *s;
				48	const char *t;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	49	t = s = *sPtr;
				50	while (s < end && (*s & 0x80)) {
				51	s++;
				52	}
				53	*sPtr = s;
				54	return PyUnicode_DecodeUTF8(t, s - t, NULL);
				55	}
				56
				57	static PyObject *
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	58	decode_unicode_with_escapes(Parser parser, const char s, size_t len, Token *t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	59	{
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	60	PyObject *v;
				61	PyObject *u;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	62	char *buf;
				63	char *p;
				64	const char *end;
				65
				66	/* check for integer overflow */
				67	if (len > SIZE_MAX / 6) {
				68	return NULL;
				69	}
				70	/* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
				71	"\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
				72	u = PyBytes_FromStringAndSize((char )NULL, len 6);
				73	if (u == NULL) {
				74	return NULL;
				75	}
				76	p = buf = PyBytes_AsString(u);
				77	end = s + len;
				78	while (s < end) {
				79	if (*s == '\\') {
				80	p++ = s++;
				81	if (s >= end \|\| *s & 0x80) {
				82	strcpy(p, "u005c");
				83	p += 5;
				84	if (s >= end) {
				85	break;
				86	}
				87	}
				88	}
				89	if (*s & 0x80) {
				90	PyObject *w;
				91	int kind;
				92	void *data;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	93	Py_ssize_t w_len;
				94	Py_ssize_t i;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	95	w = decode_utf8(&s, end);
				96	if (w == NULL) {
				97	Py_DECREF(u);
				98	return NULL;
				99	}
				100	kind = PyUnicode_KIND(w);
				101	data = PyUnicode_DATA(w);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	102	w_len = PyUnicode_GET_LENGTH(w);
				103	for (i = 0; i < w_len; i++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	104	Py_UCS4 chr = PyUnicode_READ(kind, data, i);
				105	sprintf(p, "\\U%08x", chr);
				106	p += 10;
				107	}
				108	/* Should be impossible to overflow */
				109	assert(p - buf <= PyBytes_GET_SIZE(u));
				110	Py_DECREF(w);
				111	}
				112	else {
				113	p++ = s++;
				114	}
				115	}
				116	len = p - buf;
				117	s = buf;
				118
				119	const char *first_invalid_escape;
				120	v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
				121
				122	if (v != NULL && first_invalid_escape != NULL) {
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	123	if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	124	/* We have not decref u before because first_invalid_escape points
				125	inside u. */
				126	Py_XDECREF(u);
				127	Py_DECREF(v);
				128	return NULL;
				129	}
				130	}
				131	Py_XDECREF(u);
				132	return v;
				133	}
				134
				135	static PyObject *
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	136	decode_bytes_with_escapes(Parser p, const char s, Py_ssize_t len, Token *t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	137	{
				138	const char *first_invalid_escape;
				139	PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
				140	if (result == NULL) {
				141	return NULL;
				142	}
				143
				144	if (first_invalid_escape != NULL) {
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	145	if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	146	Py_DECREF(result);
				147	return NULL;
				148	}
				149	}
				150	return result;
				151	}
				152
				153	/* s must include the bracketing quote characters, and r, b, u,
				154	&/or f prefixes (if any), and embedded escape sequences (if any).
				155	_PyPegen_parsestr parses it, and sets *result to decoded Python string object.
				156	If the string is an f-string, set fstr and fstrlen to the unparsed
				157	string object. Return 0 if no errors occurred. */
				158	int
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	159	_PyPegen_parsestr(Parser p, int bytesmode, int rawmode, PyObject *result,
				160	const char *fstr, Py_ssize_t fstrlen, Token *t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	161	{
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	162	const char *s = PyBytes_AsString(t->bytes);
				163	if (s == NULL) {
				164	return -1;
				165	}
				166
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	167	size_t len;
				168	int quote = Py_CHARMASK(*s);
				169	int fmode = 0;
				170	*bytesmode = 0;
				171	*rawmode = 0;
				172	*result = NULL;
				173	*fstr = NULL;
				174	if (Py_ISALPHA(quote)) {
				175	while (!bytesmode \|\| !rawmode) {
				176	if (quote == 'b' \|\| quote == 'B') {
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	177	quote =(unsigned char)*++s;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	178	*bytesmode = 1;
				179	}
				180	else if (quote == 'u' \|\| quote == 'U') {
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	181	quote = (unsigned char)*++s;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	182	}
				183	else if (quote == 'r' \|\| quote == 'R') {
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	184	quote = (unsigned char)*++s;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	185	*rawmode = 1;
				186	}
				187	else if (quote == 'f' \|\| quote == 'F') {
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	188	quote = (unsigned char)*++s;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	189	fmode = 1;
				190	}
				191	else {
				192	break;
				193	}
				194	}
				195	}
				196
Lysandros Nikolaou	3e0a6f3	2020-05-01 06:27:52 +0300	[diff] [blame]	197	/* fstrings are only allowed in Python 3.6 and greater */
				198	if (fmode && p->feature_version < 6) {
				199	p->error_indicator = 1;
				200	RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
				201	return -1;
				202	}
				203
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	204	if (fmode && *bytesmode) {
				205	PyErr_BadInternalCall();
				206	return -1;
				207	}
				208	if (quote != '\'' && quote != '\"') {
				209	PyErr_BadInternalCall();
				210	return -1;
				211	}
				212	/* Skip the leading quote char. */
				213	s++;
				214	len = strlen(s);
				215	if (len > INT_MAX) {
				216	PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
				217	return -1;
				218	}
				219	if (s[--len] != quote) {
				220	/* Last quote char must match the first. */
				221	PyErr_BadInternalCall();
				222	return -1;
				223	}
				224	if (len >= 4 && s[0] == quote && s[1] == quote) {
				225	/* A triple quoted string. We've already skipped one quote at
				226	the start and one at the end of the string. Now skip the
				227	two at the start. */
				228	s += 2;
				229	len -= 2;
				230	/* And check that the last two match. */
				231	if (s[--len] != quote \|\| s[--len] != quote) {
				232	PyErr_BadInternalCall();
				233	return -1;
				234	}
				235	}
				236
				237	if (fmode) {
				238	/* Just return the bytes. The caller will parse the resulting
				239	string. */
				240	*fstr = s;
				241	*fstrlen = len;
				242	return 0;
				243	}
				244
				245	/* Not an f-string. */
				246	/* Avoid invoking escape decoding routines if possible. */
				247	rawmode = rawmode \|\| strchr(s, '\\') == NULL;
				248	if (*bytesmode) {
				249	/* Disallow non-ASCII characters. */
				250	const char *ch;
				251	for (ch = s; *ch; ch++) {
				252	if (Py_CHARMASK(*ch) >= 0x80) {
				253	RAISE_SYNTAX_ERROR(
				254	"bytes can only contain ASCII "
				255	"literal characters.");
				256	return -1;
				257	}
				258	}
				259	if (*rawmode) {
				260	*result = PyBytes_FromStringAndSize(s, len);
				261	}
				262	else {
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	263	*result = decode_bytes_with_escapes(p, s, len, t);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	264	}
				265	}
				266	else {
				267	if (*rawmode) {
				268	*result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
				269	}
				270	else {
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	271	*result = decode_unicode_with_escapes(p, s, len, t);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	272	}
				273	}
				274	return *result == NULL ? -1 : 0;
				275	}
				276
				277
				278
				279	// FSTRING STUFF
				280
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	281	/* Fix locations for the given node and its children.
				282
				283	`parent` is the enclosing node.
				284	`n` is the node which locations are going to be fixed relative to parent.
				285	`expr_str` is the child node's string representation, including braces.
				286	*/
Miss Islington (bot)	961703c	2020-07-16 06:25:31 -0700	[diff] [blame]	287	static bool
Pablo Galindo	dab533d	2020-06-28 01:15:28 +0100	[diff] [blame]	288	fstring_find_expr_location(Token parent, char expr_str, int p_lines, int p_cols)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	289	{
Miss Islington (bot)	961703c	2020-07-16 06:25:31 -0700	[diff] [blame]	290	*p_lines = 0;
				291	*p_cols = 0;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	292	if (parent && parent->bytes) {
				293	char *parent_str = PyBytes_AsString(parent->bytes);
				294	if (!parent_str) {
Miss Islington (bot)	961703c	2020-07-16 06:25:31 -0700	[diff] [blame]	295	return false;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	296	}
Miss Islington (bot)	961703c	2020-07-16 06:25:31 -0700	[diff] [blame]	297	char *substr = strstr(parent_str, expr_str);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	298	if (substr) {
				299	// The following is needed, in order to correctly shift the column
				300	// offset, in the case that (disregarding any whitespace) a newline
				301	// immediately follows the opening curly brace of the fstring expression.
Miss Islington (bot)	961703c	2020-07-16 06:25:31 -0700	[diff] [blame]	302	bool newline_after_brace = 1;
				303	char *start = substr + 1;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	304	while (start && start != '}' && start != '\n') {
				305	if (start != ' ' && start != '\t' && *start != '\f') {
				306	newline_after_brace = 0;
				307	break;
				308	}
				309	start++;
				310	}
				311
				312	// Account for the characters from the last newline character to our
				313	// left until the beginning of substr.
				314	if (!newline_after_brace) {
				315	start = substr;
				316	while (start > parent_str && *start != '\n') {
				317	start--;
				318	}
Miss Islington (bot)	961703c	2020-07-16 06:25:31 -0700	[diff] [blame]	319	*p_cols += (int)(substr - start);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	320	}
				321	/* adjust the start based on the number of newlines encountered
				322	before the f-string expression */
Pablo Galindo	0b7829e	2020-04-23 03:24:25 +0100	[diff] [blame]	323	for (char* p = parent_str; p < substr; p++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	324	if (*p == '\n') {
Miss Islington (bot)	961703c	2020-07-16 06:25:31 -0700	[diff] [blame]	325	(*p_lines)++;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	326	}
				327	}
				328	}
				329	}
Miss Islington (bot)	961703c	2020-07-16 06:25:31 -0700	[diff] [blame]	330	return true;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	331	}
				332
				333
				334	/* Compile this expression in to an expr_ty. Add parens around the
				335	expression, in order to allow leading spaces in the expression. */
				336	static expr_ty
				337	fstring_compile_expr(Parser p, const char expr_start, const char *expr_end,
				338	Token *t)
				339	{
				340	expr_ty expr = NULL;
				341	char *str;
				342	Py_ssize_t len;
				343	const char *s;
				344	expr_ty result = NULL;
				345
				346	assert(expr_end >= expr_start);
				347	assert(*(expr_start-1) == '{');
				348	assert(expr_end == '}' \|\| expr_end == '!' \|\| *expr_end == ':' \|\|
				349	*expr_end == '=');
				350
				351	/* If the substring is all whitespace, it's an error. We need to catch this
				352	here, and not when we call PyParser_SimpleParseStringFlagsFilename,
				353	because turning the expression '' in to '()' would go from being invalid
				354	to valid. */
				355	for (s = expr_start; s != expr_end; s++) {
				356	char c = *s;
				357	/* The Python parser ignores only the following whitespace
				358	characters (\r already is converted to \n). */
				359	if (!(c == ' ' \|\| c == '\t' \|\| c == '\n' \|\| c == '\f')) {
				360	break;
				361	}
				362	}
				363	if (s == expr_end) {
				364	RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
				365	return NULL;
				366	}
				367
				368	len = expr_end - expr_start;
				369	/* Allocate 3 extra bytes: open paren, close paren, null byte. */
Lysandros Nikolaou	5193d0a	2020-06-27 21:35:18 +0300	[diff] [blame]	370	str = PyMem_Malloc(len + 3);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	371	if (str == NULL) {
				372	PyErr_NoMemory();
				373	return NULL;
				374	}
				375
Pablo Galindo	dab533d	2020-06-28 01:15:28 +0100	[diff] [blame]	376	// The call to fstring_find_expr_location is responsible for finding the column offset
				377	// the generated AST nodes need to be shifted to the right, which is equal to the number
				378	// of the f-string characters before the expression starts. In order to correctly compute
				379	// this offset, strstr gets called in fstring_find_expr_location which only succeeds
				380	// if curly braces appear before and after the f-string expression (exactly like they do
				381	// in the f-string itself), hence the following lines.
				382	str[0] = '{';
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	383	memcpy(str+1, expr_start, len);
Pablo Galindo	dab533d	2020-06-28 01:15:28 +0100	[diff] [blame]	384	str[len+1] = '}';
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	385	str[len+2] = 0;
				386
Pablo Galindo	dab533d	2020-06-28 01:15:28 +0100	[diff] [blame]	387	int lines, cols;
Miss Islington (bot)	961703c	2020-07-16 06:25:31 -0700	[diff] [blame]	388	if (!fstring_find_expr_location(t, str, &lines, &cols)) {
				389	PyMem_FREE(str);
				390	return NULL;
				391	}
Pablo Galindo	dab533d	2020-06-28 01:15:28 +0100	[diff] [blame]	392
Miss Islington (bot)	9d8b8c3	2020-07-16 09:30:19 -0700	[diff] [blame^]	393	// The parentheses are needed in order to allow for leading whitespace within
Pablo Galindo	dab533d	2020-06-28 01:15:28 +0100	[diff] [blame]	394	// the f-string expression. This consequently gets parsed as a group (see the
				395	// group rule in python.gram).
				396	str[0] = '(';
				397	str[len+1] = ')';
				398
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	399	struct tok_state* tok = PyTokenizer_FromString(str, 1);
				400	if (tok == NULL) {
Lysandros Nikolaou	5193d0a	2020-06-27 21:35:18 +0300	[diff] [blame]	401	PyMem_Free(str);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	402	return NULL;
				403	}
Lysandros Nikolaou	791a46e	2020-05-26 04:24:31 +0300	[diff] [blame]	404	Py_INCREF(p->tok->filename);
				405	tok->filename = p->tok->filename;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	406
Lysandros Nikolaou	3e0a6f3	2020-05-01 06:27:52 +0300	[diff] [blame]	407	Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
				408	NULL, p->arena);
Pablo Galindo	dab533d	2020-06-28 01:15:28 +0100	[diff] [blame]	409	p2->starting_lineno = t->lineno + lines - 1;
				410	p2->starting_col_offset = p->tok->first_lineno == p->tok->lineno ? t->col_offset + cols : cols;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	411
				412	expr = _PyPegen_run_parser(p2);
				413
				414	if (expr == NULL) {
				415	goto exit;
				416	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	417	result = expr;
				418
				419	exit:
Lysandros Nikolaou	5193d0a	2020-06-27 21:35:18 +0300	[diff] [blame]	420	PyMem_Free(str);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	421	_PyPegen_Parser_Free(p2);
				422	PyTokenizer_Free(tok);
				423	return result;
				424	}
				425
				426	/* Return -1 on error.
				427
				428	Return 0 if we reached the end of the literal.
				429
				430	Return 1 if we haven't reached the end of the literal, but we want
				431	the caller to process the literal up to this point. Used for
				432	doubled braces.
				433	*/
				434	static int
				435	fstring_find_literal(Parser p, const char str, const char end, int raw,
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	436	PyObject *literal, int recurse_lvl, Token t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	437	{
				438	/* Get any literal string. It ends when we hit an un-doubled left
				439	brace (which isn't part of a unicode name escape such as
				440	"\N{EULER CONSTANT}"), or the end of the string. */
				441
				442	const char s = str;
				443	const char *literal_start = s;
				444	int result = 0;
				445
				446	assert(*literal == NULL);
				447	while (s < end) {
				448	char ch = *s++;
				449	if (!raw && ch == '\\' && s < end) {
				450	ch = *s++;
				451	if (ch == 'N') {
				452	if (s < end && *s++ == '{') {
				453	while (s < end && *s++ != '}') {
				454	}
				455	continue;
				456	}
				457	break;
				458	}
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	459	if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	460	return -1;
				461	}
				462	}
				463	if (ch == '{' \|\| ch == '}') {
				464	/* Check for doubled braces, but only at the top level. If
				465	we checked at every level, then f'{0:{3}}' would fail
				466	with the two closing braces. */
				467	if (recurse_lvl == 0) {
				468	if (s < end && *s == ch) {
				469	/* We're going to tell the caller that the literal ends
				470	here, but that they should continue scanning. But also
				471	skip over the second brace when we resume scanning. */
				472	*str = s + 1;
				473	result = 1;
				474	goto done;
				475	}
				476
				477	/* Where a single '{' is the start of a new expression, a
				478	single '}' is not allowed. */
				479	if (ch == '}') {
				480	*str = s - 1;
				481	RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
				482	return -1;
				483	}
				484	}
				485	/* We're either at a '{', which means we're starting another
				486	expression; or a '}', which means we're at the end of this
				487	f-string (for a nested format_spec). */
				488	s--;
				489	break;
				490	}
				491	}
				492	*str = s;
				493	assert(s <= end);
				494	assert(s == end \|\| s == '{' \|\| s == '}');
				495	done:
				496	if (literal_start != s) {
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	497	if (raw) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	498	*literal = PyUnicode_DecodeUTF8Stateful(literal_start,
				499	s - literal_start,
				500	NULL, NULL);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	501	} else {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	502	*literal = decode_unicode_with_escapes(p, literal_start,
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	503	s - literal_start, t);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	504	}
				505	if (!*literal) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	506	return -1;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	507	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	508	}
				509	return result;
				510	}
				511
				512	/* Forward declaration because parsing is recursive. */
				513	static expr_ty
				514	fstring_parse(Parser p, const char str, const char end, int raw, int recurse_lvl,
				515	Token first_token, Token t, Token *last_token);
				516
				517	/* Parse the f-string at str, ending at end. We know str starts an
				518	expression (so it must be a '{'). Returns the FormattedValue node, which
				519	includes the expression, conversion character, format_spec expression, and
				520	optionally the text of the expression (if = is used).
				521
				522	Note that I don't do a perfect job here: I don't make sure that a
				523	closing brace doesn't match an opening paren, for example. It
				524	doesn't need to error on all invalid expressions, just correctly
				525	find the end of all valid ones. Any errors inside the expression
				526	will be caught when we parse it later.
				527
				528	*expression is set to the expression. For an '=' "debug" expression,
				529	*expr_text is set to the debug text (the original text of the expression,
				530	including the '=' and any whitespace around it, as a string object). If
				531	not a debug expression, expr_text set to NULL. /
				532	static int
				533	fstring_find_expr(Parser p, const char str, const char end, int raw, int recurse_lvl,
				534	PyObject *expr_text, expr_ty expression, Token *first_token,
				535	Token t, Token last_token)
				536	{
				537	/* Return -1 on error, else 0. */
				538
				539	const char *expr_start;
				540	const char *expr_end;
				541	expr_ty simple_expression;
				542	expr_ty format_spec = NULL; /* Optional format specifier. */
				543	int conversion = -1; /* The conversion char. Use default if not
				544	specified, or !r if using = and no format
				545	spec. */
				546
				547	/* 0 if we're not in a string, else the quote char we're trying to
				548	match (single or double quote). */
				549	char quote_char = 0;
				550
				551	/* If we're inside a string, 1=normal, 3=triple-quoted. */
				552	int string_type = 0;
				553
				554	/* Keep track of nesting level for braces/parens/brackets in
				555	expressions. */
				556	Py_ssize_t nested_depth = 0;
				557	char parenstack[MAXLEVEL];
				558
				559	*expr_text = NULL;
				560
				561	/* Can only nest one level deep. */
				562	if (recurse_lvl >= 2) {
				563	RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
				564	goto error;
				565	}
				566
				567	/* The first char must be a left brace, or we wouldn't have gotten
				568	here. Skip over it. */
				569	assert(**str == '{');
				570	*str += 1;
				571
				572	expr_start = *str;
				573	for (; str < end; (str)++) {
				574	char ch;
				575
				576	/* Loop invariants. */
				577	assert(nested_depth >= 0);
				578	assert(str >= expr_start && str < end);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	579	if (quote_char) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	580	assert(string_type == 1 \|\| string_type == 3);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	581	} else {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	582	assert(string_type == 0);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	583	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	584
				585	ch = **str;
				586	/* Nowhere inside an expression is a backslash allowed. */
				587	if (ch == '\\') {
				588	/* Error: can't include a backslash character, inside
				589	parens or strings or not. */
				590	RAISE_SYNTAX_ERROR(
				591	"f-string expression part "
				592	"cannot include a backslash");
				593	goto error;
				594	}
				595	if (quote_char) {
				596	/* We're inside a string. See if we're at the end. */
				597	/* This code needs to implement the same non-error logic
				598	as tok_get from tokenizer.c, at the letter_quote
				599	label. To actually share that code would be a
				600	nightmare. But, it's unlikely to change and is small,
				601	so duplicate it here. Note we don't need to catch all
				602	of the errors, since they'll be caught when parsing the
				603	expression. We just need to match the non-error
				604	cases. Thus we can ignore \n in single-quoted strings,
				605	for example. Or non-terminated strings. */
				606	if (ch == quote_char) {
				607	/* Does this match the string_type (single or triple
				608	quoted)? */
				609	if (string_type == 3) {
				610	if (str+2 < end && (str+1) == ch && (*str+2) == ch) {
				611	/* We're at the end of a triple quoted string. */
				612	*str += 2;
				613	string_type = 0;
				614	quote_char = 0;
				615	continue;
				616	}
				617	} else {
				618	/* We're at the end of a normal string. */
				619	quote_char = 0;
				620	string_type = 0;
				621	continue;
				622	}
				623	}
				624	} else if (ch == '\'' \|\| ch == '"') {
				625	/* Is this a triple quoted string? */
				626	if (str+2 < end && (str+1) == ch && (*str+2) == ch) {
				627	string_type = 3;
				628	*str += 2;
				629	} else {
				630	/* Start of a normal string. */
				631	string_type = 1;
				632	}
				633	/* Start looking for the end of the string. */
				634	quote_char = ch;
				635	} else if (ch == '[' \|\| ch == '{' \|\| ch == '(') {
				636	if (nested_depth >= MAXLEVEL) {
				637	RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
				638	goto error;
				639	}
				640	parenstack[nested_depth] = ch;
				641	nested_depth++;
				642	} else if (ch == '#') {
				643	/* Error: can't include a comment character, inside parens
				644	or not. */
				645	RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
				646	goto error;
				647	} else if (nested_depth == 0 &&
				648	(ch == '!' \|\| ch == ':' \|\| ch == '}' \|\|
				649	ch == '=' \|\| ch == '>' \|\| ch == '<')) {
				650	/* See if there's a next character. */
				651	if (*str+1 < end) {
				652	char next = (str+1);
				653
				654	/* For "!=". since '=' is not an allowed conversion character,
				655	nothing is lost in this test. */
				656	if ((ch == '!' && next == '=') \|\| /* != */
				657	(ch == '=' && next == '=') \|\| /* == */
				658	(ch == '<' && next == '=') \|\| /* <= */
				659	(ch == '>' && next == '=') /* >= */
				660	) {
				661	*str += 1;
				662	continue;
				663	}
				664	/* Don't get out of the loop for these, if they're single
				665	chars (not part of 2-char tokens). If by themselves, they
				666	don't end an expression (unlike say '!'). */
				667	if (ch == '>' \|\| ch == '<') {
				668	continue;
				669	}
				670	}
				671
				672	/* Normal way out of this loop. */
				673	break;
				674	} else if (ch == ']' \|\| ch == '}' \|\| ch == ')') {
				675	if (!nested_depth) {
				676	RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
				677	goto error;
				678	}
				679	nested_depth--;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	680	int opening = (unsigned char)parenstack[nested_depth];
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	681	if (!((opening == '(' && ch == ')') \|\|
				682	(opening == '[' && ch == ']') \|\|
				683	(opening == '{' && ch == '}')))
				684	{
				685	RAISE_SYNTAX_ERROR(
				686	"f-string: closing parenthesis '%c' "
				687	"does not match opening parenthesis '%c'",
				688	ch, opening);
				689	goto error;
				690	}
				691	} else {
				692	/* Just consume this char and loop around. */
				693	}
				694	}
				695	expr_end = *str;
				696	/* If we leave this loop in a string or with mismatched parens, we
				697	don't care. We'll get a syntax error when compiling the
				698	expression. But, we can produce a better error message, so
				699	let's just do that.*/
				700	if (quote_char) {
				701	RAISE_SYNTAX_ERROR("f-string: unterminated string");
				702	goto error;
				703	}
				704	if (nested_depth) {
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	705	int opening = (unsigned char)parenstack[nested_depth - 1];
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	706	RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
				707	goto error;
				708	}
				709
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	710	if (*str >= end) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	711	goto unexpected_end_of_string;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	712	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	713
				714	/* Compile the expression as soon as possible, so we show errors
				715	related to the expression before errors related to the
				716	conversion or format_spec. */
				717	simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	718	if (!simple_expression) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	719	goto error;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	720	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	721
				722	/* Check for =, which puts the text value of the expression in
				723	expr_text. */
				724	if (**str == '=') {
Pablo Galindo	9b83829	2020-05-27 22:01:11 +0100	[diff] [blame]	725	if (p->feature_version < 8) {
				726	RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
				727	"only supported in Python 3.8 and greater");
				728	goto error;
				729	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	730	*str += 1;
				731
				732	/* Skip over ASCII whitespace. No need to test for end of string
				733	here, since we know there's at least a trailing quote somewhere
				734	ahead. */
				735	while (Py_ISSPACE(**str)) {
				736	*str += 1;
				737	}
				738
				739	/* Set expr_text to the text of the expression. /
				740	expr_text = PyUnicode_FromStringAndSize(expr_start, str-expr_start);
				741	if (!*expr_text) {
				742	goto error;
				743	}
				744	}
				745
				746	/* Check for a conversion char, if present. */
				747	if (**str == '!') {
				748	*str += 1;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	749	if (*str >= end) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	750	goto unexpected_end_of_string;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	751	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	752
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	753	conversion = (unsigned char)**str;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	754	*str += 1;
				755
				756	/* Validate the conversion. */
				757	if (!(conversion == 's' \|\| conversion == 'r' \|\| conversion == 'a')) {
				758	RAISE_SYNTAX_ERROR(
				759	"f-string: invalid conversion character: "
				760	"expected 's', 'r', or 'a'");
				761	goto error;
				762	}
				763
				764	}
				765
				766	/* Check for the format spec, if present. */
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	767	if (*str >= end) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	768	goto unexpected_end_of_string;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	769	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	770	if (**str == ':') {
				771	*str += 1;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	772	if (*str >= end) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	773	goto unexpected_end_of_string;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	774	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	775
				776	/* Parse the format spec. */
				777	format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
				778	first_token, t, last_token);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	779	if (!format_spec) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	780	goto error;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	781	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	782	}
				783
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	784	if (str >= end \|\| *str != '}') {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	785	goto unexpected_end_of_string;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	786	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	787
				788	/* We're at a right brace. Consume it. */
				789	assert(*str < end);
				790	assert(**str == '}');
				791	*str += 1;
				792
				793	/* If we're in = mode (detected by non-NULL expr_text), and have no format
				794	spec and no explicit conversion, set the conversion to 'r'. */
				795	if (*expr_text && format_spec == NULL && conversion == -1) {
				796	conversion = 'r';
				797	}
				798
				799	/* And now create the FormattedValue node that represents this
				800	entire expression with the conversion and format spec. */
				801	//TODO: Fix this
				802	*expression = FormattedValue(simple_expression, conversion,
				803	format_spec, first_token->lineno,
				804	first_token->col_offset, last_token->end_lineno,
				805	last_token->end_col_offset, p->arena);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	806	if (!*expression) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	807	goto error;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	808	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	809
				810	return 0;
				811
				812	unexpected_end_of_string:
				813	RAISE_SYNTAX_ERROR("f-string: expecting '}'");
				814	/* Falls through to error. */
				815
				816	error:
				817	Py_XDECREF(*expr_text);
				818	return -1;
				819
				820	}
				821
				822	/* Return -1 on error.
				823
				824	Return 0 if we have a literal (possible zero length) and an
				825	expression (zero length if at the end of the string.
				826
				827	Return 1 if we have a literal, but no expression, and we want the
				828	caller to call us again. This is used to deal with doubled
				829	braces.
				830
				831	When called multiple times on the string 'a{{b{0}c', this function
				832	will return:
				833
				834	1. the literal 'a{' with no expression, and a return value
				835	of 1. Despite the fact that there's no expression, the return
				836	value of 1 means we're not finished yet.
				837
				838	2. the literal 'b' and the expression '0', with a return value of
				839	0. The fact that there's an expression means we're not finished.
				840
				841	3. literal 'c' with no expression and a return value of 0. The
				842	combination of the return value of 0 with no expression means
				843	we're finished.
				844	*/
				845	static int
				846	fstring_find_literal_and_expr(Parser p, const char str, const char end, int raw,
				847	int recurse_lvl, PyObject **literal,
				848	PyObject *expr_text, expr_ty expression,
				849	Token first_token, Token t, Token *last_token)
				850	{
				851	int result;
				852
				853	assert(literal == NULL && expression == NULL);
				854
				855	/* Get any literal string. */
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	856	result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	857	if (result < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	858	goto error;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	859	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	860
				861	assert(result == 0 \|\| result == 1);
				862
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	863	if (result == 1) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	864	/* We have a literal, but don't look at the expression. */
				865	return 1;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	866	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	867
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	868	if (str >= end \|\| *str == '}') {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	869	/* We're at the end of the string or the end of a nested
				870	f-string: no expression. The top-level error case where we
				871	expect to be at the end of the string but we're at a '}' is
				872	handled later. */
				873	return 0;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	874	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	875
				876	/* We must now be the start of an expression, on a '{'. */
				877	assert(**str == '{');
				878
				879	if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	880	expression, first_token, t, last_token) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	881	goto error;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	882	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	883
				884	return 0;
				885
				886	error:
				887	Py_CLEAR(*literal);
				888	return -1;
				889	}
				890
				891	#ifdef NDEBUG
				892	#define ExprList_check_invariants(l)
				893	#else
				894	static void
				895	ExprList_check_invariants(ExprList *l)
				896	{
				897	/* Check our invariants. Make sure this object is "live", and
				898	hasn't been deallocated. */
				899	assert(l->size >= 0);
				900	assert(l->p != NULL);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	901	if (l->size <= EXPRLIST_N_CACHED) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	902	assert(l->data == l->p);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	903	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	904	}
				905	#endif
				906
				907	static void
				908	ExprList_Init(ExprList *l)
				909	{
				910	l->allocated = EXPRLIST_N_CACHED;
				911	l->size = 0;
				912
				913	/* Until we start allocating dynamically, p points to data. */
				914	l->p = l->data;
				915
				916	ExprList_check_invariants(l);
				917	}
				918
				919	static int
				920	ExprList_Append(ExprList *l, expr_ty exp)
				921	{
				922	ExprList_check_invariants(l);
				923	if (l->size >= l->allocated) {
				924	/* We need to alloc (or realloc) the memory. */
				925	Py_ssize_t new_size = l->allocated * 2;
				926
				927	/* See if we've ever allocated anything dynamically. */
				928	if (l->p == l->data) {
				929	Py_ssize_t i;
				930	/* We're still using the cached data. Switch to
				931	alloc-ing. */
Lysandros Nikolaou	5193d0a	2020-06-27 21:35:18 +0300	[diff] [blame]	932	l->p = PyMem_Malloc(sizeof(expr_ty) * new_size);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	933	if (!l->p) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	934	return -1;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	935	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	936	/* Copy the cached data into the new buffer. */
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	937	for (i = 0; i < l->size; i++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	938	l->p[i] = l->data[i];
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	939	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	940	} else {
				941	/* Just realloc. */
Lysandros Nikolaou	5193d0a	2020-06-27 21:35:18 +0300	[diff] [blame]	942	expr_ty tmp = PyMem_Realloc(l->p, sizeof(expr_ty) new_size);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	943	if (!tmp) {
Lysandros Nikolaou	5193d0a	2020-06-27 21:35:18 +0300	[diff] [blame]	944	PyMem_Free(l->p);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	945	l->p = NULL;
				946	return -1;
				947	}
				948	l->p = tmp;
				949	}
				950
				951	l->allocated = new_size;
				952	assert(l->allocated == 2 * l->size);
				953	}
				954
				955	l->p[l->size++] = exp;
				956
				957	ExprList_check_invariants(l);
				958	return 0;
				959	}
				960
				961	static void
				962	ExprList_Dealloc(ExprList *l)
				963	{
				964	ExprList_check_invariants(l);
				965
				966	/* If there's been an error, or we've never dynamically allocated,
				967	do nothing. */
				968	if (!l->p \|\| l->p == l->data) {
				969	/* Do nothing. */
				970	} else {
				971	/* We have dynamically allocated. Free the memory. */
Lysandros Nikolaou	5193d0a	2020-06-27 21:35:18 +0300	[diff] [blame]	972	PyMem_Free(l->p);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	973	}
				974	l->p = NULL;
				975	l->size = -1;
				976	}
				977
				978	static asdl_seq *
				979	ExprList_Finish(ExprList l, PyArena arena)
				980	{
				981	asdl_seq *seq;
				982
				983	ExprList_check_invariants(l);
				984
				985	/* Allocate the asdl_seq and copy the expressions in to it. */
				986	seq = _Py_asdl_seq_new(l->size, arena);
				987	if (seq) {
				988	Py_ssize_t i;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	989	for (i = 0; i < l->size; i++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	990	asdl_seq_SET(seq, i, l->p[i]);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	991	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	992	}
				993	ExprList_Dealloc(l);
				994	return seq;
				995	}
				996
				997	#ifdef NDEBUG
				998	#define FstringParser_check_invariants(state)
				999	#else
				1000	static void
				1001	FstringParser_check_invariants(FstringParser *state)
				1002	{
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1003	if (state->last_str) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1004	assert(PyUnicode_CheckExact(state->last_str));
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1005	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1006	ExprList_check_invariants(&state->expr_list);
				1007	}
				1008	#endif
				1009
				1010	void
				1011	_PyPegen_FstringParser_Init(FstringParser *state)
				1012	{
				1013	state->last_str = NULL;
				1014	state->fmode = 0;
				1015	ExprList_Init(&state->expr_list);
				1016	FstringParser_check_invariants(state);
				1017	}
				1018
				1019	void
				1020	_PyPegen_FstringParser_Dealloc(FstringParser *state)
				1021	{
				1022	FstringParser_check_invariants(state);
				1023
				1024	Py_XDECREF(state->last_str);
				1025	ExprList_Dealloc(&state->expr_list);
				1026	}
				1027
				1028	/* Make a Constant node, but decref the PyUnicode object being added. */
				1029	static expr_ty
				1030	make_str_node_and_del(Parser p, PyObject str, Token first_token, Token *last_token)
				1031	{
				1032	PyObject s = str;
				1033	PyObject *kind = NULL;
				1034	*str = NULL;
				1035	assert(PyUnicode_CheckExact(s));
				1036	if (PyArena_AddPyObject(p->arena, s) < 0) {
				1037	Py_DECREF(s);
				1038	return NULL;
				1039	}
				1040	const char* the_str = PyBytes_AsString(first_token->bytes);
				1041	if (the_str && the_str[0] == 'u') {
				1042	kind = _PyPegen_new_identifier(p, "u");
				1043	}
				1044
				1045	if (kind == NULL && PyErr_Occurred()) {
				1046	return NULL;
				1047	}
				1048
				1049	return Constant(s, kind, first_token->lineno, first_token->col_offset,
				1050	last_token->end_lineno, last_token->end_col_offset, p->arena);
				1051
				1052	}
				1053
				1054
				1055	/* Add a non-f-string (that is, a regular literal string). str is
				1056	decref'd. */
				1057	int
				1058	_PyPegen_FstringParser_ConcatAndDel(FstringParser state, PyObject str)
				1059	{
				1060	FstringParser_check_invariants(state);
				1061
				1062	assert(PyUnicode_CheckExact(str));
				1063
				1064	if (PyUnicode_GET_LENGTH(str) == 0) {
				1065	Py_DECREF(str);
				1066	return 0;
				1067	}
				1068
				1069	if (!state->last_str) {
				1070	/* We didn't have a string before, so just remember this one. */
				1071	state->last_str = str;
				1072	} else {
				1073	/* Concatenate this with the previous string. */
				1074	PyUnicode_AppendAndDel(&state->last_str, str);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1075	if (!state->last_str) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1076	return -1;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1077	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1078	}
				1079	FstringParser_check_invariants(state);
				1080	return 0;
				1081	}
				1082
				1083	/* Parse an f-string. The f-string is in *str to end, with no
				1084	'f' or quotes. */
				1085	int
				1086	_PyPegen_FstringParser_ConcatFstring(Parser p, FstringParser state, const char **str,
				1087	const char *end, int raw, int recurse_lvl,
				1088	Token first_token, Token t, Token *last_token)
				1089	{
				1090	FstringParser_check_invariants(state);
				1091	state->fmode = 1;
				1092
				1093	/* Parse the f-string. */
				1094	while (1) {
				1095	PyObject *literal = NULL;
				1096	PyObject *expr_text = NULL;
				1097	expr_ty expression = NULL;
				1098
				1099	/* If there's a zero length literal in front of the
				1100	expression, literal will be NULL. If we're at the end of
				1101	the f-string, expression will be NULL (unless result == 1,
				1102	see below). */
				1103	int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
				1104	&literal, &expr_text,
				1105	&expression, first_token, t, last_token);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1106	if (result < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1107	return -1;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1108	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1109
				1110	/* Add the literal, if any. */
				1111	if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
				1112	Py_XDECREF(expr_text);
				1113	return -1;
				1114	}
				1115	/* Add the expr_text, if any. */
				1116	if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
				1117	return -1;
				1118	}
				1119
				1120	/* We've dealt with the literal and expr_text, their ownership has
				1121	been transferred to the state object. Don't look at them again. */
				1122
				1123	/* See if we should just loop around to get the next literal
				1124	and expression, while ignoring the expression this
				1125	time. This is used for un-doubling braces, as an
				1126	optimization. */
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1127	if (result == 1) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1128	continue;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1129	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1130
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1131	if (!expression) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1132	/* We're done with this f-string. */
				1133	break;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1134	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1135
				1136	/* We know we have an expression. Convert any existing string
				1137	to a Constant node. */
				1138	if (!state->last_str) {
				1139	/* Do nothing. No previous literal. */
				1140	} else {
				1141	/* Convert the existing last_str literal to a Constant node. */
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1142	expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
				1143	if (!last_str \|\| ExprList_Append(&state->expr_list, last_str) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1144	return -1;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1145	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1146	}
				1147
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1148	if (ExprList_Append(&state->expr_list, expression) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1149	return -1;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1150	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1151	}
				1152
				1153	/* If recurse_lvl is zero, then we must be at the end of the
				1154	string. Otherwise, we must be at a right brace. */
				1155
				1156	if (recurse_lvl == 0 && *str < end-1) {
				1157	RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
				1158	return -1;
				1159	}
				1160	if (recurse_lvl != 0 && **str != '}') {
				1161	RAISE_SYNTAX_ERROR("f-string: expecting '}'");
				1162	return -1;
				1163	}
				1164
				1165	FstringParser_check_invariants(state);
				1166	return 0;
				1167	}
				1168
				1169	/* Convert the partial state reflected in last_str and expr_list to an
				1170	expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
				1171	expr_ty
				1172	_PyPegen_FstringParser_Finish(Parser p, FstringParser state, Token* first_token,
				1173	Token *last_token)
				1174	{
				1175	asdl_seq *seq;
				1176
				1177	FstringParser_check_invariants(state);
				1178
				1179	/* If we're just a constant string with no expressions, return
				1180	that. */
				1181	if (!state->fmode) {
				1182	assert(!state->expr_list.size);
				1183	if (!state->last_str) {
				1184	/* Create a zero length string. */
				1185	state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1186	if (!state->last_str) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1187	goto error;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1188	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1189	}
				1190	return make_str_node_and_del(p, &state->last_str, first_token, last_token);
				1191	}
				1192
				1193	/* Create a Constant node out of last_str, if needed. It will be the
				1194	last node in our expression list. */
				1195	if (state->last_str) {
				1196	expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1197	if (!str \|\| ExprList_Append(&state->expr_list, str) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1198	goto error;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1199	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1200	}
				1201	/* This has already been freed. */
				1202	assert(state->last_str == NULL);
				1203
				1204	seq = ExprList_Finish(&state->expr_list, p->arena);
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1205	if (!seq) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1206	goto error;
Pablo Galindo	30b59fd	2020-06-15 15:08:00 +0100	[diff] [blame]	1207	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1208
				1209	return _Py_JoinedStr(seq, first_token->lineno, first_token->col_offset,
				1210	last_token->end_lineno, last_token->end_col_offset, p->arena);
				1211
				1212	error:
				1213	_PyPegen_FstringParser_Dealloc(state);
				1214	return NULL;
				1215	}
				1216
				1217	/* Given an f-string (with no 'f' or quotes) that's in *str and ends
				1218	at end, parse it into an expr_ty. Return NULL on error. Adjust
				1219	str to point past the parsed portion. */
				1220	static expr_ty
				1221	fstring_parse(Parser p, const char str, const char end, int raw,
				1222	int recurse_lvl, Token first_token, Token t, Token *last_token)
				1223	{
				1224	FstringParser state;
				1225
				1226	_PyPegen_FstringParser_Init(&state);
				1227	if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
				1228	first_token, t, last_token) < 0) {
				1229	_PyPegen_FstringParser_Dealloc(&state);
				1230	return NULL;
				1231	}
				1232
				1233	return _PyPegen_FstringParser_Finish(p, &state, t, t);
				1234	}