Blame - Parser/string_parser.c - platform/external/python/cpython3

blob: 9f56ce21d0f206912d9d1fc123d4433c34851336 [file] [log] [blame]

Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1	#include <Python.h>
				2
Pablo Galindo	1ed83ad	2020-06-11 17:30:46 +0100	[diff] [blame]	3	#include "tokenizer.h"
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	4	#include "pegen.h"
Pablo Galindo	1ed83ad	2020-06-11 17:30:46 +0100	[diff] [blame]	5	#include "string_parser.h"
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	6
				7	//// STRING HANDLING FUNCTIONS ////
				8
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	9	static int
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	10	warn_invalid_escape_sequence(Parser p, unsigned char first_invalid_escape_char, Token t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	11	{
				12	PyObject *msg =
				13	PyUnicode_FromFormat("invalid escape sequence \\%c", first_invalid_escape_char);
				14	if (msg == NULL) {
				15	return -1;
				16	}
				17	if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	18	t->lineno, NULL, NULL) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	19	if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
				20	/* Replace the DeprecationWarning exception with a SyntaxError
				21	to get a more accurate error report */
				22	PyErr_Clear();
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	23
				24	/* This is needed, in order for the SyntaxError to point to the token t,
				25	since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
				26	error location, if p->known_err_token is not set. */
				27	p->known_err_token = t;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	28	RAISE_SYNTAX_ERROR("invalid escape sequence \\%c", first_invalid_escape_char);
				29	}
				30	Py_DECREF(msg);
				31	return -1;
				32	}
				33	Py_DECREF(msg);
				34	return 0;
				35	}
				36
				37	static PyObject *
				38	decode_utf8(const char *sPtr, const char end)
				39	{
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	40	const char *s;
				41	const char *t;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	42	t = s = *sPtr;
				43	while (s < end && (*s & 0x80)) {
				44	s++;
				45	}
				46	*sPtr = s;
				47	return PyUnicode_DecodeUTF8(t, s - t, NULL);
				48	}
				49
				50	static PyObject *
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	51	decode_unicode_with_escapes(Parser parser, const char s, size_t len, Token *t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	52	{
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	53	PyObject *v;
				54	PyObject *u;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	55	char *buf;
				56	char *p;
				57	const char *end;
				58
				59	/* check for integer overflow */
				60	if (len > SIZE_MAX / 6) {
				61	return NULL;
				62	}
				63	/* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
				64	"\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
				65	u = PyBytes_FromStringAndSize((char )NULL, len 6);
				66	if (u == NULL) {
				67	return NULL;
				68	}
				69	p = buf = PyBytes_AsString(u);
				70	end = s + len;
				71	while (s < end) {
				72	if (*s == '\\') {
				73	p++ = s++;
				74	if (s >= end \|\| *s & 0x80) {
				75	strcpy(p, "u005c");
				76	p += 5;
				77	if (s >= end) {
				78	break;
				79	}
				80	}
				81	}
				82	if (*s & 0x80) {
				83	PyObject *w;
				84	int kind;
				85	void *data;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	86	Py_ssize_t w_len;
				87	Py_ssize_t i;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	88	w = decode_utf8(&s, end);
				89	if (w == NULL) {
				90	Py_DECREF(u);
				91	return NULL;
				92	}
				93	kind = PyUnicode_KIND(w);
				94	data = PyUnicode_DATA(w);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	95	w_len = PyUnicode_GET_LENGTH(w);
				96	for (i = 0; i < w_len; i++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	97	Py_UCS4 chr = PyUnicode_READ(kind, data, i);
				98	sprintf(p, "\\U%08x", chr);
				99	p += 10;
				100	}
				101	/* Should be impossible to overflow */
				102	assert(p - buf <= PyBytes_GET_SIZE(u));
				103	Py_DECREF(w);
				104	}
				105	else {
				106	p++ = s++;
				107	}
				108	}
				109	len = p - buf;
				110	s = buf;
				111
				112	const char *first_invalid_escape;
				113	v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
				114
				115	if (v != NULL && first_invalid_escape != NULL) {
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	116	if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	117	/* We have not decref u before because first_invalid_escape points
				118	inside u. */
				119	Py_XDECREF(u);
				120	Py_DECREF(v);
				121	return NULL;
				122	}
				123	}
				124	Py_XDECREF(u);
				125	return v;
				126	}
				127
				128	static PyObject *
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	129	decode_bytes_with_escapes(Parser p, const char s, Py_ssize_t len, Token *t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	130	{
				131	const char *first_invalid_escape;
				132	PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
				133	if (result == NULL) {
				134	return NULL;
				135	}
				136
				137	if (first_invalid_escape != NULL) {
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	138	if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	139	Py_DECREF(result);
				140	return NULL;
				141	}
				142	}
				143	return result;
				144	}
				145
				146	/* s must include the bracketing quote characters, and r, b, u,
				147	&/or f prefixes (if any), and embedded escape sequences (if any).
				148	_PyPegen_parsestr parses it, and sets *result to decoded Python string object.
				149	If the string is an f-string, set fstr and fstrlen to the unparsed
				150	string object. Return 0 if no errors occurred. */
				151	int
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	152	_PyPegen_parsestr(Parser p, int bytesmode, int rawmode, PyObject *result,
				153	const char *fstr, Py_ssize_t fstrlen, Token *t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	154	{
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	155	const char *s = PyBytes_AsString(t->bytes);
				156	if (s == NULL) {
				157	return -1;
				158	}
				159
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	160	size_t len;
				161	int quote = Py_CHARMASK(*s);
				162	int fmode = 0;
				163	*bytesmode = 0;
				164	*rawmode = 0;
				165	*result = NULL;
				166	*fstr = NULL;
				167	if (Py_ISALPHA(quote)) {
				168	while (!bytesmode \|\| !rawmode) {
				169	if (quote == 'b' \|\| quote == 'B') {
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	170	quote =(unsigned char)*++s;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	171	*bytesmode = 1;
				172	}
				173	else if (quote == 'u' \|\| quote == 'U') {
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	174	quote = (unsigned char)*++s;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	175	}
				176	else if (quote == 'r' \|\| quote == 'R') {
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	177	quote = (unsigned char)*++s;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	178	*rawmode = 1;
				179	}
				180	else if (quote == 'f' \|\| quote == 'F') {
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	181	quote = (unsigned char)*++s;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	182	fmode = 1;
				183	}
				184	else {
				185	break;
				186	}
				187	}
				188	}
				189
Lysandros Nikolaou	3e0a6f3	2020-05-01 06:27:52 +0300	[diff] [blame]	190	/* fstrings are only allowed in Python 3.6 and greater */
				191	if (fmode && p->feature_version < 6) {
				192	p->error_indicator = 1;
				193	RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
				194	return -1;
				195	}
				196
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	197	if (fmode && *bytesmode) {
				198	PyErr_BadInternalCall();
				199	return -1;
				200	}
				201	if (quote != '\'' && quote != '\"') {
				202	PyErr_BadInternalCall();
				203	return -1;
				204	}
				205	/* Skip the leading quote char. */
				206	s++;
				207	len = strlen(s);
				208	if (len > INT_MAX) {
				209	PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
				210	return -1;
				211	}
				212	if (s[--len] != quote) {
				213	/* Last quote char must match the first. */
				214	PyErr_BadInternalCall();
				215	return -1;
				216	}
				217	if (len >= 4 && s[0] == quote && s[1] == quote) {
				218	/* A triple quoted string. We've already skipped one quote at
				219	the start and one at the end of the string. Now skip the
				220	two at the start. */
				221	s += 2;
				222	len -= 2;
				223	/* And check that the last two match. */
				224	if (s[--len] != quote \|\| s[--len] != quote) {
				225	PyErr_BadInternalCall();
				226	return -1;
				227	}
				228	}
				229
				230	if (fmode) {
				231	/* Just return the bytes. The caller will parse the resulting
				232	string. */
				233	*fstr = s;
				234	*fstrlen = len;
				235	return 0;
				236	}
				237
				238	/* Not an f-string. */
				239	/* Avoid invoking escape decoding routines if possible. */
				240	rawmode = rawmode \|\| strchr(s, '\\') == NULL;
				241	if (*bytesmode) {
				242	/* Disallow non-ASCII characters. */
				243	const char *ch;
				244	for (ch = s; *ch; ch++) {
				245	if (Py_CHARMASK(*ch) >= 0x80) {
				246	RAISE_SYNTAX_ERROR(
				247	"bytes can only contain ASCII "
				248	"literal characters.");
				249	return -1;
				250	}
				251	}
				252	if (*rawmode) {
				253	*result = PyBytes_FromStringAndSize(s, len);
				254	}
				255	else {
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	256	*result = decode_bytes_with_escapes(p, s, len, t);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	257	}
				258	}
				259	else {
				260	if (*rawmode) {
				261	*result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
				262	}
				263	else {
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	264	*result = decode_unicode_with_escapes(p, s, len, t);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	265	}
				266	}
				267	return *result == NULL ? -1 : 0;
				268	}
				269
				270
				271
				272	// FSTRING STUFF
				273
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	274	/* Fix locations for the given node and its children.
				275
				276	`parent` is the enclosing node.
				277	`n` is the node which locations are going to be fixed relative to parent.
				278	`expr_str` is the child node's string representation, including braces.
				279	*/
				280	static void
Lysandros Nikolaou	1f0f4ab	2020-06-28 02:41:48 +0300	[diff] [blame]	281	fstring_find_expr_location(Token parent, char expr_str, int p_lines, int p_cols)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	282	{
				283	char *substr = NULL;
				284	char *start;
				285	int lines = 0;
				286	int cols = 0;
				287
				288	if (parent && parent->bytes) {
				289	char *parent_str = PyBytes_AsString(parent->bytes);
				290	if (!parent_str) {
				291	return;
				292	}
				293	substr = strstr(parent_str, expr_str);
				294	if (substr) {
				295	// The following is needed, in order to correctly shift the column
				296	// offset, in the case that (disregarding any whitespace) a newline
				297	// immediately follows the opening curly brace of the fstring expression.
				298	int newline_after_brace = 1;
				299	start = substr + 1;
				300	while (start && start != '}' && start != '\n') {
				301	if (start != ' ' && start != '\t' && *start != '\f') {
				302	newline_after_brace = 0;
				303	break;
				304	}
				305	start++;
				306	}
				307
				308	// Account for the characters from the last newline character to our
				309	// left until the beginning of substr.
				310	if (!newline_after_brace) {
				311	start = substr;
				312	while (start > parent_str && *start != '\n') {
				313	start--;
				314	}
				315	cols += (int)(substr - start);
				316	}
				317	/* adjust the start based on the number of newlines encountered
				318	before the f-string expression */
Pablo Galindo	0b7829e	2020-04-23 03:24:25 +0100	[diff] [blame]	319	for (char* p = parent_str; p < substr; p++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	320	if (*p == '\n') {
				321	lines++;
				322	}
				323	}
				324	}
				325	}
Lysandros Nikolaou	1f0f4ab	2020-06-28 02:41:48 +0300	[diff] [blame]	326	*p_lines = lines;
				327	*p_cols = cols;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	328	}
				329
				330
				331	/* Compile this expression in to an expr_ty. Add parens around the
				332	expression, in order to allow leading spaces in the expression. */
				333	static expr_ty
				334	fstring_compile_expr(Parser p, const char expr_start, const char *expr_end,
				335	Token *t)
				336	{
				337	expr_ty expr = NULL;
				338	char *str;
				339	Py_ssize_t len;
				340	const char *s;
				341	expr_ty result = NULL;
				342
				343	assert(expr_end >= expr_start);
				344	assert(*(expr_start-1) == '{');
				345	assert(expr_end == '}' \|\| expr_end == '!' \|\| *expr_end == ':' \|\|
				346	*expr_end == '=');
				347
				348	/* If the substring is all whitespace, it's an error. We need to catch this
				349	here, and not when we call PyParser_SimpleParseStringFlagsFilename,
				350	because turning the expression '' in to '()' would go from being invalid
				351	to valid. */
				352	for (s = expr_start; s != expr_end; s++) {
				353	char c = *s;
				354	/* The Python parser ignores only the following whitespace
				355	characters (\r already is converted to \n). */
				356	if (!(c == ' ' \|\| c == '\t' \|\| c == '\n' \|\| c == '\f')) {
				357	break;
				358	}
				359	}
				360	if (s == expr_end) {
				361	RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
				362	return NULL;
				363	}
				364
				365	len = expr_end - expr_start;
				366	/* Allocate 3 extra bytes: open paren, close paren, null byte. */
Lysandros Nikolaou	6dcbc24	2020-06-27 20:47:00 +0300	[diff] [blame]	367	str = PyMem_Malloc(len + 3);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	368	if (str == NULL) {
				369	PyErr_NoMemory();
				370	return NULL;
				371	}
				372
Lysandros Nikolaou	1f0f4ab	2020-06-28 02:41:48 +0300	[diff] [blame]	373	// The call to fstring_find_expr_location is responsible for finding the column offset
				374	// the generated AST nodes need to be shifted to the right, which is equal to the number
				375	// of the f-string characters before the expression starts. In order to correctly compute
				376	// this offset, strstr gets called in fstring_find_expr_location which only succeeds
				377	// if curly braces appear before and after the f-string expression (exactly like they do
				378	// in the f-string itself), hence the following lines.
				379	str[0] = '{';
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	380	memcpy(str+1, expr_start, len);
Lysandros Nikolaou	1f0f4ab	2020-06-28 02:41:48 +0300	[diff] [blame]	381	str[len+1] = '}';
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	382	str[len+2] = 0;
				383
Lysandros Nikolaou	1f0f4ab	2020-06-28 02:41:48 +0300	[diff] [blame]	384	int lines, cols;
				385	fstring_find_expr_location(t, str, &lines, &cols);
				386
				387	// The parentheses are needed in order to allow for leading whitespace withing
				388	// the f-string expression. This consequently gets parsed as a group (see the
				389	// group rule in python.gram).
				390	str[0] = '(';
				391	str[len+1] = ')';
				392
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	393	struct tok_state* tok = PyTokenizer_FromString(str, 1);
				394	if (tok == NULL) {
Lysandros Nikolaou	6dcbc24	2020-06-27 20:47:00 +0300	[diff] [blame]	395	PyMem_Free(str);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	396	return NULL;
				397	}
Lysandros Nikolaou	f7b1e46	2020-05-26 03:32:18 +0300	[diff] [blame]	398	Py_INCREF(p->tok->filename);
				399	tok->filename = p->tok->filename;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	400
Lysandros Nikolaou	3e0a6f3	2020-05-01 06:27:52 +0300	[diff] [blame]	401	Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
				402	NULL, p->arena);
Lysandros Nikolaou	1f0f4ab	2020-06-28 02:41:48 +0300	[diff] [blame]	403	p2->starting_lineno = t->lineno + lines - 1;
				404	p2->starting_col_offset = p->tok->first_lineno == p->tok->lineno ? t->col_offset + cols : cols;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	405
				406	expr = _PyPegen_run_parser(p2);
				407
				408	if (expr == NULL) {
				409	goto exit;
				410	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	411	result = expr;
				412
				413	exit:
Lysandros Nikolaou	6dcbc24	2020-06-27 20:47:00 +0300	[diff] [blame]	414	PyMem_Free(str);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	415	_PyPegen_Parser_Free(p2);
				416	PyTokenizer_Free(tok);
				417	return result;
				418	}
				419
				420	/* Return -1 on error.
				421
				422	Return 0 if we reached the end of the literal.
				423
				424	Return 1 if we haven't reached the end of the literal, but we want
				425	the caller to process the literal up to this point. Used for
				426	doubled braces.
				427	*/
				428	static int
				429	fstring_find_literal(Parser p, const char str, const char end, int raw,
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	430	PyObject *literal, int recurse_lvl, Token t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	431	{
				432	/* Get any literal string. It ends when we hit an un-doubled left
				433	brace (which isn't part of a unicode name escape such as
				434	"\N{EULER CONSTANT}"), or the end of the string. */
				435
				436	const char s = str;
				437	const char *literal_start = s;
				438	int result = 0;
				439
				440	assert(*literal == NULL);
				441	while (s < end) {
				442	char ch = *s++;
				443	if (!raw && ch == '\\' && s < end) {
				444	ch = *s++;
				445	if (ch == 'N') {
				446	if (s < end && *s++ == '{') {
				447	while (s < end && *s++ != '}') {
				448	}
				449	continue;
				450	}
				451	break;
				452	}
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	453	if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	454	return -1;
				455	}
				456	}
				457	if (ch == '{' \|\| ch == '}') {
				458	/* Check for doubled braces, but only at the top level. If
				459	we checked at every level, then f'{0:{3}}' would fail
				460	with the two closing braces. */
				461	if (recurse_lvl == 0) {
				462	if (s < end && *s == ch) {
				463	/* We're going to tell the caller that the literal ends
				464	here, but that they should continue scanning. But also
				465	skip over the second brace when we resume scanning. */
				466	*str = s + 1;
				467	result = 1;
				468	goto done;
				469	}
				470
				471	/* Where a single '{' is the start of a new expression, a
				472	single '}' is not allowed. */
				473	if (ch == '}') {
				474	*str = s - 1;
				475	RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
				476	return -1;
				477	}
				478	}
				479	/* We're either at a '{', which means we're starting another
				480	expression; or a '}', which means we're at the end of this
				481	f-string (for a nested format_spec). */
				482	s--;
				483	break;
				484	}
				485	}
				486	*str = s;
				487	assert(s <= end);
				488	assert(s == end \|\| s == '{' \|\| s == '}');
				489	done:
				490	if (literal_start != s) {
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	491	if (raw) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	492	*literal = PyUnicode_DecodeUTF8Stateful(literal_start,
				493	s - literal_start,
				494	NULL, NULL);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	495	} else {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	496	*literal = decode_unicode_with_escapes(p, literal_start,
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	497	s - literal_start, t);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	498	}
				499	if (!*literal) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	500	return -1;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	501	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	502	}
				503	return result;
				504	}
				505
				506	/* Forward declaration because parsing is recursive. */
				507	static expr_ty
				508	fstring_parse(Parser p, const char str, const char end, int raw, int recurse_lvl,
				509	Token first_token, Token t, Token *last_token);
				510
				511	/* Parse the f-string at str, ending at end. We know str starts an
				512	expression (so it must be a '{'). Returns the FormattedValue node, which
				513	includes the expression, conversion character, format_spec expression, and
				514	optionally the text of the expression (if = is used).
				515
				516	Note that I don't do a perfect job here: I don't make sure that a
				517	closing brace doesn't match an opening paren, for example. It
				518	doesn't need to error on all invalid expressions, just correctly
				519	find the end of all valid ones. Any errors inside the expression
				520	will be caught when we parse it later.
				521
				522	*expression is set to the expression. For an '=' "debug" expression,
				523	*expr_text is set to the debug text (the original text of the expression,
				524	including the '=' and any whitespace around it, as a string object). If
				525	not a debug expression, expr_text set to NULL. /
				526	static int
				527	fstring_find_expr(Parser p, const char str, const char end, int raw, int recurse_lvl,
				528	PyObject *expr_text, expr_ty expression, Token *first_token,
				529	Token t, Token last_token)
				530	{
				531	/* Return -1 on error, else 0. */
				532
				533	const char *expr_start;
				534	const char *expr_end;
				535	expr_ty simple_expression;
				536	expr_ty format_spec = NULL; /* Optional format specifier. */
				537	int conversion = -1; /* The conversion char. Use default if not
				538	specified, or !r if using = and no format
				539	spec. */
				540
				541	/* 0 if we're not in a string, else the quote char we're trying to
				542	match (single or double quote). */
				543	char quote_char = 0;
				544
				545	/* If we're inside a string, 1=normal, 3=triple-quoted. */
				546	int string_type = 0;
				547
				548	/* Keep track of nesting level for braces/parens/brackets in
				549	expressions. */
				550	Py_ssize_t nested_depth = 0;
				551	char parenstack[MAXLEVEL];
				552
				553	*expr_text = NULL;
				554
				555	/* Can only nest one level deep. */
				556	if (recurse_lvl >= 2) {
				557	RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
				558	goto error;
				559	}
				560
				561	/* The first char must be a left brace, or we wouldn't have gotten
				562	here. Skip over it. */
				563	assert(**str == '{');
				564	*str += 1;
				565
				566	expr_start = *str;
				567	for (; str < end; (str)++) {
				568	char ch;
				569
				570	/* Loop invariants. */
				571	assert(nested_depth >= 0);
				572	assert(str >= expr_start && str < end);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	573	if (quote_char) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	574	assert(string_type == 1 \|\| string_type == 3);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	575	} else {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	576	assert(string_type == 0);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	577	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	578
				579	ch = **str;
				580	/* Nowhere inside an expression is a backslash allowed. */
				581	if (ch == '\\') {
				582	/* Error: can't include a backslash character, inside
				583	parens or strings or not. */
				584	RAISE_SYNTAX_ERROR(
				585	"f-string expression part "
				586	"cannot include a backslash");
				587	goto error;
				588	}
				589	if (quote_char) {
				590	/* We're inside a string. See if we're at the end. */
				591	/* This code needs to implement the same non-error logic
				592	as tok_get from tokenizer.c, at the letter_quote
				593	label. To actually share that code would be a
				594	nightmare. But, it's unlikely to change and is small,
				595	so duplicate it here. Note we don't need to catch all
				596	of the errors, since they'll be caught when parsing the
				597	expression. We just need to match the non-error
				598	cases. Thus we can ignore \n in single-quoted strings,
				599	for example. Or non-terminated strings. */
				600	if (ch == quote_char) {
				601	/* Does this match the string_type (single or triple
				602	quoted)? */
				603	if (string_type == 3) {
				604	if (str+2 < end && (str+1) == ch && (*str+2) == ch) {
				605	/* We're at the end of a triple quoted string. */
				606	*str += 2;
				607	string_type = 0;
				608	quote_char = 0;
				609	continue;
				610	}
				611	} else {
				612	/* We're at the end of a normal string. */
				613	quote_char = 0;
				614	string_type = 0;
				615	continue;
				616	}
				617	}
				618	} else if (ch == '\'' \|\| ch == '"') {
				619	/* Is this a triple quoted string? */
				620	if (str+2 < end && (str+1) == ch && (*str+2) == ch) {
				621	string_type = 3;
				622	*str += 2;
				623	} else {
				624	/* Start of a normal string. */
				625	string_type = 1;
				626	}
				627	/* Start looking for the end of the string. */
				628	quote_char = ch;
				629	} else if (ch == '[' \|\| ch == '{' \|\| ch == '(') {
				630	if (nested_depth >= MAXLEVEL) {
				631	RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
				632	goto error;
				633	}
				634	parenstack[nested_depth] = ch;
				635	nested_depth++;
				636	} else if (ch == '#') {
				637	/* Error: can't include a comment character, inside parens
				638	or not. */
				639	RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
				640	goto error;
				641	} else if (nested_depth == 0 &&
				642	(ch == '!' \|\| ch == ':' \|\| ch == '}' \|\|
				643	ch == '=' \|\| ch == '>' \|\| ch == '<')) {
				644	/* See if there's a next character. */
				645	if (*str+1 < end) {
				646	char next = (str+1);
				647
				648	/* For "!=". since '=' is not an allowed conversion character,
				649	nothing is lost in this test. */
				650	if ((ch == '!' && next == '=') \|\| /* != */
				651	(ch == '=' && next == '=') \|\| /* == */
				652	(ch == '<' && next == '=') \|\| /* <= */
				653	(ch == '>' && next == '=') /* >= */
				654	) {
				655	*str += 1;
				656	continue;
				657	}
				658	/* Don't get out of the loop for these, if they're single
				659	chars (not part of 2-char tokens). If by themselves, they
				660	don't end an expression (unlike say '!'). */
				661	if (ch == '>' \|\| ch == '<') {
				662	continue;
				663	}
				664	}
				665
				666	/* Normal way out of this loop. */
				667	break;
				668	} else if (ch == ']' \|\| ch == '}' \|\| ch == ')') {
				669	if (!nested_depth) {
				670	RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
				671	goto error;
				672	}
				673	nested_depth--;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	674	int opening = (unsigned char)parenstack[nested_depth];
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	675	if (!((opening == '(' && ch == ')') \|\|
				676	(opening == '[' && ch == ']') \|\|
				677	(opening == '{' && ch == '}')))
				678	{
				679	RAISE_SYNTAX_ERROR(
				680	"f-string: closing parenthesis '%c' "
				681	"does not match opening parenthesis '%c'",
				682	ch, opening);
				683	goto error;
				684	}
				685	} else {
				686	/* Just consume this char and loop around. */
				687	}
				688	}
				689	expr_end = *str;
				690	/* If we leave this loop in a string or with mismatched parens, we
				691	don't care. We'll get a syntax error when compiling the
				692	expression. But, we can produce a better error message, so
				693	let's just do that.*/
				694	if (quote_char) {
				695	RAISE_SYNTAX_ERROR("f-string: unterminated string");
				696	goto error;
				697	}
				698	if (nested_depth) {
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	699	int opening = (unsigned char)parenstack[nested_depth - 1];
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	700	RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
				701	goto error;
				702	}
				703
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	704	if (*str >= end) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	705	goto unexpected_end_of_string;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	706	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	707
				708	/* Compile the expression as soon as possible, so we show errors
				709	related to the expression before errors related to the
				710	conversion or format_spec. */
				711	simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	712	if (!simple_expression) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	713	goto error;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	714	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	715
				716	/* Check for =, which puts the text value of the expression in
				717	expr_text. */
				718	if (**str == '=') {
Shantanu	c116c94	2020-05-27 13:30:38 -0700	[diff] [blame]	719	if (p->feature_version < 8) {
				720	RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
				721	"only supported in Python 3.8 and greater");
				722	goto error;
				723	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	724	*str += 1;
				725
				726	/* Skip over ASCII whitespace. No need to test for end of string
				727	here, since we know there's at least a trailing quote somewhere
				728	ahead. */
				729	while (Py_ISSPACE(**str)) {
				730	*str += 1;
				731	}
				732
				733	/* Set expr_text to the text of the expression. /
				734	expr_text = PyUnicode_FromStringAndSize(expr_start, str-expr_start);
				735	if (!*expr_text) {
				736	goto error;
				737	}
				738	}
				739
				740	/* Check for a conversion char, if present. */
				741	if (**str == '!') {
				742	*str += 1;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	743	if (*str >= end) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	744	goto unexpected_end_of_string;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	745	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	746
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	747	conversion = (unsigned char)**str;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	748	*str += 1;
				749
				750	/* Validate the conversion. */
				751	if (!(conversion == 's' \|\| conversion == 'r' \|\| conversion == 'a')) {
				752	RAISE_SYNTAX_ERROR(
				753	"f-string: invalid conversion character: "
				754	"expected 's', 'r', or 'a'");
				755	goto error;
				756	}
				757
				758	}
				759
				760	/* Check for the format spec, if present. */
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	761	if (*str >= end) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	762	goto unexpected_end_of_string;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	763	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	764	if (**str == ':') {
				765	*str += 1;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	766	if (*str >= end) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	767	goto unexpected_end_of_string;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	768	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	769
				770	/* Parse the format spec. */
				771	format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
				772	first_token, t, last_token);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	773	if (!format_spec) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	774	goto error;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	775	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	776	}
				777
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	778	if (str >= end \|\| *str != '}') {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	779	goto unexpected_end_of_string;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	780	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	781
				782	/* We're at a right brace. Consume it. */
				783	assert(*str < end);
				784	assert(**str == '}');
				785	*str += 1;
				786
				787	/* If we're in = mode (detected by non-NULL expr_text), and have no format
				788	spec and no explicit conversion, set the conversion to 'r'. */
				789	if (*expr_text && format_spec == NULL && conversion == -1) {
				790	conversion = 'r';
				791	}
				792
				793	/* And now create the FormattedValue node that represents this
				794	entire expression with the conversion and format spec. */
				795	//TODO: Fix this
				796	*expression = FormattedValue(simple_expression, conversion,
				797	format_spec, first_token->lineno,
				798	first_token->col_offset, last_token->end_lineno,
				799	last_token->end_col_offset, p->arena);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	800	if (!*expression) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	801	goto error;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	802	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	803
				804	return 0;
				805
				806	unexpected_end_of_string:
				807	RAISE_SYNTAX_ERROR("f-string: expecting '}'");
				808	/* Falls through to error. */
				809
				810	error:
				811	Py_XDECREF(*expr_text);
				812	return -1;
				813
				814	}
				815
				816	/* Return -1 on error.
				817
				818	Return 0 if we have a literal (possible zero length) and an
				819	expression (zero length if at the end of the string.
				820
				821	Return 1 if we have a literal, but no expression, and we want the
				822	caller to call us again. This is used to deal with doubled
				823	braces.
				824
				825	When called multiple times on the string 'a{{b{0}c', this function
				826	will return:
				827
				828	1. the literal 'a{' with no expression, and a return value
				829	of 1. Despite the fact that there's no expression, the return
				830	value of 1 means we're not finished yet.
				831
				832	2. the literal 'b' and the expression '0', with a return value of
				833	0. The fact that there's an expression means we're not finished.
				834
				835	3. literal 'c' with no expression and a return value of 0. The
				836	combination of the return value of 0 with no expression means
				837	we're finished.
				838	*/
				839	static int
				840	fstring_find_literal_and_expr(Parser p, const char str, const char end, int raw,
				841	int recurse_lvl, PyObject **literal,
				842	PyObject *expr_text, expr_ty expression,
				843	Token first_token, Token t, Token *last_token)
				844	{
				845	int result;
				846
				847	assert(literal == NULL && expression == NULL);
				848
				849	/* Get any literal string. */
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	850	result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	851	if (result < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	852	goto error;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	853	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	854
				855	assert(result == 0 \|\| result == 1);
				856
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	857	if (result == 1) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	858	/* We have a literal, but don't look at the expression. */
				859	return 1;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	860	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	861
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	862	if (str >= end \|\| *str == '}') {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	863	/* We're at the end of the string or the end of a nested
				864	f-string: no expression. The top-level error case where we
				865	expect to be at the end of the string but we're at a '}' is
				866	handled later. */
				867	return 0;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	868	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	869
				870	/* We must now be the start of an expression, on a '{'. */
				871	assert(**str == '{');
				872
				873	if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	874	expression, first_token, t, last_token) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	875	goto error;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	876	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	877
				878	return 0;
				879
				880	error:
				881	Py_CLEAR(*literal);
				882	return -1;
				883	}
				884
				885	#ifdef NDEBUG
				886	#define ExprList_check_invariants(l)
				887	#else
				888	static void
				889	ExprList_check_invariants(ExprList *l)
				890	{
				891	/* Check our invariants. Make sure this object is "live", and
				892	hasn't been deallocated. */
				893	assert(l->size >= 0);
				894	assert(l->p != NULL);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	895	if (l->size <= EXPRLIST_N_CACHED) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	896	assert(l->data == l->p);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	897	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	898	}
				899	#endif
				900
				901	static void
				902	ExprList_Init(ExprList *l)
				903	{
				904	l->allocated = EXPRLIST_N_CACHED;
				905	l->size = 0;
				906
				907	/* Until we start allocating dynamically, p points to data. */
				908	l->p = l->data;
				909
				910	ExprList_check_invariants(l);
				911	}
				912
				913	static int
				914	ExprList_Append(ExprList *l, expr_ty exp)
				915	{
				916	ExprList_check_invariants(l);
				917	if (l->size >= l->allocated) {
				918	/* We need to alloc (or realloc) the memory. */
				919	Py_ssize_t new_size = l->allocated * 2;
				920
				921	/* See if we've ever allocated anything dynamically. */
				922	if (l->p == l->data) {
				923	Py_ssize_t i;
				924	/* We're still using the cached data. Switch to
				925	alloc-ing. */
Lysandros Nikolaou	6dcbc24	2020-06-27 20:47:00 +0300	[diff] [blame]	926	l->p = PyMem_Malloc(sizeof(expr_ty) * new_size);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	927	if (!l->p) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	928	return -1;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	929	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	930	/* Copy the cached data into the new buffer. */
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	931	for (i = 0; i < l->size; i++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	932	l->p[i] = l->data[i];
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	933	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	934	} else {
				935	/* Just realloc. */
Lysandros Nikolaou	6dcbc24	2020-06-27 20:47:00 +0300	[diff] [blame]	936	expr_ty tmp = PyMem_Realloc(l->p, sizeof(expr_ty) new_size);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	937	if (!tmp) {
Lysandros Nikolaou	6dcbc24	2020-06-27 20:47:00 +0300	[diff] [blame]	938	PyMem_Free(l->p);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	939	l->p = NULL;
				940	return -1;
				941	}
				942	l->p = tmp;
				943	}
				944
				945	l->allocated = new_size;
				946	assert(l->allocated == 2 * l->size);
				947	}
				948
				949	l->p[l->size++] = exp;
				950
				951	ExprList_check_invariants(l);
				952	return 0;
				953	}
				954
				955	static void
				956	ExprList_Dealloc(ExprList *l)
				957	{
				958	ExprList_check_invariants(l);
				959
				960	/* If there's been an error, or we've never dynamically allocated,
				961	do nothing. */
				962	if (!l->p \|\| l->p == l->data) {
				963	/* Do nothing. */
				964	} else {
				965	/* We have dynamically allocated. Free the memory. */
Lysandros Nikolaou	6dcbc24	2020-06-27 20:47:00 +0300	[diff] [blame]	966	PyMem_Free(l->p);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	967	}
				968	l->p = NULL;
				969	l->size = -1;
				970	}
				971
				972	static asdl_seq *
				973	ExprList_Finish(ExprList l, PyArena arena)
				974	{
				975	asdl_seq *seq;
				976
				977	ExprList_check_invariants(l);
				978
				979	/* Allocate the asdl_seq and copy the expressions in to it. */
				980	seq = _Py_asdl_seq_new(l->size, arena);
				981	if (seq) {
				982	Py_ssize_t i;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	983	for (i = 0; i < l->size; i++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	984	asdl_seq_SET(seq, i, l->p[i]);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	985	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	986	}
				987	ExprList_Dealloc(l);
				988	return seq;
				989	}
				990
				991	#ifdef NDEBUG
				992	#define FstringParser_check_invariants(state)
				993	#else
				994	static void
				995	FstringParser_check_invariants(FstringParser *state)
				996	{
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	997	if (state->last_str) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	998	assert(PyUnicode_CheckExact(state->last_str));
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	999	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1000	ExprList_check_invariants(&state->expr_list);
				1001	}
				1002	#endif
				1003
				1004	void
				1005	_PyPegen_FstringParser_Init(FstringParser *state)
				1006	{
				1007	state->last_str = NULL;
				1008	state->fmode = 0;
				1009	ExprList_Init(&state->expr_list);
				1010	FstringParser_check_invariants(state);
				1011	}
				1012
				1013	void
				1014	_PyPegen_FstringParser_Dealloc(FstringParser *state)
				1015	{
				1016	FstringParser_check_invariants(state);
				1017
				1018	Py_XDECREF(state->last_str);
				1019	ExprList_Dealloc(&state->expr_list);
				1020	}
				1021
				1022	/* Make a Constant node, but decref the PyUnicode object being added. */
				1023	static expr_ty
				1024	make_str_node_and_del(Parser p, PyObject str, Token first_token, Token *last_token)
				1025	{
				1026	PyObject s = str;
				1027	PyObject *kind = NULL;
				1028	*str = NULL;
				1029	assert(PyUnicode_CheckExact(s));
				1030	if (PyArena_AddPyObject(p->arena, s) < 0) {
				1031	Py_DECREF(s);
				1032	return NULL;
				1033	}
				1034	const char* the_str = PyBytes_AsString(first_token->bytes);
				1035	if (the_str && the_str[0] == 'u') {
				1036	kind = _PyPegen_new_identifier(p, "u");
				1037	}
				1038
				1039	if (kind == NULL && PyErr_Occurred()) {
				1040	return NULL;
				1041	}
				1042
				1043	return Constant(s, kind, first_token->lineno, first_token->col_offset,
				1044	last_token->end_lineno, last_token->end_col_offset, p->arena);
				1045
				1046	}
				1047
				1048
				1049	/* Add a non-f-string (that is, a regular literal string). str is
				1050	decref'd. */
				1051	int
				1052	_PyPegen_FstringParser_ConcatAndDel(FstringParser state, PyObject str)
				1053	{
				1054	FstringParser_check_invariants(state);
				1055
				1056	assert(PyUnicode_CheckExact(str));
				1057
				1058	if (PyUnicode_GET_LENGTH(str) == 0) {
				1059	Py_DECREF(str);
				1060	return 0;
				1061	}
				1062
				1063	if (!state->last_str) {
				1064	/* We didn't have a string before, so just remember this one. */
				1065	state->last_str = str;
				1066	} else {
				1067	/* Concatenate this with the previous string. */
				1068	PyUnicode_AppendAndDel(&state->last_str, str);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1069	if (!state->last_str) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1070	return -1;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1071	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1072	}
				1073	FstringParser_check_invariants(state);
				1074	return 0;
				1075	}
				1076
				1077	/* Parse an f-string. The f-string is in *str to end, with no
				1078	'f' or quotes. */
				1079	int
				1080	_PyPegen_FstringParser_ConcatFstring(Parser p, FstringParser state, const char **str,
				1081	const char *end, int raw, int recurse_lvl,
				1082	Token first_token, Token t, Token *last_token)
				1083	{
				1084	FstringParser_check_invariants(state);
				1085	state->fmode = 1;
				1086
				1087	/* Parse the f-string. */
				1088	while (1) {
				1089	PyObject *literal = NULL;
				1090	PyObject *expr_text = NULL;
				1091	expr_ty expression = NULL;
				1092
				1093	/* If there's a zero length literal in front of the
				1094	expression, literal will be NULL. If we're at the end of
				1095	the f-string, expression will be NULL (unless result == 1,
				1096	see below). */
				1097	int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
				1098	&literal, &expr_text,
				1099	&expression, first_token, t, last_token);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1100	if (result < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1101	return -1;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1102	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1103
				1104	/* Add the literal, if any. */
				1105	if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
				1106	Py_XDECREF(expr_text);
				1107	return -1;
				1108	}
				1109	/* Add the expr_text, if any. */
				1110	if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
				1111	return -1;
				1112	}
				1113
				1114	/* We've dealt with the literal and expr_text, their ownership has
				1115	been transferred to the state object. Don't look at them again. */
				1116
				1117	/* See if we should just loop around to get the next literal
				1118	and expression, while ignoring the expression this
				1119	time. This is used for un-doubling braces, as an
				1120	optimization. */
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1121	if (result == 1) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1122	continue;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1123	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1124
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1125	if (!expression) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1126	/* We're done with this f-string. */
				1127	break;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1128	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1129
				1130	/* We know we have an expression. Convert any existing string
				1131	to a Constant node. */
				1132	if (!state->last_str) {
				1133	/* Do nothing. No previous literal. */
				1134	} else {
				1135	/* Convert the existing last_str literal to a Constant node. */
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1136	expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
				1137	if (!last_str \|\| ExprList_Append(&state->expr_list, last_str) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1138	return -1;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1139	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1140	}
				1141
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1142	if (ExprList_Append(&state->expr_list, expression) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1143	return -1;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1144	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1145	}
				1146
				1147	/* If recurse_lvl is zero, then we must be at the end of the
				1148	string. Otherwise, we must be at a right brace. */
				1149
				1150	if (recurse_lvl == 0 && *str < end-1) {
				1151	RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
				1152	return -1;
				1153	}
				1154	if (recurse_lvl != 0 && **str != '}') {
				1155	RAISE_SYNTAX_ERROR("f-string: expecting '}'");
				1156	return -1;
				1157	}
				1158
				1159	FstringParser_check_invariants(state);
				1160	return 0;
				1161	}
				1162
				1163	/* Convert the partial state reflected in last_str and expr_list to an
				1164	expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
				1165	expr_ty
				1166	_PyPegen_FstringParser_Finish(Parser p, FstringParser state, Token* first_token,
				1167	Token *last_token)
				1168	{
				1169	asdl_seq *seq;
				1170
				1171	FstringParser_check_invariants(state);
				1172
				1173	/* If we're just a constant string with no expressions, return
				1174	that. */
				1175	if (!state->fmode) {
				1176	assert(!state->expr_list.size);
				1177	if (!state->last_str) {
				1178	/* Create a zero length string. */
				1179	state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1180	if (!state->last_str) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1181	goto error;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1182	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1183	}
				1184	return make_str_node_and_del(p, &state->last_str, first_token, last_token);
				1185	}
				1186
				1187	/* Create a Constant node out of last_str, if needed. It will be the
				1188	last node in our expression list. */
				1189	if (state->last_str) {
				1190	expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1191	if (!str \|\| ExprList_Append(&state->expr_list, str) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1192	goto error;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1193	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1194	}
				1195	/* This has already been freed. */
				1196	assert(state->last_str == NULL);
				1197
				1198	seq = ExprList_Finish(&state->expr_list, p->arena);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1199	if (!seq) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1200	goto error;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1201	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1202
				1203	return _Py_JoinedStr(seq, first_token->lineno, first_token->col_offset,
				1204	last_token->end_lineno, last_token->end_col_offset, p->arena);
				1205
				1206	error:
				1207	_PyPegen_FstringParser_Dealloc(state);
				1208	return NULL;
				1209	}
				1210
				1211	/* Given an f-string (with no 'f' or quotes) that's in *str and ends
				1212	at end, parse it into an expr_ty. Return NULL on error. Adjust
				1213	str to point past the parsed portion. */
				1214	static expr_ty
				1215	fstring_parse(Parser p, const char str, const char end, int raw,
				1216	int recurse_lvl, Token first_token, Token t, Token *last_token)
				1217	{
				1218	FstringParser state;
				1219
				1220	_PyPegen_FstringParser_Init(&state);
				1221	if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
				1222	first_token, t, last_token) < 0) {
				1223	_PyPegen_FstringParser_Dealloc(&state);
				1224	return NULL;
				1225	}
				1226
				1227	return _PyPegen_FstringParser_Finish(p, &state, t, t);
				1228	}