Blame - Parser/string_parser.c - platform/external/python/cpython3

blob: 2c35da590defbbc47df152be00b769999f32b00f [file] [log] [blame]

Benjamin Peterson	2ad7e9c	2020-07-16 06:07:29 -0700	[diff] [blame]	1	#include <stdbool.h>
				2
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	3	#include <Python.h>
				4
Pablo Galindo	1ed83ad	2020-06-11 17:30:46 +0100	[diff] [blame]	5	#include "tokenizer.h"
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	6	#include "pegen.h"
Pablo Galindo	1ed83ad	2020-06-11 17:30:46 +0100	[diff] [blame]	7	#include "string_parser.h"
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	8
				9	//// STRING HANDLING FUNCTIONS ////
				10
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	11	static int
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	12	warn_invalid_escape_sequence(Parser p, unsigned char first_invalid_escape_char, Token t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	13	{
				14	PyObject *msg =
				15	PyUnicode_FromFormat("invalid escape sequence \\%c", first_invalid_escape_char);
				16	if (msg == NULL) {
				17	return -1;
				18	}
				19	if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	20	t->lineno, NULL, NULL) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	21	if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
				22	/* Replace the DeprecationWarning exception with a SyntaxError
				23	to get a more accurate error report */
				24	PyErr_Clear();
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	25
				26	/* This is needed, in order for the SyntaxError to point to the token t,
				27	since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
				28	error location, if p->known_err_token is not set. */
				29	p->known_err_token = t;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	30	RAISE_SYNTAX_ERROR("invalid escape sequence \\%c", first_invalid_escape_char);
				31	}
				32	Py_DECREF(msg);
				33	return -1;
				34	}
				35	Py_DECREF(msg);
				36	return 0;
				37	}
				38
				39	static PyObject *
				40	decode_utf8(const char *sPtr, const char end)
				41	{
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	42	const char *s;
				43	const char *t;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	44	t = s = *sPtr;
				45	while (s < end && (*s & 0x80)) {
				46	s++;
				47	}
				48	*sPtr = s;
				49	return PyUnicode_DecodeUTF8(t, s - t, NULL);
				50	}
				51
				52	static PyObject *
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	53	decode_unicode_with_escapes(Parser parser, const char s, size_t len, Token *t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	54	{
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	55	PyObject *v;
				56	PyObject *u;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	57	char *buf;
				58	char *p;
				59	const char *end;
				60
				61	/* check for integer overflow */
				62	if (len > SIZE_MAX / 6) {
				63	return NULL;
				64	}
				65	/* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
				66	"\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
				67	u = PyBytes_FromStringAndSize((char )NULL, len 6);
				68	if (u == NULL) {
				69	return NULL;
				70	}
				71	p = buf = PyBytes_AsString(u);
				72	end = s + len;
				73	while (s < end) {
				74	if (*s == '\\') {
				75	p++ = s++;
				76	if (s >= end \|\| *s & 0x80) {
				77	strcpy(p, "u005c");
				78	p += 5;
				79	if (s >= end) {
				80	break;
				81	}
				82	}
				83	}
				84	if (*s & 0x80) {
				85	PyObject *w;
				86	int kind;
				87	void *data;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	88	Py_ssize_t w_len;
				89	Py_ssize_t i;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	90	w = decode_utf8(&s, end);
				91	if (w == NULL) {
				92	Py_DECREF(u);
				93	return NULL;
				94	}
				95	kind = PyUnicode_KIND(w);
				96	data = PyUnicode_DATA(w);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	97	w_len = PyUnicode_GET_LENGTH(w);
				98	for (i = 0; i < w_len; i++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	99	Py_UCS4 chr = PyUnicode_READ(kind, data, i);
				100	sprintf(p, "\\U%08x", chr);
				101	p += 10;
				102	}
				103	/* Should be impossible to overflow */
				104	assert(p - buf <= PyBytes_GET_SIZE(u));
				105	Py_DECREF(w);
				106	}
				107	else {
				108	p++ = s++;
				109	}
				110	}
				111	len = p - buf;
				112	s = buf;
				113
				114	const char *first_invalid_escape;
				115	v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
				116
				117	if (v != NULL && first_invalid_escape != NULL) {
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	118	if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	119	/* We have not decref u before because first_invalid_escape points
				120	inside u. */
				121	Py_XDECREF(u);
				122	Py_DECREF(v);
				123	return NULL;
				124	}
				125	}
				126	Py_XDECREF(u);
				127	return v;
				128	}
				129
				130	static PyObject *
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	131	decode_bytes_with_escapes(Parser p, const char s, Py_ssize_t len, Token *t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	132	{
				133	const char *first_invalid_escape;
				134	PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
				135	if (result == NULL) {
				136	return NULL;
				137	}
				138
				139	if (first_invalid_escape != NULL) {
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	140	if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	141	Py_DECREF(result);
				142	return NULL;
				143	}
				144	}
				145	return result;
				146	}
				147
				148	/* s must include the bracketing quote characters, and r, b, u,
				149	&/or f prefixes (if any), and embedded escape sequences (if any).
				150	_PyPegen_parsestr parses it, and sets *result to decoded Python string object.
				151	If the string is an f-string, set fstr and fstrlen to the unparsed
				152	string object. Return 0 if no errors occurred. */
				153	int
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	154	_PyPegen_parsestr(Parser p, int bytesmode, int rawmode, PyObject *result,
				155	const char *fstr, Py_ssize_t fstrlen, Token *t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	156	{
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	157	const char *s = PyBytes_AsString(t->bytes);
				158	if (s == NULL) {
				159	return -1;
				160	}
				161
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	162	size_t len;
				163	int quote = Py_CHARMASK(*s);
				164	int fmode = 0;
				165	*bytesmode = 0;
				166	*rawmode = 0;
				167	*result = NULL;
				168	*fstr = NULL;
				169	if (Py_ISALPHA(quote)) {
				170	while (!bytesmode \|\| !rawmode) {
				171	if (quote == 'b' \|\| quote == 'B') {
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	172	quote =(unsigned char)*++s;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	173	*bytesmode = 1;
				174	}
				175	else if (quote == 'u' \|\| quote == 'U') {
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	176	quote = (unsigned char)*++s;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	177	}
				178	else if (quote == 'r' \|\| quote == 'R') {
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	179	quote = (unsigned char)*++s;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	180	*rawmode = 1;
				181	}
				182	else if (quote == 'f' \|\| quote == 'F') {
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	183	quote = (unsigned char)*++s;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	184	fmode = 1;
				185	}
				186	else {
				187	break;
				188	}
				189	}
				190	}
				191
Lysandros Nikolaou	3e0a6f3	2020-05-01 06:27:52 +0300	[diff] [blame]	192	/* fstrings are only allowed in Python 3.6 and greater */
				193	if (fmode && p->feature_version < 6) {
				194	p->error_indicator = 1;
				195	RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
				196	return -1;
				197	}
				198
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	199	if (fmode && *bytesmode) {
				200	PyErr_BadInternalCall();
				201	return -1;
				202	}
				203	if (quote != '\'' && quote != '\"') {
				204	PyErr_BadInternalCall();
				205	return -1;
				206	}
				207	/* Skip the leading quote char. */
				208	s++;
				209	len = strlen(s);
				210	if (len > INT_MAX) {
				211	PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
				212	return -1;
				213	}
				214	if (s[--len] != quote) {
				215	/* Last quote char must match the first. */
				216	PyErr_BadInternalCall();
				217	return -1;
				218	}
				219	if (len >= 4 && s[0] == quote && s[1] == quote) {
				220	/* A triple quoted string. We've already skipped one quote at
				221	the start and one at the end of the string. Now skip the
				222	two at the start. */
				223	s += 2;
				224	len -= 2;
				225	/* And check that the last two match. */
				226	if (s[--len] != quote \|\| s[--len] != quote) {
				227	PyErr_BadInternalCall();
				228	return -1;
				229	}
				230	}
				231
				232	if (fmode) {
				233	/* Just return the bytes. The caller will parse the resulting
				234	string. */
				235	*fstr = s;
				236	*fstrlen = len;
				237	return 0;
				238	}
				239
				240	/* Not an f-string. */
				241	/* Avoid invoking escape decoding routines if possible. */
				242	rawmode = rawmode \|\| strchr(s, '\\') == NULL;
				243	if (*bytesmode) {
				244	/* Disallow non-ASCII characters. */
				245	const char *ch;
				246	for (ch = s; *ch; ch++) {
				247	if (Py_CHARMASK(*ch) >= 0x80) {
				248	RAISE_SYNTAX_ERROR(
				249	"bytes can only contain ASCII "
				250	"literal characters.");
				251	return -1;
				252	}
				253	}
				254	if (*rawmode) {
				255	*result = PyBytes_FromStringAndSize(s, len);
				256	}
				257	else {
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	258	*result = decode_bytes_with_escapes(p, s, len, t);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	259	}
				260	}
				261	else {
				262	if (*rawmode) {
				263	*result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
				264	}
				265	else {
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	266	*result = decode_unicode_with_escapes(p, s, len, t);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	267	}
				268	}
				269	return *result == NULL ? -1 : 0;
				270	}
				271
				272
				273
				274	// FSTRING STUFF
				275
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	276	/* Fix locations for the given node and its children.
				277
				278	`parent` is the enclosing node.
				279	`n` is the node which locations are going to be fixed relative to parent.
				280	`expr_str` is the child node's string representation, including braces.
				281	*/
Benjamin Peterson	2ad7e9c	2020-07-16 06:07:29 -0700	[diff] [blame]	282	static bool
Lysandros Nikolaou	1f0f4ab	2020-06-28 02:41:48 +0300	[diff] [blame]	283	fstring_find_expr_location(Token parent, char expr_str, int p_lines, int p_cols)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	284	{
Benjamin Peterson	2ad7e9c	2020-07-16 06:07:29 -0700	[diff] [blame]	285	*p_lines = 0;
				286	*p_cols = 0;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	287	if (parent && parent->bytes) {
				288	char *parent_str = PyBytes_AsString(parent->bytes);
				289	if (!parent_str) {
Benjamin Peterson	2ad7e9c	2020-07-16 06:07:29 -0700	[diff] [blame]	290	return false;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	291	}
Benjamin Peterson	2ad7e9c	2020-07-16 06:07:29 -0700	[diff] [blame]	292	char *substr = strstr(parent_str, expr_str);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	293	if (substr) {
				294	// The following is needed, in order to correctly shift the column
				295	// offset, in the case that (disregarding any whitespace) a newline
				296	// immediately follows the opening curly brace of the fstring expression.
Benjamin Peterson	2ad7e9c	2020-07-16 06:07:29 -0700	[diff] [blame]	297	bool newline_after_brace = 1;
				298	char *start = substr + 1;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	299	while (start && start != '}' && start != '\n') {
				300	if (start != ' ' && start != '\t' && *start != '\f') {
				301	newline_after_brace = 0;
				302	break;
				303	}
				304	start++;
				305	}
				306
				307	// Account for the characters from the last newline character to our
				308	// left until the beginning of substr.
				309	if (!newline_after_brace) {
				310	start = substr;
				311	while (start > parent_str && *start != '\n') {
				312	start--;
				313	}
Benjamin Peterson	2ad7e9c	2020-07-16 06:07:29 -0700	[diff] [blame]	314	*p_cols += (int)(substr - start);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	315	}
				316	/* adjust the start based on the number of newlines encountered
				317	before the f-string expression */
Pablo Galindo	0b7829e	2020-04-23 03:24:25 +0100	[diff] [blame]	318	for (char* p = parent_str; p < substr; p++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	319	if (*p == '\n') {
Benjamin Peterson	2ad7e9c	2020-07-16 06:07:29 -0700	[diff] [blame]	320	(*p_lines)++;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	321	}
				322	}
				323	}
				324	}
Benjamin Peterson	2ad7e9c	2020-07-16 06:07:29 -0700	[diff] [blame]	325	return true;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	326	}
				327
				328
				329	/* Compile this expression in to an expr_ty. Add parens around the
				330	expression, in order to allow leading spaces in the expression. */
				331	static expr_ty
				332	fstring_compile_expr(Parser p, const char expr_start, const char *expr_end,
				333	Token *t)
				334	{
				335	expr_ty expr = NULL;
				336	char *str;
				337	Py_ssize_t len;
				338	const char *s;
				339	expr_ty result = NULL;
				340
				341	assert(expr_end >= expr_start);
				342	assert(*(expr_start-1) == '{');
				343	assert(expr_end == '}' \|\| expr_end == '!' \|\| *expr_end == ':' \|\|
				344	*expr_end == '=');
				345
				346	/* If the substring is all whitespace, it's an error. We need to catch this
				347	here, and not when we call PyParser_SimpleParseStringFlagsFilename,
				348	because turning the expression '' in to '()' would go from being invalid
				349	to valid. */
				350	for (s = expr_start; s != expr_end; s++) {
				351	char c = *s;
				352	/* The Python parser ignores only the following whitespace
				353	characters (\r already is converted to \n). */
				354	if (!(c == ' ' \|\| c == '\t' \|\| c == '\n' \|\| c == '\f')) {
				355	break;
				356	}
				357	}
				358	if (s == expr_end) {
				359	RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
				360	return NULL;
				361	}
				362
				363	len = expr_end - expr_start;
				364	/* Allocate 3 extra bytes: open paren, close paren, null byte. */
Lysandros Nikolaou	6dcbc24	2020-06-27 20:47:00 +0300	[diff] [blame]	365	str = PyMem_Malloc(len + 3);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	366	if (str == NULL) {
				367	PyErr_NoMemory();
				368	return NULL;
				369	}
				370
Lysandros Nikolaou	1f0f4ab	2020-06-28 02:41:48 +0300	[diff] [blame]	371	// The call to fstring_find_expr_location is responsible for finding the column offset
				372	// the generated AST nodes need to be shifted to the right, which is equal to the number
				373	// of the f-string characters before the expression starts. In order to correctly compute
				374	// this offset, strstr gets called in fstring_find_expr_location which only succeeds
				375	// if curly braces appear before and after the f-string expression (exactly like they do
				376	// in the f-string itself), hence the following lines.
				377	str[0] = '{';
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	378	memcpy(str+1, expr_start, len);
Lysandros Nikolaou	1f0f4ab	2020-06-28 02:41:48 +0300	[diff] [blame]	379	str[len+1] = '}';
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	380	str[len+2] = 0;
				381
Lysandros Nikolaou	1f0f4ab	2020-06-28 02:41:48 +0300	[diff] [blame]	382	int lines, cols;
Benjamin Peterson	2ad7e9c	2020-07-16 06:07:29 -0700	[diff] [blame]	383	if (!fstring_find_expr_location(t, str, &lines, &cols)) {
				384	PyMem_FREE(str);
				385	return NULL;
				386	}
Lysandros Nikolaou	1f0f4ab	2020-06-28 02:41:48 +0300	[diff] [blame]	387
Eric V. Smith	0275e04	2020-07-16 12:10:23 -0400	[diff] [blame^]	388	// The parentheses are needed in order to allow for leading whitespace within
Lysandros Nikolaou	1f0f4ab	2020-06-28 02:41:48 +0300	[diff] [blame]	389	// the f-string expression. This consequently gets parsed as a group (see the
				390	// group rule in python.gram).
				391	str[0] = '(';
				392	str[len+1] = ')';
				393
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	394	struct tok_state* tok = PyTokenizer_FromString(str, 1);
				395	if (tok == NULL) {
Lysandros Nikolaou	6dcbc24	2020-06-27 20:47:00 +0300	[diff] [blame]	396	PyMem_Free(str);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	397	return NULL;
				398	}
Lysandros Nikolaou	f7b1e46	2020-05-26 03:32:18 +0300	[diff] [blame]	399	Py_INCREF(p->tok->filename);
				400	tok->filename = p->tok->filename;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	401
Lysandros Nikolaou	3e0a6f3	2020-05-01 06:27:52 +0300	[diff] [blame]	402	Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
				403	NULL, p->arena);
Lysandros Nikolaou	1f0f4ab	2020-06-28 02:41:48 +0300	[diff] [blame]	404	p2->starting_lineno = t->lineno + lines - 1;
				405	p2->starting_col_offset = p->tok->first_lineno == p->tok->lineno ? t->col_offset + cols : cols;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	406
				407	expr = _PyPegen_run_parser(p2);
				408
				409	if (expr == NULL) {
				410	goto exit;
				411	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	412	result = expr;
				413
				414	exit:
Lysandros Nikolaou	6dcbc24	2020-06-27 20:47:00 +0300	[diff] [blame]	415	PyMem_Free(str);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	416	_PyPegen_Parser_Free(p2);
				417	PyTokenizer_Free(tok);
				418	return result;
				419	}
				420
				421	/* Return -1 on error.
				422
				423	Return 0 if we reached the end of the literal.
				424
				425	Return 1 if we haven't reached the end of the literal, but we want
				426	the caller to process the literal up to this point. Used for
				427	doubled braces.
				428	*/
				429	static int
				430	fstring_find_literal(Parser p, const char str, const char end, int raw,
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	431	PyObject *literal, int recurse_lvl, Token t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	432	{
				433	/* Get any literal string. It ends when we hit an un-doubled left
				434	brace (which isn't part of a unicode name escape such as
				435	"\N{EULER CONSTANT}"), or the end of the string. */
				436
				437	const char s = str;
				438	const char *literal_start = s;
				439	int result = 0;
				440
				441	assert(*literal == NULL);
				442	while (s < end) {
				443	char ch = *s++;
				444	if (!raw && ch == '\\' && s < end) {
				445	ch = *s++;
				446	if (ch == 'N') {
				447	if (s < end && *s++ == '{') {
				448	while (s < end && *s++ != '}') {
				449	}
				450	continue;
				451	}
				452	break;
				453	}
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	454	if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	455	return -1;
				456	}
				457	}
				458	if (ch == '{' \|\| ch == '}') {
				459	/* Check for doubled braces, but only at the top level. If
				460	we checked at every level, then f'{0:{3}}' would fail
				461	with the two closing braces. */
				462	if (recurse_lvl == 0) {
				463	if (s < end && *s == ch) {
				464	/* We're going to tell the caller that the literal ends
				465	here, but that they should continue scanning. But also
				466	skip over the second brace when we resume scanning. */
				467	*str = s + 1;
				468	result = 1;
				469	goto done;
				470	}
				471
				472	/* Where a single '{' is the start of a new expression, a
				473	single '}' is not allowed. */
				474	if (ch == '}') {
				475	*str = s - 1;
				476	RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
				477	return -1;
				478	}
				479	}
				480	/* We're either at a '{', which means we're starting another
				481	expression; or a '}', which means we're at the end of this
				482	f-string (for a nested format_spec). */
				483	s--;
				484	break;
				485	}
				486	}
				487	*str = s;
				488	assert(s <= end);
				489	assert(s == end \|\| s == '{' \|\| s == '}');
				490	done:
				491	if (literal_start != s) {
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	492	if (raw) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	493	*literal = PyUnicode_DecodeUTF8Stateful(literal_start,
				494	s - literal_start,
				495	NULL, NULL);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	496	} else {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	497	*literal = decode_unicode_with_escapes(p, literal_start,
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	498	s - literal_start, t);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	499	}
				500	if (!*literal) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	501	return -1;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	502	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	503	}
				504	return result;
				505	}
				506
				507	/* Forward declaration because parsing is recursive. */
				508	static expr_ty
				509	fstring_parse(Parser p, const char str, const char end, int raw, int recurse_lvl,
				510	Token first_token, Token t, Token *last_token);
				511
				512	/* Parse the f-string at str, ending at end. We know str starts an
				513	expression (so it must be a '{'). Returns the FormattedValue node, which
				514	includes the expression, conversion character, format_spec expression, and
				515	optionally the text of the expression (if = is used).
				516
				517	Note that I don't do a perfect job here: I don't make sure that a
				518	closing brace doesn't match an opening paren, for example. It
				519	doesn't need to error on all invalid expressions, just correctly
				520	find the end of all valid ones. Any errors inside the expression
				521	will be caught when we parse it later.
				522
				523	*expression is set to the expression. For an '=' "debug" expression,
				524	*expr_text is set to the debug text (the original text of the expression,
				525	including the '=' and any whitespace around it, as a string object). If
				526	not a debug expression, expr_text set to NULL. /
				527	static int
				528	fstring_find_expr(Parser p, const char str, const char end, int raw, int recurse_lvl,
				529	PyObject *expr_text, expr_ty expression, Token *first_token,
				530	Token t, Token last_token)
				531	{
				532	/* Return -1 on error, else 0. */
				533
				534	const char *expr_start;
				535	const char *expr_end;
				536	expr_ty simple_expression;
				537	expr_ty format_spec = NULL; /* Optional format specifier. */
				538	int conversion = -1; /* The conversion char. Use default if not
				539	specified, or !r if using = and no format
				540	spec. */
				541
				542	/* 0 if we're not in a string, else the quote char we're trying to
				543	match (single or double quote). */
				544	char quote_char = 0;
				545
				546	/* If we're inside a string, 1=normal, 3=triple-quoted. */
				547	int string_type = 0;
				548
				549	/* Keep track of nesting level for braces/parens/brackets in
				550	expressions. */
				551	Py_ssize_t nested_depth = 0;
				552	char parenstack[MAXLEVEL];
				553
				554	*expr_text = NULL;
				555
				556	/* Can only nest one level deep. */
				557	if (recurse_lvl >= 2) {
				558	RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
				559	goto error;
				560	}
				561
				562	/* The first char must be a left brace, or we wouldn't have gotten
				563	here. Skip over it. */
				564	assert(**str == '{');
				565	*str += 1;
				566
				567	expr_start = *str;
				568	for (; str < end; (str)++) {
				569	char ch;
				570
				571	/* Loop invariants. */
				572	assert(nested_depth >= 0);
				573	assert(str >= expr_start && str < end);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	574	if (quote_char) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	575	assert(string_type == 1 \|\| string_type == 3);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	576	} else {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	577	assert(string_type == 0);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	578	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	579
				580	ch = **str;
				581	/* Nowhere inside an expression is a backslash allowed. */
				582	if (ch == '\\') {
				583	/* Error: can't include a backslash character, inside
				584	parens or strings or not. */
				585	RAISE_SYNTAX_ERROR(
				586	"f-string expression part "
				587	"cannot include a backslash");
				588	goto error;
				589	}
				590	if (quote_char) {
				591	/* We're inside a string. See if we're at the end. */
				592	/* This code needs to implement the same non-error logic
				593	as tok_get from tokenizer.c, at the letter_quote
				594	label. To actually share that code would be a
				595	nightmare. But, it's unlikely to change and is small,
				596	so duplicate it here. Note we don't need to catch all
				597	of the errors, since they'll be caught when parsing the
				598	expression. We just need to match the non-error
				599	cases. Thus we can ignore \n in single-quoted strings,
				600	for example. Or non-terminated strings. */
				601	if (ch == quote_char) {
				602	/* Does this match the string_type (single or triple
				603	quoted)? */
				604	if (string_type == 3) {
				605	if (str+2 < end && (str+1) == ch && (*str+2) == ch) {
				606	/* We're at the end of a triple quoted string. */
				607	*str += 2;
				608	string_type = 0;
				609	quote_char = 0;
				610	continue;
				611	}
				612	} else {
				613	/* We're at the end of a normal string. */
				614	quote_char = 0;
				615	string_type = 0;
				616	continue;
				617	}
				618	}
				619	} else if (ch == '\'' \|\| ch == '"') {
				620	/* Is this a triple quoted string? */
				621	if (str+2 < end && (str+1) == ch && (*str+2) == ch) {
				622	string_type = 3;
				623	*str += 2;
				624	} else {
				625	/* Start of a normal string. */
				626	string_type = 1;
				627	}
				628	/* Start looking for the end of the string. */
				629	quote_char = ch;
				630	} else if (ch == '[' \|\| ch == '{' \|\| ch == '(') {
				631	if (nested_depth >= MAXLEVEL) {
				632	RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
				633	goto error;
				634	}
				635	parenstack[nested_depth] = ch;
				636	nested_depth++;
				637	} else if (ch == '#') {
				638	/* Error: can't include a comment character, inside parens
				639	or not. */
				640	RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
				641	goto error;
				642	} else if (nested_depth == 0 &&
				643	(ch == '!' \|\| ch == ':' \|\| ch == '}' \|\|
				644	ch == '=' \|\| ch == '>' \|\| ch == '<')) {
				645	/* See if there's a next character. */
				646	if (*str+1 < end) {
				647	char next = (str+1);
				648
				649	/* For "!=". since '=' is not an allowed conversion character,
				650	nothing is lost in this test. */
				651	if ((ch == '!' && next == '=') \|\| /* != */
				652	(ch == '=' && next == '=') \|\| /* == */
				653	(ch == '<' && next == '=') \|\| /* <= */
				654	(ch == '>' && next == '=') /* >= */
				655	) {
				656	*str += 1;
				657	continue;
				658	}
				659	/* Don't get out of the loop for these, if they're single
				660	chars (not part of 2-char tokens). If by themselves, they
				661	don't end an expression (unlike say '!'). */
				662	if (ch == '>' \|\| ch == '<') {
				663	continue;
				664	}
				665	}
				666
				667	/* Normal way out of this loop. */
				668	break;
				669	} else if (ch == ']' \|\| ch == '}' \|\| ch == ')') {
				670	if (!nested_depth) {
				671	RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
				672	goto error;
				673	}
				674	nested_depth--;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	675	int opening = (unsigned char)parenstack[nested_depth];
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	676	if (!((opening == '(' && ch == ')') \|\|
				677	(opening == '[' && ch == ']') \|\|
				678	(opening == '{' && ch == '}')))
				679	{
				680	RAISE_SYNTAX_ERROR(
				681	"f-string: closing parenthesis '%c' "
				682	"does not match opening parenthesis '%c'",
				683	ch, opening);
				684	goto error;
				685	}
				686	} else {
				687	/* Just consume this char and loop around. */
				688	}
				689	}
				690	expr_end = *str;
				691	/* If we leave this loop in a string or with mismatched parens, we
				692	don't care. We'll get a syntax error when compiling the
				693	expression. But, we can produce a better error message, so
				694	let's just do that.*/
				695	if (quote_char) {
				696	RAISE_SYNTAX_ERROR("f-string: unterminated string");
				697	goto error;
				698	}
				699	if (nested_depth) {
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	700	int opening = (unsigned char)parenstack[nested_depth - 1];
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	701	RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
				702	goto error;
				703	}
				704
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	705	if (*str >= end) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	706	goto unexpected_end_of_string;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	707	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	708
				709	/* Compile the expression as soon as possible, so we show errors
				710	related to the expression before errors related to the
				711	conversion or format_spec. */
				712	simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	713	if (!simple_expression) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	714	goto error;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	715	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	716
				717	/* Check for =, which puts the text value of the expression in
				718	expr_text. */
				719	if (**str == '=') {
Shantanu	c116c94	2020-05-27 13:30:38 -0700	[diff] [blame]	720	if (p->feature_version < 8) {
				721	RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
				722	"only supported in Python 3.8 and greater");
				723	goto error;
				724	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	725	*str += 1;
				726
				727	/* Skip over ASCII whitespace. No need to test for end of string
				728	here, since we know there's at least a trailing quote somewhere
				729	ahead. */
				730	while (Py_ISSPACE(**str)) {
				731	*str += 1;
				732	}
				733
				734	/* Set expr_text to the text of the expression. /
				735	expr_text = PyUnicode_FromStringAndSize(expr_start, str-expr_start);
				736	if (!*expr_text) {
				737	goto error;
				738	}
				739	}
				740
				741	/* Check for a conversion char, if present. */
				742	if (**str == '!') {
				743	*str += 1;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	744	if (*str >= end) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	745	goto unexpected_end_of_string;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	746	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	747
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	748	conversion = (unsigned char)**str;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	749	*str += 1;
				750
				751	/* Validate the conversion. */
				752	if (!(conversion == 's' \|\| conversion == 'r' \|\| conversion == 'a')) {
				753	RAISE_SYNTAX_ERROR(
				754	"f-string: invalid conversion character: "
				755	"expected 's', 'r', or 'a'");
				756	goto error;
				757	}
				758
				759	}
				760
				761	/* Check for the format spec, if present. */
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	762	if (*str >= end) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	763	goto unexpected_end_of_string;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	764	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	765	if (**str == ':') {
				766	*str += 1;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	767	if (*str >= end) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	768	goto unexpected_end_of_string;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	769	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	770
				771	/* Parse the format spec. */
				772	format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
				773	first_token, t, last_token);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	774	if (!format_spec) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	775	goto error;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	776	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	777	}
				778
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	779	if (str >= end \|\| *str != '}') {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	780	goto unexpected_end_of_string;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	781	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	782
				783	/* We're at a right brace. Consume it. */
				784	assert(*str < end);
				785	assert(**str == '}');
				786	*str += 1;
				787
				788	/* If we're in = mode (detected by non-NULL expr_text), and have no format
				789	spec and no explicit conversion, set the conversion to 'r'. */
				790	if (*expr_text && format_spec == NULL && conversion == -1) {
				791	conversion = 'r';
				792	}
				793
				794	/* And now create the FormattedValue node that represents this
				795	entire expression with the conversion and format spec. */
				796	//TODO: Fix this
				797	*expression = FormattedValue(simple_expression, conversion,
				798	format_spec, first_token->lineno,
				799	first_token->col_offset, last_token->end_lineno,
				800	last_token->end_col_offset, p->arena);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	801	if (!*expression) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	802	goto error;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	803	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	804
				805	return 0;
				806
				807	unexpected_end_of_string:
				808	RAISE_SYNTAX_ERROR("f-string: expecting '}'");
				809	/* Falls through to error. */
				810
				811	error:
				812	Py_XDECREF(*expr_text);
				813	return -1;
				814
				815	}
				816
				817	/* Return -1 on error.
				818
				819	Return 0 if we have a literal (possible zero length) and an
				820	expression (zero length if at the end of the string.
				821
				822	Return 1 if we have a literal, but no expression, and we want the
				823	caller to call us again. This is used to deal with doubled
				824	braces.
				825
				826	When called multiple times on the string 'a{{b{0}c', this function
				827	will return:
				828
				829	1. the literal 'a{' with no expression, and a return value
				830	of 1. Despite the fact that there's no expression, the return
				831	value of 1 means we're not finished yet.
				832
				833	2. the literal 'b' and the expression '0', with a return value of
				834	0. The fact that there's an expression means we're not finished.
				835
				836	3. literal 'c' with no expression and a return value of 0. The
				837	combination of the return value of 0 with no expression means
				838	we're finished.
				839	*/
				840	static int
				841	fstring_find_literal_and_expr(Parser p, const char str, const char end, int raw,
				842	int recurse_lvl, PyObject **literal,
				843	PyObject *expr_text, expr_ty expression,
				844	Token first_token, Token t, Token *last_token)
				845	{
				846	int result;
				847
				848	assert(literal == NULL && expression == NULL);
				849
				850	/* Get any literal string. */
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	851	result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	852	if (result < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	853	goto error;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	854	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	855
				856	assert(result == 0 \|\| result == 1);
				857
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	858	if (result == 1) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	859	/* We have a literal, but don't look at the expression. */
				860	return 1;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	861	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	862
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	863	if (str >= end \|\| *str == '}') {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	864	/* We're at the end of the string or the end of a nested
				865	f-string: no expression. The top-level error case where we
				866	expect to be at the end of the string but we're at a '}' is
				867	handled later. */
				868	return 0;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	869	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	870
				871	/* We must now be the start of an expression, on a '{'. */
				872	assert(**str == '{');
				873
				874	if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	875	expression, first_token, t, last_token) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	876	goto error;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	877	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	878
				879	return 0;
				880
				881	error:
				882	Py_CLEAR(*literal);
				883	return -1;
				884	}
				885
				886	#ifdef NDEBUG
				887	#define ExprList_check_invariants(l)
				888	#else
				889	static void
				890	ExprList_check_invariants(ExprList *l)
				891	{
				892	/* Check our invariants. Make sure this object is "live", and
				893	hasn't been deallocated. */
				894	assert(l->size >= 0);
				895	assert(l->p != NULL);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	896	if (l->size <= EXPRLIST_N_CACHED) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	897	assert(l->data == l->p);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	898	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	899	}
				900	#endif
				901
				902	static void
				903	ExprList_Init(ExprList *l)
				904	{
				905	l->allocated = EXPRLIST_N_CACHED;
				906	l->size = 0;
				907
				908	/* Until we start allocating dynamically, p points to data. */
				909	l->p = l->data;
				910
				911	ExprList_check_invariants(l);
				912	}
				913
				914	static int
				915	ExprList_Append(ExprList *l, expr_ty exp)
				916	{
				917	ExprList_check_invariants(l);
				918	if (l->size >= l->allocated) {
				919	/* We need to alloc (or realloc) the memory. */
				920	Py_ssize_t new_size = l->allocated * 2;
				921
				922	/* See if we've ever allocated anything dynamically. */
				923	if (l->p == l->data) {
				924	Py_ssize_t i;
				925	/* We're still using the cached data. Switch to
				926	alloc-ing. */
Lysandros Nikolaou	6dcbc24	2020-06-27 20:47:00 +0300	[diff] [blame]	927	l->p = PyMem_Malloc(sizeof(expr_ty) * new_size);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	928	if (!l->p) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	929	return -1;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	930	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	931	/* Copy the cached data into the new buffer. */
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	932	for (i = 0; i < l->size; i++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	933	l->p[i] = l->data[i];
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	934	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	935	} else {
				936	/* Just realloc. */
Lysandros Nikolaou	6dcbc24	2020-06-27 20:47:00 +0300	[diff] [blame]	937	expr_ty tmp = PyMem_Realloc(l->p, sizeof(expr_ty) new_size);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	938	if (!tmp) {
Lysandros Nikolaou	6dcbc24	2020-06-27 20:47:00 +0300	[diff] [blame]	939	PyMem_Free(l->p);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	940	l->p = NULL;
				941	return -1;
				942	}
				943	l->p = tmp;
				944	}
				945
				946	l->allocated = new_size;
				947	assert(l->allocated == 2 * l->size);
				948	}
				949
				950	l->p[l->size++] = exp;
				951
				952	ExprList_check_invariants(l);
				953	return 0;
				954	}
				955
				956	static void
				957	ExprList_Dealloc(ExprList *l)
				958	{
				959	ExprList_check_invariants(l);
				960
				961	/* If there's been an error, or we've never dynamically allocated,
				962	do nothing. */
				963	if (!l->p \|\| l->p == l->data) {
				964	/* Do nothing. */
				965	} else {
				966	/* We have dynamically allocated. Free the memory. */
Lysandros Nikolaou	6dcbc24	2020-06-27 20:47:00 +0300	[diff] [blame]	967	PyMem_Free(l->p);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	968	}
				969	l->p = NULL;
				970	l->size = -1;
				971	}
				972
				973	static asdl_seq *
				974	ExprList_Finish(ExprList l, PyArena arena)
				975	{
				976	asdl_seq *seq;
				977
				978	ExprList_check_invariants(l);
				979
				980	/* Allocate the asdl_seq and copy the expressions in to it. */
				981	seq = _Py_asdl_seq_new(l->size, arena);
				982	if (seq) {
				983	Py_ssize_t i;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	984	for (i = 0; i < l->size; i++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	985	asdl_seq_SET(seq, i, l->p[i]);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	986	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	987	}
				988	ExprList_Dealloc(l);
				989	return seq;
				990	}
				991
				992	#ifdef NDEBUG
				993	#define FstringParser_check_invariants(state)
				994	#else
				995	static void
				996	FstringParser_check_invariants(FstringParser *state)
				997	{
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	998	if (state->last_str) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	999	assert(PyUnicode_CheckExact(state->last_str));
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1000	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1001	ExprList_check_invariants(&state->expr_list);
				1002	}
				1003	#endif
				1004
				1005	void
				1006	_PyPegen_FstringParser_Init(FstringParser *state)
				1007	{
				1008	state->last_str = NULL;
				1009	state->fmode = 0;
				1010	ExprList_Init(&state->expr_list);
				1011	FstringParser_check_invariants(state);
				1012	}
				1013
				1014	void
				1015	_PyPegen_FstringParser_Dealloc(FstringParser *state)
				1016	{
				1017	FstringParser_check_invariants(state);
				1018
				1019	Py_XDECREF(state->last_str);
				1020	ExprList_Dealloc(&state->expr_list);
				1021	}
				1022
				1023	/* Make a Constant node, but decref the PyUnicode object being added. */
				1024	static expr_ty
				1025	make_str_node_and_del(Parser p, PyObject str, Token first_token, Token *last_token)
				1026	{
				1027	PyObject s = str;
				1028	PyObject *kind = NULL;
				1029	*str = NULL;
				1030	assert(PyUnicode_CheckExact(s));
				1031	if (PyArena_AddPyObject(p->arena, s) < 0) {
				1032	Py_DECREF(s);
				1033	return NULL;
				1034	}
				1035	const char* the_str = PyBytes_AsString(first_token->bytes);
				1036	if (the_str && the_str[0] == 'u') {
				1037	kind = _PyPegen_new_identifier(p, "u");
				1038	}
				1039
				1040	if (kind == NULL && PyErr_Occurred()) {
				1041	return NULL;
				1042	}
				1043
				1044	return Constant(s, kind, first_token->lineno, first_token->col_offset,
				1045	last_token->end_lineno, last_token->end_col_offset, p->arena);
				1046
				1047	}
				1048
				1049
				1050	/* Add a non-f-string (that is, a regular literal string). str is
				1051	decref'd. */
				1052	int
				1053	_PyPegen_FstringParser_ConcatAndDel(FstringParser state, PyObject str)
				1054	{
				1055	FstringParser_check_invariants(state);
				1056
				1057	assert(PyUnicode_CheckExact(str));
				1058
				1059	if (PyUnicode_GET_LENGTH(str) == 0) {
				1060	Py_DECREF(str);
				1061	return 0;
				1062	}
				1063
				1064	if (!state->last_str) {
				1065	/* We didn't have a string before, so just remember this one. */
				1066	state->last_str = str;
				1067	} else {
				1068	/* Concatenate this with the previous string. */
				1069	PyUnicode_AppendAndDel(&state->last_str, str);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1070	if (!state->last_str) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1071	return -1;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1072	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1073	}
				1074	FstringParser_check_invariants(state);
				1075	return 0;
				1076	}
				1077
				1078	/* Parse an f-string. The f-string is in *str to end, with no
				1079	'f' or quotes. */
				1080	int
				1081	_PyPegen_FstringParser_ConcatFstring(Parser p, FstringParser state, const char **str,
				1082	const char *end, int raw, int recurse_lvl,
				1083	Token first_token, Token t, Token *last_token)
				1084	{
				1085	FstringParser_check_invariants(state);
				1086	state->fmode = 1;
				1087
				1088	/* Parse the f-string. */
				1089	while (1) {
				1090	PyObject *literal = NULL;
				1091	PyObject *expr_text = NULL;
				1092	expr_ty expression = NULL;
				1093
				1094	/* If there's a zero length literal in front of the
				1095	expression, literal will be NULL. If we're at the end of
				1096	the f-string, expression will be NULL (unless result == 1,
				1097	see below). */
				1098	int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
				1099	&literal, &expr_text,
				1100	&expression, first_token, t, last_token);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1101	if (result < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1102	return -1;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1103	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1104
				1105	/* Add the literal, if any. */
				1106	if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
				1107	Py_XDECREF(expr_text);
				1108	return -1;
				1109	}
				1110	/* Add the expr_text, if any. */
				1111	if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
				1112	return -1;
				1113	}
				1114
				1115	/* We've dealt with the literal and expr_text, their ownership has
				1116	been transferred to the state object. Don't look at them again. */
				1117
				1118	/* See if we should just loop around to get the next literal
				1119	and expression, while ignoring the expression this
				1120	time. This is used for un-doubling braces, as an
				1121	optimization. */
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1122	if (result == 1) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1123	continue;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1124	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1125
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1126	if (!expression) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1127	/* We're done with this f-string. */
				1128	break;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1129	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1130
				1131	/* We know we have an expression. Convert any existing string
				1132	to a Constant node. */
				1133	if (!state->last_str) {
				1134	/* Do nothing. No previous literal. */
				1135	} else {
				1136	/* Convert the existing last_str literal to a Constant node. */
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1137	expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
				1138	if (!last_str \|\| ExprList_Append(&state->expr_list, last_str) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1139	return -1;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1140	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1141	}
				1142
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1143	if (ExprList_Append(&state->expr_list, expression) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1144	return -1;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1145	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1146	}
				1147
				1148	/* If recurse_lvl is zero, then we must be at the end of the
				1149	string. Otherwise, we must be at a right brace. */
				1150
				1151	if (recurse_lvl == 0 && *str < end-1) {
				1152	RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
				1153	return -1;
				1154	}
				1155	if (recurse_lvl != 0 && **str != '}') {
				1156	RAISE_SYNTAX_ERROR("f-string: expecting '}'");
				1157	return -1;
				1158	}
				1159
				1160	FstringParser_check_invariants(state);
				1161	return 0;
				1162	}
				1163
				1164	/* Convert the partial state reflected in last_str and expr_list to an
				1165	expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
				1166	expr_ty
				1167	_PyPegen_FstringParser_Finish(Parser p, FstringParser state, Token* first_token,
				1168	Token *last_token)
				1169	{
				1170	asdl_seq *seq;
				1171
				1172	FstringParser_check_invariants(state);
				1173
				1174	/* If we're just a constant string with no expressions, return
				1175	that. */
				1176	if (!state->fmode) {
				1177	assert(!state->expr_list.size);
				1178	if (!state->last_str) {
				1179	/* Create a zero length string. */
				1180	state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1181	if (!state->last_str) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1182	goto error;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1183	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1184	}
				1185	return make_str_node_and_del(p, &state->last_str, first_token, last_token);
				1186	}
				1187
				1188	/* Create a Constant node out of last_str, if needed. It will be the
				1189	last node in our expression list. */
				1190	if (state->last_str) {
				1191	expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1192	if (!str \|\| ExprList_Append(&state->expr_list, str) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1193	goto error;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1194	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1195	}
				1196	/* This has already been freed. */
				1197	assert(state->last_str == NULL);
				1198
				1199	seq = ExprList_Finish(&state->expr_list, p->arena);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1200	if (!seq) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1201	goto error;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1202	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1203
				1204	return _Py_JoinedStr(seq, first_token->lineno, first_token->col_offset,
				1205	last_token->end_lineno, last_token->end_col_offset, p->arena);
				1206
				1207	error:
				1208	_PyPegen_FstringParser_Dealloc(state);
				1209	return NULL;
				1210	}
				1211
				1212	/* Given an f-string (with no 'f' or quotes) that's in *str and ends
				1213	at end, parse it into an expr_ty. Return NULL on error. Adjust
				1214	str to point past the parsed portion. */
				1215	static expr_ty
				1216	fstring_parse(Parser p, const char str, const char end, int raw,
				1217	int recurse_lvl, Token first_token, Token t, Token *last_token)
				1218	{
				1219	FstringParser state;
				1220
				1221	_PyPegen_FstringParser_Init(&state);
				1222	if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
				1223	first_token, t, last_token) < 0) {
				1224	_PyPegen_FstringParser_Dealloc(&state);
				1225	return NULL;
				1226	}
				1227
				1228	return _PyPegen_FstringParser_Finish(p, &state, t, t);
				1229	}