Blame - Parser/string_parser.c - platform/external/python/cpython3

blob: 8f6433dbcec1313745b6b0a52ee25d9dc5f499d3 [file] [log] [blame]

Benjamin Peterson	2ad7e9c	2020-07-16 06:07:29 -0700	[diff] [blame]	1	#include <stdbool.h>
				2
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	3	#include <Python.h>
				4
Pablo Galindo	1ed83ad	2020-06-11 17:30:46 +0100	[diff] [blame]	5	#include "tokenizer.h"
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	6	#include "pegen.h"
Pablo Galindo	1ed83ad	2020-06-11 17:30:46 +0100	[diff] [blame]	7	#include "string_parser.h"
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	8
				9	//// STRING HANDLING FUNCTIONS ////
				10
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	11	static int
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	12	warn_invalid_escape_sequence(Parser p, unsigned char first_invalid_escape_char, Token t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	13	{
				14	PyObject *msg =
				15	PyUnicode_FromFormat("invalid escape sequence \\%c", first_invalid_escape_char);
				16	if (msg == NULL) {
				17	return -1;
				18	}
				19	if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	20	t->lineno, NULL, NULL) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	21	if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
				22	/* Replace the DeprecationWarning exception with a SyntaxError
				23	to get a more accurate error report */
				24	PyErr_Clear();
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	25
				26	/* This is needed, in order for the SyntaxError to point to the token t,
				27	since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
				28	error location, if p->known_err_token is not set. */
				29	p->known_err_token = t;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	30	RAISE_SYNTAX_ERROR("invalid escape sequence \\%c", first_invalid_escape_char);
				31	}
				32	Py_DECREF(msg);
				33	return -1;
				34	}
				35	Py_DECREF(msg);
				36	return 0;
				37	}
				38
				39	static PyObject *
				40	decode_utf8(const char *sPtr, const char end)
				41	{
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	42	const char *s;
				43	const char *t;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	44	t = s = *sPtr;
				45	while (s < end && (*s & 0x80)) {
				46	s++;
				47	}
				48	*sPtr = s;
				49	return PyUnicode_DecodeUTF8(t, s - t, NULL);
				50	}
				51
				52	static PyObject *
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	53	decode_unicode_with_escapes(Parser parser, const char s, size_t len, Token *t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	54	{
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	55	PyObject *v;
				56	PyObject *u;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	57	char *buf;
				58	char *p;
				59	const char *end;
				60
				61	/* check for integer overflow */
				62	if (len > SIZE_MAX / 6) {
				63	return NULL;
				64	}
				65	/* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
				66	"\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
				67	u = PyBytes_FromStringAndSize((char )NULL, len 6);
				68	if (u == NULL) {
				69	return NULL;
				70	}
				71	p = buf = PyBytes_AsString(u);
Christian Heimes	07f2ade	2020-11-18 16:38:53 +0100	[diff] [blame^]	72	if (p == NULL) {
				73	return NULL;
				74	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	75	end = s + len;
				76	while (s < end) {
				77	if (*s == '\\') {
				78	p++ = s++;
				79	if (s >= end \|\| *s & 0x80) {
				80	strcpy(p, "u005c");
				81	p += 5;
				82	if (s >= end) {
				83	break;
				84	}
				85	}
				86	}
				87	if (*s & 0x80) {
				88	PyObject *w;
				89	int kind;
				90	void *data;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	91	Py_ssize_t w_len;
				92	Py_ssize_t i;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	93	w = decode_utf8(&s, end);
				94	if (w == NULL) {
				95	Py_DECREF(u);
				96	return NULL;
				97	}
				98	kind = PyUnicode_KIND(w);
				99	data = PyUnicode_DATA(w);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	100	w_len = PyUnicode_GET_LENGTH(w);
				101	for (i = 0; i < w_len; i++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	102	Py_UCS4 chr = PyUnicode_READ(kind, data, i);
				103	sprintf(p, "\\U%08x", chr);
				104	p += 10;
				105	}
				106	/* Should be impossible to overflow */
				107	assert(p - buf <= PyBytes_GET_SIZE(u));
				108	Py_DECREF(w);
				109	}
				110	else {
				111	p++ = s++;
				112	}
				113	}
				114	len = p - buf;
				115	s = buf;
				116
				117	const char *first_invalid_escape;
				118	v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
				119
				120	if (v != NULL && first_invalid_escape != NULL) {
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	121	if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	122	/* We have not decref u before because first_invalid_escape points
				123	inside u. */
				124	Py_XDECREF(u);
				125	Py_DECREF(v);
				126	return NULL;
				127	}
				128	}
				129	Py_XDECREF(u);
				130	return v;
				131	}
				132
				133	static PyObject *
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	134	decode_bytes_with_escapes(Parser p, const char s, Py_ssize_t len, Token *t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	135	{
				136	const char *first_invalid_escape;
				137	PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
				138	if (result == NULL) {
				139	return NULL;
				140	}
				141
				142	if (first_invalid_escape != NULL) {
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	143	if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	144	Py_DECREF(result);
				145	return NULL;
				146	}
				147	}
				148	return result;
				149	}
				150
				151	/* s must include the bracketing quote characters, and r, b, u,
				152	&/or f prefixes (if any), and embedded escape sequences (if any).
				153	_PyPegen_parsestr parses it, and sets *result to decoded Python string object.
				154	If the string is an f-string, set fstr and fstrlen to the unparsed
				155	string object. Return 0 if no errors occurred. */
				156	int
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	157	_PyPegen_parsestr(Parser p, int bytesmode, int rawmode, PyObject *result,
				158	const char *fstr, Py_ssize_t fstrlen, Token *t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	159	{
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	160	const char *s = PyBytes_AsString(t->bytes);
				161	if (s == NULL) {
				162	return -1;
				163	}
				164
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	165	size_t len;
				166	int quote = Py_CHARMASK(*s);
				167	int fmode = 0;
				168	*bytesmode = 0;
				169	*rawmode = 0;
				170	*result = NULL;
				171	*fstr = NULL;
				172	if (Py_ISALPHA(quote)) {
				173	while (!bytesmode \|\| !rawmode) {
				174	if (quote == 'b' \|\| quote == 'B') {
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	175	quote =(unsigned char)*++s;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	176	*bytesmode = 1;
				177	}
				178	else if (quote == 'u' \|\| quote == 'U') {
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	179	quote = (unsigned char)*++s;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	180	}
				181	else if (quote == 'r' \|\| quote == 'R') {
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	182	quote = (unsigned char)*++s;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	183	*rawmode = 1;
				184	}
				185	else if (quote == 'f' \|\| quote == 'F') {
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	186	quote = (unsigned char)*++s;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	187	fmode = 1;
				188	}
				189	else {
				190	break;
				191	}
				192	}
				193	}
				194
Lysandros Nikolaou	3e0a6f3	2020-05-01 06:27:52 +0300	[diff] [blame]	195	/* fstrings are only allowed in Python 3.6 and greater */
				196	if (fmode && p->feature_version < 6) {
				197	p->error_indicator = 1;
				198	RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
				199	return -1;
				200	}
				201
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	202	if (fmode && *bytesmode) {
				203	PyErr_BadInternalCall();
				204	return -1;
				205	}
				206	if (quote != '\'' && quote != '\"') {
				207	PyErr_BadInternalCall();
				208	return -1;
				209	}
				210	/* Skip the leading quote char. */
				211	s++;
				212	len = strlen(s);
				213	if (len > INT_MAX) {
				214	PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
				215	return -1;
				216	}
				217	if (s[--len] != quote) {
				218	/* Last quote char must match the first. */
				219	PyErr_BadInternalCall();
				220	return -1;
				221	}
				222	if (len >= 4 && s[0] == quote && s[1] == quote) {
				223	/* A triple quoted string. We've already skipped one quote at
				224	the start and one at the end of the string. Now skip the
				225	two at the start. */
				226	s += 2;
				227	len -= 2;
				228	/* And check that the last two match. */
				229	if (s[--len] != quote \|\| s[--len] != quote) {
				230	PyErr_BadInternalCall();
				231	return -1;
				232	}
				233	}
				234
				235	if (fmode) {
				236	/* Just return the bytes. The caller will parse the resulting
				237	string. */
				238	*fstr = s;
				239	*fstrlen = len;
				240	return 0;
				241	}
				242
				243	/* Not an f-string. */
				244	/* Avoid invoking escape decoding routines if possible. */
				245	rawmode = rawmode \|\| strchr(s, '\\') == NULL;
				246	if (*bytesmode) {
				247	/* Disallow non-ASCII characters. */
				248	const char *ch;
				249	for (ch = s; *ch; ch++) {
				250	if (Py_CHARMASK(*ch) >= 0x80) {
				251	RAISE_SYNTAX_ERROR(
				252	"bytes can only contain ASCII "
				253	"literal characters.");
				254	return -1;
				255	}
				256	}
				257	if (*rawmode) {
				258	*result = PyBytes_FromStringAndSize(s, len);
				259	}
				260	else {
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	261	*result = decode_bytes_with_escapes(p, s, len, t);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	262	}
				263	}
				264	else {
				265	if (*rawmode) {
				266	*result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
				267	}
				268	else {
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	269	*result = decode_unicode_with_escapes(p, s, len, t);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	270	}
				271	}
				272	return *result == NULL ? -1 : 0;
				273	}
				274
				275
				276
				277	// FSTRING STUFF
				278
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	279	/* Fix locations for the given node and its children.
				280
				281	`parent` is the enclosing node.
				282	`n` is the node which locations are going to be fixed relative to parent.
				283	`expr_str` is the child node's string representation, including braces.
				284	*/
Benjamin Peterson	2ad7e9c	2020-07-16 06:07:29 -0700	[diff] [blame]	285	static bool
Lysandros Nikolaou	1f0f4ab	2020-06-28 02:41:48 +0300	[diff] [blame]	286	fstring_find_expr_location(Token parent, char expr_str, int p_lines, int p_cols)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	287	{
Benjamin Peterson	2ad7e9c	2020-07-16 06:07:29 -0700	[diff] [blame]	288	*p_lines = 0;
				289	*p_cols = 0;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	290	if (parent && parent->bytes) {
				291	char *parent_str = PyBytes_AsString(parent->bytes);
				292	if (!parent_str) {
Benjamin Peterson	2ad7e9c	2020-07-16 06:07:29 -0700	[diff] [blame]	293	return false;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	294	}
Benjamin Peterson	2ad7e9c	2020-07-16 06:07:29 -0700	[diff] [blame]	295	char *substr = strstr(parent_str, expr_str);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	296	if (substr) {
				297	// The following is needed, in order to correctly shift the column
				298	// offset, in the case that (disregarding any whitespace) a newline
				299	// immediately follows the opening curly brace of the fstring expression.
Benjamin Peterson	2ad7e9c	2020-07-16 06:07:29 -0700	[diff] [blame]	300	bool newline_after_brace = 1;
				301	char *start = substr + 1;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	302	while (start && start != '}' && start != '\n') {
				303	if (start != ' ' && start != '\t' && *start != '\f') {
				304	newline_after_brace = 0;
				305	break;
				306	}
				307	start++;
				308	}
				309
				310	// Account for the characters from the last newline character to our
				311	// left until the beginning of substr.
				312	if (!newline_after_brace) {
				313	start = substr;
				314	while (start > parent_str && *start != '\n') {
				315	start--;
				316	}
Benjamin Peterson	2ad7e9c	2020-07-16 06:07:29 -0700	[diff] [blame]	317	*p_cols += (int)(substr - start);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	318	}
				319	/* adjust the start based on the number of newlines encountered
				320	before the f-string expression */
Pablo Galindo	0b7829e	2020-04-23 03:24:25 +0100	[diff] [blame]	321	for (char* p = parent_str; p < substr; p++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	322	if (*p == '\n') {
Benjamin Peterson	2ad7e9c	2020-07-16 06:07:29 -0700	[diff] [blame]	323	(*p_lines)++;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	324	}
				325	}
				326	}
				327	}
Benjamin Peterson	2ad7e9c	2020-07-16 06:07:29 -0700	[diff] [blame]	328	return true;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	329	}
				330
				331
				332	/* Compile this expression in to an expr_ty. Add parens around the
				333	expression, in order to allow leading spaces in the expression. */
				334	static expr_ty
				335	fstring_compile_expr(Parser p, const char expr_start, const char *expr_end,
				336	Token *t)
				337	{
				338	expr_ty expr = NULL;
				339	char *str;
				340	Py_ssize_t len;
				341	const char *s;
				342	expr_ty result = NULL;
				343
				344	assert(expr_end >= expr_start);
				345	assert(*(expr_start-1) == '{');
				346	assert(expr_end == '}' \|\| expr_end == '!' \|\| *expr_end == ':' \|\|
				347	*expr_end == '=');
				348
				349	/* If the substring is all whitespace, it's an error. We need to catch this
				350	here, and not when we call PyParser_SimpleParseStringFlagsFilename,
				351	because turning the expression '' in to '()' would go from being invalid
				352	to valid. */
				353	for (s = expr_start; s != expr_end; s++) {
				354	char c = *s;
				355	/* The Python parser ignores only the following whitespace
				356	characters (\r already is converted to \n). */
				357	if (!(c == ' ' \|\| c == '\t' \|\| c == '\n' \|\| c == '\f')) {
				358	break;
				359	}
				360	}
				361	if (s == expr_end) {
				362	RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
				363	return NULL;
				364	}
				365
				366	len = expr_end - expr_start;
				367	/* Allocate 3 extra bytes: open paren, close paren, null byte. */
Lysandros Nikolaou	6dcbc24	2020-06-27 20:47:00 +0300	[diff] [blame]	368	str = PyMem_Malloc(len + 3);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	369	if (str == NULL) {
				370	PyErr_NoMemory();
				371	return NULL;
				372	}
				373
Lysandros Nikolaou	1f0f4ab	2020-06-28 02:41:48 +0300	[diff] [blame]	374	// The call to fstring_find_expr_location is responsible for finding the column offset
				375	// the generated AST nodes need to be shifted to the right, which is equal to the number
				376	// of the f-string characters before the expression starts. In order to correctly compute
				377	// this offset, strstr gets called in fstring_find_expr_location which only succeeds
				378	// if curly braces appear before and after the f-string expression (exactly like they do
				379	// in the f-string itself), hence the following lines.
				380	str[0] = '{';
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	381	memcpy(str+1, expr_start, len);
Lysandros Nikolaou	1f0f4ab	2020-06-28 02:41:48 +0300	[diff] [blame]	382	str[len+1] = '}';
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	383	str[len+2] = 0;
				384
Lysandros Nikolaou	1f0f4ab	2020-06-28 02:41:48 +0300	[diff] [blame]	385	int lines, cols;
Benjamin Peterson	2ad7e9c	2020-07-16 06:07:29 -0700	[diff] [blame]	386	if (!fstring_find_expr_location(t, str, &lines, &cols)) {
				387	PyMem_FREE(str);
				388	return NULL;
				389	}
Lysandros Nikolaou	1f0f4ab	2020-06-28 02:41:48 +0300	[diff] [blame]	390
Eric V. Smith	0275e04	2020-07-16 12:10:23 -0400	[diff] [blame]	391	// The parentheses are needed in order to allow for leading whitespace within
Lysandros Nikolaou	1f0f4ab	2020-06-28 02:41:48 +0300	[diff] [blame]	392	// the f-string expression. This consequently gets parsed as a group (see the
				393	// group rule in python.gram).
				394	str[0] = '(';
				395	str[len+1] = ')';
				396
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	397	struct tok_state* tok = PyTokenizer_FromString(str, 1);
				398	if (tok == NULL) {
Lysandros Nikolaou	6dcbc24	2020-06-27 20:47:00 +0300	[diff] [blame]	399	PyMem_Free(str);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	400	return NULL;
				401	}
Lysandros Nikolaou	f7b1e46	2020-05-26 03:32:18 +0300	[diff] [blame]	402	Py_INCREF(p->tok->filename);
				403	tok->filename = p->tok->filename;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	404
Lysandros Nikolaou	3e0a6f3	2020-05-01 06:27:52 +0300	[diff] [blame]	405	Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
				406	NULL, p->arena);
Lysandros Nikolaou	1f0f4ab	2020-06-28 02:41:48 +0300	[diff] [blame]	407	p2->starting_lineno = t->lineno + lines - 1;
				408	p2->starting_col_offset = p->tok->first_lineno == p->tok->lineno ? t->col_offset + cols : cols;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	409
				410	expr = _PyPegen_run_parser(p2);
				411
				412	if (expr == NULL) {
				413	goto exit;
				414	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	415	result = expr;
				416
				417	exit:
Lysandros Nikolaou	6dcbc24	2020-06-27 20:47:00 +0300	[diff] [blame]	418	PyMem_Free(str);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	419	_PyPegen_Parser_Free(p2);
				420	PyTokenizer_Free(tok);
				421	return result;
				422	}
				423
				424	/* Return -1 on error.
				425
				426	Return 0 if we reached the end of the literal.
				427
				428	Return 1 if we haven't reached the end of the literal, but we want
				429	the caller to process the literal up to this point. Used for
				430	doubled braces.
				431	*/
				432	static int
				433	fstring_find_literal(Parser p, const char str, const char end, int raw,
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	434	PyObject *literal, int recurse_lvl, Token t)
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	435	{
				436	/* Get any literal string. It ends when we hit an un-doubled left
				437	brace (which isn't part of a unicode name escape such as
				438	"\N{EULER CONSTANT}"), or the end of the string. */
				439
				440	const char s = str;
				441	const char *literal_start = s;
				442	int result = 0;
				443
				444	assert(*literal == NULL);
				445	while (s < end) {
				446	char ch = *s++;
				447	if (!raw && ch == '\\' && s < end) {
				448	ch = *s++;
				449	if (ch == 'N') {
				450	if (s < end && *s++ == '{') {
				451	while (s < end && *s++ != '}') {
				452	}
				453	continue;
				454	}
				455	break;
				456	}
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	457	if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	458	return -1;
				459	}
				460	}
				461	if (ch == '{' \|\| ch == '}') {
				462	/* Check for doubled braces, but only at the top level. If
				463	we checked at every level, then f'{0:{3}}' would fail
				464	with the two closing braces. */
				465	if (recurse_lvl == 0) {
				466	if (s < end && *s == ch) {
				467	/* We're going to tell the caller that the literal ends
				468	here, but that they should continue scanning. But also
				469	skip over the second brace when we resume scanning. */
				470	*str = s + 1;
				471	result = 1;
				472	goto done;
				473	}
				474
				475	/* Where a single '{' is the start of a new expression, a
				476	single '}' is not allowed. */
				477	if (ch == '}') {
				478	*str = s - 1;
				479	RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
				480	return -1;
				481	}
				482	}
				483	/* We're either at a '{', which means we're starting another
				484	expression; or a '}', which means we're at the end of this
				485	f-string (for a nested format_spec). */
				486	s--;
				487	break;
				488	}
				489	}
				490	*str = s;
				491	assert(s <= end);
				492	assert(s == end \|\| s == '{' \|\| s == '}');
				493	done:
				494	if (literal_start != s) {
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	495	if (raw) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	496	*literal = PyUnicode_DecodeUTF8Stateful(literal_start,
				497	s - literal_start,
				498	NULL, NULL);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	499	} else {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	500	*literal = decode_unicode_with_escapes(p, literal_start,
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	501	s - literal_start, t);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	502	}
				503	if (!*literal) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	504	return -1;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	505	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	506	}
				507	return result;
				508	}
				509
				510	/* Forward declaration because parsing is recursive. */
				511	static expr_ty
				512	fstring_parse(Parser p, const char str, const char end, int raw, int recurse_lvl,
				513	Token first_token, Token t, Token *last_token);
				514
				515	/* Parse the f-string at str, ending at end. We know str starts an
				516	expression (so it must be a '{'). Returns the FormattedValue node, which
				517	includes the expression, conversion character, format_spec expression, and
				518	optionally the text of the expression (if = is used).
				519
				520	Note that I don't do a perfect job here: I don't make sure that a
				521	closing brace doesn't match an opening paren, for example. It
				522	doesn't need to error on all invalid expressions, just correctly
				523	find the end of all valid ones. Any errors inside the expression
				524	will be caught when we parse it later.
				525
				526	*expression is set to the expression. For an '=' "debug" expression,
				527	*expr_text is set to the debug text (the original text of the expression,
				528	including the '=' and any whitespace around it, as a string object). If
				529	not a debug expression, expr_text set to NULL. /
				530	static int
				531	fstring_find_expr(Parser p, const char str, const char end, int raw, int recurse_lvl,
				532	PyObject *expr_text, expr_ty expression, Token *first_token,
				533	Token t, Token last_token)
				534	{
				535	/* Return -1 on error, else 0. */
				536
				537	const char *expr_start;
				538	const char *expr_end;
				539	expr_ty simple_expression;
				540	expr_ty format_spec = NULL; /* Optional format specifier. */
				541	int conversion = -1; /* The conversion char. Use default if not
				542	specified, or !r if using = and no format
				543	spec. */
				544
				545	/* 0 if we're not in a string, else the quote char we're trying to
				546	match (single or double quote). */
				547	char quote_char = 0;
				548
				549	/* If we're inside a string, 1=normal, 3=triple-quoted. */
				550	int string_type = 0;
				551
				552	/* Keep track of nesting level for braces/parens/brackets in
				553	expressions. */
				554	Py_ssize_t nested_depth = 0;
				555	char parenstack[MAXLEVEL];
				556
				557	*expr_text = NULL;
				558
				559	/* Can only nest one level deep. */
				560	if (recurse_lvl >= 2) {
				561	RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
				562	goto error;
				563	}
				564
				565	/* The first char must be a left brace, or we wouldn't have gotten
				566	here. Skip over it. */
				567	assert(**str == '{');
				568	*str += 1;
				569
				570	expr_start = *str;
				571	for (; str < end; (str)++) {
				572	char ch;
				573
				574	/* Loop invariants. */
				575	assert(nested_depth >= 0);
				576	assert(str >= expr_start && str < end);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	577	if (quote_char) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	578	assert(string_type == 1 \|\| string_type == 3);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	579	} else {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	580	assert(string_type == 0);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	581	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	582
				583	ch = **str;
				584	/* Nowhere inside an expression is a backslash allowed. */
				585	if (ch == '\\') {
				586	/* Error: can't include a backslash character, inside
				587	parens or strings or not. */
				588	RAISE_SYNTAX_ERROR(
				589	"f-string expression part "
				590	"cannot include a backslash");
				591	goto error;
				592	}
				593	if (quote_char) {
				594	/* We're inside a string. See if we're at the end. */
				595	/* This code needs to implement the same non-error logic
				596	as tok_get from tokenizer.c, at the letter_quote
				597	label. To actually share that code would be a
				598	nightmare. But, it's unlikely to change and is small,
				599	so duplicate it here. Note we don't need to catch all
				600	of the errors, since they'll be caught when parsing the
				601	expression. We just need to match the non-error
				602	cases. Thus we can ignore \n in single-quoted strings,
				603	for example. Or non-terminated strings. */
				604	if (ch == quote_char) {
				605	/* Does this match the string_type (single or triple
				606	quoted)? */
				607	if (string_type == 3) {
				608	if (str+2 < end && (str+1) == ch && (*str+2) == ch) {
				609	/* We're at the end of a triple quoted string. */
				610	*str += 2;
				611	string_type = 0;
				612	quote_char = 0;
				613	continue;
				614	}
				615	} else {
				616	/* We're at the end of a normal string. */
				617	quote_char = 0;
				618	string_type = 0;
				619	continue;
				620	}
				621	}
				622	} else if (ch == '\'' \|\| ch == '"') {
				623	/* Is this a triple quoted string? */
				624	if (str+2 < end && (str+1) == ch && (*str+2) == ch) {
				625	string_type = 3;
				626	*str += 2;
				627	} else {
				628	/* Start of a normal string. */
				629	string_type = 1;
				630	}
				631	/* Start looking for the end of the string. */
				632	quote_char = ch;
				633	} else if (ch == '[' \|\| ch == '{' \|\| ch == '(') {
				634	if (nested_depth >= MAXLEVEL) {
				635	RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
				636	goto error;
				637	}
				638	parenstack[nested_depth] = ch;
				639	nested_depth++;
				640	} else if (ch == '#') {
				641	/* Error: can't include a comment character, inside parens
				642	or not. */
				643	RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
				644	goto error;
				645	} else if (nested_depth == 0 &&
				646	(ch == '!' \|\| ch == ':' \|\| ch == '}' \|\|
				647	ch == '=' \|\| ch == '>' \|\| ch == '<')) {
				648	/* See if there's a next character. */
				649	if (*str+1 < end) {
				650	char next = (str+1);
				651
				652	/* For "!=". since '=' is not an allowed conversion character,
				653	nothing is lost in this test. */
				654	if ((ch == '!' && next == '=') \|\| /* != */
				655	(ch == '=' && next == '=') \|\| /* == */
				656	(ch == '<' && next == '=') \|\| /* <= */
				657	(ch == '>' && next == '=') /* >= */
				658	) {
				659	*str += 1;
				660	continue;
				661	}
				662	/* Don't get out of the loop for these, if they're single
				663	chars (not part of 2-char tokens). If by themselves, they
				664	don't end an expression (unlike say '!'). */
				665	if (ch == '>' \|\| ch == '<') {
				666	continue;
				667	}
				668	}
				669
				670	/* Normal way out of this loop. */
				671	break;
				672	} else if (ch == ']' \|\| ch == '}' \|\| ch == ')') {
				673	if (!nested_depth) {
				674	RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
				675	goto error;
				676	}
				677	nested_depth--;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	678	int opening = (unsigned char)parenstack[nested_depth];
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	679	if (!((opening == '(' && ch == ')') \|\|
				680	(opening == '[' && ch == ']') \|\|
				681	(opening == '{' && ch == '}')))
				682	{
				683	RAISE_SYNTAX_ERROR(
				684	"f-string: closing parenthesis '%c' "
				685	"does not match opening parenthesis '%c'",
				686	ch, opening);
				687	goto error;
				688	}
				689	} else {
				690	/* Just consume this char and loop around. */
				691	}
				692	}
				693	expr_end = *str;
				694	/* If we leave this loop in a string or with mismatched parens, we
				695	don't care. We'll get a syntax error when compiling the
				696	expression. But, we can produce a better error message, so
				697	let's just do that.*/
				698	if (quote_char) {
				699	RAISE_SYNTAX_ERROR("f-string: unterminated string");
				700	goto error;
				701	}
				702	if (nested_depth) {
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	703	int opening = (unsigned char)parenstack[nested_depth - 1];
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	704	RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
				705	goto error;
				706	}
				707
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	708	if (*str >= end) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	709	goto unexpected_end_of_string;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	710	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	711
				712	/* Compile the expression as soon as possible, so we show errors
				713	related to the expression before errors related to the
				714	conversion or format_spec. */
				715	simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	716	if (!simple_expression) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	717	goto error;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	718	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	719
				720	/* Check for =, which puts the text value of the expression in
				721	expr_text. */
				722	if (**str == '=') {
Shantanu	c116c94	2020-05-27 13:30:38 -0700	[diff] [blame]	723	if (p->feature_version < 8) {
				724	RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
				725	"only supported in Python 3.8 and greater");
				726	goto error;
				727	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	728	*str += 1;
				729
				730	/* Skip over ASCII whitespace. No need to test for end of string
				731	here, since we know there's at least a trailing quote somewhere
				732	ahead. */
				733	while (Py_ISSPACE(**str)) {
				734	*str += 1;
				735	}
				736
				737	/* Set expr_text to the text of the expression. /
				738	expr_text = PyUnicode_FromStringAndSize(expr_start, str-expr_start);
				739	if (!*expr_text) {
				740	goto error;
				741	}
				742	}
				743
				744	/* Check for a conversion char, if present. */
				745	if (**str == '!') {
				746	*str += 1;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	747	if (*str >= end) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	748	goto unexpected_end_of_string;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	749	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	750
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	751	conversion = (unsigned char)**str;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	752	*str += 1;
				753
				754	/* Validate the conversion. */
				755	if (!(conversion == 's' \|\| conversion == 'r' \|\| conversion == 'a')) {
				756	RAISE_SYNTAX_ERROR(
				757	"f-string: invalid conversion character: "
				758	"expected 's', 'r', or 'a'");
				759	goto error;
				760	}
				761
				762	}
				763
				764	/* Check for the format spec, if present. */
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	765	if (*str >= end) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	766	goto unexpected_end_of_string;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	767	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	768	if (**str == ':') {
				769	*str += 1;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	770	if (*str >= end) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	771	goto unexpected_end_of_string;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	772	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	773
				774	/* Parse the format spec. */
				775	format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
				776	first_token, t, last_token);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	777	if (!format_spec) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	778	goto error;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	779	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	780	}
				781
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	782	if (str >= end \|\| *str != '}') {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	783	goto unexpected_end_of_string;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	784	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	785
				786	/* We're at a right brace. Consume it. */
				787	assert(*str < end);
				788	assert(**str == '}');
				789	*str += 1;
				790
				791	/* If we're in = mode (detected by non-NULL expr_text), and have no format
				792	spec and no explicit conversion, set the conversion to 'r'. */
				793	if (*expr_text && format_spec == NULL && conversion == -1) {
				794	conversion = 'r';
				795	}
				796
				797	/* And now create the FormattedValue node that represents this
				798	entire expression with the conversion and format spec. */
				799	//TODO: Fix this
				800	*expression = FormattedValue(simple_expression, conversion,
				801	format_spec, first_token->lineno,
				802	first_token->col_offset, last_token->end_lineno,
				803	last_token->end_col_offset, p->arena);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	804	if (!*expression) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	805	goto error;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	806	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	807
				808	return 0;
				809
				810	unexpected_end_of_string:
				811	RAISE_SYNTAX_ERROR("f-string: expecting '}'");
				812	/* Falls through to error. */
				813
				814	error:
				815	Py_XDECREF(*expr_text);
				816	return -1;
				817
				818	}
				819
				820	/* Return -1 on error.
				821
				822	Return 0 if we have a literal (possible zero length) and an
				823	expression (zero length if at the end of the string.
				824
				825	Return 1 if we have a literal, but no expression, and we want the
				826	caller to call us again. This is used to deal with doubled
				827	braces.
				828
				829	When called multiple times on the string 'a{{b{0}c', this function
				830	will return:
				831
				832	1. the literal 'a{' with no expression, and a return value
				833	of 1. Despite the fact that there's no expression, the return
				834	value of 1 means we're not finished yet.
				835
				836	2. the literal 'b' and the expression '0', with a return value of
				837	0. The fact that there's an expression means we're not finished.
				838
				839	3. literal 'c' with no expression and a return value of 0. The
				840	combination of the return value of 0 with no expression means
				841	we're finished.
				842	*/
				843	static int
				844	fstring_find_literal_and_expr(Parser p, const char str, const char end, int raw,
				845	int recurse_lvl, PyObject **literal,
				846	PyObject *expr_text, expr_ty expression,
				847	Token first_token, Token t, Token *last_token)
				848	{
				849	int result;
				850
				851	assert(literal == NULL && expression == NULL);
				852
				853	/* Get any literal string. */
Lysandros Nikolaou	2f37c35	2020-05-07 13:37:51 +0300	[diff] [blame]	854	result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	855	if (result < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	856	goto error;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	857	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	858
				859	assert(result == 0 \|\| result == 1);
				860
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	861	if (result == 1) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	862	/* We have a literal, but don't look at the expression. */
				863	return 1;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	864	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	865
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	866	if (str >= end \|\| *str == '}') {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	867	/* We're at the end of the string or the end of a nested
				868	f-string: no expression. The top-level error case where we
				869	expect to be at the end of the string but we're at a '}' is
				870	handled later. */
				871	return 0;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	872	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	873
				874	/* We must now be the start of an expression, on a '{'. */
				875	assert(**str == '{');
				876
				877	if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	878	expression, first_token, t, last_token) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	879	goto error;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	880	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	881
				882	return 0;
				883
				884	error:
				885	Py_CLEAR(*literal);
				886	return -1;
				887	}
				888
				889	#ifdef NDEBUG
				890	#define ExprList_check_invariants(l)
				891	#else
				892	static void
				893	ExprList_check_invariants(ExprList *l)
				894	{
				895	/* Check our invariants. Make sure this object is "live", and
				896	hasn't been deallocated. */
				897	assert(l->size >= 0);
				898	assert(l->p != NULL);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	899	if (l->size <= EXPRLIST_N_CACHED) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	900	assert(l->data == l->p);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	901	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	902	}
				903	#endif
				904
				905	static void
				906	ExprList_Init(ExprList *l)
				907	{
				908	l->allocated = EXPRLIST_N_CACHED;
				909	l->size = 0;
				910
				911	/* Until we start allocating dynamically, p points to data. */
				912	l->p = l->data;
				913
				914	ExprList_check_invariants(l);
				915	}
				916
				917	static int
				918	ExprList_Append(ExprList *l, expr_ty exp)
				919	{
				920	ExprList_check_invariants(l);
				921	if (l->size >= l->allocated) {
				922	/* We need to alloc (or realloc) the memory. */
				923	Py_ssize_t new_size = l->allocated * 2;
				924
				925	/* See if we've ever allocated anything dynamically. */
				926	if (l->p == l->data) {
				927	Py_ssize_t i;
				928	/* We're still using the cached data. Switch to
				929	alloc-ing. */
Lysandros Nikolaou	6dcbc24	2020-06-27 20:47:00 +0300	[diff] [blame]	930	l->p = PyMem_Malloc(sizeof(expr_ty) * new_size);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	931	if (!l->p) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	932	return -1;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	933	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	934	/* Copy the cached data into the new buffer. */
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	935	for (i = 0; i < l->size; i++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	936	l->p[i] = l->data[i];
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	937	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	938	} else {
				939	/* Just realloc. */
Lysandros Nikolaou	6dcbc24	2020-06-27 20:47:00 +0300	[diff] [blame]	940	expr_ty tmp = PyMem_Realloc(l->p, sizeof(expr_ty) new_size);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	941	if (!tmp) {
Lysandros Nikolaou	6dcbc24	2020-06-27 20:47:00 +0300	[diff] [blame]	942	PyMem_Free(l->p);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	943	l->p = NULL;
				944	return -1;
				945	}
				946	l->p = tmp;
				947	}
				948
				949	l->allocated = new_size;
				950	assert(l->allocated == 2 * l->size);
				951	}
				952
				953	l->p[l->size++] = exp;
				954
				955	ExprList_check_invariants(l);
				956	return 0;
				957	}
				958
				959	static void
				960	ExprList_Dealloc(ExprList *l)
				961	{
				962	ExprList_check_invariants(l);
				963
				964	/* If there's been an error, or we've never dynamically allocated,
				965	do nothing. */
				966	if (!l->p \|\| l->p == l->data) {
				967	/* Do nothing. */
				968	} else {
				969	/* We have dynamically allocated. Free the memory. */
Lysandros Nikolaou	6dcbc24	2020-06-27 20:47:00 +0300	[diff] [blame]	970	PyMem_Free(l->p);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	971	}
				972	l->p = NULL;
				973	l->size = -1;
				974	}
				975
Pablo Galindo	a5634c4	2020-09-16 19:42:00 +0100	[diff] [blame]	976	static asdl_expr_seq *
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	977	ExprList_Finish(ExprList l, PyArena arena)
				978	{
Pablo Galindo	a5634c4	2020-09-16 19:42:00 +0100	[diff] [blame]	979	asdl_expr_seq *seq;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	980
				981	ExprList_check_invariants(l);
				982
				983	/* Allocate the asdl_seq and copy the expressions in to it. */
Pablo Galindo	a5634c4	2020-09-16 19:42:00 +0100	[diff] [blame]	984	seq = _Py_asdl_expr_seq_new(l->size, arena);
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	985	if (seq) {
				986	Py_ssize_t i;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	987	for (i = 0; i < l->size; i++) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	988	asdl_seq_SET(seq, i, l->p[i]);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	989	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	990	}
				991	ExprList_Dealloc(l);
				992	return seq;
				993	}
				994
				995	#ifdef NDEBUG
				996	#define FstringParser_check_invariants(state)
				997	#else
				998	static void
				999	FstringParser_check_invariants(FstringParser *state)
				1000	{
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1001	if (state->last_str) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1002	assert(PyUnicode_CheckExact(state->last_str));
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1003	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1004	ExprList_check_invariants(&state->expr_list);
				1005	}
				1006	#endif
				1007
				1008	void
				1009	_PyPegen_FstringParser_Init(FstringParser *state)
				1010	{
				1011	state->last_str = NULL;
				1012	state->fmode = 0;
				1013	ExprList_Init(&state->expr_list);
				1014	FstringParser_check_invariants(state);
				1015	}
				1016
				1017	void
				1018	_PyPegen_FstringParser_Dealloc(FstringParser *state)
				1019	{
				1020	FstringParser_check_invariants(state);
				1021
				1022	Py_XDECREF(state->last_str);
				1023	ExprList_Dealloc(&state->expr_list);
				1024	}
				1025
				1026	/* Make a Constant node, but decref the PyUnicode object being added. */
				1027	static expr_ty
				1028	make_str_node_and_del(Parser p, PyObject str, Token first_token, Token *last_token)
				1029	{
				1030	PyObject s = str;
				1031	PyObject *kind = NULL;
				1032	*str = NULL;
				1033	assert(PyUnicode_CheckExact(s));
				1034	if (PyArena_AddPyObject(p->arena, s) < 0) {
				1035	Py_DECREF(s);
				1036	return NULL;
				1037	}
				1038	const char* the_str = PyBytes_AsString(first_token->bytes);
				1039	if (the_str && the_str[0] == 'u') {
				1040	kind = _PyPegen_new_identifier(p, "u");
				1041	}
				1042
				1043	if (kind == NULL && PyErr_Occurred()) {
				1044	return NULL;
				1045	}
				1046
				1047	return Constant(s, kind, first_token->lineno, first_token->col_offset,
				1048	last_token->end_lineno, last_token->end_col_offset, p->arena);
				1049
				1050	}
				1051
				1052
				1053	/* Add a non-f-string (that is, a regular literal string). str is
				1054	decref'd. */
				1055	int
				1056	_PyPegen_FstringParser_ConcatAndDel(FstringParser state, PyObject str)
				1057	{
				1058	FstringParser_check_invariants(state);
				1059
				1060	assert(PyUnicode_CheckExact(str));
				1061
				1062	if (PyUnicode_GET_LENGTH(str) == 0) {
				1063	Py_DECREF(str);
				1064	return 0;
				1065	}
				1066
				1067	if (!state->last_str) {
				1068	/* We didn't have a string before, so just remember this one. */
				1069	state->last_str = str;
				1070	} else {
				1071	/* Concatenate this with the previous string. */
				1072	PyUnicode_AppendAndDel(&state->last_str, str);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1073	if (!state->last_str) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1074	return -1;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1075	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1076	}
				1077	FstringParser_check_invariants(state);
				1078	return 0;
				1079	}
				1080
				1081	/* Parse an f-string. The f-string is in *str to end, with no
				1082	'f' or quotes. */
				1083	int
				1084	_PyPegen_FstringParser_ConcatFstring(Parser p, FstringParser state, const char **str,
				1085	const char *end, int raw, int recurse_lvl,
				1086	Token first_token, Token t, Token *last_token)
				1087	{
				1088	FstringParser_check_invariants(state);
				1089	state->fmode = 1;
				1090
				1091	/* Parse the f-string. */
				1092	while (1) {
				1093	PyObject *literal = NULL;
				1094	PyObject *expr_text = NULL;
				1095	expr_ty expression = NULL;
				1096
				1097	/* If there's a zero length literal in front of the
				1098	expression, literal will be NULL. If we're at the end of
				1099	the f-string, expression will be NULL (unless result == 1,
				1100	see below). */
				1101	int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
				1102	&literal, &expr_text,
				1103	&expression, first_token, t, last_token);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1104	if (result < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1105	return -1;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1106	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1107
				1108	/* Add the literal, if any. */
				1109	if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
				1110	Py_XDECREF(expr_text);
				1111	return -1;
				1112	}
				1113	/* Add the expr_text, if any. */
				1114	if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
				1115	return -1;
				1116	}
				1117
				1118	/* We've dealt with the literal and expr_text, their ownership has
				1119	been transferred to the state object. Don't look at them again. */
				1120
				1121	/* See if we should just loop around to get the next literal
				1122	and expression, while ignoring the expression this
				1123	time. This is used for un-doubling braces, as an
				1124	optimization. */
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1125	if (result == 1) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1126	continue;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1127	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1128
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1129	if (!expression) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1130	/* We're done with this f-string. */
				1131	break;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1132	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1133
				1134	/* We know we have an expression. Convert any existing string
				1135	to a Constant node. */
				1136	if (!state->last_str) {
				1137	/* Do nothing. No previous literal. */
				1138	} else {
				1139	/* Convert the existing last_str literal to a Constant node. */
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1140	expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
				1141	if (!last_str \|\| ExprList_Append(&state->expr_list, last_str) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1142	return -1;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1143	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1144	}
				1145
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1146	if (ExprList_Append(&state->expr_list, expression) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1147	return -1;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1148	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1149	}
				1150
				1151	/* If recurse_lvl is zero, then we must be at the end of the
				1152	string. Otherwise, we must be at a right brace. */
				1153
				1154	if (recurse_lvl == 0 && *str < end-1) {
				1155	RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
				1156	return -1;
				1157	}
				1158	if (recurse_lvl != 0 && **str != '}') {
				1159	RAISE_SYNTAX_ERROR("f-string: expecting '}'");
				1160	return -1;
				1161	}
				1162
				1163	FstringParser_check_invariants(state);
				1164	return 0;
				1165	}
				1166
				1167	/* Convert the partial state reflected in last_str and expr_list to an
				1168	expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
				1169	expr_ty
				1170	_PyPegen_FstringParser_Finish(Parser p, FstringParser state, Token* first_token,
				1171	Token *last_token)
				1172	{
Pablo Galindo	a5634c4	2020-09-16 19:42:00 +0100	[diff] [blame]	1173	asdl_expr_seq *seq;
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1174
				1175	FstringParser_check_invariants(state);
				1176
				1177	/* If we're just a constant string with no expressions, return
				1178	that. */
				1179	if (!state->fmode) {
				1180	assert(!state->expr_list.size);
				1181	if (!state->last_str) {
				1182	/* Create a zero length string. */
				1183	state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1184	if (!state->last_str) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1185	goto error;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1186	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1187	}
				1188	return make_str_node_and_del(p, &state->last_str, first_token, last_token);
				1189	}
				1190
				1191	/* Create a Constant node out of last_str, if needed. It will be the
				1192	last node in our expression list. */
				1193	if (state->last_str) {
				1194	expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1195	if (!str \|\| ExprList_Append(&state->expr_list, str) < 0) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1196	goto error;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1197	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1198	}
				1199	/* This has already been freed. */
				1200	assert(state->last_str == NULL);
				1201
				1202	seq = ExprList_Finish(&state->expr_list, p->arena);
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1203	if (!seq) {
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1204	goto error;
Pablo Galindo	fb61c42	2020-06-15 14:23:43 +0100	[diff] [blame]	1205	}
Pablo Galindo	c5fc156	2020-04-22 23:29:27 +0100	[diff] [blame]	1206
				1207	return _Py_JoinedStr(seq, first_token->lineno, first_token->col_offset,
				1208	last_token->end_lineno, last_token->end_col_offset, p->arena);
				1209
				1210	error:
				1211	_PyPegen_FstringParser_Dealloc(state);
				1212	return NULL;
				1213	}
				1214
				1215	/* Given an f-string (with no 'f' or quotes) that's in *str and ends
				1216	at end, parse it into an expr_ty. Return NULL on error. Adjust
				1217	str to point past the parsed portion. */
				1218	static expr_ty
				1219	fstring_parse(Parser p, const char str, const char end, int raw,
				1220	int recurse_lvl, Token first_token, Token t, Token *last_token)
				1221	{
				1222	FstringParser state;
				1223
				1224	_PyPegen_FstringParser_Init(&state);
				1225	if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
				1226	first_token, t, last_token) < 0) {
				1227	_PyPegen_FstringParser_Dealloc(&state);
				1228	return NULL;
				1229	}
				1230
				1231	return _PyPegen_FstringParser_Finish(p, &state, t, t);
				1232	}