Blame - llvm/lib/Support/regcomp.c - toolchain/llvm-project

blob: 354e359f676b474e2ddbcc0b8c9aa04e7ea6acea [file] [log] [blame]

Torok Edwin	e14d4cd	2009-08-30 08:24:09 +0000	[diff] [blame]	1	/*-
				2	* This code is derived from OpenBSD's libc/regex, original license follows:
				3	*
				4	* Copyright (c) 1992, 1993, 1994 Henry Spencer.
				5	* Copyright (c) 1992, 1993, 1994
				6	* The Regents of the University of California. All rights reserved.
				7	*
				8	* This code is derived from software contributed to Berkeley by
				9	* Henry Spencer.
				10	*
				11	* Redistribution and use in source and binary forms, with or without
				12	* modification, are permitted provided that the following conditions
				13	* are met:
				14	* 1. Redistributions of source code must retain the above copyright
				15	* notice, this list of conditions and the following disclaimer.
				16	* 2. Redistributions in binary form must reproduce the above copyright
				17	* notice, this list of conditions and the following disclaimer in the
				18	* documentation and/or other materials provided with the distribution.
				19	* 3. Neither the name of the University nor the names of its contributors
				20	* may be used to endorse or promote products derived from this software
				21	* without specific prior written permission.
				22	*
				23	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
				24	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
				25	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
				26	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
				27	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
				28	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
				29	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
				30	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
				31	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
				32	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
				33	* SUCH DAMAGE.
				34	*
				35	* @(#)regcomp.c 8.5 (Berkeley) 3/20/94
				36	*/
				37
				38	#include <sys/types.h>
				39	#include <stdio.h>
				40	#include <string.h>
				41	#include <ctype.h>
				42	#include <limits.h>
				43	#include <stdlib.h>
				44	#include "regex_impl.h"
				45
				46	#include "regutils.h"
				47	#include "regex2.h"
				48
Joerg Sonnenberger	a73284a	2015-02-07 21:24:06 +0000	[diff] [blame]	49	#include "llvm/Config/config.h"
				50	#if HAVE_STDINT_H
				51	#include <stdint.h>
				52	#else
				53	/* Pessimistically bound memory use */
				54	#define SIZE_MAX UINT_MAX
				55	#endif
				56
David Blaikie	1f13b3c	2017-10-26 21:32:58 +0000	[diff] [blame]	57	/* character-class table */
				58	static struct cclass {
				59	const char *name;
				60	const char *chars;
				61	const char *multis;
				62	} cclasses[] = {
				63	{ "alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
				64	0123456789", ""} ,
				65	{ "alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
				66	""} ,
				67	{ "blank", " \t", ""} ,
				68	{ "cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\
				69	\25\26\27\30\31\32\33\34\35\36\37\177", ""} ,
				70	{ "digit", "0123456789", ""} ,
				71	{ "graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
				72	0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{\|}~",
				73	""} ,
				74	{ "lower", "abcdefghijklmnopqrstuvwxyz",
				75	""} ,
				76	{ "print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
				77	0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{\|}~ ",
				78	""} ,
				79	{ "punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{\|}~",
				80	""} ,
				81	{ "space", "\t\n\v\f\r ", ""} ,
				82	{ "upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
				83	""} ,
				84	{ "xdigit", "0123456789ABCDEFabcdef",
				85	""} ,
				86	{ NULL, 0, "" }
				87	};
				88
				89	/* character-name table */
				90	static struct cname {
				91	const char *name;
				92	char code;
				93	} cnames[] = {
				94	{ "NUL", '\0' },
				95	{ "SOH", '\001' },
				96	{ "STX", '\002' },
				97	{ "ETX", '\003' },
				98	{ "EOT", '\004' },
				99	{ "ENQ", '\005' },
				100	{ "ACK", '\006' },
				101	{ "BEL", '\007' },
				102	{ "alert", '\007' },
				103	{ "BS", '\010' },
				104	{ "backspace", '\b' },
				105	{ "HT", '\011' },
				106	{ "tab", '\t' },
				107	{ "LF", '\012' },
				108	{ "newline", '\n' },
				109	{ "VT", '\013' },
				110	{ "vertical-tab", '\v' },
				111	{ "FF", '\014' },
				112	{ "form-feed", '\f' },
				113	{ "CR", '\015' },
				114	{ "carriage-return", '\r' },
				115	{ "SO", '\016' },
				116	{ "SI", '\017' },
				117	{ "DLE", '\020' },
				118	{ "DC1", '\021' },
				119	{ "DC2", '\022' },
				120	{ "DC3", '\023' },
				121	{ "DC4", '\024' },
				122	{ "NAK", '\025' },
				123	{ "SYN", '\026' },
				124	{ "ETB", '\027' },
				125	{ "CAN", '\030' },
				126	{ "EM", '\031' },
				127	{ "SUB", '\032' },
				128	{ "ESC", '\033' },
				129	{ "IS4", '\034' },
				130	{ "FS", '\034' },
				131	{ "IS3", '\035' },
				132	{ "GS", '\035' },
				133	{ "IS2", '\036' },
				134	{ "RS", '\036' },
				135	{ "IS1", '\037' },
				136	{ "US", '\037' },
				137	{ "space", ' ' },
				138	{ "exclamation-mark", '!' },
				139	{ "quotation-mark", '"' },
				140	{ "number-sign", '#' },
				141	{ "dollar-sign", '$' },
				142	{ "percent-sign", '%' },
				143	{ "ampersand", '&' },
				144	{ "apostrophe", '\'' },
				145	{ "left-parenthesis", '(' },
				146	{ "right-parenthesis", ')' },
				147	{ "asterisk", '*' },
				148	{ "plus-sign", '+' },
				149	{ "comma", ',' },
				150	{ "hyphen", '-' },
				151	{ "hyphen-minus", '-' },
				152	{ "period", '.' },
				153	{ "full-stop", '.' },
				154	{ "slash", '/' },
				155	{ "solidus", '/' },
				156	{ "zero", '0' },
				157	{ "one", '1' },
				158	{ "two", '2' },
				159	{ "three", '3' },
				160	{ "four", '4' },
				161	{ "five", '5' },
				162	{ "six", '6' },
				163	{ "seven", '7' },
				164	{ "eight", '8' },
				165	{ "nine", '9' },
				166	{ "colon", ':' },
				167	{ "semicolon", ';' },
				168	{ "less-than-sign", '<' },
				169	{ "equals-sign", '=' },
				170	{ "greater-than-sign", '>' },
				171	{ "question-mark", '?' },
				172	{ "commercial-at", '@' },
				173	{ "left-square-bracket", '[' },
				174	{ "backslash", '\\' },
				175	{ "reverse-solidus", '\\' },
				176	{ "right-square-bracket", ']' },
				177	{ "circumflex", '^' },
				178	{ "circumflex-accent", '^' },
				179	{ "underscore", '_' },
				180	{ "low-line", '_' },
				181	{ "grave-accent", '`' },
				182	{ "left-brace", '{' },
				183	{ "left-curly-bracket", '{' },
				184	{ "vertical-line", '\|' },
				185	{ "right-brace", '}' },
				186	{ "right-curly-bracket", '}' },
				187	{ "tilde", '~' },
				188	{ "DEL", '\177' },
				189	{ NULL, 0 }
				190	};
				191
Torok Edwin	e14d4cd	2009-08-30 08:24:09 +0000	[diff] [blame]	192	/*
				193	* parse structure, passed up and down to avoid global variables and
				194	* other clumsinesses
				195	*/
				196	struct parse {
				197	char next; / next character in RE */
				198	char end; / end of string (-> NUL normally) */
				199	int error; /* has an error been seen? */
				200	sop strip; / malloced strip */
				201	sopno ssize; /* malloced strip size (allocated) */
				202	sopno slen; /* malloced strip length (used) */
				203	int ncsalloc; /* number of csets allocated */
				204	struct re_guts *g;
				205	# define NPAREN 10 /* we need to remember () 1-9 for back refs */
				206	sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
				207	sopno pend[NPAREN]; /* -> ) ([0] unused) */
				208	};
				209
				210	static void p_ere(struct parse *, int);
				211	static void p_ere_exp(struct parse *);
				212	static void p_str(struct parse *);
				213	static void p_bre(struct parse *, int, int);
				214	static int p_simp_re(struct parse *, int);
				215	static int p_count(struct parse *);
				216	static void p_bracket(struct parse *);
				217	static void p_b_term(struct parse , cset );
				218	static void p_b_cclass(struct parse , cset );
				219	static void p_b_eclass(struct parse , cset );
				220	static char p_b_symbol(struct parse *);
				221	static char p_b_coll_elem(struct parse *, int);
				222	static char othercase(int);
				223	static void bothcases(struct parse *, int);
				224	static void ordinary(struct parse *, int);
				225	static void nonnewline(struct parse *);
				226	static void repeat(struct parse *, sopno, int, int);
				227	static int seterr(struct parse *, int);
				228	static cset allocset(struct parse );
				229	static void freeset(struct parse , cset );
				230	static int freezeset(struct parse , cset );
				231	static int firstch(struct parse , cset );
				232	static int nch(struct parse , cset );
				233	static void mcadd(struct parse , cset , const char *);
				234	static void mcinvert(struct parse , cset );
				235	static void mccase(struct parse , cset );
				236	static int isinsets(struct re_guts *, int);
				237	static int samesets(struct re_guts *, int, int);
				238	static void categorize(struct parse , struct re_guts );
				239	static sopno dupl(struct parse *, sopno, sopno);
				240	static void doemit(struct parse *, sop, size_t);
				241	static void doinsert(struct parse *, sop, size_t, sopno);
				242	static void dofwd(struct parse *, sopno, sop);
				243	static void enlarge(struct parse *, sopno);
				244	static void stripsnug(struct parse , struct re_guts );
				245	static void findmust(struct parse , struct re_guts );
				246	static sopno pluscount(struct parse , struct re_guts );
				247
				248	static char nuls[10]; /* place to point scanner in event of error */
				249
				250	/*
				251	* macros for use with parse structure
				252	* BEWARE: these know that the parse structure is named `p' !!!
				253	*/
				254	#define PEEK() (*p->next)
				255	#define PEEK2() (*(p->next+1))
				256	#define MORE() (p->next < p->end)
				257	#define MORE2() (p->next+1 < p->end)
				258	#define SEE(c) (MORE() && PEEK() == (c))
				259	#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
				260	#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
				261	#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
				262	#define NEXT() (p->next++)
				263	#define NEXT2() (p->next += 2)
				264	#define NEXTn(n) (p->next += (n))
				265	#define GETNEXT() (*p->next++)
				266	#define SETERROR(e) seterr(p, (e))
				267	#define REQUIRE(co, e) (void)((co) \|\| SETERROR(e))
				268	#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
				269	#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
				270	#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() \|\| PEEK() != (c), e))
				271	#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
				272	#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
				273	#define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
				274	#define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
				275	#define HERE() (p->slen)
				276	#define THERE() (p->slen - 1)
				277	#define THERETHERE() (p->slen - 2)
				278	#define DROP(n) (p->slen -= (n))
				279
				280	#ifdef _POSIX2_RE_DUP_MAX
				281	#define DUPMAX _POSIX2_RE_DUP_MAX
				282	#else
				283	#define DUPMAX 255
				284	#endif
Benjamin Kramer	2b37efa	2009-09-06 12:26:28 +0000	[diff] [blame]	285	#define INFINITY (DUPMAX + 1)
Torok Edwin	e14d4cd	2009-08-30 08:24:09 +0000	[diff] [blame]	286
				287	#ifndef NDEBUG
				288	static int never = 0; /* for use in asserts; shuts lint up */
				289	#else
				290	#define never 0 /* some <assert.h>s have bugs too */
				291	#endif
				292
				293	/*
				294	- llvm_regcomp - interface for parser and compilation
				295	*/
				296	int /* 0 success, otherwise REG_something */
				297	llvm_regcomp(llvm_regex_t preg, const char pattern, int cflags)
				298	{
				299	struct parse pa;
				300	struct re_guts *g;
				301	struct parse *p = &pa;
				302	int i;
				303	size_t len;
				304	#ifdef REDEBUG
				305	# define GOODFLAGS(f) (f)
				306	#else
				307	# define GOODFLAGS(f) ((f)&~REG_DUMP)
				308	#endif
				309
				310	cflags = GOODFLAGS(cflags);
				311	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
				312	return(REG_INVARG);
				313
				314	if (cflags&REG_PEND) {
				315	if (preg->re_endp < pattern)
				316	return(REG_INVARG);
				317	len = preg->re_endp - pattern;
				318	} else
				319	len = strlen((const char *)pattern);
				320
				321	/* do the mallocs early so failure handling is easy */
				322	g = (struct re_guts *)malloc(sizeof(struct re_guts) +
				323	(NC-1)*sizeof(cat_t));
				324	if (g == NULL)
				325	return(REG_ESPACE);
				326	p->ssize = len/(size_t)2(size_t)3 + (size_t)1; / ugh */
				327	p->strip = (sop *)calloc(p->ssize, sizeof(sop));
				328	p->slen = 0;
				329	if (p->strip == NULL) {
				330	free((char *)g);
				331	return(REG_ESPACE);
				332	}
				333
				334	/* set things up */
				335	p->g = g;
				336	p->next = (char )pattern; / convenience; we do not modify it */
				337	p->end = p->next + len;
				338	p->error = 0;
				339	p->ncsalloc = 0;
				340	for (i = 0; i < NPAREN; i++) {
				341	p->pbegin[i] = 0;
				342	p->pend[i] = 0;
				343	}
				344	g->csetsize = NC;
				345	g->sets = NULL;
				346	g->setbits = NULL;
				347	g->ncsets = 0;
				348	g->cflags = cflags;
				349	g->iflags = 0;
				350	g->nbol = 0;
				351	g->neol = 0;
				352	g->must = NULL;
				353	g->mlen = 0;
				354	g->nsub = 0;
				355	g->ncategories = 1; /* category 0 is "everything else" */
				356	g->categories = &g->catspace[-(CHAR_MIN)];
				357	(void) memset((char )g->catspace, 0, NCsizeof(cat_t));
				358	g->backrefs = 0;
				359
				360	/* do it */
				361	EMIT(OEND, 0);
				362	g->firststate = THERE();
				363	if (cflags&REG_EXTENDED)
				364	p_ere(p, OUT);
				365	else if (cflags&REG_NOSPEC)
				366	p_str(p);
				367	else
				368	p_bre(p, OUT, OUT);
				369	EMIT(OEND, 0);
				370	g->laststate = THERE();
				371
				372	/* tidy up loose ends and fill things in */
				373	categorize(p, g);
				374	stripsnug(p, g);
				375	findmust(p, g);
				376	g->nplus = pluscount(p, g);
				377	g->magic = MAGIC2;
				378	preg->re_nsub = g->nsub;
				379	preg->re_g = g;
				380	preg->re_magic = MAGIC1;
				381	#ifndef REDEBUG
				382	/* not debugging, so can't rely on the assert() in llvm_regexec() */
				383	if (g->iflags&REGEX_BAD)
				384	SETERROR(REG_ASSERT);
				385	#endif
				386
				387	/* win or lose, we're done */
				388	if (p->error != 0) /* lose */
				389	llvm_regfree(preg);
				390	return(p->error);
				391	}
				392
				393	/*
				394	- p_ere - ERE parser top level, concatenation and alternation
				395	*/
				396	static void
				397	p_ere(struct parse p, int stop) / character this ERE should end at */
				398	{
				399	char c;
Daniel Dunbar	6e4ed8c	2009-09-08 16:14:54 +0000	[diff] [blame]	400	sopno prevback = 0;
				401	sopno prevfwd = 0;
Torok Edwin	e14d4cd	2009-08-30 08:24:09 +0000	[diff] [blame]	402	sopno conc;
				403	int first = 1; /* is this the first alternative? */
				404
				405	for (;;) {
				406	/* do a bunch of concatenated expressions */
				407	conc = HERE();
				408	while (MORE() && (c = PEEK()) != '\|' && c != stop)
				409	p_ere_exp(p);
				410	REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
				411
				412	if (!EAT('\|'))
				413	break; /* NOTE BREAK OUT */
				414
				415	if (first) {
				416	INSERT(OCH_, conc); /* offset is wrong */
				417	prevfwd = conc;
				418	prevback = conc;
				419	first = 0;
				420	}
				421	ASTERN(OOR1, prevback);
				422	prevback = THERE();
				423	AHEAD(prevfwd); /* fix previous offset */
				424	prevfwd = HERE();
				425	EMIT(OOR2, 0); /* offset is very wrong */
				426	}
				427
				428	if (!first) { /* tail-end fixups */
				429	AHEAD(prevfwd);
				430	ASTERN(O_CH, prevback);
				431	}
				432
				433	assert(!MORE() \|\| SEE(stop));
				434	}
				435
				436	/*
				437	- p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
				438	*/
				439	static void
				440	p_ere_exp(struct parse *p)
				441	{
				442	char c;
				443	sopno pos;
				444	int count;
				445	int count2;
Eli Bendersky	10f22d7	2012-11-28 19:00:02 +0000	[diff] [blame]	446	int backrefnum;
Torok Edwin	e14d4cd	2009-08-30 08:24:09 +0000	[diff] [blame]	447	sopno subno;
				448	int wascaret = 0;
				449
				450	assert(MORE()); /* caller should have ensured this */
				451	c = GETNEXT();
				452
				453	pos = HERE();
				454	switch (c) {
				455	case '(':
				456	REQUIRE(MORE(), REG_EPAREN);
				457	p->g->nsub++;
				458	subno = p->g->nsub;
				459	if (subno < NPAREN)
				460	p->pbegin[subno] = HERE();
				461	EMIT(OLPAREN, subno);
				462	if (!SEE(')'))
				463	p_ere(p, ')');
				464	if (subno < NPAREN) {
				465	p->pend[subno] = HERE();
				466	assert(p->pend[subno] != 0);
				467	}
				468	EMIT(ORPAREN, subno);
				469	MUSTEAT(')', REG_EPAREN);
				470	break;
				471	#ifndef POSIX_MISTAKE
				472	case ')': /* happens only if no current unmatched ( */
				473	/*
				474	* You may ask, why the ifndef? Because I didn't notice
				475	* this until slightly too late for 1003.2, and none of the
				476	* other 1003.2 regular-expression reviewers noticed it at
				477	* all. So an unmatched ) is legal POSIX, at least until
				478	* we can get it fixed.
				479	*/
				480	SETERROR(REG_EPAREN);
				481	break;
				482	#endif
				483	case '^':
				484	EMIT(OBOL, 0);
				485	p->g->iflags \|= USEBOL;
				486	p->g->nbol++;
				487	wascaret = 1;
				488	break;
				489	case '$':
				490	EMIT(OEOL, 0);
				491	p->g->iflags \|= USEEOL;
				492	p->g->neol++;
				493	break;
				494	case '\|':
				495	SETERROR(REG_EMPTY);
				496	break;
				497	case '*':
				498	case '+':
				499	case '?':
				500	SETERROR(REG_BADRPT);
				501	break;
				502	case '.':
				503	if (p->g->cflags&REG_NEWLINE)
				504	nonnewline(p);
				505	else
				506	EMIT(OANY, 0);
				507	break;
				508	case '[':
				509	p_bracket(p);
				510	break;
				511	case '\\':
				512	REQUIRE(MORE(), REG_EESCAPE);
				513	c = GETNEXT();
Eli Bendersky	10f22d7	2012-11-28 19:00:02 +0000	[diff] [blame]	514	if (c >= '1' && c <= '9') {
				515	/* \[0-9] is taken to be a back-reference to a previously specified
				516	* matching group. backrefnum will hold the number. The matching
				517	* group must exist (i.e. if \4 is found there must have been at
				518	* least 4 matching groups specified in the pattern previously).
				519	*/
				520	backrefnum = c - '0';
				521	if (p->pend[backrefnum] == 0) {
				522	SETERROR(REG_ESUBREG);
				523	break;
				524	}
				525
				526	/* Make sure everything checks out and emit the sequence
				527	* that marks a back-reference to the parse structure.
				528	*/
				529	assert(backrefnum <= p->g->nsub);
				530	EMIT(OBACK_, backrefnum);
				531	assert(p->pbegin[backrefnum] != 0);
				532	assert(OP(p->strip[p->pbegin[backrefnum]]) != OLPAREN);
				533	assert(OP(p->strip[p->pend[backrefnum]]) != ORPAREN);
				534	(void) dupl(p, p->pbegin[backrefnum]+1, p->pend[backrefnum]);
				535	EMIT(O_BACK, backrefnum);
				536	p->g->backrefs = 1;
				537	} else {
				538	/* Other chars are simply themselves when escaped with a backslash.
				539	*/
				540	ordinary(p, c);
				541	}
Torok Edwin	e14d4cd	2009-08-30 08:24:09 +0000	[diff] [blame]	542	break;
				543	case '{': /* okay as ordinary except if digit follows */
				544	REQUIRE(!MORE() \|\| !isdigit((uch)PEEK()), REG_BADRPT);
				545	/* FALLTHROUGH */
				546	default:
				547	ordinary(p, c);
				548	break;
				549	}
				550
				551	if (!MORE())
				552	return;
				553	c = PEEK();
				554	/* we call { a repetition if followed by a digit */
				555	if (!( c == '*' \|\| c == '+' \|\| c == '?' \|\|
				556	(c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
				557	return; /* no repetition, we're done */
				558	NEXT();
				559
				560	REQUIRE(!wascaret, REG_BADRPT);
				561	switch (c) {
				562	case '': / implemented as +? */
				563	/* this case does not require the (y\|) trick, noKLUDGE */
				564	INSERT(OPLUS_, pos);
				565	ASTERN(O_PLUS, pos);
				566	INSERT(OQUEST_, pos);
				567	ASTERN(O_QUEST, pos);
				568	break;
				569	case '+':
				570	INSERT(OPLUS_, pos);
				571	ASTERN(O_PLUS, pos);
				572	break;
				573	case '?':
				574	/* KLUDGE: emit y? as (y\|) until subtle bug gets fixed */
				575	INSERT(OCH_, pos); /* offset slightly wrong */
				576	ASTERN(OOR1, pos); /* this one's right */
				577	AHEAD(pos); /* fix the OCH_ */
				578	EMIT(OOR2, 0); /* offset very wrong... */
				579	AHEAD(THERE()); /* ...so fix it */
				580	ASTERN(O_CH, THERETHERE());
				581	break;
				582	case '{':
				583	count = p_count(p);
				584	if (EAT(',')) {
				585	if (isdigit((uch)PEEK())) {
				586	count2 = p_count(p);
				587	REQUIRE(count <= count2, REG_BADBR);
				588	} else /* single number with comma */
				589	count2 = INFINITY;
				590	} else /* just a single number */
				591	count2 = count;
				592	repeat(p, pos, count, count2);
				593	if (!EAT('}')) { /* error heuristics */
				594	while (MORE() && PEEK() != '}')
				595	NEXT();
				596	REQUIRE(MORE(), REG_EBRACE);
				597	SETERROR(REG_BADBR);
				598	}
				599	break;
				600	}
				601
				602	if (!MORE())
				603	return;
				604	c = PEEK();
				605	if (!( c == '*' \|\| c == '+' \|\| c == '?' \|\|
				606	(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
				607	return;
				608	SETERROR(REG_BADRPT);
				609	}
				610
				611	/*
				612	- p_str - string (no metacharacters) "parser"
				613	*/
				614	static void
				615	p_str(struct parse *p)
				616	{
				617	REQUIRE(MORE(), REG_EMPTY);
				618	while (MORE())
				619	ordinary(p, GETNEXT());
				620	}
				621
				622	/*
				623	- p_bre - BRE parser top level, anchoring and concatenation
				624	* Giving end1 as OUT essentially eliminates the end1/end2 check.
				625	*
				626	* This implementation is a bit of a kludge, in that a trailing $ is first
				627	* taken as an ordinary character and then revised to be an anchor. The
				628	* only undesirable side effect is that '$' gets included as a character
				629	* category in such cases. This is fairly harmless; not worth fixing.
				630	* The amount of lookahead needed to avoid this kludge is excessive.
				631	*/
				632	static void
				633	p_bre(struct parse *p,
				634	int end1, /* first terminating character */
				635	int end2) /* second terminating character */
				636	{
				637	sopno start = HERE();
				638	int first = 1; /* first subexpression? */
				639	int wasdollar = 0;
				640
				641	if (EAT('^')) {
				642	EMIT(OBOL, 0);
				643	p->g->iflags \|= USEBOL;
				644	p->g->nbol++;
				645	}
				646	while (MORE() && !SEETWO(end1, end2)) {
				647	wasdollar = p_simp_re(p, first);
				648	first = 0;
				649	}
				650	if (wasdollar) { /* oops, that was a trailing anchor */
				651	DROP(1);
				652	EMIT(OEOL, 0);
				653	p->g->iflags \|= USEEOL;
				654	p->g->neol++;
				655	}
				656
				657	REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
				658	}
				659
				660	/*
				661	- p_simp_re - parse a simple RE, an atom possibly followed by a repetition
				662	*/
				663	static int /* was the simple RE an unbackslashed $? */
				664	p_simp_re(struct parse *p,
				665	int starordinary) /* is a leading * an ordinary character? */
				666	{
				667	int c;
				668	int count;
				669	int count2;
				670	sopno pos;
				671	int i;
				672	sopno subno;
				673	# define BACKSL (1<<CHAR_BIT)
				674
Alp Toker	cb40291	2014-01-24 17:20:08 +0000	[diff] [blame]	675	pos = HERE(); /* repetition op, if any, covers from here */
Torok Edwin	e14d4cd	2009-08-30 08:24:09 +0000	[diff] [blame]	676
Alp Toker	cb40291	2014-01-24 17:20:08 +0000	[diff] [blame]	677	assert(MORE()); /* caller should have ensured this */
				678	c = GETNEXT();
Torok Edwin	e14d4cd	2009-08-30 08:24:09 +0000	[diff] [blame]	679	if (c == '\\') {
				680	REQUIRE(MORE(), REG_EESCAPE);
				681	c = BACKSL \| GETNEXT();
				682	}
				683	switch (c) {
				684	case '.':
				685	if (p->g->cflags&REG_NEWLINE)
				686	nonnewline(p);
				687	else
				688	EMIT(OANY, 0);
				689	break;
				690	case '[':
				691	p_bracket(p);
				692	break;
				693	case BACKSL\|'{':
				694	SETERROR(REG_BADRPT);
				695	break;
				696	case BACKSL\|'(':
				697	p->g->nsub++;
				698	subno = p->g->nsub;
				699	if (subno < NPAREN)
				700	p->pbegin[subno] = HERE();
				701	EMIT(OLPAREN, subno);
				702	/* the MORE here is an error heuristic */
				703	if (MORE() && !SEETWO('\\', ')'))
				704	p_bre(p, '\\', ')');
				705	if (subno < NPAREN) {
				706	p->pend[subno] = HERE();
				707	assert(p->pend[subno] != 0);
				708	}
				709	EMIT(ORPAREN, subno);
				710	REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
				711	break;
				712	case BACKSL\|')': /* should not get here -- must be user */
				713	case BACKSL\|'}':
				714	SETERROR(REG_EPAREN);
				715	break;
				716	case BACKSL\|'1':
				717	case BACKSL\|'2':
				718	case BACKSL\|'3':
				719	case BACKSL\|'4':
				720	case BACKSL\|'5':
				721	case BACKSL\|'6':
				722	case BACKSL\|'7':
				723	case BACKSL\|'8':
				724	case BACKSL\|'9':
				725	i = (c&~BACKSL) - '0';
				726	assert(i < NPAREN);
				727	if (p->pend[i] != 0) {
				728	assert(i <= p->g->nsub);
				729	EMIT(OBACK_, i);
				730	assert(p->pbegin[i] != 0);
				731	assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
				732	assert(OP(p->strip[p->pend[i]]) == ORPAREN);
				733	(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
				734	EMIT(O_BACK, i);
				735	} else
				736	SETERROR(REG_ESUBREG);
				737	p->g->backrefs = 1;
				738	break;
				739	case '*':
				740	REQUIRE(starordinary, REG_BADRPT);
				741	/* FALLTHROUGH */
				742	default:
				743	ordinary(p, (char)c);
				744	break;
				745	}
				746
				747	if (EAT('')) { / implemented as +? */
				748	/* this case does not require the (y\|) trick, noKLUDGE */
				749	INSERT(OPLUS_, pos);
				750	ASTERN(O_PLUS, pos);
				751	INSERT(OQUEST_, pos);
				752	ASTERN(O_QUEST, pos);
				753	} else if (EATTWO('\\', '{')) {
				754	count = p_count(p);
				755	if (EAT(',')) {
				756	if (MORE() && isdigit((uch)PEEK())) {
				757	count2 = p_count(p);
				758	REQUIRE(count <= count2, REG_BADBR);
				759	} else /* single number with comma */
				760	count2 = INFINITY;
				761	} else /* just a single number */
				762	count2 = count;
				763	repeat(p, pos, count, count2);
				764	if (!EATTWO('\\', '}')) { /* error heuristics */
				765	while (MORE() && !SEETWO('\\', '}'))
				766	NEXT();
				767	REQUIRE(MORE(), REG_EBRACE);
				768	SETERROR(REG_BADBR);
				769	}
				770	} else if (c == '$') /* $ (but not \$) ends it */
				771	return(1);
				772
				773	return(0);
				774	}
				775
				776	/*
				777	- p_count - parse a repetition count
				778	*/
				779	static int /* the value */
				780	p_count(struct parse *p)
				781	{
				782	int count = 0;
				783	int ndigits = 0;
				784
				785	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
				786	count = count*10 + (GETNEXT() - '0');
				787	ndigits++;
				788	}
				789
				790	REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
				791	return(count);
				792	}
				793
				794	/*
				795	- p_bracket - parse a bracketed character list
				796	*
				797	* Note a significant property of this code: if the allocset() did SETERROR,
				798	* no set operations are done.
				799	*/
				800	static void
				801	p_bracket(struct parse *p)
				802	{
				803	cset *cs;
				804	int invert = 0;
				805
				806	/* Dept of Truly Sickening Special-Case Kludges */
				807	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
				808	EMIT(OBOW, 0);
				809	NEXTn(6);
				810	return;
				811	}
				812	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
				813	EMIT(OEOW, 0);
				814	NEXTn(6);
				815	return;
				816	}
				817
				818	if ((cs = allocset(p)) == NULL) {
				819	/* allocset did set error status in p */
				820	return;
				821	}
				822
				823	if (EAT('^'))
				824	invert++; /* make note to invert set at end */
				825	if (EAT(']'))
				826	CHadd(cs, ']');
				827	else if (EAT('-'))
				828	CHadd(cs, '-');
				829	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
				830	p_b_term(p, cs);
				831	if (EAT('-'))
				832	CHadd(cs, '-');
				833	MUSTEAT(']', REG_EBRACK);
				834
				835	if (p->error != 0) { /* don't mess things up further */
				836	freeset(p, cs);
				837	return;
				838	}
				839
				840	if (p->g->cflags&REG_ICASE) {
				841	int i;
				842	int ci;
				843
				844	for (i = p->g->csetsize - 1; i >= 0; i--)
				845	if (CHIN(cs, i) && isalpha(i)) {
				846	ci = othercase(i);
				847	if (ci != i)
				848	CHadd(cs, ci);
				849	}
				850	if (cs->multis != NULL)
				851	mccase(p, cs);
				852	}
				853	if (invert) {
				854	int i;
				855
				856	for (i = p->g->csetsize - 1; i >= 0; i--)
				857	if (CHIN(cs, i))
				858	CHsub(cs, i);
				859	else
				860	CHadd(cs, i);
				861	if (p->g->cflags&REG_NEWLINE)
				862	CHsub(cs, '\n');
				863	if (cs->multis != NULL)
				864	mcinvert(p, cs);
				865	}
				866
				867	assert(cs->multis == NULL); /* xxx */
				868
				869	if (nch(p, cs) == 1) { /* optimize singleton sets */
				870	ordinary(p, firstch(p, cs));
				871	freeset(p, cs);
				872	} else
				873	EMIT(OANYOF, freezeset(p, cs));
				874	}
				875
				876	/*
				877	- p_b_term - parse one term of a bracketed character list
				878	*/
				879	static void
				880	p_b_term(struct parse p, cset cs)
				881	{
				882	char c;
				883	char start, finish;
				884	int i;
				885
				886	/* classify what we've got */
				887	switch ((MORE()) ? PEEK() : '\0') {
				888	case '[':
				889	c = (MORE2()) ? PEEK2() : '\0';
				890	break;
				891	case '-':
				892	SETERROR(REG_ERANGE);
				893	return; /* NOTE RETURN */
				894	break;
				895	default:
				896	c = '\0';
				897	break;
				898	}
				899
				900	switch (c) {
				901	case ':': /* character class */
				902	NEXT2();
				903	REQUIRE(MORE(), REG_EBRACK);
				904	c = PEEK();
				905	REQUIRE(c != '-' && c != ']', REG_ECTYPE);
				906	p_b_cclass(p, cs);
				907	REQUIRE(MORE(), REG_EBRACK);
				908	REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
				909	break;
				910	case '=': /* equivalence class */
				911	NEXT2();
				912	REQUIRE(MORE(), REG_EBRACK);
				913	c = PEEK();
				914	REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
				915	p_b_eclass(p, cs);
				916	REQUIRE(MORE(), REG_EBRACK);
				917	REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
				918	break;
				919	default: /* symbol, ordinary character, or range */
				920	/* xxx revision needed for multichar stuff */
				921	start = p_b_symbol(p);
				922	if (SEE('-') && MORE2() && PEEK2() != ']') {
				923	/* range */
				924	NEXT();
				925	if (EAT('-'))
				926	finish = '-';
				927	else
				928	finish = p_b_symbol(p);
				929	} else
				930	finish = start;
				931	/* xxx what about signed chars here... */
				932	REQUIRE(start <= finish, REG_ERANGE);
				933	for (i = start; i <= finish; i++)
				934	CHadd(cs, i);
				935	break;
				936	}
				937	}
				938
				939	/*
				940	- p_b_cclass - parse a character-class name and deal with it
				941	*/
				942	static void
				943	p_b_cclass(struct parse p, cset cs)
				944	{
				945	char *sp = p->next;
				946	struct cclass *cp;
				947	size_t len;
				948	const char *u;
				949	char c;
				950
NAKAMURA Takumi	5918457	2011-03-08 12:25:29 +0000	[diff] [blame]	951	while (MORE() && isalpha((uch)PEEK()))
Torok Edwin	e14d4cd	2009-08-30 08:24:09 +0000	[diff] [blame]	952	NEXT();
				953	len = p->next - sp;
				954	for (cp = cclasses; cp->name != NULL; cp++)
				955	if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
				956	break;
				957	if (cp->name == NULL) {
				958	/* oops, didn't find it */
				959	SETERROR(REG_ECTYPE);
				960	return;
				961	}
				962
				963	u = cp->chars;
				964	while ((c = *u++) != '\0')
				965	CHadd(cs, c);
				966	for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
				967	MCadd(p, cs, u);
				968	}
				969
				970	/*
				971	- p_b_eclass - parse an equivalence-class name and deal with it
				972	*
				973	* This implementation is incomplete. xxx
				974	*/
				975	static void
				976	p_b_eclass(struct parse p, cset cs)
				977	{
				978	char c;
				979
				980	c = p_b_coll_elem(p, '=');
				981	CHadd(cs, c);
				982	}
				983
				984	/*
				985	- p_b_symbol - parse a character or [..]ed multicharacter collating symbol
				986	*/
				987	static char /* value of symbol */
				988	p_b_symbol(struct parse *p)
				989	{
				990	char value;
				991
				992	REQUIRE(MORE(), REG_EBRACK);
				993	if (!EATTWO('[', '.'))
				994	return(GETNEXT());
				995
				996	/* collating symbol */
				997	value = p_b_coll_elem(p, '.');
				998	REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
				999	return(value);
				1000	}
				1001
				1002	/*
				1003	- p_b_coll_elem - parse a collating-element name and look it up
				1004	*/
				1005	static char /* value of collating element */
				1006	p_b_coll_elem(struct parse *p,
				1007	int endc) /* name ended by endc,']' */
				1008	{
				1009	char *sp = p->next;
				1010	struct cname *cp;
Vlad Tsyrklevich	b42db15	2017-10-27 19:15:13 +0000	[diff] [blame]	1011	size_t len;
Torok Edwin	e14d4cd	2009-08-30 08:24:09 +0000	[diff] [blame]	1012
				1013	while (MORE() && !SEETWO(endc, ']'))
				1014	NEXT();
				1015	if (!MORE()) {
				1016	SETERROR(REG_EBRACK);
				1017	return(0);
				1018	}
				1019	len = p->next - sp;
				1020	for (cp = cnames; cp->name != NULL; cp++)
Vlad Tsyrklevich	b42db15	2017-10-27 19:15:13 +0000	[diff] [blame]	1021	if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
Torok Edwin	e14d4cd	2009-08-30 08:24:09 +0000	[diff] [blame]	1022	return(cp->code); /* known name */
				1023	if (len == 1)
				1024	return(sp); / single character */
				1025	SETERROR(REG_ECOLLATE); /* neither */
				1026	return(0);
				1027	}
				1028
				1029	/*
				1030	- othercase - return the case counterpart of an alphabetic
				1031	*/
				1032	static char /* if no counterpart, return ch */
				1033	othercase(int ch)
				1034	{
				1035	ch = (uch)ch;
				1036	assert(isalpha(ch));
				1037	if (isupper(ch))
				1038	return ((uch)tolower(ch));
				1039	else if (islower(ch))
				1040	return ((uch)toupper(ch));
				1041	else /* peculiar, but could happen */
				1042	return(ch);
				1043	}
				1044
				1045	/*
				1046	- bothcases - emit a dualcase version of a two-case character
				1047	*
				1048	* Boy, is this implementation ever a kludge...
				1049	*/
				1050	static void
				1051	bothcases(struct parse *p, int ch)
				1052	{
				1053	char *oldnext = p->next;
				1054	char *oldend = p->end;
				1055	char bracket[3];
				1056
				1057	ch = (uch)ch;
				1058	assert(othercase(ch) != ch); /* p_bracket() would recurse */
				1059	p->next = bracket;
				1060	p->end = bracket+2;
				1061	bracket[0] = ch;
				1062	bracket[1] = ']';
				1063	bracket[2] = '\0';
				1064	p_bracket(p);
				1065	assert(p->next == bracket+2);
				1066	p->next = oldnext;
				1067	p->end = oldend;
				1068	}
				1069
				1070	/*
				1071	- ordinary - emit an ordinary character
				1072	*/
				1073	static void
				1074	ordinary(struct parse *p, int ch)
				1075	{
				1076	cat_t *cap = p->g->categories;
				1077
				1078	if ((p->g->cflags&REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
				1079	bothcases(p, ch);
				1080	else {
				1081	EMIT(OCHAR, (uch)ch);
				1082	if (cap[ch] == 0)
				1083	cap[ch] = p->g->ncategories++;
				1084	}
				1085	}
				1086
				1087	/*
				1088	- nonnewline - emit REG_NEWLINE version of OANY
				1089	*
				1090	* Boy, is this implementation ever a kludge...
				1091	*/
				1092	static void
				1093	nonnewline(struct parse *p)
				1094	{
				1095	char *oldnext = p->next;
				1096	char *oldend = p->end;
				1097	char bracket[4];
				1098
				1099	p->next = bracket;
				1100	p->end = bracket+3;
				1101	bracket[0] = '^';
				1102	bracket[1] = '\n';
				1103	bracket[2] = ']';
				1104	bracket[3] = '\0';
				1105	p_bracket(p);
				1106	assert(p->next == bracket+3);
				1107	p->next = oldnext;
				1108	p->end = oldend;
				1109	}
				1110
				1111	/*
				1112	- repeat - generate code for a bounded repetition, recursively if needed
				1113	*/
				1114	static void
				1115	repeat(struct parse *p,
				1116	sopno start, /* operand from here to end of strip */
				1117	int from, /* repeated from this number */
				1118	int to) /* to this number of times (maybe INFINITY) */
				1119	{
				1120	sopno finish = HERE();
				1121	# define N 2
				1122	# define INF 3
				1123	# define REP(f, t) ((f)*8 + (t))
				1124	# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
				1125	sopno copy;
				1126
				1127	if (p->error != 0) /* head off possible runaway recursion */
				1128	return;
				1129
				1130	assert(from <= to);
				1131
				1132	switch (REP(MAP(from), MAP(to))) {
				1133	case REP(0, 0): /* must be user doing this */
				1134	DROP(finish-start); /* drop the operand */
				1135	break;
				1136	case REP(0, 1): /* as x{1,1}? */
				1137	case REP(0, N): /* as x{1,n}? */
				1138	case REP(0, INF): /* as x{1,}? */
				1139	/* KLUDGE: emit y? as (y\|) until subtle bug gets fixed */
				1140	INSERT(OCH_, start); /* offset is wrong... */
				1141	repeat(p, start+1, 1, to);
				1142	ASTERN(OOR1, start);
				1143	AHEAD(start); /* ... fix it */
				1144	EMIT(OOR2, 0);
				1145	AHEAD(THERE());
				1146	ASTERN(O_CH, THERETHERE());
				1147	break;
				1148	case REP(1, 1): /* trivial case */
				1149	/* done */
				1150	break;
				1151	case REP(1, N): /* as x?x{1,n-1} */
				1152	/* KLUDGE: emit y? as (y\|) until subtle bug gets fixed */
				1153	INSERT(OCH_, start);
				1154	ASTERN(OOR1, start);
				1155	AHEAD(start);
				1156	EMIT(OOR2, 0); /* offset very wrong... */
				1157	AHEAD(THERE()); /* ...so fix it */
				1158	ASTERN(O_CH, THERETHERE());
				1159	copy = dupl(p, start+1, finish+1);
				1160	assert(copy == finish+4);
				1161	repeat(p, copy, 1, to-1);
				1162	break;
				1163	case REP(1, INF): /* as x+ */
				1164	INSERT(OPLUS_, start);
				1165	ASTERN(O_PLUS, start);
				1166	break;
				1167	case REP(N, N): /* as xx{m-1,n-1} */
				1168	copy = dupl(p, start, finish);
				1169	repeat(p, copy, from-1, to-1);
				1170	break;
				1171	case REP(N, INF): /* as xx{n-1,INF} */
				1172	copy = dupl(p, start, finish);
				1173	repeat(p, copy, from-1, to);
				1174	break;
				1175	default: /* "can't happen" */
				1176	SETERROR(REG_ASSERT); /* just in case */
				1177	break;
				1178	}
				1179	}
				1180
				1181	/*
				1182	- seterr - set an error condition
				1183	*/
				1184	static int /* useless but makes type checking happy */
				1185	seterr(struct parse *p, int e)
				1186	{
				1187	if (p->error == 0) /* keep earliest error condition */
				1188	p->error = e;
				1189	p->next = nuls; /* try to bring things to a halt */
				1190	p->end = nuls;
				1191	return(0); /* make the return value well-defined */
				1192	}
				1193
				1194	/*
				1195	- allocset - allocate a set of characters for []
				1196	*/
				1197	static cset *
				1198	allocset(struct parse *p)
				1199	{
				1200	int no = p->g->ncsets++;
				1201	size_t nc;
				1202	size_t nbytes;
				1203	cset *cs;
				1204	size_t css = (size_t)p->g->csetsize;
				1205	int i;
				1206
				1207	if (no >= p->ncsalloc) { /* need another column of space */
				1208	void *ptr;
				1209
				1210	p->ncsalloc += CHAR_BIT;
				1211	nc = p->ncsalloc;
Joerg Sonnenberger	a73284a	2015-02-07 21:24:06 +0000	[diff] [blame]	1212	if (nc > SIZE_MAX / sizeof(cset))
				1213	goto nomem;
Torok Edwin	e14d4cd	2009-08-30 08:24:09 +0000	[diff] [blame]	1214	assert(nc % CHAR_BIT == 0);
				1215	nbytes = nc / CHAR_BIT * css;
				1216
				1217	ptr = (cset )realloc((char )p->g->sets, nc * sizeof(cset));
				1218	if (ptr == NULL)
				1219	goto nomem;
				1220	p->g->sets = ptr;
				1221
				1222	ptr = (uch )realloc((char )p->g->setbits, nbytes);
				1223	if (ptr == NULL)
				1224	goto nomem;
				1225	p->g->setbits = ptr;
				1226
				1227	for (i = 0; i < no; i++)
				1228	p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
				1229
				1230	(void) memset((char *)p->g->setbits + (nbytes - css), 0, css);
				1231	}
				1232	/* XXX should not happen */
				1233	if (p->g->sets == NULL \|\| p->g->setbits == NULL)
				1234	goto nomem;
				1235
				1236	cs = &p->g->sets[no];
				1237	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
				1238	cs->mask = 1 << ((no) % CHAR_BIT);
				1239	cs->hash = 0;
				1240	cs->smultis = 0;
				1241	cs->multis = NULL;
				1242
				1243	return(cs);
				1244	nomem:
				1245	free(p->g->sets);
				1246	p->g->sets = NULL;
				1247	free(p->g->setbits);
				1248	p->g->setbits = NULL;
				1249
				1250	SETERROR(REG_ESPACE);
				1251	/* caller's responsibility not to do set ops */
				1252	return(NULL);
				1253	}
				1254
				1255	/*
				1256	- freeset - free a now-unused set
				1257	*/
				1258	static void
				1259	freeset(struct parse p, cset cs)
				1260	{
				1261	size_t i;
				1262	cset *top = &p->g->sets[p->g->ncsets];
				1263	size_t css = (size_t)p->g->csetsize;
				1264
				1265	for (i = 0; i < css; i++)
				1266	CHsub(cs, i);
				1267	if (cs == top-1) /* recover only the easy case */
				1268	p->g->ncsets--;
				1269	}
				1270
				1271	/*
				1272	- freezeset - final processing on a set of characters
				1273	*
				1274	* The main task here is merging identical sets. This is usually a waste
				1275	* of time (although the hash code minimizes the overhead), but can win
				1276	* big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
				1277	* is done using addition rather than xor -- all ASCII [aA] sets xor to
				1278	* the same value!
				1279	*/
				1280	static int /* set number */
				1281	freezeset(struct parse p, cset cs)
				1282	{
				1283	uch h = cs->hash;
				1284	size_t i;
				1285	cset *top = &p->g->sets[p->g->ncsets];
				1286	cset *cs2;
				1287	size_t css = (size_t)p->g->csetsize;
				1288
				1289	/* look for an earlier one which is the same */
				1290	for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
				1291	if (cs2->hash == h && cs2 != cs) {
				1292	/* maybe */
				1293	for (i = 0; i < css; i++)
				1294	if (!!CHIN(cs2, i) != !!CHIN(cs, i))
				1295	break; /* no */
				1296	if (i == css)
				1297	break; /* yes */
				1298	}
				1299
				1300	if (cs2 < top) { /* found one */
				1301	freeset(p, cs);
				1302	cs = cs2;
				1303	}
				1304
				1305	return((int)(cs - p->g->sets));
				1306	}
				1307
				1308	/*
				1309	- firstch - return first character in a set (which must have at least one)
				1310	*/
				1311	static int /* character; there is no "none" value */
				1312	firstch(struct parse p, cset cs)
				1313	{
				1314	size_t i;
				1315	size_t css = (size_t)p->g->csetsize;
				1316
				1317	for (i = 0; i < css; i++)
				1318	if (CHIN(cs, i))
				1319	return((char)i);
				1320	assert(never);
				1321	return(0); /* arbitrary */
				1322	}
				1323
				1324	/*
				1325	- nch - number of characters in a set
				1326	*/
				1327	static int
				1328	nch(struct parse p, cset cs)
				1329	{
				1330	size_t i;
				1331	size_t css = (size_t)p->g->csetsize;
				1332	int n = 0;
				1333
				1334	for (i = 0; i < css; i++)
				1335	if (CHIN(cs, i))
				1336	n++;
				1337	return(n);
				1338	}
				1339
				1340	/*
				1341	- mcadd - add a collating element to a cset
				1342	*/
				1343	static void
				1344	mcadd( struct parse p, cset cs, const char *cp)
				1345	{
				1346	size_t oldend = cs->smultis;
				1347	void *np;
				1348
				1349	cs->smultis += strlen(cp) + 1;
				1350	np = realloc(cs->multis, cs->smultis);
				1351	if (np == NULL) {
				1352	if (cs->multis)
				1353	free(cs->multis);
				1354	cs->multis = NULL;
				1355	SETERROR(REG_ESPACE);
				1356	return;
				1357	}
				1358	cs->multis = np;
				1359
				1360	llvm_strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1);
				1361	}
				1362
				1363	/*
				1364	- mcinvert - invert the list of collating elements in a cset
				1365	*
				1366	* This would have to know the set of possibilities. Implementation
				1367	* is deferred.
				1368	*/
				1369	/* ARGSUSED */
				1370	static void
				1371	mcinvert(struct parse p, cset cs)
				1372	{
				1373	assert(cs->multis == NULL); /* xxx */
				1374	}
				1375
				1376	/*
				1377	- mccase - add case counterparts of the list of collating elements in a cset
				1378	*
				1379	* This would have to know the set of possibilities. Implementation
				1380	* is deferred.
				1381	*/
				1382	/* ARGSUSED */
				1383	static void
				1384	mccase(struct parse p, cset cs)
				1385	{
				1386	assert(cs->multis == NULL); /* xxx */
				1387	}
				1388
				1389	/*
				1390	- isinsets - is this character in any sets?
				1391	*/
				1392	static int /* predicate */
				1393	isinsets(struct re_guts *g, int c)
				1394	{
				1395	uch *col;
				1396	int i;
				1397	int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
				1398	unsigned uc = (uch)c;
				1399
				1400	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
				1401	if (col[uc] != 0)
				1402	return(1);
				1403	return(0);
				1404	}
				1405
				1406	/*
				1407	- samesets - are these two characters in exactly the same sets?
				1408	*/
				1409	static int /* predicate */
				1410	samesets(struct re_guts *g, int c1, int c2)
				1411	{
				1412	uch *col;
				1413	int i;
				1414	int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
				1415	unsigned uc1 = (uch)c1;
				1416	unsigned uc2 = (uch)c2;
				1417
				1418	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
				1419	if (col[uc1] != col[uc2])
				1420	return(0);
				1421	return(1);
				1422	}
				1423
				1424	/*
				1425	- categorize - sort out character categories
				1426	*/
				1427	static void
				1428	categorize(struct parse p, struct re_guts g)
				1429	{
				1430	cat_t *cats = g->categories;
				1431	int c;
				1432	int c2;
				1433	cat_t cat;
				1434
				1435	/* avoid making error situations worse */
				1436	if (p->error != 0)
				1437	return;
				1438
				1439	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
				1440	if (cats[c] == 0 && isinsets(g, c)) {
				1441	cat = g->ncategories++;
				1442	cats[c] = cat;
				1443	for (c2 = c+1; c2 <= CHAR_MAX; c2++)
				1444	if (cats[c2] == 0 && samesets(g, c, c2))
				1445	cats[c2] = cat;
				1446	}
				1447	}
				1448
				1449	/*
				1450	- dupl - emit a duplicate of a bunch of sops
				1451	*/
				1452	static sopno /* start of duplicate */
				1453	dupl(struct parse *p,
				1454	sopno start, /* from here */
				1455	sopno finish) /* to this less one */
				1456	{
				1457	sopno ret = HERE();
				1458	sopno len = finish - start;
				1459
				1460	assert(finish >= start);
				1461	if (len == 0)
				1462	return(ret);
				1463	enlarge(p, p->ssize + len); /* this many unexpected additions */
				1464	assert(p->ssize >= p->slen + len);
				1465	(void) memmove((char *)(p->strip + p->slen),
				1466	(char )(p->strip + start), (size_t)lensizeof(sop));
				1467	p->slen += len;
				1468	return(ret);
				1469	}
				1470
				1471	/*
				1472	- doemit - emit a strip operator
				1473	*
				1474	* It might seem better to implement this as a macro with a function as
				1475	* hard-case backup, but it's just too big and messy unless there are
				1476	* some changes to the data structures. Maybe later.
				1477	*/
				1478	static void
				1479	doemit(struct parse *p, sop op, size_t opnd)
				1480	{
				1481	/* avoid making error situations worse */
				1482	if (p->error != 0)
				1483	return;
				1484
				1485	/* deal with oversize operands ("can't happen", more or less) */
				1486	assert(opnd < 1<<OPSHIFT);
				1487
				1488	/* deal with undersized strip */
				1489	if (p->slen >= p->ssize)
				1490	enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
				1491	assert(p->slen < p->ssize);
				1492
				1493	/* finally, it's all reduced to the easy case */
				1494	p->strip[p->slen++] = SOP(op, opnd);
				1495	}
				1496
				1497	/*
				1498	- doinsert - insert a sop into the strip
				1499	*/
				1500	static void
				1501	doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
				1502	{
				1503	sopno sn;
				1504	sop s;
				1505	int i;
				1506
				1507	/* avoid making error situations worse */
				1508	if (p->error != 0)
				1509	return;
				1510
				1511	sn = HERE();
				1512	EMIT(op, opnd); /* do checks, ensure space */
				1513	assert(HERE() == sn+1);
				1514	s = p->strip[sn];
				1515
				1516	/* adjust paren pointers */
				1517	assert(pos > 0);
				1518	for (i = 1; i < NPAREN; i++) {
				1519	if (p->pbegin[i] >= pos) {
				1520	p->pbegin[i]++;
				1521	}
				1522	if (p->pend[i] >= pos) {
				1523	p->pend[i]++;
				1524	}
				1525	}
				1526
				1527	memmove((char )&p->strip[pos+1], (char )&p->strip[pos],
				1528	(HERE()-pos-1)*sizeof(sop));
				1529	p->strip[pos] = s;
				1530	}
				1531
				1532	/*
				1533	- dofwd - complete a forward reference
				1534	*/
				1535	static void
				1536	dofwd(struct parse *p, sopno pos, sop value)
				1537	{
				1538	/* avoid making error situations worse */
				1539	if (p->error != 0)
				1540	return;
				1541
				1542	assert(value < 1<<OPSHIFT);
				1543	p->strip[pos] = OP(p->strip[pos]) \| value;
				1544	}
				1545
				1546	/*
				1547	- enlarge - enlarge the strip
				1548	*/
				1549	static void
				1550	enlarge(struct parse *p, sopno size)
				1551	{
				1552	sop *sp;
				1553
				1554	if (p->ssize >= size)
				1555	return;
				1556
Reid Kleckner	1aecd5b	2015-02-26 21:07:30 +0000	[diff] [blame]	1557	if ((uintptr_t)size > SIZE_MAX / sizeof(sop)) {
Joerg Sonnenberger	a73284a	2015-02-07 21:24:06 +0000	[diff] [blame]	1558	SETERROR(REG_ESPACE);
				1559	return;
				1560	}
				1561
Torok Edwin	e14d4cd	2009-08-30 08:24:09 +0000	[diff] [blame]	1562	sp = (sop )realloc(p->strip, sizesizeof(sop));
				1563	if (sp == NULL) {
				1564	SETERROR(REG_ESPACE);
				1565	return;
				1566	}
				1567	p->strip = sp;
				1568	p->ssize = size;
				1569	}
				1570
				1571	/*
				1572	- stripsnug - compact the strip
				1573	*/
				1574	static void
				1575	stripsnug(struct parse p, struct re_guts g)
				1576	{
				1577	g->nstates = p->slen;
Reid Kleckner	1aecd5b	2015-02-26 21:07:30 +0000	[diff] [blame]	1578	if ((uintptr_t)p->slen > SIZE_MAX / sizeof(sop)) {
Joerg Sonnenberger	a73284a	2015-02-07 21:24:06 +0000	[diff] [blame]	1579	g->strip = p->strip;
				1580	SETERROR(REG_ESPACE);
				1581	return;
				1582	}
				1583
Torok Edwin	e14d4cd	2009-08-30 08:24:09 +0000	[diff] [blame]	1584	g->strip = (sop )realloc((char )p->strip, p->slen * sizeof(sop));
				1585	if (g->strip == NULL) {
				1586	SETERROR(REG_ESPACE);
				1587	g->strip = p->strip;
				1588	}
				1589	}
				1590
				1591	/*
				1592	- findmust - fill in must and mlen with longest mandatory literal string
				1593	*
				1594	* This algorithm could do fancy things like analyzing the operands of \|
				1595	* for common subsequences. Someday. This code is simple and finds most
				1596	* of the interesting cases.
				1597	*
				1598	* Note that must and mlen got initialized during setup.
				1599	*/
				1600	static void
				1601	findmust(struct parse p, struct re_guts g)
				1602	{
				1603	sop *scan;
Daniel Dunbar	bf75ffd	2009-08-30 21:13:58 +0000	[diff] [blame]	1604	sop start = 0; / start initialized in the default case, after that */
				1605	sop newstart = 0; / newstart was initialized in the OCHAR case */
Torok Edwin	e14d4cd	2009-08-30 08:24:09 +0000	[diff] [blame]	1606	sopno newlen;
				1607	sop s;
				1608	char *cp;
				1609	sopno i;
				1610
				1611	/* avoid making error situations worse */
				1612	if (p->error != 0)
				1613	return;
				1614
				1615	/* find the longest OCHAR sequence in strip */
				1616	newlen = 0;
				1617	scan = g->strip + 1;
				1618	do {
				1619	s = *scan++;
				1620	switch (OP(s)) {
				1621	case OCHAR: /* sequence member */
				1622	if (newlen == 0) /* new sequence */
				1623	newstart = scan - 1;
				1624	newlen++;
				1625	break;
				1626	case OPLUS_: /* things that don't break one */
				1627	case OLPAREN:
				1628	case ORPAREN:
				1629	break;
				1630	case OQUEST_: /* things that must be skipped */
				1631	case OCH_:
				1632	scan--;
				1633	do {
				1634	scan += OPND(s);
				1635	s = *scan;
				1636	/* assert() interferes w debug printouts */
				1637	if (OP(s) != O_QUEST && OP(s) != O_CH &&
				1638	OP(s) != OOR2) {
				1639	g->iflags \|= REGEX_BAD;
				1640	return;
				1641	}
				1642	} while (OP(s) != O_QUEST && OP(s) != O_CH);
				1643	/* fallthrough */
				1644	default: /* things that break a sequence */
				1645	if (newlen > g->mlen) { /* ends one */
				1646	start = newstart;
				1647	g->mlen = newlen;
				1648	}
				1649	newlen = 0;
				1650	break;
				1651	}
				1652	} while (OP(s) != OEND);
				1653
				1654	if (g->mlen == 0) /* there isn't one */
				1655	return;
				1656
				1657	/* turn it into a character string */
				1658	g->must = malloc((size_t)g->mlen + 1);
				1659	if (g->must == NULL) { /* argh; just forget it */
				1660	g->mlen = 0;
				1661	return;
				1662	}
				1663	cp = g->must;
				1664	scan = start;
				1665	for (i = g->mlen; i > 0; i--) {
				1666	while (OP(s = *scan++) != OCHAR)
				1667	continue;
				1668	assert(cp < g->must + g->mlen);
				1669	*cp++ = (char)OPND(s);
				1670	}
				1671	assert(cp == g->must + g->mlen);
				1672	cp++ = '\0'; / just on general principles */
				1673	}
				1674
				1675	/*
				1676	- pluscount - count + nesting
				1677	*/
				1678	static sopno /* nesting depth */
				1679	pluscount(struct parse p, struct re_guts g)
				1680	{
				1681	sop *scan;
				1682	sop s;
				1683	sopno plusnest = 0;
				1684	sopno maxnest = 0;
				1685
				1686	if (p->error != 0)
				1687	return(0); /* there may not be an OEND */
				1688
				1689	scan = g->strip + 1;
				1690	do {
				1691	s = *scan++;
				1692	switch (OP(s)) {
				1693	case OPLUS_:
				1694	plusnest++;
				1695	break;
				1696	case O_PLUS:
				1697	if (plusnest > maxnest)
				1698	maxnest = plusnest;
				1699	plusnest--;
				1700	break;
				1701	}
				1702	} while (OP(s) != OEND);
				1703	if (plusnest != 0)
				1704	g->iflags \|= REGEX_BAD;
				1705	return(maxnest);
				1706	}