| Torok Edwin | e14d4cd | 2009-08-30 08:24:09 +0000 | [diff] [blame] | 1 | /*- | 
|  | 2 | * This code is derived from OpenBSD's libc/regex, original license follows: | 
|  | 3 | * | 
|  | 4 | * Copyright (c) 1992, 1993, 1994 Henry Spencer. | 
|  | 5 | * Copyright (c) 1992, 1993, 1994 | 
|  | 6 | *	The Regents of the University of California.  All rights reserved. | 
|  | 7 | * | 
|  | 8 | * This code is derived from software contributed to Berkeley by | 
|  | 9 | * Henry Spencer. | 
|  | 10 | * | 
|  | 11 | * Redistribution and use in source and binary forms, with or without | 
|  | 12 | * modification, are permitted provided that the following conditions | 
|  | 13 | * are met: | 
|  | 14 | * 1. Redistributions of source code must retain the above copyright | 
|  | 15 | *    notice, this list of conditions and the following disclaimer. | 
|  | 16 | * 2. Redistributions in binary form must reproduce the above copyright | 
|  | 17 | *    notice, this list of conditions and the following disclaimer in the | 
|  | 18 | *    documentation and/or other materials provided with the distribution. | 
|  | 19 | * 3. Neither the name of the University nor the names of its contributors | 
|  | 20 | *    may be used to endorse or promote products derived from this software | 
|  | 21 | *    without specific prior written permission. | 
|  | 22 | * | 
|  | 23 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | 
|  | 24 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
|  | 25 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | 
|  | 26 | * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | 
|  | 27 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | 
|  | 28 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | 
|  | 29 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | 
|  | 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | 
|  | 31 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | 
|  | 32 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | 
|  | 33 | * SUCH DAMAGE. | 
|  | 34 | * | 
|  | 35 | *	@(#)regcomp.c	8.5 (Berkeley) 3/20/94 | 
|  | 36 | */ | 
|  | 37 |  | 
|  | 38 | #include <sys/types.h> | 
|  | 39 | #include <stdio.h> | 
|  | 40 | #include <string.h> | 
|  | 41 | #include <ctype.h> | 
|  | 42 | #include <limits.h> | 
|  | 43 | #include <stdlib.h> | 
|  | 44 | #include "regex_impl.h" | 
|  | 45 |  | 
|  | 46 | #include "regutils.h" | 
|  | 47 | #include "regex2.h" | 
|  | 48 |  | 
|  | 49 | #include "regcclass.h" | 
|  | 50 | #include "regcname.h" | 
|  | 51 |  | 
|  | 52 | /* | 
|  | 53 | * parse structure, passed up and down to avoid global variables and | 
|  | 54 | * other clumsinesses | 
|  | 55 | */ | 
|  | 56 | struct parse { | 
|  | 57 | char *next;		/* next character in RE */ | 
|  | 58 | char *end;		/* end of string (-> NUL normally) */ | 
|  | 59 | int error;		/* has an error been seen? */ | 
|  | 60 | sop *strip;		/* malloced strip */ | 
|  | 61 | sopno ssize;		/* malloced strip size (allocated) */ | 
|  | 62 | sopno slen;		/* malloced strip length (used) */ | 
|  | 63 | int ncsalloc;		/* number of csets allocated */ | 
|  | 64 | struct re_guts *g; | 
|  | 65 | #	define	NPAREN	10	/* we need to remember () 1-9 for back refs */ | 
|  | 66 | sopno pbegin[NPAREN];	/* -> ( ([0] unused) */ | 
|  | 67 | sopno pend[NPAREN];	/* -> ) ([0] unused) */ | 
|  | 68 | }; | 
|  | 69 |  | 
|  | 70 | static void p_ere(struct parse *, int); | 
|  | 71 | static void p_ere_exp(struct parse *); | 
|  | 72 | static void p_str(struct parse *); | 
|  | 73 | static void p_bre(struct parse *, int, int); | 
|  | 74 | static int p_simp_re(struct parse *, int); | 
|  | 75 | static int p_count(struct parse *); | 
|  | 76 | static void p_bracket(struct parse *); | 
|  | 77 | static void p_b_term(struct parse *, cset *); | 
|  | 78 | static void p_b_cclass(struct parse *, cset *); | 
|  | 79 | static void p_b_eclass(struct parse *, cset *); | 
|  | 80 | static char p_b_symbol(struct parse *); | 
|  | 81 | static char p_b_coll_elem(struct parse *, int); | 
|  | 82 | static char othercase(int); | 
|  | 83 | static void bothcases(struct parse *, int); | 
|  | 84 | static void ordinary(struct parse *, int); | 
|  | 85 | static void nonnewline(struct parse *); | 
|  | 86 | static void repeat(struct parse *, sopno, int, int); | 
|  | 87 | static int seterr(struct parse *, int); | 
|  | 88 | static cset *allocset(struct parse *); | 
|  | 89 | static void freeset(struct parse *, cset *); | 
|  | 90 | static int freezeset(struct parse *, cset *); | 
|  | 91 | static int firstch(struct parse *, cset *); | 
|  | 92 | static int nch(struct parse *, cset *); | 
|  | 93 | static void mcadd(struct parse *, cset *, const char *); | 
|  | 94 | static void mcinvert(struct parse *, cset *); | 
|  | 95 | static void mccase(struct parse *, cset *); | 
|  | 96 | static int isinsets(struct re_guts *, int); | 
|  | 97 | static int samesets(struct re_guts *, int, int); | 
|  | 98 | static void categorize(struct parse *, struct re_guts *); | 
|  | 99 | static sopno dupl(struct parse *, sopno, sopno); | 
|  | 100 | static void doemit(struct parse *, sop, size_t); | 
|  | 101 | static void doinsert(struct parse *, sop, size_t, sopno); | 
|  | 102 | static void dofwd(struct parse *, sopno, sop); | 
|  | 103 | static void enlarge(struct parse *, sopno); | 
|  | 104 | static void stripsnug(struct parse *, struct re_guts *); | 
|  | 105 | static void findmust(struct parse *, struct re_guts *); | 
|  | 106 | static sopno pluscount(struct parse *, struct re_guts *); | 
|  | 107 |  | 
|  | 108 | static char nuls[10];		/* place to point scanner in event of error */ | 
|  | 109 |  | 
|  | 110 | /* | 
|  | 111 | * macros for use with parse structure | 
|  | 112 | * BEWARE:  these know that the parse structure is named `p' !!! | 
|  | 113 | */ | 
|  | 114 | #define	PEEK()	(*p->next) | 
|  | 115 | #define	PEEK2()	(*(p->next+1)) | 
|  | 116 | #define	MORE()	(p->next < p->end) | 
|  | 117 | #define	MORE2()	(p->next+1 < p->end) | 
|  | 118 | #define	SEE(c)	(MORE() && PEEK() == (c)) | 
|  | 119 | #define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) | 
|  | 120 | #define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0) | 
|  | 121 | #define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0) | 
|  | 122 | #define	NEXT()	(p->next++) | 
|  | 123 | #define	NEXT2()	(p->next += 2) | 
|  | 124 | #define	NEXTn(n)	(p->next += (n)) | 
|  | 125 | #define	GETNEXT()	(*p->next++) | 
|  | 126 | #define	SETERROR(e)	seterr(p, (e)) | 
|  | 127 | #define	REQUIRE(co, e)	(void)((co) || SETERROR(e)) | 
|  | 128 | #define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e)) | 
|  | 129 | #define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e)) | 
|  | 130 | #define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e)) | 
|  | 131 | #define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd)) | 
|  | 132 | #define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos) | 
|  | 133 | #define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos)) | 
|  | 134 | #define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos) | 
|  | 135 | #define	HERE()		(p->slen) | 
|  | 136 | #define	THERE()		(p->slen - 1) | 
|  | 137 | #define	THERETHERE()	(p->slen - 2) | 
|  | 138 | #define	DROP(n)	(p->slen -= (n)) | 
|  | 139 |  | 
|  | 140 | #ifdef	_POSIX2_RE_DUP_MAX | 
|  | 141 | #define	DUPMAX	_POSIX2_RE_DUP_MAX | 
|  | 142 | #else | 
|  | 143 | #define	DUPMAX	255 | 
|  | 144 | #endif | 
| Benjamin Kramer | 2b37efa | 2009-09-06 12:26:28 +0000 | [diff] [blame] | 145 | #define	INFINITY	(DUPMAX + 1) | 
| Torok Edwin | e14d4cd | 2009-08-30 08:24:09 +0000 | [diff] [blame] | 146 |  | 
|  | 147 | #ifndef NDEBUG | 
|  | 148 | static int never = 0;		/* for use in asserts; shuts lint up */ | 
|  | 149 | #else | 
|  | 150 | #define	never	0		/* some <assert.h>s have bugs too */ | 
|  | 151 | #endif | 
|  | 152 |  | 
|  | 153 | /* | 
|  | 154 | - llvm_regcomp - interface for parser and compilation | 
|  | 155 | */ | 
|  | 156 | int				/* 0 success, otherwise REG_something */ | 
|  | 157 | llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags) | 
|  | 158 | { | 
|  | 159 | struct parse pa; | 
|  | 160 | struct re_guts *g; | 
|  | 161 | struct parse *p = &pa; | 
|  | 162 | int i; | 
|  | 163 | size_t len; | 
|  | 164 | #ifdef REDEBUG | 
|  | 165 | #	define	GOODFLAGS(f)	(f) | 
|  | 166 | #else | 
|  | 167 | #	define	GOODFLAGS(f)	((f)&~REG_DUMP) | 
|  | 168 | #endif | 
|  | 169 |  | 
|  | 170 | cflags = GOODFLAGS(cflags); | 
|  | 171 | if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) | 
|  | 172 | return(REG_INVARG); | 
|  | 173 |  | 
|  | 174 | if (cflags®_PEND) { | 
|  | 175 | if (preg->re_endp < pattern) | 
|  | 176 | return(REG_INVARG); | 
|  | 177 | len = preg->re_endp - pattern; | 
|  | 178 | } else | 
|  | 179 | len = strlen((const char *)pattern); | 
|  | 180 |  | 
|  | 181 | /* do the mallocs early so failure handling is easy */ | 
|  | 182 | g = (struct re_guts *)malloc(sizeof(struct re_guts) + | 
|  | 183 | (NC-1)*sizeof(cat_t)); | 
|  | 184 | if (g == NULL) | 
|  | 185 | return(REG_ESPACE); | 
|  | 186 | p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */ | 
|  | 187 | p->strip = (sop *)calloc(p->ssize, sizeof(sop)); | 
|  | 188 | p->slen = 0; | 
|  | 189 | if (p->strip == NULL) { | 
|  | 190 | free((char *)g); | 
|  | 191 | return(REG_ESPACE); | 
|  | 192 | } | 
|  | 193 |  | 
|  | 194 | /* set things up */ | 
|  | 195 | p->g = g; | 
|  | 196 | p->next = (char *)pattern;	/* convenience; we do not modify it */ | 
|  | 197 | p->end = p->next + len; | 
|  | 198 | p->error = 0; | 
|  | 199 | p->ncsalloc = 0; | 
|  | 200 | for (i = 0; i < NPAREN; i++) { | 
|  | 201 | p->pbegin[i] = 0; | 
|  | 202 | p->pend[i] = 0; | 
|  | 203 | } | 
|  | 204 | g->csetsize = NC; | 
|  | 205 | g->sets = NULL; | 
|  | 206 | g->setbits = NULL; | 
|  | 207 | g->ncsets = 0; | 
|  | 208 | g->cflags = cflags; | 
|  | 209 | g->iflags = 0; | 
|  | 210 | g->nbol = 0; | 
|  | 211 | g->neol = 0; | 
|  | 212 | g->must = NULL; | 
|  | 213 | g->mlen = 0; | 
|  | 214 | g->nsub = 0; | 
|  | 215 | g->ncategories = 1;	/* category 0 is "everything else" */ | 
|  | 216 | g->categories = &g->catspace[-(CHAR_MIN)]; | 
|  | 217 | (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); | 
|  | 218 | g->backrefs = 0; | 
|  | 219 |  | 
|  | 220 | /* do it */ | 
|  | 221 | EMIT(OEND, 0); | 
|  | 222 | g->firststate = THERE(); | 
|  | 223 | if (cflags®_EXTENDED) | 
|  | 224 | p_ere(p, OUT); | 
|  | 225 | else if (cflags®_NOSPEC) | 
|  | 226 | p_str(p); | 
|  | 227 | else | 
|  | 228 | p_bre(p, OUT, OUT); | 
|  | 229 | EMIT(OEND, 0); | 
|  | 230 | g->laststate = THERE(); | 
|  | 231 |  | 
|  | 232 | /* tidy up loose ends and fill things in */ | 
|  | 233 | categorize(p, g); | 
|  | 234 | stripsnug(p, g); | 
|  | 235 | findmust(p, g); | 
|  | 236 | g->nplus = pluscount(p, g); | 
|  | 237 | g->magic = MAGIC2; | 
|  | 238 | preg->re_nsub = g->nsub; | 
|  | 239 | preg->re_g = g; | 
|  | 240 | preg->re_magic = MAGIC1; | 
|  | 241 | #ifndef REDEBUG | 
|  | 242 | /* not debugging, so can't rely on the assert() in llvm_regexec() */ | 
|  | 243 | if (g->iflags®EX_BAD) | 
|  | 244 | SETERROR(REG_ASSERT); | 
|  | 245 | #endif | 
|  | 246 |  | 
|  | 247 | /* win or lose, we're done */ | 
|  | 248 | if (p->error != 0)	/* lose */ | 
|  | 249 | llvm_regfree(preg); | 
|  | 250 | return(p->error); | 
|  | 251 | } | 
|  | 252 |  | 
|  | 253 | /* | 
|  | 254 | - p_ere - ERE parser top level, concatenation and alternation | 
|  | 255 | */ | 
|  | 256 | static void | 
|  | 257 | p_ere(struct parse *p, int stop)	/* character this ERE should end at */ | 
|  | 258 | { | 
|  | 259 | char c; | 
| Daniel Dunbar | 6e4ed8c | 2009-09-08 16:14:54 +0000 | [diff] [blame] | 260 | sopno prevback = 0; | 
|  | 261 | sopno prevfwd = 0; | 
| Torok Edwin | e14d4cd | 2009-08-30 08:24:09 +0000 | [diff] [blame] | 262 | sopno conc; | 
|  | 263 | int first = 1;		/* is this the first alternative? */ | 
|  | 264 |  | 
|  | 265 | for (;;) { | 
|  | 266 | /* do a bunch of concatenated expressions */ | 
|  | 267 | conc = HERE(); | 
|  | 268 | while (MORE() && (c = PEEK()) != '|' && c != stop) | 
|  | 269 | p_ere_exp(p); | 
|  | 270 | REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */ | 
|  | 271 |  | 
|  | 272 | if (!EAT('|')) | 
|  | 273 | break;		/* NOTE BREAK OUT */ | 
|  | 274 |  | 
|  | 275 | if (first) { | 
|  | 276 | INSERT(OCH_, conc);	/* offset is wrong */ | 
|  | 277 | prevfwd = conc; | 
|  | 278 | prevback = conc; | 
|  | 279 | first = 0; | 
|  | 280 | } | 
|  | 281 | ASTERN(OOR1, prevback); | 
|  | 282 | prevback = THERE(); | 
|  | 283 | AHEAD(prevfwd);			/* fix previous offset */ | 
|  | 284 | prevfwd = HERE(); | 
|  | 285 | EMIT(OOR2, 0);			/* offset is very wrong */ | 
|  | 286 | } | 
|  | 287 |  | 
|  | 288 | if (!first) {		/* tail-end fixups */ | 
|  | 289 | AHEAD(prevfwd); | 
|  | 290 | ASTERN(O_CH, prevback); | 
|  | 291 | } | 
|  | 292 |  | 
|  | 293 | assert(!MORE() || SEE(stop)); | 
|  | 294 | } | 
|  | 295 |  | 
|  | 296 | /* | 
|  | 297 | - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op | 
|  | 298 | */ | 
|  | 299 | static void | 
|  | 300 | p_ere_exp(struct parse *p) | 
|  | 301 | { | 
|  | 302 | char c; | 
|  | 303 | sopno pos; | 
|  | 304 | int count; | 
|  | 305 | int count2; | 
|  | 306 | sopno subno; | 
|  | 307 | int wascaret = 0; | 
|  | 308 |  | 
|  | 309 | assert(MORE());		/* caller should have ensured this */ | 
|  | 310 | c = GETNEXT(); | 
|  | 311 |  | 
|  | 312 | pos = HERE(); | 
|  | 313 | switch (c) { | 
|  | 314 | case '(': | 
|  | 315 | REQUIRE(MORE(), REG_EPAREN); | 
|  | 316 | p->g->nsub++; | 
|  | 317 | subno = p->g->nsub; | 
|  | 318 | if (subno < NPAREN) | 
|  | 319 | p->pbegin[subno] = HERE(); | 
|  | 320 | EMIT(OLPAREN, subno); | 
|  | 321 | if (!SEE(')')) | 
|  | 322 | p_ere(p, ')'); | 
|  | 323 | if (subno < NPAREN) { | 
|  | 324 | p->pend[subno] = HERE(); | 
|  | 325 | assert(p->pend[subno] != 0); | 
|  | 326 | } | 
|  | 327 | EMIT(ORPAREN, subno); | 
|  | 328 | MUSTEAT(')', REG_EPAREN); | 
|  | 329 | break; | 
|  | 330 | #ifndef POSIX_MISTAKE | 
|  | 331 | case ')':		/* happens only if no current unmatched ( */ | 
|  | 332 | /* | 
|  | 333 | * You may ask, why the ifndef?  Because I didn't notice | 
|  | 334 | * this until slightly too late for 1003.2, and none of the | 
|  | 335 | * other 1003.2 regular-expression reviewers noticed it at | 
|  | 336 | * all.  So an unmatched ) is legal POSIX, at least until | 
|  | 337 | * we can get it fixed. | 
|  | 338 | */ | 
|  | 339 | SETERROR(REG_EPAREN); | 
|  | 340 | break; | 
|  | 341 | #endif | 
|  | 342 | case '^': | 
|  | 343 | EMIT(OBOL, 0); | 
|  | 344 | p->g->iflags |= USEBOL; | 
|  | 345 | p->g->nbol++; | 
|  | 346 | wascaret = 1; | 
|  | 347 | break; | 
|  | 348 | case '$': | 
|  | 349 | EMIT(OEOL, 0); | 
|  | 350 | p->g->iflags |= USEEOL; | 
|  | 351 | p->g->neol++; | 
|  | 352 | break; | 
|  | 353 | case '|': | 
|  | 354 | SETERROR(REG_EMPTY); | 
|  | 355 | break; | 
|  | 356 | case '*': | 
|  | 357 | case '+': | 
|  | 358 | case '?': | 
|  | 359 | SETERROR(REG_BADRPT); | 
|  | 360 | break; | 
|  | 361 | case '.': | 
|  | 362 | if (p->g->cflags®_NEWLINE) | 
|  | 363 | nonnewline(p); | 
|  | 364 | else | 
|  | 365 | EMIT(OANY, 0); | 
|  | 366 | break; | 
|  | 367 | case '[': | 
|  | 368 | p_bracket(p); | 
|  | 369 | break; | 
|  | 370 | case '\\': | 
|  | 371 | REQUIRE(MORE(), REG_EESCAPE); | 
|  | 372 | c = GETNEXT(); | 
|  | 373 | ordinary(p, c); | 
|  | 374 | break; | 
|  | 375 | case '{':		/* okay as ordinary except if digit follows */ | 
|  | 376 | REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); | 
|  | 377 | /* FALLTHROUGH */ | 
|  | 378 | default: | 
|  | 379 | ordinary(p, c); | 
|  | 380 | break; | 
|  | 381 | } | 
|  | 382 |  | 
|  | 383 | if (!MORE()) | 
|  | 384 | return; | 
|  | 385 | c = PEEK(); | 
|  | 386 | /* we call { a repetition if followed by a digit */ | 
|  | 387 | if (!( c == '*' || c == '+' || c == '?' || | 
|  | 388 | (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) | 
|  | 389 | return;		/* no repetition, we're done */ | 
|  | 390 | NEXT(); | 
|  | 391 |  | 
|  | 392 | REQUIRE(!wascaret, REG_BADRPT); | 
|  | 393 | switch (c) { | 
|  | 394 | case '*':	/* implemented as +? */ | 
|  | 395 | /* this case does not require the (y|) trick, noKLUDGE */ | 
|  | 396 | INSERT(OPLUS_, pos); | 
|  | 397 | ASTERN(O_PLUS, pos); | 
|  | 398 | INSERT(OQUEST_, pos); | 
|  | 399 | ASTERN(O_QUEST, pos); | 
|  | 400 | break; | 
|  | 401 | case '+': | 
|  | 402 | INSERT(OPLUS_, pos); | 
|  | 403 | ASTERN(O_PLUS, pos); | 
|  | 404 | break; | 
|  | 405 | case '?': | 
|  | 406 | /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ | 
|  | 407 | INSERT(OCH_, pos);		/* offset slightly wrong */ | 
|  | 408 | ASTERN(OOR1, pos);		/* this one's right */ | 
|  | 409 | AHEAD(pos);			/* fix the OCH_ */ | 
|  | 410 | EMIT(OOR2, 0);			/* offset very wrong... */ | 
|  | 411 | AHEAD(THERE());			/* ...so fix it */ | 
|  | 412 | ASTERN(O_CH, THERETHERE()); | 
|  | 413 | break; | 
|  | 414 | case '{': | 
|  | 415 | count = p_count(p); | 
|  | 416 | if (EAT(',')) { | 
|  | 417 | if (isdigit((uch)PEEK())) { | 
|  | 418 | count2 = p_count(p); | 
|  | 419 | REQUIRE(count <= count2, REG_BADBR); | 
|  | 420 | } else		/* single number with comma */ | 
|  | 421 | count2 = INFINITY; | 
|  | 422 | } else		/* just a single number */ | 
|  | 423 | count2 = count; | 
|  | 424 | repeat(p, pos, count, count2); | 
|  | 425 | if (!EAT('}')) {	/* error heuristics */ | 
|  | 426 | while (MORE() && PEEK() != '}') | 
|  | 427 | NEXT(); | 
|  | 428 | REQUIRE(MORE(), REG_EBRACE); | 
|  | 429 | SETERROR(REG_BADBR); | 
|  | 430 | } | 
|  | 431 | break; | 
|  | 432 | } | 
|  | 433 |  | 
|  | 434 | if (!MORE()) | 
|  | 435 | return; | 
|  | 436 | c = PEEK(); | 
|  | 437 | if (!( c == '*' || c == '+' || c == '?' || | 
|  | 438 | (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) | 
|  | 439 | return; | 
|  | 440 | SETERROR(REG_BADRPT); | 
|  | 441 | } | 
|  | 442 |  | 
|  | 443 | /* | 
|  | 444 | - p_str - string (no metacharacters) "parser" | 
|  | 445 | */ | 
|  | 446 | static void | 
|  | 447 | p_str(struct parse *p) | 
|  | 448 | { | 
|  | 449 | REQUIRE(MORE(), REG_EMPTY); | 
|  | 450 | while (MORE()) | 
|  | 451 | ordinary(p, GETNEXT()); | 
|  | 452 | } | 
|  | 453 |  | 
|  | 454 | /* | 
|  | 455 | - p_bre - BRE parser top level, anchoring and concatenation | 
|  | 456 | * Giving end1 as OUT essentially eliminates the end1/end2 check. | 
|  | 457 | * | 
|  | 458 | * This implementation is a bit of a kludge, in that a trailing $ is first | 
|  | 459 | * taken as an ordinary character and then revised to be an anchor.  The | 
|  | 460 | * only undesirable side effect is that '$' gets included as a character | 
|  | 461 | * category in such cases.  This is fairly harmless; not worth fixing. | 
|  | 462 | * The amount of lookahead needed to avoid this kludge is excessive. | 
|  | 463 | */ | 
|  | 464 | static void | 
|  | 465 | p_bre(struct parse *p, | 
|  | 466 | int end1,		/* first terminating character */ | 
|  | 467 | int end2)		/* second terminating character */ | 
|  | 468 | { | 
|  | 469 | sopno start = HERE(); | 
|  | 470 | int first = 1;			/* first subexpression? */ | 
|  | 471 | int wasdollar = 0; | 
|  | 472 |  | 
|  | 473 | if (EAT('^')) { | 
|  | 474 | EMIT(OBOL, 0); | 
|  | 475 | p->g->iflags |= USEBOL; | 
|  | 476 | p->g->nbol++; | 
|  | 477 | } | 
|  | 478 | while (MORE() && !SEETWO(end1, end2)) { | 
|  | 479 | wasdollar = p_simp_re(p, first); | 
|  | 480 | first = 0; | 
|  | 481 | } | 
|  | 482 | if (wasdollar) {	/* oops, that was a trailing anchor */ | 
|  | 483 | DROP(1); | 
|  | 484 | EMIT(OEOL, 0); | 
|  | 485 | p->g->iflags |= USEEOL; | 
|  | 486 | p->g->neol++; | 
|  | 487 | } | 
|  | 488 |  | 
|  | 489 | REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */ | 
|  | 490 | } | 
|  | 491 |  | 
|  | 492 | /* | 
|  | 493 | - p_simp_re - parse a simple RE, an atom possibly followed by a repetition | 
|  | 494 | */ | 
|  | 495 | static int			/* was the simple RE an unbackslashed $? */ | 
|  | 496 | p_simp_re(struct parse *p, | 
|  | 497 | int starordinary)		/* is a leading * an ordinary character? */ | 
|  | 498 | { | 
|  | 499 | int c; | 
|  | 500 | int count; | 
|  | 501 | int count2; | 
|  | 502 | sopno pos; | 
|  | 503 | int i; | 
|  | 504 | sopno subno; | 
|  | 505 | #	define	BACKSL	(1<<CHAR_BIT) | 
|  | 506 |  | 
|  | 507 | pos = HERE();		/* repetion op, if any, covers from here */ | 
|  | 508 |  | 
|  | 509 | assert(MORE());		/* caller should have ensured this */ | 
|  | 510 | c = GETNEXT(); | 
|  | 511 | if (c == '\\') { | 
|  | 512 | REQUIRE(MORE(), REG_EESCAPE); | 
|  | 513 | c = BACKSL | GETNEXT(); | 
|  | 514 | } | 
|  | 515 | switch (c) { | 
|  | 516 | case '.': | 
|  | 517 | if (p->g->cflags®_NEWLINE) | 
|  | 518 | nonnewline(p); | 
|  | 519 | else | 
|  | 520 | EMIT(OANY, 0); | 
|  | 521 | break; | 
|  | 522 | case '[': | 
|  | 523 | p_bracket(p); | 
|  | 524 | break; | 
|  | 525 | case BACKSL|'{': | 
|  | 526 | SETERROR(REG_BADRPT); | 
|  | 527 | break; | 
|  | 528 | case BACKSL|'(': | 
|  | 529 | p->g->nsub++; | 
|  | 530 | subno = p->g->nsub; | 
|  | 531 | if (subno < NPAREN) | 
|  | 532 | p->pbegin[subno] = HERE(); | 
|  | 533 | EMIT(OLPAREN, subno); | 
|  | 534 | /* the MORE here is an error heuristic */ | 
|  | 535 | if (MORE() && !SEETWO('\\', ')')) | 
|  | 536 | p_bre(p, '\\', ')'); | 
|  | 537 | if (subno < NPAREN) { | 
|  | 538 | p->pend[subno] = HERE(); | 
|  | 539 | assert(p->pend[subno] != 0); | 
|  | 540 | } | 
|  | 541 | EMIT(ORPAREN, subno); | 
|  | 542 | REQUIRE(EATTWO('\\', ')'), REG_EPAREN); | 
|  | 543 | break; | 
|  | 544 | case BACKSL|')':	/* should not get here -- must be user */ | 
|  | 545 | case BACKSL|'}': | 
|  | 546 | SETERROR(REG_EPAREN); | 
|  | 547 | break; | 
|  | 548 | case BACKSL|'1': | 
|  | 549 | case BACKSL|'2': | 
|  | 550 | case BACKSL|'3': | 
|  | 551 | case BACKSL|'4': | 
|  | 552 | case BACKSL|'5': | 
|  | 553 | case BACKSL|'6': | 
|  | 554 | case BACKSL|'7': | 
|  | 555 | case BACKSL|'8': | 
|  | 556 | case BACKSL|'9': | 
|  | 557 | i = (c&~BACKSL) - '0'; | 
|  | 558 | assert(i < NPAREN); | 
|  | 559 | if (p->pend[i] != 0) { | 
|  | 560 | assert(i <= p->g->nsub); | 
|  | 561 | EMIT(OBACK_, i); | 
|  | 562 | assert(p->pbegin[i] != 0); | 
|  | 563 | assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); | 
|  | 564 | assert(OP(p->strip[p->pend[i]]) == ORPAREN); | 
|  | 565 | (void) dupl(p, p->pbegin[i]+1, p->pend[i]); | 
|  | 566 | EMIT(O_BACK, i); | 
|  | 567 | } else | 
|  | 568 | SETERROR(REG_ESUBREG); | 
|  | 569 | p->g->backrefs = 1; | 
|  | 570 | break; | 
|  | 571 | case '*': | 
|  | 572 | REQUIRE(starordinary, REG_BADRPT); | 
|  | 573 | /* FALLTHROUGH */ | 
|  | 574 | default: | 
|  | 575 | ordinary(p, (char)c); | 
|  | 576 | break; | 
|  | 577 | } | 
|  | 578 |  | 
|  | 579 | if (EAT('*')) {		/* implemented as +? */ | 
|  | 580 | /* this case does not require the (y|) trick, noKLUDGE */ | 
|  | 581 | INSERT(OPLUS_, pos); | 
|  | 582 | ASTERN(O_PLUS, pos); | 
|  | 583 | INSERT(OQUEST_, pos); | 
|  | 584 | ASTERN(O_QUEST, pos); | 
|  | 585 | } else if (EATTWO('\\', '{')) { | 
|  | 586 | count = p_count(p); | 
|  | 587 | if (EAT(',')) { | 
|  | 588 | if (MORE() && isdigit((uch)PEEK())) { | 
|  | 589 | count2 = p_count(p); | 
|  | 590 | REQUIRE(count <= count2, REG_BADBR); | 
|  | 591 | } else		/* single number with comma */ | 
|  | 592 | count2 = INFINITY; | 
|  | 593 | } else		/* just a single number */ | 
|  | 594 | count2 = count; | 
|  | 595 | repeat(p, pos, count, count2); | 
|  | 596 | if (!EATTWO('\\', '}')) {	/* error heuristics */ | 
|  | 597 | while (MORE() && !SEETWO('\\', '}')) | 
|  | 598 | NEXT(); | 
|  | 599 | REQUIRE(MORE(), REG_EBRACE); | 
|  | 600 | SETERROR(REG_BADBR); | 
|  | 601 | } | 
|  | 602 | } else if (c == '$')	/* $ (but not \$) ends it */ | 
|  | 603 | return(1); | 
|  | 604 |  | 
|  | 605 | return(0); | 
|  | 606 | } | 
|  | 607 |  | 
|  | 608 | /* | 
|  | 609 | - p_count - parse a repetition count | 
|  | 610 | */ | 
|  | 611 | static int			/* the value */ | 
|  | 612 | p_count(struct parse *p) | 
|  | 613 | { | 
|  | 614 | int count = 0; | 
|  | 615 | int ndigits = 0; | 
|  | 616 |  | 
|  | 617 | while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { | 
|  | 618 | count = count*10 + (GETNEXT() - '0'); | 
|  | 619 | ndigits++; | 
|  | 620 | } | 
|  | 621 |  | 
|  | 622 | REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); | 
|  | 623 | return(count); | 
|  | 624 | } | 
|  | 625 |  | 
|  | 626 | /* | 
|  | 627 | - p_bracket - parse a bracketed character list | 
|  | 628 | * | 
|  | 629 | * Note a significant property of this code:  if the allocset() did SETERROR, | 
|  | 630 | * no set operations are done. | 
|  | 631 | */ | 
|  | 632 | static void | 
|  | 633 | p_bracket(struct parse *p) | 
|  | 634 | { | 
|  | 635 | cset *cs; | 
|  | 636 | int invert = 0; | 
|  | 637 |  | 
|  | 638 | /* Dept of Truly Sickening Special-Case Kludges */ | 
|  | 639 | if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { | 
|  | 640 | EMIT(OBOW, 0); | 
|  | 641 | NEXTn(6); | 
|  | 642 | return; | 
|  | 643 | } | 
|  | 644 | if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { | 
|  | 645 | EMIT(OEOW, 0); | 
|  | 646 | NEXTn(6); | 
|  | 647 | return; | 
|  | 648 | } | 
|  | 649 |  | 
|  | 650 | if ((cs = allocset(p)) == NULL) { | 
|  | 651 | /* allocset did set error status in p */ | 
|  | 652 | return; | 
|  | 653 | } | 
|  | 654 |  | 
|  | 655 | if (EAT('^')) | 
|  | 656 | invert++;	/* make note to invert set at end */ | 
|  | 657 | if (EAT(']')) | 
|  | 658 | CHadd(cs, ']'); | 
|  | 659 | else if (EAT('-')) | 
|  | 660 | CHadd(cs, '-'); | 
|  | 661 | while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) | 
|  | 662 | p_b_term(p, cs); | 
|  | 663 | if (EAT('-')) | 
|  | 664 | CHadd(cs, '-'); | 
|  | 665 | MUSTEAT(']', REG_EBRACK); | 
|  | 666 |  | 
|  | 667 | if (p->error != 0) {	/* don't mess things up further */ | 
|  | 668 | freeset(p, cs); | 
|  | 669 | return; | 
|  | 670 | } | 
|  | 671 |  | 
|  | 672 | if (p->g->cflags®_ICASE) { | 
|  | 673 | int i; | 
|  | 674 | int ci; | 
|  | 675 |  | 
|  | 676 | for (i = p->g->csetsize - 1; i >= 0; i--) | 
|  | 677 | if (CHIN(cs, i) && isalpha(i)) { | 
|  | 678 | ci = othercase(i); | 
|  | 679 | if (ci != i) | 
|  | 680 | CHadd(cs, ci); | 
|  | 681 | } | 
|  | 682 | if (cs->multis != NULL) | 
|  | 683 | mccase(p, cs); | 
|  | 684 | } | 
|  | 685 | if (invert) { | 
|  | 686 | int i; | 
|  | 687 |  | 
|  | 688 | for (i = p->g->csetsize - 1; i >= 0; i--) | 
|  | 689 | if (CHIN(cs, i)) | 
|  | 690 | CHsub(cs, i); | 
|  | 691 | else | 
|  | 692 | CHadd(cs, i); | 
|  | 693 | if (p->g->cflags®_NEWLINE) | 
|  | 694 | CHsub(cs, '\n'); | 
|  | 695 | if (cs->multis != NULL) | 
|  | 696 | mcinvert(p, cs); | 
|  | 697 | } | 
|  | 698 |  | 
|  | 699 | assert(cs->multis == NULL);		/* xxx */ | 
|  | 700 |  | 
|  | 701 | if (nch(p, cs) == 1) {		/* optimize singleton sets */ | 
|  | 702 | ordinary(p, firstch(p, cs)); | 
|  | 703 | freeset(p, cs); | 
|  | 704 | } else | 
|  | 705 | EMIT(OANYOF, freezeset(p, cs)); | 
|  | 706 | } | 
|  | 707 |  | 
|  | 708 | /* | 
|  | 709 | - p_b_term - parse one term of a bracketed character list | 
|  | 710 | */ | 
|  | 711 | static void | 
|  | 712 | p_b_term(struct parse *p, cset *cs) | 
|  | 713 | { | 
|  | 714 | char c; | 
|  | 715 | char start, finish; | 
|  | 716 | int i; | 
|  | 717 |  | 
|  | 718 | /* classify what we've got */ | 
|  | 719 | switch ((MORE()) ? PEEK() : '\0') { | 
|  | 720 | case '[': | 
|  | 721 | c = (MORE2()) ? PEEK2() : '\0'; | 
|  | 722 | break; | 
|  | 723 | case '-': | 
|  | 724 | SETERROR(REG_ERANGE); | 
|  | 725 | return;			/* NOTE RETURN */ | 
|  | 726 | break; | 
|  | 727 | default: | 
|  | 728 | c = '\0'; | 
|  | 729 | break; | 
|  | 730 | } | 
|  | 731 |  | 
|  | 732 | switch (c) { | 
|  | 733 | case ':':		/* character class */ | 
|  | 734 | NEXT2(); | 
|  | 735 | REQUIRE(MORE(), REG_EBRACK); | 
|  | 736 | c = PEEK(); | 
|  | 737 | REQUIRE(c != '-' && c != ']', REG_ECTYPE); | 
|  | 738 | p_b_cclass(p, cs); | 
|  | 739 | REQUIRE(MORE(), REG_EBRACK); | 
|  | 740 | REQUIRE(EATTWO(':', ']'), REG_ECTYPE); | 
|  | 741 | break; | 
|  | 742 | case '=':		/* equivalence class */ | 
|  | 743 | NEXT2(); | 
|  | 744 | REQUIRE(MORE(), REG_EBRACK); | 
|  | 745 | c = PEEK(); | 
|  | 746 | REQUIRE(c != '-' && c != ']', REG_ECOLLATE); | 
|  | 747 | p_b_eclass(p, cs); | 
|  | 748 | REQUIRE(MORE(), REG_EBRACK); | 
|  | 749 | REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); | 
|  | 750 | break; | 
|  | 751 | default:		/* symbol, ordinary character, or range */ | 
|  | 752 | /* xxx revision needed for multichar stuff */ | 
|  | 753 | start = p_b_symbol(p); | 
|  | 754 | if (SEE('-') && MORE2() && PEEK2() != ']') { | 
|  | 755 | /* range */ | 
|  | 756 | NEXT(); | 
|  | 757 | if (EAT('-')) | 
|  | 758 | finish = '-'; | 
|  | 759 | else | 
|  | 760 | finish = p_b_symbol(p); | 
|  | 761 | } else | 
|  | 762 | finish = start; | 
|  | 763 | /* xxx what about signed chars here... */ | 
|  | 764 | REQUIRE(start <= finish, REG_ERANGE); | 
|  | 765 | for (i = start; i <= finish; i++) | 
|  | 766 | CHadd(cs, i); | 
|  | 767 | break; | 
|  | 768 | } | 
|  | 769 | } | 
|  | 770 |  | 
|  | 771 | /* | 
|  | 772 | - p_b_cclass - parse a character-class name and deal with it | 
|  | 773 | */ | 
|  | 774 | static void | 
|  | 775 | p_b_cclass(struct parse *p, cset *cs) | 
|  | 776 | { | 
|  | 777 | char *sp = p->next; | 
|  | 778 | struct cclass *cp; | 
|  | 779 | size_t len; | 
|  | 780 | const char *u; | 
|  | 781 | char c; | 
|  | 782 |  | 
|  | 783 | while (MORE() && isalpha(PEEK())) | 
|  | 784 | NEXT(); | 
|  | 785 | len = p->next - sp; | 
|  | 786 | for (cp = cclasses; cp->name != NULL; cp++) | 
|  | 787 | if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') | 
|  | 788 | break; | 
|  | 789 | if (cp->name == NULL) { | 
|  | 790 | /* oops, didn't find it */ | 
|  | 791 | SETERROR(REG_ECTYPE); | 
|  | 792 | return; | 
|  | 793 | } | 
|  | 794 |  | 
|  | 795 | u = cp->chars; | 
|  | 796 | while ((c = *u++) != '\0') | 
|  | 797 | CHadd(cs, c); | 
|  | 798 | for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) | 
|  | 799 | MCadd(p, cs, u); | 
|  | 800 | } | 
|  | 801 |  | 
|  | 802 | /* | 
|  | 803 | - p_b_eclass - parse an equivalence-class name and deal with it | 
|  | 804 | * | 
|  | 805 | * This implementation is incomplete. xxx | 
|  | 806 | */ | 
|  | 807 | static void | 
|  | 808 | p_b_eclass(struct parse *p, cset *cs) | 
|  | 809 | { | 
|  | 810 | char c; | 
|  | 811 |  | 
|  | 812 | c = p_b_coll_elem(p, '='); | 
|  | 813 | CHadd(cs, c); | 
|  | 814 | } | 
|  | 815 |  | 
|  | 816 | /* | 
|  | 817 | - p_b_symbol - parse a character or [..]ed multicharacter collating symbol | 
|  | 818 | */ | 
|  | 819 | static char			/* value of symbol */ | 
|  | 820 | p_b_symbol(struct parse *p) | 
|  | 821 | { | 
|  | 822 | char value; | 
|  | 823 |  | 
|  | 824 | REQUIRE(MORE(), REG_EBRACK); | 
|  | 825 | if (!EATTWO('[', '.')) | 
|  | 826 | return(GETNEXT()); | 
|  | 827 |  | 
|  | 828 | /* collating symbol */ | 
|  | 829 | value = p_b_coll_elem(p, '.'); | 
|  | 830 | REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); | 
|  | 831 | return(value); | 
|  | 832 | } | 
|  | 833 |  | 
|  | 834 | /* | 
|  | 835 | - p_b_coll_elem - parse a collating-element name and look it up | 
|  | 836 | */ | 
|  | 837 | static char			/* value of collating element */ | 
|  | 838 | p_b_coll_elem(struct parse *p, | 
|  | 839 | int endc)			/* name ended by endc,']' */ | 
|  | 840 | { | 
|  | 841 | char *sp = p->next; | 
|  | 842 | struct cname *cp; | 
|  | 843 | int len; | 
|  | 844 |  | 
|  | 845 | while (MORE() && !SEETWO(endc, ']')) | 
|  | 846 | NEXT(); | 
|  | 847 | if (!MORE()) { | 
|  | 848 | SETERROR(REG_EBRACK); | 
|  | 849 | return(0); | 
|  | 850 | } | 
|  | 851 | len = p->next - sp; | 
|  | 852 | for (cp = cnames; cp->name != NULL; cp++) | 
|  | 853 | if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') | 
|  | 854 | return(cp->code);	/* known name */ | 
|  | 855 | if (len == 1) | 
|  | 856 | return(*sp);	/* single character */ | 
|  | 857 | SETERROR(REG_ECOLLATE);			/* neither */ | 
|  | 858 | return(0); | 
|  | 859 | } | 
|  | 860 |  | 
|  | 861 | /* | 
|  | 862 | - othercase - return the case counterpart of an alphabetic | 
|  | 863 | */ | 
|  | 864 | static char			/* if no counterpart, return ch */ | 
|  | 865 | othercase(int ch) | 
|  | 866 | { | 
|  | 867 | ch = (uch)ch; | 
|  | 868 | assert(isalpha(ch)); | 
|  | 869 | if (isupper(ch)) | 
|  | 870 | return ((uch)tolower(ch)); | 
|  | 871 | else if (islower(ch)) | 
|  | 872 | return ((uch)toupper(ch)); | 
|  | 873 | else			/* peculiar, but could happen */ | 
|  | 874 | return(ch); | 
|  | 875 | } | 
|  | 876 |  | 
|  | 877 | /* | 
|  | 878 | - bothcases - emit a dualcase version of a two-case character | 
|  | 879 | * | 
|  | 880 | * Boy, is this implementation ever a kludge... | 
|  | 881 | */ | 
|  | 882 | static void | 
|  | 883 | bothcases(struct parse *p, int ch) | 
|  | 884 | { | 
|  | 885 | char *oldnext = p->next; | 
|  | 886 | char *oldend = p->end; | 
|  | 887 | char bracket[3]; | 
|  | 888 |  | 
|  | 889 | ch = (uch)ch; | 
|  | 890 | assert(othercase(ch) != ch);	/* p_bracket() would recurse */ | 
|  | 891 | p->next = bracket; | 
|  | 892 | p->end = bracket+2; | 
|  | 893 | bracket[0] = ch; | 
|  | 894 | bracket[1] = ']'; | 
|  | 895 | bracket[2] = '\0'; | 
|  | 896 | p_bracket(p); | 
|  | 897 | assert(p->next == bracket+2); | 
|  | 898 | p->next = oldnext; | 
|  | 899 | p->end = oldend; | 
|  | 900 | } | 
|  | 901 |  | 
|  | 902 | /* | 
|  | 903 | - ordinary - emit an ordinary character | 
|  | 904 | */ | 
|  | 905 | static void | 
|  | 906 | ordinary(struct parse *p, int ch) | 
|  | 907 | { | 
|  | 908 | cat_t *cap = p->g->categories; | 
|  | 909 |  | 
|  | 910 | if ((p->g->cflags®_ICASE) && isalpha((uch)ch) && othercase(ch) != ch) | 
|  | 911 | bothcases(p, ch); | 
|  | 912 | else { | 
|  | 913 | EMIT(OCHAR, (uch)ch); | 
|  | 914 | if (cap[ch] == 0) | 
|  | 915 | cap[ch] = p->g->ncategories++; | 
|  | 916 | } | 
|  | 917 | } | 
|  | 918 |  | 
|  | 919 | /* | 
|  | 920 | - nonnewline - emit REG_NEWLINE version of OANY | 
|  | 921 | * | 
|  | 922 | * Boy, is this implementation ever a kludge... | 
|  | 923 | */ | 
|  | 924 | static void | 
|  | 925 | nonnewline(struct parse *p) | 
|  | 926 | { | 
|  | 927 | char *oldnext = p->next; | 
|  | 928 | char *oldend = p->end; | 
|  | 929 | char bracket[4]; | 
|  | 930 |  | 
|  | 931 | p->next = bracket; | 
|  | 932 | p->end = bracket+3; | 
|  | 933 | bracket[0] = '^'; | 
|  | 934 | bracket[1] = '\n'; | 
|  | 935 | bracket[2] = ']'; | 
|  | 936 | bracket[3] = '\0'; | 
|  | 937 | p_bracket(p); | 
|  | 938 | assert(p->next == bracket+3); | 
|  | 939 | p->next = oldnext; | 
|  | 940 | p->end = oldend; | 
|  | 941 | } | 
|  | 942 |  | 
|  | 943 | /* | 
|  | 944 | - repeat - generate code for a bounded repetition, recursively if needed | 
|  | 945 | */ | 
|  | 946 | static void | 
|  | 947 | repeat(struct parse *p, | 
|  | 948 | sopno start,		/* operand from here to end of strip */ | 
|  | 949 | int from,			/* repeated from this number */ | 
|  | 950 | int to)			/* to this number of times (maybe INFINITY) */ | 
|  | 951 | { | 
|  | 952 | sopno finish = HERE(); | 
|  | 953 | #	define	N	2 | 
|  | 954 | #	define	INF	3 | 
|  | 955 | #	define	REP(f, t)	((f)*8 + (t)) | 
|  | 956 | #	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) | 
|  | 957 | sopno copy; | 
|  | 958 |  | 
|  | 959 | if (p->error != 0)	/* head off possible runaway recursion */ | 
|  | 960 | return; | 
|  | 961 |  | 
|  | 962 | assert(from <= to); | 
|  | 963 |  | 
|  | 964 | switch (REP(MAP(from), MAP(to))) { | 
|  | 965 | case REP(0, 0):			/* must be user doing this */ | 
|  | 966 | DROP(finish-start);	/* drop the operand */ | 
|  | 967 | break; | 
|  | 968 | case REP(0, 1):			/* as x{1,1}? */ | 
|  | 969 | case REP(0, N):			/* as x{1,n}? */ | 
|  | 970 | case REP(0, INF):		/* as x{1,}? */ | 
|  | 971 | /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ | 
|  | 972 | INSERT(OCH_, start);		/* offset is wrong... */ | 
|  | 973 | repeat(p, start+1, 1, to); | 
|  | 974 | ASTERN(OOR1, start); | 
|  | 975 | AHEAD(start);			/* ... fix it */ | 
|  | 976 | EMIT(OOR2, 0); | 
|  | 977 | AHEAD(THERE()); | 
|  | 978 | ASTERN(O_CH, THERETHERE()); | 
|  | 979 | break; | 
|  | 980 | case REP(1, 1):			/* trivial case */ | 
|  | 981 | /* done */ | 
|  | 982 | break; | 
|  | 983 | case REP(1, N):			/* as x?x{1,n-1} */ | 
|  | 984 | /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ | 
|  | 985 | INSERT(OCH_, start); | 
|  | 986 | ASTERN(OOR1, start); | 
|  | 987 | AHEAD(start); | 
|  | 988 | EMIT(OOR2, 0);			/* offset very wrong... */ | 
|  | 989 | AHEAD(THERE());			/* ...so fix it */ | 
|  | 990 | ASTERN(O_CH, THERETHERE()); | 
|  | 991 | copy = dupl(p, start+1, finish+1); | 
|  | 992 | assert(copy == finish+4); | 
|  | 993 | repeat(p, copy, 1, to-1); | 
|  | 994 | break; | 
|  | 995 | case REP(1, INF):		/* as x+ */ | 
|  | 996 | INSERT(OPLUS_, start); | 
|  | 997 | ASTERN(O_PLUS, start); | 
|  | 998 | break; | 
|  | 999 | case REP(N, N):			/* as xx{m-1,n-1} */ | 
|  | 1000 | copy = dupl(p, start, finish); | 
|  | 1001 | repeat(p, copy, from-1, to-1); | 
|  | 1002 | break; | 
|  | 1003 | case REP(N, INF):		/* as xx{n-1,INF} */ | 
|  | 1004 | copy = dupl(p, start, finish); | 
|  | 1005 | repeat(p, copy, from-1, to); | 
|  | 1006 | break; | 
|  | 1007 | default:			/* "can't happen" */ | 
|  | 1008 | SETERROR(REG_ASSERT);	/* just in case */ | 
|  | 1009 | break; | 
|  | 1010 | } | 
|  | 1011 | } | 
|  | 1012 |  | 
|  | 1013 | /* | 
|  | 1014 | - seterr - set an error condition | 
|  | 1015 | */ | 
|  | 1016 | static int			/* useless but makes type checking happy */ | 
|  | 1017 | seterr(struct parse *p, int e) | 
|  | 1018 | { | 
|  | 1019 | if (p->error == 0)	/* keep earliest error condition */ | 
|  | 1020 | p->error = e; | 
|  | 1021 | p->next = nuls;		/* try to bring things to a halt */ | 
|  | 1022 | p->end = nuls; | 
|  | 1023 | return(0);		/* make the return value well-defined */ | 
|  | 1024 | } | 
|  | 1025 |  | 
|  | 1026 | /* | 
|  | 1027 | - allocset - allocate a set of characters for [] | 
|  | 1028 | */ | 
|  | 1029 | static cset * | 
|  | 1030 | allocset(struct parse *p) | 
|  | 1031 | { | 
|  | 1032 | int no = p->g->ncsets++; | 
|  | 1033 | size_t nc; | 
|  | 1034 | size_t nbytes; | 
|  | 1035 | cset *cs; | 
|  | 1036 | size_t css = (size_t)p->g->csetsize; | 
|  | 1037 | int i; | 
|  | 1038 |  | 
|  | 1039 | if (no >= p->ncsalloc) {	/* need another column of space */ | 
|  | 1040 | void *ptr; | 
|  | 1041 |  | 
|  | 1042 | p->ncsalloc += CHAR_BIT; | 
|  | 1043 | nc = p->ncsalloc; | 
|  | 1044 | assert(nc % CHAR_BIT == 0); | 
|  | 1045 | nbytes = nc / CHAR_BIT * css; | 
|  | 1046 |  | 
|  | 1047 | ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset)); | 
|  | 1048 | if (ptr == NULL) | 
|  | 1049 | goto nomem; | 
|  | 1050 | p->g->sets = ptr; | 
|  | 1051 |  | 
|  | 1052 | ptr = (uch *)realloc((char *)p->g->setbits, nbytes); | 
|  | 1053 | if (ptr == NULL) | 
|  | 1054 | goto nomem; | 
|  | 1055 | p->g->setbits = ptr; | 
|  | 1056 |  | 
|  | 1057 | for (i = 0; i < no; i++) | 
|  | 1058 | p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); | 
|  | 1059 |  | 
|  | 1060 | (void) memset((char *)p->g->setbits + (nbytes - css), 0, css); | 
|  | 1061 | } | 
|  | 1062 | /* XXX should not happen */ | 
|  | 1063 | if (p->g->sets == NULL || p->g->setbits == NULL) | 
|  | 1064 | goto nomem; | 
|  | 1065 |  | 
|  | 1066 | cs = &p->g->sets[no]; | 
|  | 1067 | cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); | 
|  | 1068 | cs->mask = 1 << ((no) % CHAR_BIT); | 
|  | 1069 | cs->hash = 0; | 
|  | 1070 | cs->smultis = 0; | 
|  | 1071 | cs->multis = NULL; | 
|  | 1072 |  | 
|  | 1073 | return(cs); | 
|  | 1074 | nomem: | 
|  | 1075 | free(p->g->sets); | 
|  | 1076 | p->g->sets = NULL; | 
|  | 1077 | free(p->g->setbits); | 
|  | 1078 | p->g->setbits = NULL; | 
|  | 1079 |  | 
|  | 1080 | SETERROR(REG_ESPACE); | 
|  | 1081 | /* caller's responsibility not to do set ops */ | 
|  | 1082 | return(NULL); | 
|  | 1083 | } | 
|  | 1084 |  | 
|  | 1085 | /* | 
|  | 1086 | - freeset - free a now-unused set | 
|  | 1087 | */ | 
|  | 1088 | static void | 
|  | 1089 | freeset(struct parse *p, cset *cs) | 
|  | 1090 | { | 
|  | 1091 | size_t i; | 
|  | 1092 | cset *top = &p->g->sets[p->g->ncsets]; | 
|  | 1093 | size_t css = (size_t)p->g->csetsize; | 
|  | 1094 |  | 
|  | 1095 | for (i = 0; i < css; i++) | 
|  | 1096 | CHsub(cs, i); | 
|  | 1097 | if (cs == top-1)	/* recover only the easy case */ | 
|  | 1098 | p->g->ncsets--; | 
|  | 1099 | } | 
|  | 1100 |  | 
|  | 1101 | /* | 
|  | 1102 | - freezeset - final processing on a set of characters | 
|  | 1103 | * | 
|  | 1104 | * The main task here is merging identical sets.  This is usually a waste | 
|  | 1105 | * of time (although the hash code minimizes the overhead), but can win | 
|  | 1106 | * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash | 
|  | 1107 | * is done using addition rather than xor -- all ASCII [aA] sets xor to | 
|  | 1108 | * the same value! | 
|  | 1109 | */ | 
|  | 1110 | static int			/* set number */ | 
|  | 1111 | freezeset(struct parse *p, cset *cs) | 
|  | 1112 | { | 
|  | 1113 | uch h = cs->hash; | 
|  | 1114 | size_t i; | 
|  | 1115 | cset *top = &p->g->sets[p->g->ncsets]; | 
|  | 1116 | cset *cs2; | 
|  | 1117 | size_t css = (size_t)p->g->csetsize; | 
|  | 1118 |  | 
|  | 1119 | /* look for an earlier one which is the same */ | 
|  | 1120 | for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) | 
|  | 1121 | if (cs2->hash == h && cs2 != cs) { | 
|  | 1122 | /* maybe */ | 
|  | 1123 | for (i = 0; i < css; i++) | 
|  | 1124 | if (!!CHIN(cs2, i) != !!CHIN(cs, i)) | 
|  | 1125 | break;		/* no */ | 
|  | 1126 | if (i == css) | 
|  | 1127 | break;			/* yes */ | 
|  | 1128 | } | 
|  | 1129 |  | 
|  | 1130 | if (cs2 < top) {	/* found one */ | 
|  | 1131 | freeset(p, cs); | 
|  | 1132 | cs = cs2; | 
|  | 1133 | } | 
|  | 1134 |  | 
|  | 1135 | return((int)(cs - p->g->sets)); | 
|  | 1136 | } | 
|  | 1137 |  | 
|  | 1138 | /* | 
|  | 1139 | - firstch - return first character in a set (which must have at least one) | 
|  | 1140 | */ | 
|  | 1141 | static int			/* character; there is no "none" value */ | 
|  | 1142 | firstch(struct parse *p, cset *cs) | 
|  | 1143 | { | 
|  | 1144 | size_t i; | 
|  | 1145 | size_t css = (size_t)p->g->csetsize; | 
|  | 1146 |  | 
|  | 1147 | for (i = 0; i < css; i++) | 
|  | 1148 | if (CHIN(cs, i)) | 
|  | 1149 | return((char)i); | 
|  | 1150 | assert(never); | 
|  | 1151 | return(0);		/* arbitrary */ | 
|  | 1152 | } | 
|  | 1153 |  | 
|  | 1154 | /* | 
|  | 1155 | - nch - number of characters in a set | 
|  | 1156 | */ | 
|  | 1157 | static int | 
|  | 1158 | nch(struct parse *p, cset *cs) | 
|  | 1159 | { | 
|  | 1160 | size_t i; | 
|  | 1161 | size_t css = (size_t)p->g->csetsize; | 
|  | 1162 | int n = 0; | 
|  | 1163 |  | 
|  | 1164 | for (i = 0; i < css; i++) | 
|  | 1165 | if (CHIN(cs, i)) | 
|  | 1166 | n++; | 
|  | 1167 | return(n); | 
|  | 1168 | } | 
|  | 1169 |  | 
|  | 1170 | /* | 
|  | 1171 | - mcadd - add a collating element to a cset | 
|  | 1172 | */ | 
|  | 1173 | static void | 
|  | 1174 | mcadd( struct parse *p, cset *cs, const char *cp) | 
|  | 1175 | { | 
|  | 1176 | size_t oldend = cs->smultis; | 
|  | 1177 | void *np; | 
|  | 1178 |  | 
|  | 1179 | cs->smultis += strlen(cp) + 1; | 
|  | 1180 | np = realloc(cs->multis, cs->smultis); | 
|  | 1181 | if (np == NULL) { | 
|  | 1182 | if (cs->multis) | 
|  | 1183 | free(cs->multis); | 
|  | 1184 | cs->multis = NULL; | 
|  | 1185 | SETERROR(REG_ESPACE); | 
|  | 1186 | return; | 
|  | 1187 | } | 
|  | 1188 | cs->multis = np; | 
|  | 1189 |  | 
|  | 1190 | llvm_strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1); | 
|  | 1191 | } | 
|  | 1192 |  | 
|  | 1193 | /* | 
|  | 1194 | - mcinvert - invert the list of collating elements in a cset | 
|  | 1195 | * | 
|  | 1196 | * This would have to know the set of possibilities.  Implementation | 
|  | 1197 | * is deferred. | 
|  | 1198 | */ | 
|  | 1199 | /* ARGSUSED */ | 
|  | 1200 | static void | 
|  | 1201 | mcinvert(struct parse *p, cset *cs) | 
|  | 1202 | { | 
|  | 1203 | assert(cs->multis == NULL);	/* xxx */ | 
|  | 1204 | } | 
|  | 1205 |  | 
|  | 1206 | /* | 
|  | 1207 | - mccase - add case counterparts of the list of collating elements in a cset | 
|  | 1208 | * | 
|  | 1209 | * This would have to know the set of possibilities.  Implementation | 
|  | 1210 | * is deferred. | 
|  | 1211 | */ | 
|  | 1212 | /* ARGSUSED */ | 
|  | 1213 | static void | 
|  | 1214 | mccase(struct parse *p, cset *cs) | 
|  | 1215 | { | 
|  | 1216 | assert(cs->multis == NULL);	/* xxx */ | 
|  | 1217 | } | 
|  | 1218 |  | 
|  | 1219 | /* | 
|  | 1220 | - isinsets - is this character in any sets? | 
|  | 1221 | */ | 
|  | 1222 | static int			/* predicate */ | 
|  | 1223 | isinsets(struct re_guts *g, int c) | 
|  | 1224 | { | 
|  | 1225 | uch *col; | 
|  | 1226 | int i; | 
|  | 1227 | int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; | 
|  | 1228 | unsigned uc = (uch)c; | 
|  | 1229 |  | 
|  | 1230 | for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) | 
|  | 1231 | if (col[uc] != 0) | 
|  | 1232 | return(1); | 
|  | 1233 | return(0); | 
|  | 1234 | } | 
|  | 1235 |  | 
|  | 1236 | /* | 
|  | 1237 | - samesets - are these two characters in exactly the same sets? | 
|  | 1238 | */ | 
|  | 1239 | static int			/* predicate */ | 
|  | 1240 | samesets(struct re_guts *g, int c1, int c2) | 
|  | 1241 | { | 
|  | 1242 | uch *col; | 
|  | 1243 | int i; | 
|  | 1244 | int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; | 
|  | 1245 | unsigned uc1 = (uch)c1; | 
|  | 1246 | unsigned uc2 = (uch)c2; | 
|  | 1247 |  | 
|  | 1248 | for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) | 
|  | 1249 | if (col[uc1] != col[uc2]) | 
|  | 1250 | return(0); | 
|  | 1251 | return(1); | 
|  | 1252 | } | 
|  | 1253 |  | 
|  | 1254 | /* | 
|  | 1255 | - categorize - sort out character categories | 
|  | 1256 | */ | 
|  | 1257 | static void | 
|  | 1258 | categorize(struct parse *p, struct re_guts *g) | 
|  | 1259 | { | 
|  | 1260 | cat_t *cats = g->categories; | 
|  | 1261 | int c; | 
|  | 1262 | int c2; | 
|  | 1263 | cat_t cat; | 
|  | 1264 |  | 
|  | 1265 | /* avoid making error situations worse */ | 
|  | 1266 | if (p->error != 0) | 
|  | 1267 | return; | 
|  | 1268 |  | 
|  | 1269 | for (c = CHAR_MIN; c <= CHAR_MAX; c++) | 
|  | 1270 | if (cats[c] == 0 && isinsets(g, c)) { | 
|  | 1271 | cat = g->ncategories++; | 
|  | 1272 | cats[c] = cat; | 
|  | 1273 | for (c2 = c+1; c2 <= CHAR_MAX; c2++) | 
|  | 1274 | if (cats[c2] == 0 && samesets(g, c, c2)) | 
|  | 1275 | cats[c2] = cat; | 
|  | 1276 | } | 
|  | 1277 | } | 
|  | 1278 |  | 
|  | 1279 | /* | 
|  | 1280 | - dupl - emit a duplicate of a bunch of sops | 
|  | 1281 | */ | 
|  | 1282 | static sopno			/* start of duplicate */ | 
|  | 1283 | dupl(struct parse *p, | 
|  | 1284 | sopno start,		/* from here */ | 
|  | 1285 | sopno finish)		/* to this less one */ | 
|  | 1286 | { | 
|  | 1287 | sopno ret = HERE(); | 
|  | 1288 | sopno len = finish - start; | 
|  | 1289 |  | 
|  | 1290 | assert(finish >= start); | 
|  | 1291 | if (len == 0) | 
|  | 1292 | return(ret); | 
|  | 1293 | enlarge(p, p->ssize + len);	/* this many unexpected additions */ | 
|  | 1294 | assert(p->ssize >= p->slen + len); | 
|  | 1295 | (void) memmove((char *)(p->strip + p->slen), | 
|  | 1296 | (char *)(p->strip + start), (size_t)len*sizeof(sop)); | 
|  | 1297 | p->slen += len; | 
|  | 1298 | return(ret); | 
|  | 1299 | } | 
|  | 1300 |  | 
|  | 1301 | /* | 
|  | 1302 | - doemit - emit a strip operator | 
|  | 1303 | * | 
|  | 1304 | * It might seem better to implement this as a macro with a function as | 
|  | 1305 | * hard-case backup, but it's just too big and messy unless there are | 
|  | 1306 | * some changes to the data structures.  Maybe later. | 
|  | 1307 | */ | 
|  | 1308 | static void | 
|  | 1309 | doemit(struct parse *p, sop op, size_t opnd) | 
|  | 1310 | { | 
|  | 1311 | /* avoid making error situations worse */ | 
|  | 1312 | if (p->error != 0) | 
|  | 1313 | return; | 
|  | 1314 |  | 
|  | 1315 | /* deal with oversize operands ("can't happen", more or less) */ | 
|  | 1316 | assert(opnd < 1<<OPSHIFT); | 
|  | 1317 |  | 
|  | 1318 | /* deal with undersized strip */ | 
|  | 1319 | if (p->slen >= p->ssize) | 
|  | 1320 | enlarge(p, (p->ssize+1) / 2 * 3);	/* +50% */ | 
|  | 1321 | assert(p->slen < p->ssize); | 
|  | 1322 |  | 
|  | 1323 | /* finally, it's all reduced to the easy case */ | 
|  | 1324 | p->strip[p->slen++] = SOP(op, opnd); | 
|  | 1325 | } | 
|  | 1326 |  | 
|  | 1327 | /* | 
|  | 1328 | - doinsert - insert a sop into the strip | 
|  | 1329 | */ | 
|  | 1330 | static void | 
|  | 1331 | doinsert(struct parse *p, sop op, size_t opnd, sopno pos) | 
|  | 1332 | { | 
|  | 1333 | sopno sn; | 
|  | 1334 | sop s; | 
|  | 1335 | int i; | 
|  | 1336 |  | 
|  | 1337 | /* avoid making error situations worse */ | 
|  | 1338 | if (p->error != 0) | 
|  | 1339 | return; | 
|  | 1340 |  | 
|  | 1341 | sn = HERE(); | 
|  | 1342 | EMIT(op, opnd);		/* do checks, ensure space */ | 
|  | 1343 | assert(HERE() == sn+1); | 
|  | 1344 | s = p->strip[sn]; | 
|  | 1345 |  | 
|  | 1346 | /* adjust paren pointers */ | 
|  | 1347 | assert(pos > 0); | 
|  | 1348 | for (i = 1; i < NPAREN; i++) { | 
|  | 1349 | if (p->pbegin[i] >= pos) { | 
|  | 1350 | p->pbegin[i]++; | 
|  | 1351 | } | 
|  | 1352 | if (p->pend[i] >= pos) { | 
|  | 1353 | p->pend[i]++; | 
|  | 1354 | } | 
|  | 1355 | } | 
|  | 1356 |  | 
|  | 1357 | memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], | 
|  | 1358 | (HERE()-pos-1)*sizeof(sop)); | 
|  | 1359 | p->strip[pos] = s; | 
|  | 1360 | } | 
|  | 1361 |  | 
|  | 1362 | /* | 
|  | 1363 | - dofwd - complete a forward reference | 
|  | 1364 | */ | 
|  | 1365 | static void | 
|  | 1366 | dofwd(struct parse *p, sopno pos, sop value) | 
|  | 1367 | { | 
|  | 1368 | /* avoid making error situations worse */ | 
|  | 1369 | if (p->error != 0) | 
|  | 1370 | return; | 
|  | 1371 |  | 
|  | 1372 | assert(value < 1<<OPSHIFT); | 
|  | 1373 | p->strip[pos] = OP(p->strip[pos]) | value; | 
|  | 1374 | } | 
|  | 1375 |  | 
|  | 1376 | /* | 
|  | 1377 | - enlarge - enlarge the strip | 
|  | 1378 | */ | 
|  | 1379 | static void | 
|  | 1380 | enlarge(struct parse *p, sopno size) | 
|  | 1381 | { | 
|  | 1382 | sop *sp; | 
|  | 1383 |  | 
|  | 1384 | if (p->ssize >= size) | 
|  | 1385 | return; | 
|  | 1386 |  | 
|  | 1387 | sp = (sop *)realloc(p->strip, size*sizeof(sop)); | 
|  | 1388 | if (sp == NULL) { | 
|  | 1389 | SETERROR(REG_ESPACE); | 
|  | 1390 | return; | 
|  | 1391 | } | 
|  | 1392 | p->strip = sp; | 
|  | 1393 | p->ssize = size; | 
|  | 1394 | } | 
|  | 1395 |  | 
|  | 1396 | /* | 
|  | 1397 | - stripsnug - compact the strip | 
|  | 1398 | */ | 
|  | 1399 | static void | 
|  | 1400 | stripsnug(struct parse *p, struct re_guts *g) | 
|  | 1401 | { | 
|  | 1402 | g->nstates = p->slen; | 
|  | 1403 | g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); | 
|  | 1404 | if (g->strip == NULL) { | 
|  | 1405 | SETERROR(REG_ESPACE); | 
|  | 1406 | g->strip = p->strip; | 
|  | 1407 | } | 
|  | 1408 | } | 
|  | 1409 |  | 
|  | 1410 | /* | 
|  | 1411 | - findmust - fill in must and mlen with longest mandatory literal string | 
|  | 1412 | * | 
|  | 1413 | * This algorithm could do fancy things like analyzing the operands of | | 
|  | 1414 | * for common subsequences.  Someday.  This code is simple and finds most | 
|  | 1415 | * of the interesting cases. | 
|  | 1416 | * | 
|  | 1417 | * Note that must and mlen got initialized during setup. | 
|  | 1418 | */ | 
|  | 1419 | static void | 
|  | 1420 | findmust(struct parse *p, struct re_guts *g) | 
|  | 1421 | { | 
|  | 1422 | sop *scan; | 
| Daniel Dunbar | bf75ffd | 2009-08-30 21:13:58 +0000 | [diff] [blame] | 1423 | sop *start = 0; /* start initialized in the default case, after that */ | 
|  | 1424 | sop *newstart = 0; /* newstart was initialized in the OCHAR case */ | 
| Torok Edwin | e14d4cd | 2009-08-30 08:24:09 +0000 | [diff] [blame] | 1425 | sopno newlen; | 
|  | 1426 | sop s; | 
|  | 1427 | char *cp; | 
|  | 1428 | sopno i; | 
|  | 1429 |  | 
|  | 1430 | /* avoid making error situations worse */ | 
|  | 1431 | if (p->error != 0) | 
|  | 1432 | return; | 
|  | 1433 |  | 
|  | 1434 | /* find the longest OCHAR sequence in strip */ | 
|  | 1435 | newlen = 0; | 
|  | 1436 | scan = g->strip + 1; | 
|  | 1437 | do { | 
|  | 1438 | s = *scan++; | 
|  | 1439 | switch (OP(s)) { | 
|  | 1440 | case OCHAR:		/* sequence member */ | 
|  | 1441 | if (newlen == 0)		/* new sequence */ | 
|  | 1442 | newstart = scan - 1; | 
|  | 1443 | newlen++; | 
|  | 1444 | break; | 
|  | 1445 | case OPLUS_:		/* things that don't break one */ | 
|  | 1446 | case OLPAREN: | 
|  | 1447 | case ORPAREN: | 
|  | 1448 | break; | 
|  | 1449 | case OQUEST_:		/* things that must be skipped */ | 
|  | 1450 | case OCH_: | 
|  | 1451 | scan--; | 
|  | 1452 | do { | 
|  | 1453 | scan += OPND(s); | 
|  | 1454 | s = *scan; | 
|  | 1455 | /* assert() interferes w debug printouts */ | 
|  | 1456 | if (OP(s) != O_QUEST && OP(s) != O_CH && | 
|  | 1457 | OP(s) != OOR2) { | 
|  | 1458 | g->iflags |= REGEX_BAD; | 
|  | 1459 | return; | 
|  | 1460 | } | 
|  | 1461 | } while (OP(s) != O_QUEST && OP(s) != O_CH); | 
|  | 1462 | /* fallthrough */ | 
|  | 1463 | default:		/* things that break a sequence */ | 
|  | 1464 | if (newlen > g->mlen) {		/* ends one */ | 
|  | 1465 | start = newstart; | 
|  | 1466 | g->mlen = newlen; | 
|  | 1467 | } | 
|  | 1468 | newlen = 0; | 
|  | 1469 | break; | 
|  | 1470 | } | 
|  | 1471 | } while (OP(s) != OEND); | 
|  | 1472 |  | 
|  | 1473 | if (g->mlen == 0)		/* there isn't one */ | 
|  | 1474 | return; | 
|  | 1475 |  | 
|  | 1476 | /* turn it into a character string */ | 
|  | 1477 | g->must = malloc((size_t)g->mlen + 1); | 
|  | 1478 | if (g->must == NULL) {		/* argh; just forget it */ | 
|  | 1479 | g->mlen = 0; | 
|  | 1480 | return; | 
|  | 1481 | } | 
|  | 1482 | cp = g->must; | 
|  | 1483 | scan = start; | 
|  | 1484 | for (i = g->mlen; i > 0; i--) { | 
|  | 1485 | while (OP(s = *scan++) != OCHAR) | 
|  | 1486 | continue; | 
|  | 1487 | assert(cp < g->must + g->mlen); | 
|  | 1488 | *cp++ = (char)OPND(s); | 
|  | 1489 | } | 
|  | 1490 | assert(cp == g->must + g->mlen); | 
|  | 1491 | *cp++ = '\0';		/* just on general principles */ | 
|  | 1492 | } | 
|  | 1493 |  | 
|  | 1494 | /* | 
|  | 1495 | - pluscount - count + nesting | 
|  | 1496 | */ | 
|  | 1497 | static sopno			/* nesting depth */ | 
|  | 1498 | pluscount(struct parse *p, struct re_guts *g) | 
|  | 1499 | { | 
|  | 1500 | sop *scan; | 
|  | 1501 | sop s; | 
|  | 1502 | sopno plusnest = 0; | 
|  | 1503 | sopno maxnest = 0; | 
|  | 1504 |  | 
|  | 1505 | if (p->error != 0) | 
|  | 1506 | return(0);	/* there may not be an OEND */ | 
|  | 1507 |  | 
|  | 1508 | scan = g->strip + 1; | 
|  | 1509 | do { | 
|  | 1510 | s = *scan++; | 
|  | 1511 | switch (OP(s)) { | 
|  | 1512 | case OPLUS_: | 
|  | 1513 | plusnest++; | 
|  | 1514 | break; | 
|  | 1515 | case O_PLUS: | 
|  | 1516 | if (plusnest > maxnest) | 
|  | 1517 | maxnest = plusnest; | 
|  | 1518 | plusnest--; | 
|  | 1519 | break; | 
|  | 1520 | } | 
|  | 1521 | } while (OP(s) != OEND); | 
|  | 1522 | if (plusnest != 0) | 
|  | 1523 | g->iflags |= REGEX_BAD; | 
|  | 1524 | return(maxnest); | 
|  | 1525 | } |