blob: b540d219dde20bab54c12aa5dde848ad50f1f242 [file] [log] [blame]
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001/*
2 * Secret Labs' Regular Expression Engine
3 *
4 * regular expression matching engine
5 *
6 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
7 *
8 * See the _sre.c file for information on usage and redistribution.
9 */
10
11/* String matching engine */
12
13/* This file is included three times, with different character settings */
14
15LOCAL(int)
16SRE(at)(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
17{
18 /* check if pointer is at given position */
19
20 Py_ssize_t thisp, thatp;
21
22 switch (at) {
23
24 case SRE_AT_BEGINNING:
25 case SRE_AT_BEGINNING_STRING:
26 return ((void*) ptr == state->beginning);
27
28 case SRE_AT_BEGINNING_LINE:
29 return ((void*) ptr == state->beginning ||
30 SRE_IS_LINEBREAK((int) ptr[-1]));
31
32 case SRE_AT_END:
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +030033 return (((SRE_CHAR *)state->end - ptr == 1 &&
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +030034 SRE_IS_LINEBREAK((int) ptr[0])) ||
35 ((void*) ptr == state->end));
36
37 case SRE_AT_END_LINE:
38 return ((void*) ptr == state->end ||
39 SRE_IS_LINEBREAK((int) ptr[0]));
40
41 case SRE_AT_END_STRING:
42 return ((void*) ptr == state->end);
43
44 case SRE_AT_BOUNDARY:
45 if (state->beginning == state->end)
46 return 0;
47 thatp = ((void*) ptr > state->beginning) ?
48 SRE_IS_WORD((int) ptr[-1]) : 0;
49 thisp = ((void*) ptr < state->end) ?
50 SRE_IS_WORD((int) ptr[0]) : 0;
51 return thisp != thatp;
52
53 case SRE_AT_NON_BOUNDARY:
54 if (state->beginning == state->end)
55 return 0;
56 thatp = ((void*) ptr > state->beginning) ?
57 SRE_IS_WORD((int) ptr[-1]) : 0;
58 thisp = ((void*) ptr < state->end) ?
59 SRE_IS_WORD((int) ptr[0]) : 0;
60 return thisp == thatp;
61
62 case SRE_AT_LOC_BOUNDARY:
63 if (state->beginning == state->end)
64 return 0;
65 thatp = ((void*) ptr > state->beginning) ?
66 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
67 thisp = ((void*) ptr < state->end) ?
68 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
69 return thisp != thatp;
70
71 case SRE_AT_LOC_NON_BOUNDARY:
72 if (state->beginning == state->end)
73 return 0;
74 thatp = ((void*) ptr > state->beginning) ?
75 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
76 thisp = ((void*) ptr < state->end) ?
77 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
78 return thisp == thatp;
79
80 case SRE_AT_UNI_BOUNDARY:
81 if (state->beginning == state->end)
82 return 0;
83 thatp = ((void*) ptr > state->beginning) ?
84 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
85 thisp = ((void*) ptr < state->end) ?
86 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
87 return thisp != thatp;
88
89 case SRE_AT_UNI_NON_BOUNDARY:
90 if (state->beginning == state->end)
91 return 0;
92 thatp = ((void*) ptr > state->beginning) ?
93 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
94 thisp = ((void*) ptr < state->end) ?
95 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
96 return thisp == thatp;
97
98 }
99
100 return 0;
101}
102
103LOCAL(int)
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300104SRE(char_loc_ignore)(SRE_STATE* state, SRE_CODE pattern, SRE_CODE ch)
105{
106 return ch == pattern
107 || (SRE_CODE) state->lower(ch) == pattern
108 || (SRE_CODE) state->upper(ch) == pattern;
109}
110
111LOCAL(int)
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200112SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300113{
114 /* check if character is a member of the given set */
115
116 int ok = 1;
117
118 for (;;) {
119 switch (*set++) {
120
121 case SRE_OP_FAILURE:
122 return !ok;
123
124 case SRE_OP_LITERAL:
125 /* <LITERAL> <code> */
126 if (ch == set[0])
127 return ok;
128 set++;
129 break;
130
131 case SRE_OP_CATEGORY:
132 /* <CATEGORY> <code> */
133 if (sre_category(set[0], (int) ch))
134 return ok;
135 set++;
136 break;
137
138 case SRE_OP_CHARSET:
139 /* <CHARSET> <bitmap> */
140 if (ch < 256 &&
141 (set[ch/SRE_CODE_BITS] & (1u << (ch & (SRE_CODE_BITS-1)))))
142 return ok;
143 set += 256/SRE_CODE_BITS;
144 break;
145
146 case SRE_OP_RANGE:
147 /* <RANGE> <lower> <upper> */
148 if (set[0] <= ch && ch <= set[1])
149 return ok;
150 set += 2;
151 break;
152
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200153 case SRE_OP_RANGE_IGNORE:
154 /* <RANGE_IGNORE> <lower> <upper> */
155 {
156 SRE_CODE uch;
157 /* ch is already lower cased */
158 if (set[0] <= ch && ch <= set[1])
159 return ok;
160 uch = state->upper(ch);
161 if (set[0] <= uch && uch <= set[1])
162 return ok;
163 set += 2;
164 break;
165 }
166
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300167 case SRE_OP_NEGATE:
168 ok = !ok;
169 break;
170
171 case SRE_OP_BIGCHARSET:
172 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
173 {
174 Py_ssize_t count, block;
175 count = *(set++);
176
177 if (ch < 0x10000u)
178 block = ((unsigned char*)set)[ch >> 8];
179 else
180 block = -1;
181 set += 256/sizeof(SRE_CODE);
182 if (block >=0 &&
183 (set[(block * 256 + (ch & 255))/SRE_CODE_BITS] &
184 (1u << (ch & (SRE_CODE_BITS-1)))))
185 return ok;
186 set += count * (256/SRE_CODE_BITS);
187 break;
188 }
189
190 default:
191 /* internal error -- there's not much we can do about it
192 here, so let's just pretend it didn't match... */
193 return 0;
194 }
195 }
196}
197
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300198LOCAL(int)
199SRE(charset_loc_ignore)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
200{
201 SRE_CODE lo, up;
202 lo = state->lower(ch);
203 if (SRE(charset)(state, set, lo))
204 return 1;
205
206 up = state->upper(ch);
207 return up != lo && SRE(charset)(state, set, up);
208}
209
Serhiy Storchaka429b59e2014-05-14 21:48:17 +0300210LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all);
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300211
212LOCAL(Py_ssize_t)
213SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
214{
215 SRE_CODE chr;
216 SRE_CHAR c;
217 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
218 SRE_CHAR* end = (SRE_CHAR *)state->end;
219 Py_ssize_t i;
220
221 /* adjust end */
222 if (maxcount < end - ptr && maxcount != SRE_MAXREPEAT)
223 end = ptr + maxcount;
224
225 switch (pattern[0]) {
226
227 case SRE_OP_IN:
228 /* repeated set */
229 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200230 while (ptr < end && SRE(charset)(state, pattern + 2, *ptr))
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300231 ptr++;
232 break;
233
234 case SRE_OP_ANY:
235 /* repeated dot wildcard. */
236 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
237 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
238 ptr++;
239 break;
240
241 case SRE_OP_ANY_ALL:
242 /* repeated dot wildcard. skip to the end of the target
243 string, and backtrack from there */
244 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
245 ptr = end;
246 break;
247
248 case SRE_OP_LITERAL:
249 /* repeated literal */
250 chr = pattern[1];
251 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
252 c = (SRE_CHAR) chr;
253#if SIZEOF_SRE_CHAR < 4
254 if ((SRE_CODE) c != chr)
255 ; /* literal can't match: doesn't fit in char width */
256 else
257#endif
258 while (ptr < end && *ptr == c)
259 ptr++;
260 break;
261
262 case SRE_OP_LITERAL_IGNORE:
263 /* repeated literal */
264 chr = pattern[1];
265 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
266 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
267 ptr++;
268 break;
269
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300270 case SRE_OP_LITERAL_LOC_IGNORE:
271 /* repeated literal */
272 chr = pattern[1];
273 TRACE(("|%p|%p|COUNT LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
274 while (ptr < end && SRE(char_loc_ignore)(state, chr, *ptr))
275 ptr++;
276 break;
277
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300278 case SRE_OP_NOT_LITERAL:
279 /* repeated non-literal */
280 chr = pattern[1];
281 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
282 c = (SRE_CHAR) chr;
283#if SIZEOF_SRE_CHAR < 4
284 if ((SRE_CODE) c != chr)
285 ptr = end; /* literal can't match: doesn't fit in char width */
286 else
287#endif
288 while (ptr < end && *ptr != c)
289 ptr++;
290 break;
291
292 case SRE_OP_NOT_LITERAL_IGNORE:
293 /* repeated non-literal */
294 chr = pattern[1];
295 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
296 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
297 ptr++;
298 break;
299
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300300 case SRE_OP_NOT_LITERAL_LOC_IGNORE:
301 /* repeated non-literal */
302 chr = pattern[1];
303 TRACE(("|%p|%p|COUNT NOT_LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
304 while (ptr < end && !SRE(char_loc_ignore)(state, chr, *ptr))
305 ptr++;
306 break;
307
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300308 default:
309 /* repeated single character pattern */
310 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
311 while ((SRE_CHAR*) state->ptr < end) {
Serhiy Storchaka429b59e2014-05-14 21:48:17 +0300312 i = SRE(match)(state, pattern, 0);
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300313 if (i < 0)
314 return i;
315 if (!i)
316 break;
317 }
318 TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
319 (SRE_CHAR*) state->ptr - ptr));
320 return (SRE_CHAR*) state->ptr - ptr;
321 }
322
323 TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
324 ptr - (SRE_CHAR*) state->ptr));
325 return ptr - (SRE_CHAR*) state->ptr;
326}
327
328#if 0 /* not used in this release */
329LOCAL(int)
330SRE(info)(SRE_STATE* state, SRE_CODE* pattern)
331{
332 /* check if an SRE_OP_INFO block matches at the current position.
333 returns the number of SRE_CODE objects to skip if successful, 0
334 if no match */
335
336 SRE_CHAR* end = (SRE_CHAR*) state->end;
337 SRE_CHAR* ptr = (SRE_CHAR*) state->ptr;
338 Py_ssize_t i;
339
340 /* check minimal length */
341 if (pattern[3] && end - ptr < pattern[3])
342 return 0;
343
344 /* check known prefix */
345 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
346 /* <length> <skip> <prefix data> <overlap data> */
347 for (i = 0; i < pattern[5]; i++)
348 if ((SRE_CODE) ptr[i] != pattern[7 + i])
349 return 0;
350 return pattern[0] + 2 * pattern[6];
351 }
352 return pattern[0];
353}
354#endif
355
356/* The macros below should be used to protect recursive SRE(match)()
357 * calls that *failed* and do *not* return immediately (IOW, those
358 * that will backtrack). Explaining:
359 *
360 * - Recursive SRE(match)() returned true: that's usually a success
361 * (besides atypical cases like ASSERT_NOT), therefore there's no
362 * reason to restore lastmark;
363 *
364 * - Recursive SRE(match)() returned false but the current SRE(match)()
365 * is returning to the caller: If the current SRE(match)() is the
366 * top function of the recursion, returning false will be a matching
367 * failure, and it doesn't matter where lastmark is pointing to.
368 * If it's *not* the top function, it will be a recursive SRE(match)()
369 * failure by itself, and the calling SRE(match)() will have to deal
370 * with the failure by the same rules explained here (it will restore
371 * lastmark by itself if necessary);
372 *
373 * - Recursive SRE(match)() returned false, and will continue the
374 * outside 'for' loop: must be protected when breaking, since the next
375 * OP could potentially depend on lastmark;
376 *
377 * - Recursive SRE(match)() returned false, and will be called again
378 * inside a local for/while loop: must be protected between each
379 * loop iteration, since the recursive SRE(match)() could do anything,
380 * and could potentially depend on lastmark.
381 *
382 * For more information, check the discussion at SF patch #712900.
383 */
384#define LASTMARK_SAVE() \
385 do { \
386 ctx->lastmark = state->lastmark; \
387 ctx->lastindex = state->lastindex; \
388 } while (0)
389#define LASTMARK_RESTORE() \
390 do { \
391 state->lastmark = ctx->lastmark; \
392 state->lastindex = ctx->lastindex; \
393 } while (0)
394
395#define RETURN_ERROR(i) do { return i; } while(0)
396#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
397#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
398
399#define RETURN_ON_ERROR(i) \
400 do { if (i < 0) RETURN_ERROR(i); } while (0)
401#define RETURN_ON_SUCCESS(i) \
402 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
403#define RETURN_ON_FAILURE(i) \
404 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
405
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300406#define DATA_STACK_ALLOC(state, type, ptr) \
407do { \
408 alloc_pos = state->data_stack_base; \
409 TRACE(("allocating %s in %" PY_FORMAT_SIZE_T "d " \
410 "(%" PY_FORMAT_SIZE_T "d)\n", \
Serhiy Storchaka12b25382015-11-05 17:43:42 +0200411 Py_STRINGIFY(type), alloc_pos, sizeof(type))); \
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300412 if (sizeof(type) > state->data_stack_size - alloc_pos) { \
413 int j = data_stack_grow(state, sizeof(type)); \
414 if (j < 0) return j; \
415 if (ctx_pos != -1) \
416 DATA_STACK_LOOKUP_AT(state, SRE(match_context), ctx, ctx_pos); \
417 } \
418 ptr = (type*)(state->data_stack+alloc_pos); \
419 state->data_stack_base += sizeof(type); \
420} while (0)
421
422#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
423do { \
Serhiy Storchaka12b25382015-11-05 17:43:42 +0200424 TRACE(("looking up %s at %" PY_FORMAT_SIZE_T "d\n", Py_STRINGIFY(type), pos)); \
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300425 ptr = (type*)(state->data_stack+pos); \
426} while (0)
427
428#define DATA_STACK_PUSH(state, data, size) \
429do { \
430 TRACE(("copy data in %p to %" PY_FORMAT_SIZE_T "d " \
431 "(%" PY_FORMAT_SIZE_T "d)\n", \
432 data, state->data_stack_base, size)); \
433 if (size > state->data_stack_size - state->data_stack_base) { \
434 int j = data_stack_grow(state, size); \
435 if (j < 0) return j; \
436 if (ctx_pos != -1) \
437 DATA_STACK_LOOKUP_AT(state, SRE(match_context), ctx, ctx_pos); \
438 } \
439 memcpy(state->data_stack+state->data_stack_base, data, size); \
440 state->data_stack_base += size; \
441} while (0)
442
443#define DATA_STACK_POP(state, data, size, discard) \
444do { \
445 TRACE(("copy data to %p from %" PY_FORMAT_SIZE_T "d " \
446 "(%" PY_FORMAT_SIZE_T "d)\n", \
447 data, state->data_stack_base-size, size)); \
448 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
449 if (discard) \
450 state->data_stack_base -= size; \
451} while (0)
452
453#define DATA_STACK_POP_DISCARD(state, size) \
454do { \
455 TRACE(("discard data from %" PY_FORMAT_SIZE_T "d " \
456 "(%" PY_FORMAT_SIZE_T "d)\n", \
457 state->data_stack_base-size, size)); \
458 state->data_stack_base -= size; \
459} while(0)
460
461#define DATA_PUSH(x) \
462 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
463#define DATA_POP(x) \
464 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
465#define DATA_POP_DISCARD(x) \
466 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
467#define DATA_ALLOC(t,p) \
468 DATA_STACK_ALLOC(state, t, p)
469#define DATA_LOOKUP_AT(t,p,pos) \
470 DATA_STACK_LOOKUP_AT(state,t,p,pos)
471
472#define MARK_PUSH(lastmark) \
473 do if (lastmark > 0) { \
474 i = lastmark; /* ctx->lastmark may change if reallocated */ \
475 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
476 } while (0)
477#define MARK_POP(lastmark) \
478 do if (lastmark > 0) { \
479 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
480 } while (0)
481#define MARK_POP_KEEP(lastmark) \
482 do if (lastmark > 0) { \
483 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
484 } while (0)
485#define MARK_POP_DISCARD(lastmark) \
486 do if (lastmark > 0) { \
487 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
488 } while (0)
489
490#define JUMP_NONE 0
491#define JUMP_MAX_UNTIL_1 1
492#define JUMP_MAX_UNTIL_2 2
493#define JUMP_MAX_UNTIL_3 3
494#define JUMP_MIN_UNTIL_1 4
495#define JUMP_MIN_UNTIL_2 5
496#define JUMP_MIN_UNTIL_3 6
497#define JUMP_REPEAT 7
498#define JUMP_REPEAT_ONE_1 8
499#define JUMP_REPEAT_ONE_2 9
500#define JUMP_MIN_REPEAT_ONE 10
501#define JUMP_BRANCH 11
502#define JUMP_ASSERT 12
503#define JUMP_ASSERT_NOT 13
504
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200505#define DO_JUMPX(jumpvalue, jumplabel, nextpattern, matchall) \
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300506 DATA_ALLOC(SRE(match_context), nextctx); \
507 nextctx->last_ctx_pos = ctx_pos; \
508 nextctx->jump = jumpvalue; \
509 nextctx->pattern = nextpattern; \
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200510 nextctx->match_all = matchall; \
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300511 ctx_pos = alloc_pos; \
512 ctx = nextctx; \
513 goto entrance; \
514 jumplabel: \
515 while (0) /* gcc doesn't like labels at end of scopes */ \
516
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200517#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
518 DO_JUMPX(jumpvalue, jumplabel, nextpattern, ctx->match_all)
519
520#define DO_JUMP0(jumpvalue, jumplabel, nextpattern) \
521 DO_JUMPX(jumpvalue, jumplabel, nextpattern, 0)
522
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300523typedef struct {
524 Py_ssize_t last_ctx_pos;
525 Py_ssize_t jump;
526 SRE_CHAR* ptr;
527 SRE_CODE* pattern;
528 Py_ssize_t count;
529 Py_ssize_t lastmark;
530 Py_ssize_t lastindex;
531 union {
532 SRE_CODE chr;
533 SRE_REPEAT* rep;
534 } u;
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200535 int match_all;
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300536} SRE(match_context);
537
538/* check if string matches the given pattern. returns <0 for
539 error, 0 for failure, and 1 for success */
540LOCAL(Py_ssize_t)
Serhiy Storchaka429b59e2014-05-14 21:48:17 +0300541SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all)
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300542{
543 SRE_CHAR* end = (SRE_CHAR *)state->end;
544 Py_ssize_t alloc_pos, ctx_pos = -1;
545 Py_ssize_t i, ret = 0;
546 Py_ssize_t jump;
547 unsigned int sigcount=0;
548
549 SRE(match_context)* ctx;
550 SRE(match_context)* nextctx;
551
552 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
553
554 DATA_ALLOC(SRE(match_context), ctx);
555 ctx->last_ctx_pos = -1;
556 ctx->jump = JUMP_NONE;
557 ctx->pattern = pattern;
Serhiy Storchaka429b59e2014-05-14 21:48:17 +0300558 ctx->match_all = match_all;
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300559 ctx_pos = alloc_pos;
560
561entrance:
562
563 ctx->ptr = (SRE_CHAR *)state->ptr;
564
565 if (ctx->pattern[0] == SRE_OP_INFO) {
566 /* optimization info block */
567 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Benjamin Petersonca470632016-09-06 13:47:26 -0700568 if (ctx->pattern[3] && (uintptr_t)(end - ctx->ptr) < ctx->pattern[3]) {
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300569 TRACE(("reject (got %" PY_FORMAT_SIZE_T "d chars, "
570 "need %" PY_FORMAT_SIZE_T "d)\n",
571 end - ctx->ptr, (Py_ssize_t) ctx->pattern[3]));
572 RETURN_FAILURE;
573 }
574 ctx->pattern += ctx->pattern[1] + 1;
575 }
576
577 for (;;) {
578 ++sigcount;
579 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
580 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
581
582 switch (*ctx->pattern++) {
583
584 case SRE_OP_MARK:
585 /* set mark */
586 /* <MARK> <gid> */
587 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
588 ctx->ptr, ctx->pattern[0]));
589 i = ctx->pattern[0];
590 if (i & 1)
591 state->lastindex = i/2 + 1;
592 if (i > state->lastmark) {
593 /* state->lastmark is the highest valid index in the
594 state->mark array. If it is increased by more than 1,
595 the intervening marks must be set to NULL to signal
596 that these marks have not been encountered. */
597 Py_ssize_t j = state->lastmark + 1;
598 while (j < i)
599 state->mark[j++] = NULL;
600 state->lastmark = i;
601 }
602 state->mark[i] = ctx->ptr;
603 ctx->pattern++;
604 break;
605
606 case SRE_OP_LITERAL:
607 /* match literal string */
608 /* <LITERAL> <code> */
609 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
610 ctx->ptr, *ctx->pattern));
611 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
612 RETURN_FAILURE;
613 ctx->pattern++;
614 ctx->ptr++;
615 break;
616
617 case SRE_OP_NOT_LITERAL:
618 /* match anything that is not literal character */
619 /* <NOT_LITERAL> <code> */
620 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
621 ctx->ptr, *ctx->pattern));
622 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
623 RETURN_FAILURE;
624 ctx->pattern++;
625 ctx->ptr++;
626 break;
627
628 case SRE_OP_SUCCESS:
629 /* end of pattern */
630 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200631 if (!ctx->match_all || ctx->ptr == state->end) {
632 state->ptr = ctx->ptr;
633 RETURN_SUCCESS;
634 }
635 RETURN_FAILURE;
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300636
637 case SRE_OP_AT:
638 /* match at given position */
639 /* <AT> <code> */
640 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
641 if (!SRE(at)(state, ctx->ptr, *ctx->pattern))
642 RETURN_FAILURE;
643 ctx->pattern++;
644 break;
645
646 case SRE_OP_CATEGORY:
647 /* match at given category */
648 /* <CATEGORY> <code> */
649 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
650 ctx->ptr, *ctx->pattern));
651 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
652 RETURN_FAILURE;
653 ctx->pattern++;
654 ctx->ptr++;
655 break;
656
657 case SRE_OP_ANY:
658 /* match anything (except a newline) */
659 /* <ANY> */
660 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
661 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
662 RETURN_FAILURE;
663 ctx->ptr++;
664 break;
665
666 case SRE_OP_ANY_ALL:
667 /* match anything */
668 /* <ANY_ALL> */
669 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
670 if (ctx->ptr >= end)
671 RETURN_FAILURE;
672 ctx->ptr++;
673 break;
674
675 case SRE_OP_IN:
676 /* match set member (or non_member) */
677 /* <IN> <skip> <set> */
678 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200679 if (ctx->ptr >= end ||
680 !SRE(charset)(state, ctx->pattern + 1, *ctx->ptr))
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300681 RETURN_FAILURE;
682 ctx->pattern += ctx->pattern[0];
683 ctx->ptr++;
684 break;
685
686 case SRE_OP_LITERAL_IGNORE:
687 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
688 ctx->pattern, ctx->ptr, ctx->pattern[0]));
689 if (ctx->ptr >= end ||
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300690 state->lower(*ctx->ptr) != *ctx->pattern)
691 RETURN_FAILURE;
692 ctx->pattern++;
693 ctx->ptr++;
694 break;
695
696 case SRE_OP_LITERAL_LOC_IGNORE:
697 TRACE(("|%p|%p|LITERAL_LOC_IGNORE %d\n",
698 ctx->pattern, ctx->ptr, ctx->pattern[0]));
699 if (ctx->ptr >= end
700 || !SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr))
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300701 RETURN_FAILURE;
702 ctx->pattern++;
703 ctx->ptr++;
704 break;
705
706 case SRE_OP_NOT_LITERAL_IGNORE:
707 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
708 ctx->pattern, ctx->ptr, *ctx->pattern));
709 if (ctx->ptr >= end ||
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300710 state->lower(*ctx->ptr) == *ctx->pattern)
711 RETURN_FAILURE;
712 ctx->pattern++;
713 ctx->ptr++;
714 break;
715
716 case SRE_OP_NOT_LITERAL_LOC_IGNORE:
717 TRACE(("|%p|%p|NOT_LITERAL_LOC_IGNORE %d\n",
718 ctx->pattern, ctx->ptr, *ctx->pattern));
719 if (ctx->ptr >= end
720 || SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr))
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300721 RETURN_FAILURE;
722 ctx->pattern++;
723 ctx->ptr++;
724 break;
725
726 case SRE_OP_IN_IGNORE:
727 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
728 if (ctx->ptr >= end
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200729 || !SRE(charset)(state, ctx->pattern+1,
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300730 (SRE_CODE)state->lower(*ctx->ptr)))
731 RETURN_FAILURE;
732 ctx->pattern += ctx->pattern[0];
733 ctx->ptr++;
734 break;
735
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300736 case SRE_OP_IN_LOC_IGNORE:
737 TRACE(("|%p|%p|IN_LOC_IGNORE\n", ctx->pattern, ctx->ptr));
738 if (ctx->ptr >= end
739 || !SRE(charset_loc_ignore)(state, ctx->pattern+1, *ctx->ptr))
740 RETURN_FAILURE;
741 ctx->pattern += ctx->pattern[0];
742 ctx->ptr++;
743 break;
744
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300745 case SRE_OP_JUMP:
746 case SRE_OP_INFO:
747 /* jump forward */
748 /* <JUMP> <offset> */
749 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
750 ctx->ptr, ctx->pattern[0]));
751 ctx->pattern += ctx->pattern[0];
752 break;
753
754 case SRE_OP_BRANCH:
755 /* alternation */
756 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
757 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
758 LASTMARK_SAVE();
759 ctx->u.rep = state->repeat;
760 if (ctx->u.rep)
761 MARK_PUSH(ctx->lastmark);
762 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
763 if (ctx->pattern[1] == SRE_OP_LITERAL &&
764 (ctx->ptr >= end ||
765 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
766 continue;
767 if (ctx->pattern[1] == SRE_OP_IN &&
768 (ctx->ptr >= end ||
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200769 !SRE(charset)(state, ctx->pattern + 3,
770 (SRE_CODE) *ctx->ptr)))
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300771 continue;
772 state->ptr = ctx->ptr;
773 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
774 if (ret) {
775 if (ctx->u.rep)
776 MARK_POP_DISCARD(ctx->lastmark);
777 RETURN_ON_ERROR(ret);
778 RETURN_SUCCESS;
779 }
780 if (ctx->u.rep)
781 MARK_POP_KEEP(ctx->lastmark);
782 LASTMARK_RESTORE();
783 }
784 if (ctx->u.rep)
785 MARK_POP_DISCARD(ctx->lastmark);
786 RETURN_FAILURE;
787
788 case SRE_OP_REPEAT_ONE:
789 /* match repeated sequence (maximizing regexp) */
790
791 /* this operator only works if the repeated item is
792 exactly one character wide, and we're not already
793 collecting backtracking points. for other cases,
794 use the MAX_REPEAT operator */
795
796 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
797
798 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
799 ctx->pattern[1], ctx->pattern[2]));
800
801 if ((Py_ssize_t) ctx->pattern[1] > end - ctx->ptr)
802 RETURN_FAILURE; /* cannot match */
803
804 state->ptr = ctx->ptr;
805
806 ret = SRE(count)(state, ctx->pattern+3, ctx->pattern[2]);
807 RETURN_ON_ERROR(ret);
808 DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos);
809 ctx->count = ret;
810 ctx->ptr += ctx->count;
811
812 /* when we arrive here, count contains the number of
813 matches, and ctx->ptr points to the tail of the target
814 string. check if the rest of the pattern matches,
815 and backtrack if not. */
816
817 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
818 RETURN_FAILURE;
819
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200820 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS &&
Serhiy Storchaka429b59e2014-05-14 21:48:17 +0300821 ctx->ptr == state->end) {
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300822 /* tail is empty. we're finished */
823 state->ptr = ctx->ptr;
824 RETURN_SUCCESS;
825 }
826
827 LASTMARK_SAVE();
828
829 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
830 /* tail starts with a literal. skip positions where
831 the rest of the pattern cannot possibly match */
832 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
833 for (;;) {
834 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
835 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
836 ctx->ptr--;
837 ctx->count--;
838 }
839 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
840 break;
841 state->ptr = ctx->ptr;
842 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
843 ctx->pattern+ctx->pattern[0]);
844 if (ret) {
845 RETURN_ON_ERROR(ret);
846 RETURN_SUCCESS;
847 }
848
849 LASTMARK_RESTORE();
850
851 ctx->ptr--;
852 ctx->count--;
853 }
854
855 } else {
856 /* general case */
857 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
858 state->ptr = ctx->ptr;
859 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
860 ctx->pattern+ctx->pattern[0]);
861 if (ret) {
862 RETURN_ON_ERROR(ret);
863 RETURN_SUCCESS;
864 }
865 ctx->ptr--;
866 ctx->count--;
867 LASTMARK_RESTORE();
868 }
869 }
870 RETURN_FAILURE;
871
872 case SRE_OP_MIN_REPEAT_ONE:
873 /* match repeated sequence (minimizing regexp) */
874
875 /* this operator only works if the repeated item is
876 exactly one character wide, and we're not already
877 collecting backtracking points. for other cases,
878 use the MIN_REPEAT operator */
879
880 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
881
882 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
883 ctx->pattern[1], ctx->pattern[2]));
884
885 if ((Py_ssize_t) ctx->pattern[1] > end - ctx->ptr)
886 RETURN_FAILURE; /* cannot match */
887
888 state->ptr = ctx->ptr;
889
890 if (ctx->pattern[1] == 0)
891 ctx->count = 0;
892 else {
893 /* count using pattern min as the maximum */
894 ret = SRE(count)(state, ctx->pattern+3, ctx->pattern[1]);
895 RETURN_ON_ERROR(ret);
896 DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos);
897 if (ret < (Py_ssize_t) ctx->pattern[1])
898 /* didn't match minimum number of times */
899 RETURN_FAILURE;
900 /* advance past minimum matches of repeat */
901 ctx->count = ret;
902 ctx->ptr += ctx->count;
903 }
904
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200905 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS &&
Serhiy Storchaka429b59e2014-05-14 21:48:17 +0300906 (!match_all || ctx->ptr == state->end)) {
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300907 /* tail is empty. we're finished */
908 state->ptr = ctx->ptr;
909 RETURN_SUCCESS;
910
911 } else {
912 /* general case */
913 LASTMARK_SAVE();
914 while ((Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT
915 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
916 state->ptr = ctx->ptr;
917 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
918 ctx->pattern+ctx->pattern[0]);
919 if (ret) {
920 RETURN_ON_ERROR(ret);
921 RETURN_SUCCESS;
922 }
923 state->ptr = ctx->ptr;
924 ret = SRE(count)(state, ctx->pattern+3, 1);
925 RETURN_ON_ERROR(ret);
926 DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos);
927 if (ret == 0)
928 break;
929 assert(ret == 1);
930 ctx->ptr++;
931 ctx->count++;
932 LASTMARK_RESTORE();
933 }
934 }
935 RETURN_FAILURE;
936
937 case SRE_OP_REPEAT:
938 /* create repeat context. all the hard work is done
939 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
940 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
941 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
942 ctx->pattern[1], ctx->pattern[2]));
943
944 /* install new repeat context */
945 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
946 if (!ctx->u.rep) {
947 PyErr_NoMemory();
948 RETURN_FAILURE;
949 }
950 ctx->u.rep->count = -1;
951 ctx->u.rep->pattern = ctx->pattern;
952 ctx->u.rep->prev = state->repeat;
953 ctx->u.rep->last_ptr = NULL;
954 state->repeat = ctx->u.rep;
955
956 state->ptr = ctx->ptr;
957 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
958 state->repeat = ctx->u.rep->prev;
959 PyObject_FREE(ctx->u.rep);
960
961 if (ret) {
962 RETURN_ON_ERROR(ret);
963 RETURN_SUCCESS;
964 }
965 RETURN_FAILURE;
966
967 case SRE_OP_MAX_UNTIL:
968 /* maximizing repeat */
969 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
970
971 /* FIXME: we probably need to deal with zero-width
972 matches in here... */
973
974 ctx->u.rep = state->repeat;
975 if (!ctx->u.rep)
976 RETURN_ERROR(SRE_ERROR_STATE);
977
978 state->ptr = ctx->ptr;
979
980 ctx->count = ctx->u.rep->count+1;
981
982 TRACE(("|%p|%p|MAX_UNTIL %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
983 ctx->ptr, ctx->count));
984
985 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
986 /* not enough matches */
987 ctx->u.rep->count = ctx->count;
988 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
989 ctx->u.rep->pattern+3);
990 if (ret) {
991 RETURN_ON_ERROR(ret);
992 RETURN_SUCCESS;
993 }
994 ctx->u.rep->count = ctx->count-1;
995 state->ptr = ctx->ptr;
996 RETURN_FAILURE;
997 }
998
999 if ((ctx->count < (Py_ssize_t) ctx->u.rep->pattern[2] ||
1000 ctx->u.rep->pattern[2] == SRE_MAXREPEAT) &&
1001 state->ptr != ctx->u.rep->last_ptr) {
1002 /* we may have enough matches, but if we can
1003 match another item, do so */
1004 ctx->u.rep->count = ctx->count;
1005 LASTMARK_SAVE();
1006 MARK_PUSH(ctx->lastmark);
1007 /* zero-width match protection */
1008 DATA_PUSH(&ctx->u.rep->last_ptr);
1009 ctx->u.rep->last_ptr = state->ptr;
1010 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1011 ctx->u.rep->pattern+3);
1012 DATA_POP(&ctx->u.rep->last_ptr);
1013 if (ret) {
1014 MARK_POP_DISCARD(ctx->lastmark);
1015 RETURN_ON_ERROR(ret);
1016 RETURN_SUCCESS;
1017 }
1018 MARK_POP(ctx->lastmark);
1019 LASTMARK_RESTORE();
1020 ctx->u.rep->count = ctx->count-1;
1021 state->ptr = ctx->ptr;
1022 }
1023
1024 /* cannot match more repeated items here. make sure the
1025 tail matches */
1026 state->repeat = ctx->u.rep->prev;
1027 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
1028 RETURN_ON_SUCCESS(ret);
1029 state->repeat = ctx->u.rep;
1030 state->ptr = ctx->ptr;
1031 RETURN_FAILURE;
1032
1033 case SRE_OP_MIN_UNTIL:
1034 /* minimizing repeat */
1035 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1036
1037 ctx->u.rep = state->repeat;
1038 if (!ctx->u.rep)
1039 RETURN_ERROR(SRE_ERROR_STATE);
1040
1041 state->ptr = ctx->ptr;
1042
1043 ctx->count = ctx->u.rep->count+1;
1044
1045 TRACE(("|%p|%p|MIN_UNTIL %" PY_FORMAT_SIZE_T "d %p\n", ctx->pattern,
1046 ctx->ptr, ctx->count, ctx->u.rep->pattern));
1047
1048 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
1049 /* not enough matches */
1050 ctx->u.rep->count = ctx->count;
1051 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1052 ctx->u.rep->pattern+3);
1053 if (ret) {
1054 RETURN_ON_ERROR(ret);
1055 RETURN_SUCCESS;
1056 }
1057 ctx->u.rep->count = ctx->count-1;
1058 state->ptr = ctx->ptr;
1059 RETURN_FAILURE;
1060 }
1061
1062 LASTMARK_SAVE();
1063
1064 /* see if the tail matches */
1065 state->repeat = ctx->u.rep->prev;
1066 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
1067 if (ret) {
1068 RETURN_ON_ERROR(ret);
1069 RETURN_SUCCESS;
1070 }
1071
1072 state->repeat = ctx->u.rep;
1073 state->ptr = ctx->ptr;
1074
1075 LASTMARK_RESTORE();
1076
1077 if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2]
1078 && ctx->u.rep->pattern[2] != SRE_MAXREPEAT) ||
1079 state->ptr == ctx->u.rep->last_ptr)
1080 RETURN_FAILURE;
1081
1082 ctx->u.rep->count = ctx->count;
1083 /* zero-width match protection */
1084 DATA_PUSH(&ctx->u.rep->last_ptr);
1085 ctx->u.rep->last_ptr = state->ptr;
1086 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1087 ctx->u.rep->pattern+3);
1088 DATA_POP(&ctx->u.rep->last_ptr);
1089 if (ret) {
1090 RETURN_ON_ERROR(ret);
1091 RETURN_SUCCESS;
1092 }
1093 ctx->u.rep->count = ctx->count-1;
1094 state->ptr = ctx->ptr;
1095 RETURN_FAILURE;
1096
1097 case SRE_OP_GROUPREF:
1098 /* match backreference */
1099 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1100 ctx->ptr, ctx->pattern[0]));
1101 i = ctx->pattern[0];
1102 {
1103 Py_ssize_t groupref = i+i;
1104 if (groupref >= state->lastmark) {
1105 RETURN_FAILURE;
1106 } else {
1107 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1108 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1109 if (!p || !e || e < p)
1110 RETURN_FAILURE;
1111 while (p < e) {
1112 if (ctx->ptr >= end || *ctx->ptr != *p)
1113 RETURN_FAILURE;
1114 p++;
1115 ctx->ptr++;
1116 }
1117 }
1118 }
1119 ctx->pattern++;
1120 break;
1121
1122 case SRE_OP_GROUPREF_IGNORE:
1123 /* match backreference */
1124 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1125 ctx->ptr, ctx->pattern[0]));
1126 i = ctx->pattern[0];
1127 {
1128 Py_ssize_t groupref = i+i;
1129 if (groupref >= state->lastmark) {
1130 RETURN_FAILURE;
1131 } else {
1132 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1133 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1134 if (!p || !e || e < p)
1135 RETURN_FAILURE;
1136 while (p < e) {
1137 if (ctx->ptr >= end ||
1138 state->lower(*ctx->ptr) != state->lower(*p))
1139 RETURN_FAILURE;
1140 p++;
1141 ctx->ptr++;
1142 }
1143 }
1144 }
1145 ctx->pattern++;
1146 break;
1147
1148 case SRE_OP_GROUPREF_EXISTS:
1149 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1150 ctx->ptr, ctx->pattern[0]));
1151 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1152 i = ctx->pattern[0];
1153 {
1154 Py_ssize_t groupref = i+i;
1155 if (groupref >= state->lastmark) {
1156 ctx->pattern += ctx->pattern[1];
1157 break;
1158 } else {
1159 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1160 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1161 if (!p || !e || e < p) {
1162 ctx->pattern += ctx->pattern[1];
1163 break;
1164 }
1165 }
1166 }
1167 ctx->pattern += 2;
1168 break;
1169
1170 case SRE_OP_ASSERT:
1171 /* assert subpattern */
1172 /* <ASSERT> <skip> <back> <pattern> */
1173 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1174 ctx->ptr, ctx->pattern[1]));
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001175 if (ctx->ptr - (SRE_CHAR *)state->beginning < (Py_ssize_t)ctx->pattern[1])
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001176 RETURN_FAILURE;
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001177 state->ptr = ctx->ptr - ctx->pattern[1];
Serhiy Storchaka32eddc12013-11-23 23:20:30 +02001178 DO_JUMP0(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001179 RETURN_ON_FAILURE(ret);
1180 ctx->pattern += ctx->pattern[0];
1181 break;
1182
1183 case SRE_OP_ASSERT_NOT:
1184 /* assert not subpattern */
1185 /* <ASSERT_NOT> <skip> <back> <pattern> */
1186 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1187 ctx->ptr, ctx->pattern[1]));
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001188 if (ctx->ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)ctx->pattern[1]) {
1189 state->ptr = ctx->ptr - ctx->pattern[1];
Serhiy Storchaka32eddc12013-11-23 23:20:30 +02001190 DO_JUMP0(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001191 if (ret) {
1192 RETURN_ON_ERROR(ret);
1193 RETURN_FAILURE;
1194 }
1195 }
1196 ctx->pattern += ctx->pattern[0];
1197 break;
1198
1199 case SRE_OP_FAILURE:
1200 /* immediate failure */
1201 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1202 RETURN_FAILURE;
1203
1204 default:
1205 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1206 ctx->pattern[-1]));
1207 RETURN_ERROR(SRE_ERROR_ILLEGAL);
1208 }
1209 }
1210
1211exit:
1212 ctx_pos = ctx->last_ctx_pos;
1213 jump = ctx->jump;
1214 DATA_POP_DISCARD(ctx);
1215 if (ctx_pos == -1)
1216 return ret;
1217 DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos);
1218
1219 switch (jump) {
1220 case JUMP_MAX_UNTIL_2:
1221 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1222 goto jump_max_until_2;
1223 case JUMP_MAX_UNTIL_3:
1224 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1225 goto jump_max_until_3;
1226 case JUMP_MIN_UNTIL_2:
1227 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1228 goto jump_min_until_2;
1229 case JUMP_MIN_UNTIL_3:
1230 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1231 goto jump_min_until_3;
1232 case JUMP_BRANCH:
1233 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1234 goto jump_branch;
1235 case JUMP_MAX_UNTIL_1:
1236 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1237 goto jump_max_until_1;
1238 case JUMP_MIN_UNTIL_1:
1239 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1240 goto jump_min_until_1;
1241 case JUMP_REPEAT:
1242 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1243 goto jump_repeat;
1244 case JUMP_REPEAT_ONE_1:
1245 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1246 goto jump_repeat_one_1;
1247 case JUMP_REPEAT_ONE_2:
1248 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1249 goto jump_repeat_one_2;
1250 case JUMP_MIN_REPEAT_ONE:
1251 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1252 goto jump_min_repeat_one;
1253 case JUMP_ASSERT:
1254 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1255 goto jump_assert;
1256 case JUMP_ASSERT_NOT:
1257 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1258 goto jump_assert_not;
1259 case JUMP_NONE:
1260 TRACE(("|%p|%p|RETURN %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
1261 ctx->ptr, ret));
1262 break;
1263 }
1264
1265 return ret; /* should never get here */
1266}
1267
1268LOCAL(Py_ssize_t)
1269SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
1270{
1271 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1272 SRE_CHAR* end = (SRE_CHAR *)state->end;
1273 Py_ssize_t status = 0;
1274 Py_ssize_t prefix_len = 0;
1275 Py_ssize_t prefix_skip = 0;
1276 SRE_CODE* prefix = NULL;
1277 SRE_CODE* charset = NULL;
1278 SRE_CODE* overlap = NULL;
1279 int flags = 0;
1280
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001281 if (ptr > end)
1282 return 0;
1283
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001284 if (pattern[0] == SRE_OP_INFO) {
1285 /* optimization info block */
1286 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1287
1288 flags = pattern[2];
1289
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001290 if (pattern[3] && end - ptr < (Py_ssize_t)pattern[3]) {
1291 TRACE(("reject (got %u chars, need %u)\n",
1292 (unsigned int)(end - ptr), pattern[3]));
1293 return 0;
1294 }
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001295 if (pattern[3] > 1) {
1296 /* adjust end point (but make sure we leave at least one
1297 character in there, so literal search will work) */
1298 end -= pattern[3] - 1;
1299 if (end <= ptr)
1300 end = ptr;
1301 }
1302
1303 if (flags & SRE_INFO_PREFIX) {
1304 /* pattern starts with a known prefix */
1305 /* <length> <skip> <prefix data> <overlap data> */
1306 prefix_len = pattern[5];
1307 prefix_skip = pattern[6];
1308 prefix = pattern + 7;
1309 overlap = prefix + prefix_len - 1;
1310 } else if (flags & SRE_INFO_CHARSET)
1311 /* pattern starts with a character from a known set */
1312 /* <charset> */
1313 charset = pattern + 5;
1314
1315 pattern += 1 + pattern[1];
1316 }
1317
1318 TRACE(("prefix = %p %" PY_FORMAT_SIZE_T "d %" PY_FORMAT_SIZE_T "d\n",
1319 prefix, prefix_len, prefix_skip));
1320 TRACE(("charset = %p\n", charset));
1321
Serhiy Storchaka66dc4642015-06-21 14:06:55 +03001322 if (prefix_len == 1) {
1323 /* pattern starts with a literal character */
1324 SRE_CHAR c = (SRE_CHAR) prefix[0];
1325#if SIZEOF_SRE_CHAR < 4
1326 if ((SRE_CODE) c != prefix[0])
1327 return 0; /* literal can't match: doesn't fit in char width */
1328#endif
1329 end = (SRE_CHAR *)state->end;
1330 while (ptr < end) {
1331 while (*ptr != c) {
1332 if (++ptr >= end)
1333 return 0;
1334 }
1335 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1336 state->start = ptr;
1337 state->ptr = ptr + prefix_skip;
1338 if (flags & SRE_INFO_LITERAL)
1339 return 1; /* we got all of it */
1340 status = SRE(match)(state, pattern + 2*prefix_skip, 0);
1341 if (status != 0)
1342 return status;
1343 ++ptr;
1344 }
1345 return 0;
1346 }
1347
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001348 if (prefix_len > 1) {
1349 /* pattern starts with a known prefix. use the overlap
1350 table to skip forward as fast as we possibly can */
1351 Py_ssize_t i = 0;
1352
1353 end = (SRE_CHAR *)state->end;
1354 if (prefix_len > end - ptr)
1355 return 0;
1356#if SIZEOF_SRE_CHAR < 4
1357 for (i = 0; i < prefix_len; i++)
1358 if ((SRE_CODE)(SRE_CHAR) prefix[i] != prefix[i])
1359 return 0; /* literal can't match: doesn't fit in char width */
1360#endif
1361 while (ptr < end) {
1362 SRE_CHAR c = (SRE_CHAR) prefix[0];
1363 while (*ptr++ != c) {
1364 if (ptr >= end)
1365 return 0;
1366 }
1367 if (ptr >= end)
1368 return 0;
1369
1370 i = 1;
1371 do {
1372 if (*ptr == (SRE_CHAR) prefix[i]) {
1373 if (++i != prefix_len) {
1374 if (++ptr >= end)
1375 return 0;
1376 continue;
1377 }
1378 /* found a potential match */
1379 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1380 state->start = ptr - (prefix_len - 1);
1381 state->ptr = ptr - (prefix_len - prefix_skip - 1);
1382 if (flags & SRE_INFO_LITERAL)
1383 return 1; /* we got all of it */
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03001384 status = SRE(match)(state, pattern + 2*prefix_skip, 0);
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001385 if (status != 0)
1386 return status;
1387 /* close but no cigar -- try again */
1388 if (++ptr >= end)
1389 return 0;
1390 }
1391 i = overlap[i];
1392 } while (i != 0);
1393 }
1394 return 0;
1395 }
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001396
Serhiy Storchaka66dc4642015-06-21 14:06:55 +03001397 if (charset) {
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001398 /* pattern starts with a character from a known set */
1399 end = (SRE_CHAR *)state->end;
1400 for (;;) {
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +02001401 while (ptr < end && !SRE(charset)(state, charset, *ptr))
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001402 ptr++;
1403 if (ptr >= end)
1404 return 0;
1405 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1406 state->start = ptr;
1407 state->ptr = ptr;
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03001408 status = SRE(match)(state, pattern, 0);
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001409 if (status != 0)
1410 break;
1411 ptr++;
1412 }
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001413 } else {
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001414 /* general case */
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001415 assert(ptr <= end);
1416 while (1) {
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001417 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001418 state->start = state->ptr = ptr;
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03001419 status = SRE(match)(state, pattern, 0);
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001420 if (status != 0 || ptr >= end)
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001421 break;
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001422 ptr++;
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001423 }
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001424 }
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001425
1426 return status;
1427}
1428
1429#undef SRE_CHAR
1430#undef SIZEOF_SRE_CHAR
1431#undef SRE
1432
1433/* vim:ts=4:sw=4:et
1434*/