blob: e13b90e8bc0eb886ebbece20505091d233b6f16d [file] [log] [blame]
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001/*
2 * Secret Labs' Regular Expression Engine
3 *
4 * regular expression matching engine
5 *
6 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
7 *
8 * See the _sre.c file for information on usage and redistribution.
9 */
10
11/* String matching engine */
12
13/* This file is included three times, with different character settings */
14
15LOCAL(int)
16SRE(at)(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
17{
18 /* check if pointer is at given position */
19
20 Py_ssize_t thisp, thatp;
21
22 switch (at) {
23
24 case SRE_AT_BEGINNING:
25 case SRE_AT_BEGINNING_STRING:
26 return ((void*) ptr == state->beginning);
27
28 case SRE_AT_BEGINNING_LINE:
29 return ((void*) ptr == state->beginning ||
30 SRE_IS_LINEBREAK((int) ptr[-1]));
31
32 case SRE_AT_END:
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +030033 return (((SRE_CHAR *)state->end - ptr == 1 &&
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +030034 SRE_IS_LINEBREAK((int) ptr[0])) ||
35 ((void*) ptr == state->end));
36
37 case SRE_AT_END_LINE:
38 return ((void*) ptr == state->end ||
39 SRE_IS_LINEBREAK((int) ptr[0]));
40
41 case SRE_AT_END_STRING:
42 return ((void*) ptr == state->end);
43
44 case SRE_AT_BOUNDARY:
45 if (state->beginning == state->end)
46 return 0;
47 thatp = ((void*) ptr > state->beginning) ?
48 SRE_IS_WORD((int) ptr[-1]) : 0;
49 thisp = ((void*) ptr < state->end) ?
50 SRE_IS_WORD((int) ptr[0]) : 0;
51 return thisp != thatp;
52
53 case SRE_AT_NON_BOUNDARY:
54 if (state->beginning == state->end)
55 return 0;
56 thatp = ((void*) ptr > state->beginning) ?
57 SRE_IS_WORD((int) ptr[-1]) : 0;
58 thisp = ((void*) ptr < state->end) ?
59 SRE_IS_WORD((int) ptr[0]) : 0;
60 return thisp == thatp;
61
62 case SRE_AT_LOC_BOUNDARY:
63 if (state->beginning == state->end)
64 return 0;
65 thatp = ((void*) ptr > state->beginning) ?
66 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
67 thisp = ((void*) ptr < state->end) ?
68 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
69 return thisp != thatp;
70
71 case SRE_AT_LOC_NON_BOUNDARY:
72 if (state->beginning == state->end)
73 return 0;
74 thatp = ((void*) ptr > state->beginning) ?
75 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
76 thisp = ((void*) ptr < state->end) ?
77 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
78 return thisp == thatp;
79
80 case SRE_AT_UNI_BOUNDARY:
81 if (state->beginning == state->end)
82 return 0;
83 thatp = ((void*) ptr > state->beginning) ?
84 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
85 thisp = ((void*) ptr < state->end) ?
86 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
87 return thisp != thatp;
88
89 case SRE_AT_UNI_NON_BOUNDARY:
90 if (state->beginning == state->end)
91 return 0;
92 thatp = ((void*) ptr > state->beginning) ?
93 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
94 thisp = ((void*) ptr < state->end) ?
95 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
96 return thisp == thatp;
97
98 }
99
100 return 0;
101}
102
103LOCAL(int)
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200104SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300105{
106 /* check if character is a member of the given set */
107
108 int ok = 1;
109
110 for (;;) {
111 switch (*set++) {
112
113 case SRE_OP_FAILURE:
114 return !ok;
115
116 case SRE_OP_LITERAL:
117 /* <LITERAL> <code> */
118 if (ch == set[0])
119 return ok;
120 set++;
121 break;
122
123 case SRE_OP_CATEGORY:
124 /* <CATEGORY> <code> */
125 if (sre_category(set[0], (int) ch))
126 return ok;
127 set++;
128 break;
129
130 case SRE_OP_CHARSET:
131 /* <CHARSET> <bitmap> */
132 if (ch < 256 &&
133 (set[ch/SRE_CODE_BITS] & (1u << (ch & (SRE_CODE_BITS-1)))))
134 return ok;
135 set += 256/SRE_CODE_BITS;
136 break;
137
138 case SRE_OP_RANGE:
139 /* <RANGE> <lower> <upper> */
140 if (set[0] <= ch && ch <= set[1])
141 return ok;
142 set += 2;
143 break;
144
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300145 case SRE_OP_RANGE_UNI_IGNORE:
146 /* <RANGE_UNI_IGNORE> <lower> <upper> */
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200147 {
148 SRE_CODE uch;
149 /* ch is already lower cased */
150 if (set[0] <= ch && ch <= set[1])
151 return ok;
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300152 uch = sre_upper_unicode(ch);
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200153 if (set[0] <= uch && uch <= set[1])
154 return ok;
155 set += 2;
156 break;
157 }
158
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300159 case SRE_OP_NEGATE:
160 ok = !ok;
161 break;
162
163 case SRE_OP_BIGCHARSET:
164 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
165 {
166 Py_ssize_t count, block;
167 count = *(set++);
168
169 if (ch < 0x10000u)
170 block = ((unsigned char*)set)[ch >> 8];
171 else
172 block = -1;
173 set += 256/sizeof(SRE_CODE);
174 if (block >=0 &&
175 (set[(block * 256 + (ch & 255))/SRE_CODE_BITS] &
176 (1u << (ch & (SRE_CODE_BITS-1)))))
177 return ok;
178 set += count * (256/SRE_CODE_BITS);
179 break;
180 }
181
182 default:
183 /* internal error -- there's not much we can do about it
184 here, so let's just pretend it didn't match... */
185 return 0;
186 }
187 }
188}
189
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300190LOCAL(int)
191SRE(charset_loc_ignore)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
192{
193 SRE_CODE lo, up;
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300194 lo = sre_lower_locale(ch);
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300195 if (SRE(charset)(state, set, lo))
196 return 1;
197
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300198 up = sre_upper_locale(ch);
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300199 return up != lo && SRE(charset)(state, set, up);
200}
201
Serhiy Storchaka429b59e2014-05-14 21:48:17 +0300202LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all);
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300203
204LOCAL(Py_ssize_t)
205SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
206{
207 SRE_CODE chr;
208 SRE_CHAR c;
209 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
210 SRE_CHAR* end = (SRE_CHAR *)state->end;
211 Py_ssize_t i;
212
213 /* adjust end */
214 if (maxcount < end - ptr && maxcount != SRE_MAXREPEAT)
215 end = ptr + maxcount;
216
217 switch (pattern[0]) {
218
219 case SRE_OP_IN:
220 /* repeated set */
221 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200222 while (ptr < end && SRE(charset)(state, pattern + 2, *ptr))
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300223 ptr++;
224 break;
225
226 case SRE_OP_ANY:
227 /* repeated dot wildcard. */
228 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
229 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
230 ptr++;
231 break;
232
233 case SRE_OP_ANY_ALL:
234 /* repeated dot wildcard. skip to the end of the target
235 string, and backtrack from there */
236 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
237 ptr = end;
238 break;
239
240 case SRE_OP_LITERAL:
241 /* repeated literal */
242 chr = pattern[1];
243 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
244 c = (SRE_CHAR) chr;
245#if SIZEOF_SRE_CHAR < 4
246 if ((SRE_CODE) c != chr)
247 ; /* literal can't match: doesn't fit in char width */
248 else
249#endif
250 while (ptr < end && *ptr == c)
251 ptr++;
252 break;
253
254 case SRE_OP_LITERAL_IGNORE:
255 /* repeated literal */
256 chr = pattern[1];
257 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300258 while (ptr < end && (SRE_CODE) sre_lower_ascii(*ptr) == chr)
259 ptr++;
260 break;
261
262 case SRE_OP_LITERAL_UNI_IGNORE:
263 /* repeated literal */
264 chr = pattern[1];
265 TRACE(("|%p|%p|COUNT LITERAL_UNI_IGNORE %d\n", pattern, ptr, chr));
266 while (ptr < end && (SRE_CODE) sre_lower_unicode(*ptr) == chr)
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300267 ptr++;
268 break;
269
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300270 case SRE_OP_LITERAL_LOC_IGNORE:
271 /* repeated literal */
272 chr = pattern[1];
273 TRACE(("|%p|%p|COUNT LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300274 while (ptr < end && char_loc_ignore(chr, *ptr))
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300275 ptr++;
276 break;
277
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300278 case SRE_OP_NOT_LITERAL:
279 /* repeated non-literal */
280 chr = pattern[1];
281 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
282 c = (SRE_CHAR) chr;
283#if SIZEOF_SRE_CHAR < 4
284 if ((SRE_CODE) c != chr)
285 ptr = end; /* literal can't match: doesn't fit in char width */
286 else
287#endif
288 while (ptr < end && *ptr != c)
289 ptr++;
290 break;
291
292 case SRE_OP_NOT_LITERAL_IGNORE:
293 /* repeated non-literal */
294 chr = pattern[1];
295 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300296 while (ptr < end && (SRE_CODE) sre_lower_ascii(*ptr) != chr)
297 ptr++;
298 break;
299
300 case SRE_OP_NOT_LITERAL_UNI_IGNORE:
301 /* repeated non-literal */
302 chr = pattern[1];
303 TRACE(("|%p|%p|COUNT NOT_LITERAL_UNI_IGNORE %d\n", pattern, ptr, chr));
304 while (ptr < end && (SRE_CODE) sre_lower_unicode(*ptr) != chr)
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300305 ptr++;
306 break;
307
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300308 case SRE_OP_NOT_LITERAL_LOC_IGNORE:
309 /* repeated non-literal */
310 chr = pattern[1];
311 TRACE(("|%p|%p|COUNT NOT_LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300312 while (ptr < end && !char_loc_ignore(chr, *ptr))
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300313 ptr++;
314 break;
315
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300316 default:
317 /* repeated single character pattern */
318 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
319 while ((SRE_CHAR*) state->ptr < end) {
Serhiy Storchaka429b59e2014-05-14 21:48:17 +0300320 i = SRE(match)(state, pattern, 0);
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300321 if (i < 0)
322 return i;
323 if (!i)
324 break;
325 }
326 TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
327 (SRE_CHAR*) state->ptr - ptr));
328 return (SRE_CHAR*) state->ptr - ptr;
329 }
330
331 TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
332 ptr - (SRE_CHAR*) state->ptr));
333 return ptr - (SRE_CHAR*) state->ptr;
334}
335
336#if 0 /* not used in this release */
337LOCAL(int)
338SRE(info)(SRE_STATE* state, SRE_CODE* pattern)
339{
340 /* check if an SRE_OP_INFO block matches at the current position.
341 returns the number of SRE_CODE objects to skip if successful, 0
342 if no match */
343
344 SRE_CHAR* end = (SRE_CHAR*) state->end;
345 SRE_CHAR* ptr = (SRE_CHAR*) state->ptr;
346 Py_ssize_t i;
347
348 /* check minimal length */
349 if (pattern[3] && end - ptr < pattern[3])
350 return 0;
351
352 /* check known prefix */
353 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
354 /* <length> <skip> <prefix data> <overlap data> */
355 for (i = 0; i < pattern[5]; i++)
356 if ((SRE_CODE) ptr[i] != pattern[7 + i])
357 return 0;
358 return pattern[0] + 2 * pattern[6];
359 }
360 return pattern[0];
361}
362#endif
363
364/* The macros below should be used to protect recursive SRE(match)()
365 * calls that *failed* and do *not* return immediately (IOW, those
366 * that will backtrack). Explaining:
367 *
368 * - Recursive SRE(match)() returned true: that's usually a success
369 * (besides atypical cases like ASSERT_NOT), therefore there's no
370 * reason to restore lastmark;
371 *
372 * - Recursive SRE(match)() returned false but the current SRE(match)()
373 * is returning to the caller: If the current SRE(match)() is the
374 * top function of the recursion, returning false will be a matching
375 * failure, and it doesn't matter where lastmark is pointing to.
376 * If it's *not* the top function, it will be a recursive SRE(match)()
377 * failure by itself, and the calling SRE(match)() will have to deal
378 * with the failure by the same rules explained here (it will restore
379 * lastmark by itself if necessary);
380 *
381 * - Recursive SRE(match)() returned false, and will continue the
382 * outside 'for' loop: must be protected when breaking, since the next
383 * OP could potentially depend on lastmark;
384 *
385 * - Recursive SRE(match)() returned false, and will be called again
386 * inside a local for/while loop: must be protected between each
387 * loop iteration, since the recursive SRE(match)() could do anything,
388 * and could potentially depend on lastmark.
389 *
390 * For more information, check the discussion at SF patch #712900.
391 */
392#define LASTMARK_SAVE() \
393 do { \
394 ctx->lastmark = state->lastmark; \
395 ctx->lastindex = state->lastindex; \
396 } while (0)
397#define LASTMARK_RESTORE() \
398 do { \
399 state->lastmark = ctx->lastmark; \
400 state->lastindex = ctx->lastindex; \
401 } while (0)
402
403#define RETURN_ERROR(i) do { return i; } while(0)
404#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
405#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
406
407#define RETURN_ON_ERROR(i) \
408 do { if (i < 0) RETURN_ERROR(i); } while (0)
409#define RETURN_ON_SUCCESS(i) \
410 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
411#define RETURN_ON_FAILURE(i) \
412 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
413
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300414#define DATA_STACK_ALLOC(state, type, ptr) \
415do { \
416 alloc_pos = state->data_stack_base; \
417 TRACE(("allocating %s in %" PY_FORMAT_SIZE_T "d " \
418 "(%" PY_FORMAT_SIZE_T "d)\n", \
Serhiy Storchaka12b25382015-11-05 17:43:42 +0200419 Py_STRINGIFY(type), alloc_pos, sizeof(type))); \
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300420 if (sizeof(type) > state->data_stack_size - alloc_pos) { \
421 int j = data_stack_grow(state, sizeof(type)); \
422 if (j < 0) return j; \
423 if (ctx_pos != -1) \
424 DATA_STACK_LOOKUP_AT(state, SRE(match_context), ctx, ctx_pos); \
425 } \
426 ptr = (type*)(state->data_stack+alloc_pos); \
427 state->data_stack_base += sizeof(type); \
428} while (0)
429
430#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
431do { \
Serhiy Storchaka12b25382015-11-05 17:43:42 +0200432 TRACE(("looking up %s at %" PY_FORMAT_SIZE_T "d\n", Py_STRINGIFY(type), pos)); \
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300433 ptr = (type*)(state->data_stack+pos); \
434} while (0)
435
436#define DATA_STACK_PUSH(state, data, size) \
437do { \
438 TRACE(("copy data in %p to %" PY_FORMAT_SIZE_T "d " \
439 "(%" PY_FORMAT_SIZE_T "d)\n", \
440 data, state->data_stack_base, size)); \
441 if (size > state->data_stack_size - state->data_stack_base) { \
442 int j = data_stack_grow(state, size); \
443 if (j < 0) return j; \
444 if (ctx_pos != -1) \
445 DATA_STACK_LOOKUP_AT(state, SRE(match_context), ctx, ctx_pos); \
446 } \
447 memcpy(state->data_stack+state->data_stack_base, data, size); \
448 state->data_stack_base += size; \
449} while (0)
450
451#define DATA_STACK_POP(state, data, size, discard) \
452do { \
453 TRACE(("copy data to %p from %" PY_FORMAT_SIZE_T "d " \
454 "(%" PY_FORMAT_SIZE_T "d)\n", \
455 data, state->data_stack_base-size, size)); \
456 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
457 if (discard) \
458 state->data_stack_base -= size; \
459} while (0)
460
461#define DATA_STACK_POP_DISCARD(state, size) \
462do { \
463 TRACE(("discard data from %" PY_FORMAT_SIZE_T "d " \
464 "(%" PY_FORMAT_SIZE_T "d)\n", \
465 state->data_stack_base-size, size)); \
466 state->data_stack_base -= size; \
467} while(0)
468
469#define DATA_PUSH(x) \
470 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
471#define DATA_POP(x) \
472 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
473#define DATA_POP_DISCARD(x) \
474 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
475#define DATA_ALLOC(t,p) \
476 DATA_STACK_ALLOC(state, t, p)
477#define DATA_LOOKUP_AT(t,p,pos) \
478 DATA_STACK_LOOKUP_AT(state,t,p,pos)
479
480#define MARK_PUSH(lastmark) \
481 do if (lastmark > 0) { \
482 i = lastmark; /* ctx->lastmark may change if reallocated */ \
483 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
484 } while (0)
485#define MARK_POP(lastmark) \
486 do if (lastmark > 0) { \
487 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
488 } while (0)
489#define MARK_POP_KEEP(lastmark) \
490 do if (lastmark > 0) { \
491 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
492 } while (0)
493#define MARK_POP_DISCARD(lastmark) \
494 do if (lastmark > 0) { \
495 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
496 } while (0)
497
498#define JUMP_NONE 0
499#define JUMP_MAX_UNTIL_1 1
500#define JUMP_MAX_UNTIL_2 2
501#define JUMP_MAX_UNTIL_3 3
502#define JUMP_MIN_UNTIL_1 4
503#define JUMP_MIN_UNTIL_2 5
504#define JUMP_MIN_UNTIL_3 6
505#define JUMP_REPEAT 7
506#define JUMP_REPEAT_ONE_1 8
507#define JUMP_REPEAT_ONE_2 9
508#define JUMP_MIN_REPEAT_ONE 10
509#define JUMP_BRANCH 11
510#define JUMP_ASSERT 12
511#define JUMP_ASSERT_NOT 13
512
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200513#define DO_JUMPX(jumpvalue, jumplabel, nextpattern, matchall) \
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300514 DATA_ALLOC(SRE(match_context), nextctx); \
515 nextctx->last_ctx_pos = ctx_pos; \
516 nextctx->jump = jumpvalue; \
517 nextctx->pattern = nextpattern; \
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200518 nextctx->match_all = matchall; \
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300519 ctx_pos = alloc_pos; \
520 ctx = nextctx; \
521 goto entrance; \
522 jumplabel: \
523 while (0) /* gcc doesn't like labels at end of scopes */ \
524
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200525#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
526 DO_JUMPX(jumpvalue, jumplabel, nextpattern, ctx->match_all)
527
528#define DO_JUMP0(jumpvalue, jumplabel, nextpattern) \
529 DO_JUMPX(jumpvalue, jumplabel, nextpattern, 0)
530
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300531typedef struct {
532 Py_ssize_t last_ctx_pos;
533 Py_ssize_t jump;
534 SRE_CHAR* ptr;
535 SRE_CODE* pattern;
536 Py_ssize_t count;
537 Py_ssize_t lastmark;
538 Py_ssize_t lastindex;
539 union {
540 SRE_CODE chr;
541 SRE_REPEAT* rep;
542 } u;
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200543 int match_all;
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300544} SRE(match_context);
545
546/* check if string matches the given pattern. returns <0 for
547 error, 0 for failure, and 1 for success */
548LOCAL(Py_ssize_t)
Serhiy Storchaka429b59e2014-05-14 21:48:17 +0300549SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all)
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300550{
551 SRE_CHAR* end = (SRE_CHAR *)state->end;
552 Py_ssize_t alloc_pos, ctx_pos = -1;
553 Py_ssize_t i, ret = 0;
554 Py_ssize_t jump;
555 unsigned int sigcount=0;
556
557 SRE(match_context)* ctx;
558 SRE(match_context)* nextctx;
559
560 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
561
562 DATA_ALLOC(SRE(match_context), ctx);
563 ctx->last_ctx_pos = -1;
564 ctx->jump = JUMP_NONE;
565 ctx->pattern = pattern;
Serhiy Storchaka429b59e2014-05-14 21:48:17 +0300566 ctx->match_all = match_all;
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300567 ctx_pos = alloc_pos;
568
569entrance:
570
571 ctx->ptr = (SRE_CHAR *)state->ptr;
572
573 if (ctx->pattern[0] == SRE_OP_INFO) {
574 /* optimization info block */
575 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Benjamin Petersonca470632016-09-06 13:47:26 -0700576 if (ctx->pattern[3] && (uintptr_t)(end - ctx->ptr) < ctx->pattern[3]) {
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300577 TRACE(("reject (got %" PY_FORMAT_SIZE_T "d chars, "
578 "need %" PY_FORMAT_SIZE_T "d)\n",
579 end - ctx->ptr, (Py_ssize_t) ctx->pattern[3]));
580 RETURN_FAILURE;
581 }
582 ctx->pattern += ctx->pattern[1] + 1;
583 }
584
585 for (;;) {
586 ++sigcount;
587 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
588 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
589
590 switch (*ctx->pattern++) {
591
592 case SRE_OP_MARK:
593 /* set mark */
594 /* <MARK> <gid> */
595 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
596 ctx->ptr, ctx->pattern[0]));
597 i = ctx->pattern[0];
598 if (i & 1)
599 state->lastindex = i/2 + 1;
600 if (i > state->lastmark) {
601 /* state->lastmark is the highest valid index in the
602 state->mark array. If it is increased by more than 1,
603 the intervening marks must be set to NULL to signal
604 that these marks have not been encountered. */
605 Py_ssize_t j = state->lastmark + 1;
606 while (j < i)
607 state->mark[j++] = NULL;
608 state->lastmark = i;
609 }
610 state->mark[i] = ctx->ptr;
611 ctx->pattern++;
612 break;
613
614 case SRE_OP_LITERAL:
615 /* match literal string */
616 /* <LITERAL> <code> */
617 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
618 ctx->ptr, *ctx->pattern));
619 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
620 RETURN_FAILURE;
621 ctx->pattern++;
622 ctx->ptr++;
623 break;
624
625 case SRE_OP_NOT_LITERAL:
626 /* match anything that is not literal character */
627 /* <NOT_LITERAL> <code> */
628 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
629 ctx->ptr, *ctx->pattern));
630 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
631 RETURN_FAILURE;
632 ctx->pattern++;
633 ctx->ptr++;
634 break;
635
636 case SRE_OP_SUCCESS:
637 /* end of pattern */
638 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200639 if (!ctx->match_all || ctx->ptr == state->end) {
640 state->ptr = ctx->ptr;
641 RETURN_SUCCESS;
642 }
643 RETURN_FAILURE;
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300644
645 case SRE_OP_AT:
646 /* match at given position */
647 /* <AT> <code> */
648 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
649 if (!SRE(at)(state, ctx->ptr, *ctx->pattern))
650 RETURN_FAILURE;
651 ctx->pattern++;
652 break;
653
654 case SRE_OP_CATEGORY:
655 /* match at given category */
656 /* <CATEGORY> <code> */
657 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
658 ctx->ptr, *ctx->pattern));
659 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
660 RETURN_FAILURE;
661 ctx->pattern++;
662 ctx->ptr++;
663 break;
664
665 case SRE_OP_ANY:
666 /* match anything (except a newline) */
667 /* <ANY> */
668 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
669 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
670 RETURN_FAILURE;
671 ctx->ptr++;
672 break;
673
674 case SRE_OP_ANY_ALL:
675 /* match anything */
676 /* <ANY_ALL> */
677 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
678 if (ctx->ptr >= end)
679 RETURN_FAILURE;
680 ctx->ptr++;
681 break;
682
683 case SRE_OP_IN:
684 /* match set member (or non_member) */
685 /* <IN> <skip> <set> */
686 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200687 if (ctx->ptr >= end ||
688 !SRE(charset)(state, ctx->pattern + 1, *ctx->ptr))
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300689 RETURN_FAILURE;
690 ctx->pattern += ctx->pattern[0];
691 ctx->ptr++;
692 break;
693
694 case SRE_OP_LITERAL_IGNORE:
695 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
696 ctx->pattern, ctx->ptr, ctx->pattern[0]));
697 if (ctx->ptr >= end ||
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300698 sre_lower_ascii(*ctx->ptr) != *ctx->pattern)
699 RETURN_FAILURE;
700 ctx->pattern++;
701 ctx->ptr++;
702 break;
703
704 case SRE_OP_LITERAL_UNI_IGNORE:
705 TRACE(("|%p|%p|LITERAL_UNI_IGNORE %d\n",
706 ctx->pattern, ctx->ptr, ctx->pattern[0]));
707 if (ctx->ptr >= end ||
708 sre_lower_unicode(*ctx->ptr) != *ctx->pattern)
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300709 RETURN_FAILURE;
710 ctx->pattern++;
711 ctx->ptr++;
712 break;
713
714 case SRE_OP_LITERAL_LOC_IGNORE:
715 TRACE(("|%p|%p|LITERAL_LOC_IGNORE %d\n",
716 ctx->pattern, ctx->ptr, ctx->pattern[0]));
717 if (ctx->ptr >= end
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300718 || !char_loc_ignore(*ctx->pattern, *ctx->ptr))
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300719 RETURN_FAILURE;
720 ctx->pattern++;
721 ctx->ptr++;
722 break;
723
724 case SRE_OP_NOT_LITERAL_IGNORE:
725 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
726 ctx->pattern, ctx->ptr, *ctx->pattern));
727 if (ctx->ptr >= end ||
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300728 sre_lower_ascii(*ctx->ptr) == *ctx->pattern)
729 RETURN_FAILURE;
730 ctx->pattern++;
731 ctx->ptr++;
732 break;
733
734 case SRE_OP_NOT_LITERAL_UNI_IGNORE:
735 TRACE(("|%p|%p|NOT_LITERAL_UNI_IGNORE %d\n",
736 ctx->pattern, ctx->ptr, *ctx->pattern));
737 if (ctx->ptr >= end ||
738 sre_lower_unicode(*ctx->ptr) == *ctx->pattern)
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300739 RETURN_FAILURE;
740 ctx->pattern++;
741 ctx->ptr++;
742 break;
743
744 case SRE_OP_NOT_LITERAL_LOC_IGNORE:
745 TRACE(("|%p|%p|NOT_LITERAL_LOC_IGNORE %d\n",
746 ctx->pattern, ctx->ptr, *ctx->pattern));
747 if (ctx->ptr >= end
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300748 || char_loc_ignore(*ctx->pattern, *ctx->ptr))
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300749 RETURN_FAILURE;
750 ctx->pattern++;
751 ctx->ptr++;
752 break;
753
754 case SRE_OP_IN_IGNORE:
755 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
756 if (ctx->ptr >= end
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200757 || !SRE(charset)(state, ctx->pattern+1,
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300758 (SRE_CODE)sre_lower_ascii(*ctx->ptr)))
759 RETURN_FAILURE;
760 ctx->pattern += ctx->pattern[0];
761 ctx->ptr++;
762 break;
763
764 case SRE_OP_IN_UNI_IGNORE:
765 TRACE(("|%p|%p|IN_UNI_IGNORE\n", ctx->pattern, ctx->ptr));
766 if (ctx->ptr >= end
767 || !SRE(charset)(state, ctx->pattern+1,
768 (SRE_CODE)sre_lower_unicode(*ctx->ptr)))
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300769 RETURN_FAILURE;
770 ctx->pattern += ctx->pattern[0];
771 ctx->ptr++;
772 break;
773
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300774 case SRE_OP_IN_LOC_IGNORE:
775 TRACE(("|%p|%p|IN_LOC_IGNORE\n", ctx->pattern, ctx->ptr));
776 if (ctx->ptr >= end
777 || !SRE(charset_loc_ignore)(state, ctx->pattern+1, *ctx->ptr))
778 RETURN_FAILURE;
779 ctx->pattern += ctx->pattern[0];
780 ctx->ptr++;
781 break;
782
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300783 case SRE_OP_JUMP:
784 case SRE_OP_INFO:
785 /* jump forward */
786 /* <JUMP> <offset> */
787 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
788 ctx->ptr, ctx->pattern[0]));
789 ctx->pattern += ctx->pattern[0];
790 break;
791
792 case SRE_OP_BRANCH:
793 /* alternation */
794 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
795 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
796 LASTMARK_SAVE();
797 ctx->u.rep = state->repeat;
798 if (ctx->u.rep)
799 MARK_PUSH(ctx->lastmark);
800 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
801 if (ctx->pattern[1] == SRE_OP_LITERAL &&
802 (ctx->ptr >= end ||
803 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
804 continue;
805 if (ctx->pattern[1] == SRE_OP_IN &&
806 (ctx->ptr >= end ||
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200807 !SRE(charset)(state, ctx->pattern + 3,
808 (SRE_CODE) *ctx->ptr)))
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300809 continue;
810 state->ptr = ctx->ptr;
811 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
812 if (ret) {
813 if (ctx->u.rep)
814 MARK_POP_DISCARD(ctx->lastmark);
815 RETURN_ON_ERROR(ret);
816 RETURN_SUCCESS;
817 }
818 if (ctx->u.rep)
819 MARK_POP_KEEP(ctx->lastmark);
820 LASTMARK_RESTORE();
821 }
822 if (ctx->u.rep)
823 MARK_POP_DISCARD(ctx->lastmark);
824 RETURN_FAILURE;
825
826 case SRE_OP_REPEAT_ONE:
827 /* match repeated sequence (maximizing regexp) */
828
829 /* this operator only works if the repeated item is
830 exactly one character wide, and we're not already
831 collecting backtracking points. for other cases,
832 use the MAX_REPEAT operator */
833
834 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
835
836 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
837 ctx->pattern[1], ctx->pattern[2]));
838
839 if ((Py_ssize_t) ctx->pattern[1] > end - ctx->ptr)
840 RETURN_FAILURE; /* cannot match */
841
842 state->ptr = ctx->ptr;
843
844 ret = SRE(count)(state, ctx->pattern+3, ctx->pattern[2]);
845 RETURN_ON_ERROR(ret);
846 DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos);
847 ctx->count = ret;
848 ctx->ptr += ctx->count;
849
850 /* when we arrive here, count contains the number of
851 matches, and ctx->ptr points to the tail of the target
852 string. check if the rest of the pattern matches,
853 and backtrack if not. */
854
855 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
856 RETURN_FAILURE;
857
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200858 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS &&
Serhiy Storchaka429b59e2014-05-14 21:48:17 +0300859 ctx->ptr == state->end) {
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300860 /* tail is empty. we're finished */
861 state->ptr = ctx->ptr;
862 RETURN_SUCCESS;
863 }
864
865 LASTMARK_SAVE();
866
867 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
868 /* tail starts with a literal. skip positions where
869 the rest of the pattern cannot possibly match */
870 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
871 for (;;) {
872 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
873 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
874 ctx->ptr--;
875 ctx->count--;
876 }
877 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
878 break;
879 state->ptr = ctx->ptr;
880 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
881 ctx->pattern+ctx->pattern[0]);
882 if (ret) {
883 RETURN_ON_ERROR(ret);
884 RETURN_SUCCESS;
885 }
886
887 LASTMARK_RESTORE();
888
889 ctx->ptr--;
890 ctx->count--;
891 }
892
893 } else {
894 /* general case */
895 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
896 state->ptr = ctx->ptr;
897 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
898 ctx->pattern+ctx->pattern[0]);
899 if (ret) {
900 RETURN_ON_ERROR(ret);
901 RETURN_SUCCESS;
902 }
903 ctx->ptr--;
904 ctx->count--;
905 LASTMARK_RESTORE();
906 }
907 }
908 RETURN_FAILURE;
909
910 case SRE_OP_MIN_REPEAT_ONE:
911 /* match repeated sequence (minimizing regexp) */
912
913 /* this operator only works if the repeated item is
914 exactly one character wide, and we're not already
915 collecting backtracking points. for other cases,
916 use the MIN_REPEAT operator */
917
918 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
919
920 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
921 ctx->pattern[1], ctx->pattern[2]));
922
923 if ((Py_ssize_t) ctx->pattern[1] > end - ctx->ptr)
924 RETURN_FAILURE; /* cannot match */
925
926 state->ptr = ctx->ptr;
927
928 if (ctx->pattern[1] == 0)
929 ctx->count = 0;
930 else {
931 /* count using pattern min as the maximum */
932 ret = SRE(count)(state, ctx->pattern+3, ctx->pattern[1]);
933 RETURN_ON_ERROR(ret);
934 DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos);
935 if (ret < (Py_ssize_t) ctx->pattern[1])
936 /* didn't match minimum number of times */
937 RETURN_FAILURE;
938 /* advance past minimum matches of repeat */
939 ctx->count = ret;
940 ctx->ptr += ctx->count;
941 }
942
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200943 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS &&
Serhiy Storchaka429b59e2014-05-14 21:48:17 +0300944 (!match_all || ctx->ptr == state->end)) {
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300945 /* tail is empty. we're finished */
946 state->ptr = ctx->ptr;
947 RETURN_SUCCESS;
948
949 } else {
950 /* general case */
951 LASTMARK_SAVE();
952 while ((Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT
953 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
954 state->ptr = ctx->ptr;
955 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
956 ctx->pattern+ctx->pattern[0]);
957 if (ret) {
958 RETURN_ON_ERROR(ret);
959 RETURN_SUCCESS;
960 }
961 state->ptr = ctx->ptr;
962 ret = SRE(count)(state, ctx->pattern+3, 1);
963 RETURN_ON_ERROR(ret);
964 DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos);
965 if (ret == 0)
966 break;
967 assert(ret == 1);
968 ctx->ptr++;
969 ctx->count++;
970 LASTMARK_RESTORE();
971 }
972 }
973 RETURN_FAILURE;
974
975 case SRE_OP_REPEAT:
976 /* create repeat context. all the hard work is done
977 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
978 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
979 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
980 ctx->pattern[1], ctx->pattern[2]));
981
982 /* install new repeat context */
983 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
984 if (!ctx->u.rep) {
985 PyErr_NoMemory();
986 RETURN_FAILURE;
987 }
988 ctx->u.rep->count = -1;
989 ctx->u.rep->pattern = ctx->pattern;
990 ctx->u.rep->prev = state->repeat;
991 ctx->u.rep->last_ptr = NULL;
992 state->repeat = ctx->u.rep;
993
994 state->ptr = ctx->ptr;
995 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
996 state->repeat = ctx->u.rep->prev;
997 PyObject_FREE(ctx->u.rep);
998
999 if (ret) {
1000 RETURN_ON_ERROR(ret);
1001 RETURN_SUCCESS;
1002 }
1003 RETURN_FAILURE;
1004
1005 case SRE_OP_MAX_UNTIL:
1006 /* maximizing repeat */
1007 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1008
1009 /* FIXME: we probably need to deal with zero-width
1010 matches in here... */
1011
1012 ctx->u.rep = state->repeat;
1013 if (!ctx->u.rep)
1014 RETURN_ERROR(SRE_ERROR_STATE);
1015
1016 state->ptr = ctx->ptr;
1017
1018 ctx->count = ctx->u.rep->count+1;
1019
1020 TRACE(("|%p|%p|MAX_UNTIL %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
1021 ctx->ptr, ctx->count));
1022
1023 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
1024 /* not enough matches */
1025 ctx->u.rep->count = ctx->count;
1026 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1027 ctx->u.rep->pattern+3);
1028 if (ret) {
1029 RETURN_ON_ERROR(ret);
1030 RETURN_SUCCESS;
1031 }
1032 ctx->u.rep->count = ctx->count-1;
1033 state->ptr = ctx->ptr;
1034 RETURN_FAILURE;
1035 }
1036
1037 if ((ctx->count < (Py_ssize_t) ctx->u.rep->pattern[2] ||
1038 ctx->u.rep->pattern[2] == SRE_MAXREPEAT) &&
1039 state->ptr != ctx->u.rep->last_ptr) {
1040 /* we may have enough matches, but if we can
1041 match another item, do so */
1042 ctx->u.rep->count = ctx->count;
1043 LASTMARK_SAVE();
1044 MARK_PUSH(ctx->lastmark);
1045 /* zero-width match protection */
1046 DATA_PUSH(&ctx->u.rep->last_ptr);
1047 ctx->u.rep->last_ptr = state->ptr;
1048 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1049 ctx->u.rep->pattern+3);
1050 DATA_POP(&ctx->u.rep->last_ptr);
1051 if (ret) {
1052 MARK_POP_DISCARD(ctx->lastmark);
1053 RETURN_ON_ERROR(ret);
1054 RETURN_SUCCESS;
1055 }
1056 MARK_POP(ctx->lastmark);
1057 LASTMARK_RESTORE();
1058 ctx->u.rep->count = ctx->count-1;
1059 state->ptr = ctx->ptr;
1060 }
1061
1062 /* cannot match more repeated items here. make sure the
1063 tail matches */
1064 state->repeat = ctx->u.rep->prev;
1065 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
1066 RETURN_ON_SUCCESS(ret);
1067 state->repeat = ctx->u.rep;
1068 state->ptr = ctx->ptr;
1069 RETURN_FAILURE;
1070
1071 case SRE_OP_MIN_UNTIL:
1072 /* minimizing repeat */
1073 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1074
1075 ctx->u.rep = state->repeat;
1076 if (!ctx->u.rep)
1077 RETURN_ERROR(SRE_ERROR_STATE);
1078
1079 state->ptr = ctx->ptr;
1080
1081 ctx->count = ctx->u.rep->count+1;
1082
1083 TRACE(("|%p|%p|MIN_UNTIL %" PY_FORMAT_SIZE_T "d %p\n", ctx->pattern,
1084 ctx->ptr, ctx->count, ctx->u.rep->pattern));
1085
1086 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
1087 /* not enough matches */
1088 ctx->u.rep->count = ctx->count;
1089 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1090 ctx->u.rep->pattern+3);
1091 if (ret) {
1092 RETURN_ON_ERROR(ret);
1093 RETURN_SUCCESS;
1094 }
1095 ctx->u.rep->count = ctx->count-1;
1096 state->ptr = ctx->ptr;
1097 RETURN_FAILURE;
1098 }
1099
1100 LASTMARK_SAVE();
1101
1102 /* see if the tail matches */
1103 state->repeat = ctx->u.rep->prev;
1104 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
1105 if (ret) {
1106 RETURN_ON_ERROR(ret);
1107 RETURN_SUCCESS;
1108 }
1109
1110 state->repeat = ctx->u.rep;
1111 state->ptr = ctx->ptr;
1112
1113 LASTMARK_RESTORE();
1114
1115 if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2]
1116 && ctx->u.rep->pattern[2] != SRE_MAXREPEAT) ||
1117 state->ptr == ctx->u.rep->last_ptr)
1118 RETURN_FAILURE;
1119
1120 ctx->u.rep->count = ctx->count;
1121 /* zero-width match protection */
1122 DATA_PUSH(&ctx->u.rep->last_ptr);
1123 ctx->u.rep->last_ptr = state->ptr;
1124 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1125 ctx->u.rep->pattern+3);
1126 DATA_POP(&ctx->u.rep->last_ptr);
1127 if (ret) {
1128 RETURN_ON_ERROR(ret);
1129 RETURN_SUCCESS;
1130 }
1131 ctx->u.rep->count = ctx->count-1;
1132 state->ptr = ctx->ptr;
1133 RETURN_FAILURE;
1134
1135 case SRE_OP_GROUPREF:
1136 /* match backreference */
1137 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1138 ctx->ptr, ctx->pattern[0]));
1139 i = ctx->pattern[0];
1140 {
1141 Py_ssize_t groupref = i+i;
1142 if (groupref >= state->lastmark) {
1143 RETURN_FAILURE;
1144 } else {
1145 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1146 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1147 if (!p || !e || e < p)
1148 RETURN_FAILURE;
1149 while (p < e) {
1150 if (ctx->ptr >= end || *ctx->ptr != *p)
1151 RETURN_FAILURE;
1152 p++;
1153 ctx->ptr++;
1154 }
1155 }
1156 }
1157 ctx->pattern++;
1158 break;
1159
1160 case SRE_OP_GROUPREF_IGNORE:
1161 /* match backreference */
1162 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1163 ctx->ptr, ctx->pattern[0]));
1164 i = ctx->pattern[0];
1165 {
1166 Py_ssize_t groupref = i+i;
1167 if (groupref >= state->lastmark) {
1168 RETURN_FAILURE;
1169 } else {
1170 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1171 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1172 if (!p || !e || e < p)
1173 RETURN_FAILURE;
1174 while (p < e) {
1175 if (ctx->ptr >= end ||
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001176 sre_lower_ascii(*ctx->ptr) != sre_lower_ascii(*p))
1177 RETURN_FAILURE;
1178 p++;
1179 ctx->ptr++;
1180 }
1181 }
1182 }
1183 ctx->pattern++;
1184 break;
1185
1186 case SRE_OP_GROUPREF_UNI_IGNORE:
1187 /* match backreference */
1188 TRACE(("|%p|%p|GROUPREF_UNI_IGNORE %d\n", ctx->pattern,
1189 ctx->ptr, ctx->pattern[0]));
1190 i = ctx->pattern[0];
1191 {
1192 Py_ssize_t groupref = i+i;
1193 if (groupref >= state->lastmark) {
1194 RETURN_FAILURE;
1195 } else {
1196 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1197 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1198 if (!p || !e || e < p)
1199 RETURN_FAILURE;
1200 while (p < e) {
1201 if (ctx->ptr >= end ||
1202 sre_lower_unicode(*ctx->ptr) != sre_lower_unicode(*p))
1203 RETURN_FAILURE;
1204 p++;
1205 ctx->ptr++;
1206 }
1207 }
1208 }
1209 ctx->pattern++;
1210 break;
1211
1212 case SRE_OP_GROUPREF_LOC_IGNORE:
1213 /* match backreference */
1214 TRACE(("|%p|%p|GROUPREF_LOC_IGNORE %d\n", ctx->pattern,
1215 ctx->ptr, ctx->pattern[0]));
1216 i = ctx->pattern[0];
1217 {
1218 Py_ssize_t groupref = i+i;
1219 if (groupref >= state->lastmark) {
1220 RETURN_FAILURE;
1221 } else {
1222 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1223 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1224 if (!p || !e || e < p)
1225 RETURN_FAILURE;
1226 while (p < e) {
1227 if (ctx->ptr >= end ||
1228 sre_lower_locale(*ctx->ptr) != sre_lower_locale(*p))
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001229 RETURN_FAILURE;
1230 p++;
1231 ctx->ptr++;
1232 }
1233 }
1234 }
1235 ctx->pattern++;
1236 break;
1237
1238 case SRE_OP_GROUPREF_EXISTS:
1239 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1240 ctx->ptr, ctx->pattern[0]));
1241 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1242 i = ctx->pattern[0];
1243 {
1244 Py_ssize_t groupref = i+i;
1245 if (groupref >= state->lastmark) {
1246 ctx->pattern += ctx->pattern[1];
1247 break;
1248 } else {
1249 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1250 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1251 if (!p || !e || e < p) {
1252 ctx->pattern += ctx->pattern[1];
1253 break;
1254 }
1255 }
1256 }
1257 ctx->pattern += 2;
1258 break;
1259
1260 case SRE_OP_ASSERT:
1261 /* assert subpattern */
1262 /* <ASSERT> <skip> <back> <pattern> */
1263 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1264 ctx->ptr, ctx->pattern[1]));
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001265 if (ctx->ptr - (SRE_CHAR *)state->beginning < (Py_ssize_t)ctx->pattern[1])
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001266 RETURN_FAILURE;
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001267 state->ptr = ctx->ptr - ctx->pattern[1];
Serhiy Storchaka32eddc12013-11-23 23:20:30 +02001268 DO_JUMP0(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001269 RETURN_ON_FAILURE(ret);
1270 ctx->pattern += ctx->pattern[0];
1271 break;
1272
1273 case SRE_OP_ASSERT_NOT:
1274 /* assert not subpattern */
1275 /* <ASSERT_NOT> <skip> <back> <pattern> */
1276 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1277 ctx->ptr, ctx->pattern[1]));
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001278 if (ctx->ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)ctx->pattern[1]) {
1279 state->ptr = ctx->ptr - ctx->pattern[1];
Serhiy Storchaka32eddc12013-11-23 23:20:30 +02001280 DO_JUMP0(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001281 if (ret) {
1282 RETURN_ON_ERROR(ret);
1283 RETURN_FAILURE;
1284 }
1285 }
1286 ctx->pattern += ctx->pattern[0];
1287 break;
1288
1289 case SRE_OP_FAILURE:
1290 /* immediate failure */
1291 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1292 RETURN_FAILURE;
1293
1294 default:
1295 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1296 ctx->pattern[-1]));
1297 RETURN_ERROR(SRE_ERROR_ILLEGAL);
1298 }
1299 }
1300
1301exit:
1302 ctx_pos = ctx->last_ctx_pos;
1303 jump = ctx->jump;
1304 DATA_POP_DISCARD(ctx);
1305 if (ctx_pos == -1)
1306 return ret;
1307 DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos);
1308
1309 switch (jump) {
1310 case JUMP_MAX_UNTIL_2:
1311 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1312 goto jump_max_until_2;
1313 case JUMP_MAX_UNTIL_3:
1314 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1315 goto jump_max_until_3;
1316 case JUMP_MIN_UNTIL_2:
1317 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1318 goto jump_min_until_2;
1319 case JUMP_MIN_UNTIL_3:
1320 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1321 goto jump_min_until_3;
1322 case JUMP_BRANCH:
1323 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1324 goto jump_branch;
1325 case JUMP_MAX_UNTIL_1:
1326 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1327 goto jump_max_until_1;
1328 case JUMP_MIN_UNTIL_1:
1329 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1330 goto jump_min_until_1;
1331 case JUMP_REPEAT:
1332 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1333 goto jump_repeat;
1334 case JUMP_REPEAT_ONE_1:
1335 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1336 goto jump_repeat_one_1;
1337 case JUMP_REPEAT_ONE_2:
1338 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1339 goto jump_repeat_one_2;
1340 case JUMP_MIN_REPEAT_ONE:
1341 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1342 goto jump_min_repeat_one;
1343 case JUMP_ASSERT:
1344 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1345 goto jump_assert;
1346 case JUMP_ASSERT_NOT:
1347 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1348 goto jump_assert_not;
1349 case JUMP_NONE:
1350 TRACE(("|%p|%p|RETURN %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
1351 ctx->ptr, ret));
1352 break;
1353 }
1354
1355 return ret; /* should never get here */
1356}
1357
1358LOCAL(Py_ssize_t)
1359SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
1360{
1361 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1362 SRE_CHAR* end = (SRE_CHAR *)state->end;
1363 Py_ssize_t status = 0;
1364 Py_ssize_t prefix_len = 0;
1365 Py_ssize_t prefix_skip = 0;
1366 SRE_CODE* prefix = NULL;
1367 SRE_CODE* charset = NULL;
1368 SRE_CODE* overlap = NULL;
1369 int flags = 0;
1370
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001371 if (ptr > end)
1372 return 0;
1373
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001374 if (pattern[0] == SRE_OP_INFO) {
1375 /* optimization info block */
1376 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1377
1378 flags = pattern[2];
1379
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001380 if (pattern[3] && end - ptr < (Py_ssize_t)pattern[3]) {
1381 TRACE(("reject (got %u chars, need %u)\n",
1382 (unsigned int)(end - ptr), pattern[3]));
1383 return 0;
1384 }
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001385 if (pattern[3] > 1) {
1386 /* adjust end point (but make sure we leave at least one
1387 character in there, so literal search will work) */
1388 end -= pattern[3] - 1;
1389 if (end <= ptr)
1390 end = ptr;
1391 }
1392
1393 if (flags & SRE_INFO_PREFIX) {
1394 /* pattern starts with a known prefix */
1395 /* <length> <skip> <prefix data> <overlap data> */
1396 prefix_len = pattern[5];
1397 prefix_skip = pattern[6];
1398 prefix = pattern + 7;
1399 overlap = prefix + prefix_len - 1;
1400 } else if (flags & SRE_INFO_CHARSET)
1401 /* pattern starts with a character from a known set */
1402 /* <charset> */
1403 charset = pattern + 5;
1404
1405 pattern += 1 + pattern[1];
1406 }
1407
1408 TRACE(("prefix = %p %" PY_FORMAT_SIZE_T "d %" PY_FORMAT_SIZE_T "d\n",
1409 prefix, prefix_len, prefix_skip));
1410 TRACE(("charset = %p\n", charset));
1411
Serhiy Storchaka66dc4642015-06-21 14:06:55 +03001412 if (prefix_len == 1) {
1413 /* pattern starts with a literal character */
1414 SRE_CHAR c = (SRE_CHAR) prefix[0];
1415#if SIZEOF_SRE_CHAR < 4
1416 if ((SRE_CODE) c != prefix[0])
1417 return 0; /* literal can't match: doesn't fit in char width */
1418#endif
1419 end = (SRE_CHAR *)state->end;
1420 while (ptr < end) {
1421 while (*ptr != c) {
1422 if (++ptr >= end)
1423 return 0;
1424 }
1425 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1426 state->start = ptr;
1427 state->ptr = ptr + prefix_skip;
1428 if (flags & SRE_INFO_LITERAL)
1429 return 1; /* we got all of it */
1430 status = SRE(match)(state, pattern + 2*prefix_skip, 0);
1431 if (status != 0)
1432 return status;
1433 ++ptr;
1434 }
1435 return 0;
1436 }
1437
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001438 if (prefix_len > 1) {
1439 /* pattern starts with a known prefix. use the overlap
1440 table to skip forward as fast as we possibly can */
1441 Py_ssize_t i = 0;
1442
1443 end = (SRE_CHAR *)state->end;
1444 if (prefix_len > end - ptr)
1445 return 0;
1446#if SIZEOF_SRE_CHAR < 4
1447 for (i = 0; i < prefix_len; i++)
1448 if ((SRE_CODE)(SRE_CHAR) prefix[i] != prefix[i])
1449 return 0; /* literal can't match: doesn't fit in char width */
1450#endif
1451 while (ptr < end) {
1452 SRE_CHAR c = (SRE_CHAR) prefix[0];
1453 while (*ptr++ != c) {
1454 if (ptr >= end)
1455 return 0;
1456 }
1457 if (ptr >= end)
1458 return 0;
1459
1460 i = 1;
1461 do {
1462 if (*ptr == (SRE_CHAR) prefix[i]) {
1463 if (++i != prefix_len) {
1464 if (++ptr >= end)
1465 return 0;
1466 continue;
1467 }
1468 /* found a potential match */
1469 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1470 state->start = ptr - (prefix_len - 1);
1471 state->ptr = ptr - (prefix_len - prefix_skip - 1);
1472 if (flags & SRE_INFO_LITERAL)
1473 return 1; /* we got all of it */
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03001474 status = SRE(match)(state, pattern + 2*prefix_skip, 0);
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001475 if (status != 0)
1476 return status;
1477 /* close but no cigar -- try again */
1478 if (++ptr >= end)
1479 return 0;
1480 }
1481 i = overlap[i];
1482 } while (i != 0);
1483 }
1484 return 0;
1485 }
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001486
Serhiy Storchaka66dc4642015-06-21 14:06:55 +03001487 if (charset) {
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001488 /* pattern starts with a character from a known set */
1489 end = (SRE_CHAR *)state->end;
1490 for (;;) {
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +02001491 while (ptr < end && !SRE(charset)(state, charset, *ptr))
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001492 ptr++;
1493 if (ptr >= end)
1494 return 0;
1495 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1496 state->start = ptr;
1497 state->ptr = ptr;
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03001498 status = SRE(match)(state, pattern, 0);
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001499 if (status != 0)
1500 break;
1501 ptr++;
1502 }
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001503 } else {
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001504 /* general case */
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001505 assert(ptr <= end);
1506 while (1) {
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001507 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001508 state->start = state->ptr = ptr;
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03001509 status = SRE(match)(state, pattern, 0);
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001510 if (status != 0 || ptr >= end)
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001511 break;
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001512 ptr++;
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001513 }
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001514 }
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001515
1516 return status;
1517}
1518
1519#undef SRE_CHAR
1520#undef SIZEOF_SRE_CHAR
1521#undef SRE
1522
1523/* vim:ts=4:sw=4:et
1524*/