blob: 437ab43f434a62ab5b3fdd892c52c1df24b40f32 [file] [log] [blame]
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001/*
2 * Secret Labs' Regular Expression Engine
3 *
4 * regular expression matching engine
5 *
6 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
7 *
8 * See the _sre.c file for information on usage and redistribution.
9 */
10
11/* String matching engine */
12
13/* This file is included three times, with different character settings */
14
15LOCAL(int)
16SRE(at)(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
17{
18 /* check if pointer is at given position */
19
20 Py_ssize_t thisp, thatp;
21
22 switch (at) {
23
24 case SRE_AT_BEGINNING:
25 case SRE_AT_BEGINNING_STRING:
26 return ((void*) ptr == state->beginning);
27
28 case SRE_AT_BEGINNING_LINE:
29 return ((void*) ptr == state->beginning ||
30 SRE_IS_LINEBREAK((int) ptr[-1]));
31
32 case SRE_AT_END:
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +030033 return (((SRE_CHAR *)state->end - ptr == 1 &&
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +030034 SRE_IS_LINEBREAK((int) ptr[0])) ||
35 ((void*) ptr == state->end));
36
37 case SRE_AT_END_LINE:
38 return ((void*) ptr == state->end ||
39 SRE_IS_LINEBREAK((int) ptr[0]));
40
41 case SRE_AT_END_STRING:
42 return ((void*) ptr == state->end);
43
44 case SRE_AT_BOUNDARY:
45 if (state->beginning == state->end)
46 return 0;
47 thatp = ((void*) ptr > state->beginning) ?
48 SRE_IS_WORD((int) ptr[-1]) : 0;
49 thisp = ((void*) ptr < state->end) ?
50 SRE_IS_WORD((int) ptr[0]) : 0;
51 return thisp != thatp;
52
53 case SRE_AT_NON_BOUNDARY:
54 if (state->beginning == state->end)
55 return 0;
56 thatp = ((void*) ptr > state->beginning) ?
57 SRE_IS_WORD((int) ptr[-1]) : 0;
58 thisp = ((void*) ptr < state->end) ?
59 SRE_IS_WORD((int) ptr[0]) : 0;
60 return thisp == thatp;
61
62 case SRE_AT_LOC_BOUNDARY:
63 if (state->beginning == state->end)
64 return 0;
65 thatp = ((void*) ptr > state->beginning) ?
66 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
67 thisp = ((void*) ptr < state->end) ?
68 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
69 return thisp != thatp;
70
71 case SRE_AT_LOC_NON_BOUNDARY:
72 if (state->beginning == state->end)
73 return 0;
74 thatp = ((void*) ptr > state->beginning) ?
75 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
76 thisp = ((void*) ptr < state->end) ?
77 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
78 return thisp == thatp;
79
80 case SRE_AT_UNI_BOUNDARY:
81 if (state->beginning == state->end)
82 return 0;
83 thatp = ((void*) ptr > state->beginning) ?
84 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
85 thisp = ((void*) ptr < state->end) ?
86 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
87 return thisp != thatp;
88
89 case SRE_AT_UNI_NON_BOUNDARY:
90 if (state->beginning == state->end)
91 return 0;
92 thatp = ((void*) ptr > state->beginning) ?
93 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
94 thisp = ((void*) ptr < state->end) ?
95 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
96 return thisp == thatp;
97
98 }
99
100 return 0;
101}
102
103LOCAL(int)
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200104SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300105{
106 /* check if character is a member of the given set */
107
108 int ok = 1;
109
110 for (;;) {
111 switch (*set++) {
112
113 case SRE_OP_FAILURE:
114 return !ok;
115
116 case SRE_OP_LITERAL:
117 /* <LITERAL> <code> */
118 if (ch == set[0])
119 return ok;
120 set++;
121 break;
122
123 case SRE_OP_CATEGORY:
124 /* <CATEGORY> <code> */
125 if (sre_category(set[0], (int) ch))
126 return ok;
127 set++;
128 break;
129
130 case SRE_OP_CHARSET:
131 /* <CHARSET> <bitmap> */
132 if (ch < 256 &&
133 (set[ch/SRE_CODE_BITS] & (1u << (ch & (SRE_CODE_BITS-1)))))
134 return ok;
135 set += 256/SRE_CODE_BITS;
136 break;
137
138 case SRE_OP_RANGE:
139 /* <RANGE> <lower> <upper> */
140 if (set[0] <= ch && ch <= set[1])
141 return ok;
142 set += 2;
143 break;
144
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300145 case SRE_OP_RANGE_UNI_IGNORE:
146 /* <RANGE_UNI_IGNORE> <lower> <upper> */
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200147 {
148 SRE_CODE uch;
149 /* ch is already lower cased */
150 if (set[0] <= ch && ch <= set[1])
151 return ok;
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300152 uch = sre_upper_unicode(ch);
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200153 if (set[0] <= uch && uch <= set[1])
154 return ok;
155 set += 2;
156 break;
157 }
158
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300159 case SRE_OP_NEGATE:
160 ok = !ok;
161 break;
162
163 case SRE_OP_BIGCHARSET:
164 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
165 {
166 Py_ssize_t count, block;
167 count = *(set++);
168
169 if (ch < 0x10000u)
170 block = ((unsigned char*)set)[ch >> 8];
171 else
172 block = -1;
173 set += 256/sizeof(SRE_CODE);
174 if (block >=0 &&
175 (set[(block * 256 + (ch & 255))/SRE_CODE_BITS] &
176 (1u << (ch & (SRE_CODE_BITS-1)))))
177 return ok;
178 set += count * (256/SRE_CODE_BITS);
179 break;
180 }
181
182 default:
183 /* internal error -- there's not much we can do about it
184 here, so let's just pretend it didn't match... */
185 return 0;
186 }
187 }
188}
189
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300190LOCAL(int)
191SRE(charset_loc_ignore)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
192{
193 SRE_CODE lo, up;
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300194 lo = sre_lower_locale(ch);
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300195 if (SRE(charset)(state, set, lo))
196 return 1;
197
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300198 up = sre_upper_locale(ch);
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300199 return up != lo && SRE(charset)(state, set, up);
200}
201
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200202LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel);
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300203
204LOCAL(Py_ssize_t)
205SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
206{
207 SRE_CODE chr;
208 SRE_CHAR c;
209 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
210 SRE_CHAR* end = (SRE_CHAR *)state->end;
211 Py_ssize_t i;
212
213 /* adjust end */
214 if (maxcount < end - ptr && maxcount != SRE_MAXREPEAT)
215 end = ptr + maxcount;
216
217 switch (pattern[0]) {
218
219 case SRE_OP_IN:
220 /* repeated set */
221 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200222 while (ptr < end && SRE(charset)(state, pattern + 2, *ptr))
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300223 ptr++;
224 break;
225
226 case SRE_OP_ANY:
227 /* repeated dot wildcard. */
228 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
229 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
230 ptr++;
231 break;
232
233 case SRE_OP_ANY_ALL:
234 /* repeated dot wildcard. skip to the end of the target
235 string, and backtrack from there */
236 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
237 ptr = end;
238 break;
239
240 case SRE_OP_LITERAL:
241 /* repeated literal */
242 chr = pattern[1];
243 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
244 c = (SRE_CHAR) chr;
245#if SIZEOF_SRE_CHAR < 4
246 if ((SRE_CODE) c != chr)
247 ; /* literal can't match: doesn't fit in char width */
248 else
249#endif
250 while (ptr < end && *ptr == c)
251 ptr++;
252 break;
253
254 case SRE_OP_LITERAL_IGNORE:
255 /* repeated literal */
256 chr = pattern[1];
257 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300258 while (ptr < end && (SRE_CODE) sre_lower_ascii(*ptr) == chr)
259 ptr++;
260 break;
261
262 case SRE_OP_LITERAL_UNI_IGNORE:
263 /* repeated literal */
264 chr = pattern[1];
265 TRACE(("|%p|%p|COUNT LITERAL_UNI_IGNORE %d\n", pattern, ptr, chr));
266 while (ptr < end && (SRE_CODE) sre_lower_unicode(*ptr) == chr)
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300267 ptr++;
268 break;
269
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300270 case SRE_OP_LITERAL_LOC_IGNORE:
271 /* repeated literal */
272 chr = pattern[1];
273 TRACE(("|%p|%p|COUNT LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300274 while (ptr < end && char_loc_ignore(chr, *ptr))
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300275 ptr++;
276 break;
277
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300278 case SRE_OP_NOT_LITERAL:
279 /* repeated non-literal */
280 chr = pattern[1];
281 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
282 c = (SRE_CHAR) chr;
283#if SIZEOF_SRE_CHAR < 4
284 if ((SRE_CODE) c != chr)
285 ptr = end; /* literal can't match: doesn't fit in char width */
286 else
287#endif
288 while (ptr < end && *ptr != c)
289 ptr++;
290 break;
291
292 case SRE_OP_NOT_LITERAL_IGNORE:
293 /* repeated non-literal */
294 chr = pattern[1];
295 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300296 while (ptr < end && (SRE_CODE) sre_lower_ascii(*ptr) != chr)
297 ptr++;
298 break;
299
300 case SRE_OP_NOT_LITERAL_UNI_IGNORE:
301 /* repeated non-literal */
302 chr = pattern[1];
303 TRACE(("|%p|%p|COUNT NOT_LITERAL_UNI_IGNORE %d\n", pattern, ptr, chr));
304 while (ptr < end && (SRE_CODE) sre_lower_unicode(*ptr) != chr)
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300305 ptr++;
306 break;
307
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300308 case SRE_OP_NOT_LITERAL_LOC_IGNORE:
309 /* repeated non-literal */
310 chr = pattern[1];
311 TRACE(("|%p|%p|COUNT NOT_LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300312 while (ptr < end && !char_loc_ignore(chr, *ptr))
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300313 ptr++;
314 break;
315
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300316 default:
317 /* repeated single character pattern */
318 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
319 while ((SRE_CHAR*) state->ptr < end) {
Serhiy Storchaka429b59e2014-05-14 21:48:17 +0300320 i = SRE(match)(state, pattern, 0);
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300321 if (i < 0)
322 return i;
323 if (!i)
324 break;
325 }
326 TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
327 (SRE_CHAR*) state->ptr - ptr));
328 return (SRE_CHAR*) state->ptr - ptr;
329 }
330
331 TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
332 ptr - (SRE_CHAR*) state->ptr));
333 return ptr - (SRE_CHAR*) state->ptr;
334}
335
336#if 0 /* not used in this release */
337LOCAL(int)
338SRE(info)(SRE_STATE* state, SRE_CODE* pattern)
339{
340 /* check if an SRE_OP_INFO block matches at the current position.
341 returns the number of SRE_CODE objects to skip if successful, 0
342 if no match */
343
344 SRE_CHAR* end = (SRE_CHAR*) state->end;
345 SRE_CHAR* ptr = (SRE_CHAR*) state->ptr;
346 Py_ssize_t i;
347
348 /* check minimal length */
349 if (pattern[3] && end - ptr < pattern[3])
350 return 0;
351
352 /* check known prefix */
353 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
354 /* <length> <skip> <prefix data> <overlap data> */
355 for (i = 0; i < pattern[5]; i++)
356 if ((SRE_CODE) ptr[i] != pattern[7 + i])
357 return 0;
358 return pattern[0] + 2 * pattern[6];
359 }
360 return pattern[0];
361}
362#endif
363
364/* The macros below should be used to protect recursive SRE(match)()
365 * calls that *failed* and do *not* return immediately (IOW, those
366 * that will backtrack). Explaining:
367 *
368 * - Recursive SRE(match)() returned true: that's usually a success
369 * (besides atypical cases like ASSERT_NOT), therefore there's no
370 * reason to restore lastmark;
371 *
372 * - Recursive SRE(match)() returned false but the current SRE(match)()
373 * is returning to the caller: If the current SRE(match)() is the
374 * top function of the recursion, returning false will be a matching
375 * failure, and it doesn't matter where lastmark is pointing to.
376 * If it's *not* the top function, it will be a recursive SRE(match)()
377 * failure by itself, and the calling SRE(match)() will have to deal
378 * with the failure by the same rules explained here (it will restore
379 * lastmark by itself if necessary);
380 *
381 * - Recursive SRE(match)() returned false, and will continue the
382 * outside 'for' loop: must be protected when breaking, since the next
383 * OP could potentially depend on lastmark;
384 *
385 * - Recursive SRE(match)() returned false, and will be called again
386 * inside a local for/while loop: must be protected between each
387 * loop iteration, since the recursive SRE(match)() could do anything,
388 * and could potentially depend on lastmark.
389 *
390 * For more information, check the discussion at SF patch #712900.
391 */
392#define LASTMARK_SAVE() \
393 do { \
394 ctx->lastmark = state->lastmark; \
395 ctx->lastindex = state->lastindex; \
396 } while (0)
397#define LASTMARK_RESTORE() \
398 do { \
399 state->lastmark = ctx->lastmark; \
400 state->lastindex = ctx->lastindex; \
401 } while (0)
402
403#define RETURN_ERROR(i) do { return i; } while(0)
404#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
405#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
406
407#define RETURN_ON_ERROR(i) \
408 do { if (i < 0) RETURN_ERROR(i); } while (0)
409#define RETURN_ON_SUCCESS(i) \
410 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
411#define RETURN_ON_FAILURE(i) \
412 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
413
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300414#define DATA_STACK_ALLOC(state, type, ptr) \
415do { \
416 alloc_pos = state->data_stack_base; \
417 TRACE(("allocating %s in %" PY_FORMAT_SIZE_T "d " \
418 "(%" PY_FORMAT_SIZE_T "d)\n", \
Serhiy Storchaka12b25382015-11-05 17:43:42 +0200419 Py_STRINGIFY(type), alloc_pos, sizeof(type))); \
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300420 if (sizeof(type) > state->data_stack_size - alloc_pos) { \
421 int j = data_stack_grow(state, sizeof(type)); \
422 if (j < 0) return j; \
423 if (ctx_pos != -1) \
424 DATA_STACK_LOOKUP_AT(state, SRE(match_context), ctx, ctx_pos); \
425 } \
426 ptr = (type*)(state->data_stack+alloc_pos); \
427 state->data_stack_base += sizeof(type); \
428} while (0)
429
430#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
431do { \
Serhiy Storchaka12b25382015-11-05 17:43:42 +0200432 TRACE(("looking up %s at %" PY_FORMAT_SIZE_T "d\n", Py_STRINGIFY(type), pos)); \
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300433 ptr = (type*)(state->data_stack+pos); \
434} while (0)
435
436#define DATA_STACK_PUSH(state, data, size) \
437do { \
438 TRACE(("copy data in %p to %" PY_FORMAT_SIZE_T "d " \
439 "(%" PY_FORMAT_SIZE_T "d)\n", \
440 data, state->data_stack_base, size)); \
441 if (size > state->data_stack_size - state->data_stack_base) { \
442 int j = data_stack_grow(state, size); \
443 if (j < 0) return j; \
444 if (ctx_pos != -1) \
445 DATA_STACK_LOOKUP_AT(state, SRE(match_context), ctx, ctx_pos); \
446 } \
447 memcpy(state->data_stack+state->data_stack_base, data, size); \
448 state->data_stack_base += size; \
449} while (0)
450
451#define DATA_STACK_POP(state, data, size, discard) \
452do { \
453 TRACE(("copy data to %p from %" PY_FORMAT_SIZE_T "d " \
454 "(%" PY_FORMAT_SIZE_T "d)\n", \
455 data, state->data_stack_base-size, size)); \
456 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
457 if (discard) \
458 state->data_stack_base -= size; \
459} while (0)
460
461#define DATA_STACK_POP_DISCARD(state, size) \
462do { \
463 TRACE(("discard data from %" PY_FORMAT_SIZE_T "d " \
464 "(%" PY_FORMAT_SIZE_T "d)\n", \
465 state->data_stack_base-size, size)); \
466 state->data_stack_base -= size; \
467} while(0)
468
469#define DATA_PUSH(x) \
470 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
471#define DATA_POP(x) \
472 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
473#define DATA_POP_DISCARD(x) \
474 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
475#define DATA_ALLOC(t,p) \
476 DATA_STACK_ALLOC(state, t, p)
477#define DATA_LOOKUP_AT(t,p,pos) \
478 DATA_STACK_LOOKUP_AT(state,t,p,pos)
479
480#define MARK_PUSH(lastmark) \
481 do if (lastmark > 0) { \
482 i = lastmark; /* ctx->lastmark may change if reallocated */ \
483 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
484 } while (0)
485#define MARK_POP(lastmark) \
486 do if (lastmark > 0) { \
487 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
488 } while (0)
489#define MARK_POP_KEEP(lastmark) \
490 do if (lastmark > 0) { \
491 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
492 } while (0)
493#define MARK_POP_DISCARD(lastmark) \
494 do if (lastmark > 0) { \
495 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
496 } while (0)
497
498#define JUMP_NONE 0
499#define JUMP_MAX_UNTIL_1 1
500#define JUMP_MAX_UNTIL_2 2
501#define JUMP_MAX_UNTIL_3 3
502#define JUMP_MIN_UNTIL_1 4
503#define JUMP_MIN_UNTIL_2 5
504#define JUMP_MIN_UNTIL_3 6
505#define JUMP_REPEAT 7
506#define JUMP_REPEAT_ONE_1 8
507#define JUMP_REPEAT_ONE_2 9
508#define JUMP_MIN_REPEAT_ONE 10
509#define JUMP_BRANCH 11
510#define JUMP_ASSERT 12
511#define JUMP_ASSERT_NOT 13
512
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200513#define DO_JUMPX(jumpvalue, jumplabel, nextpattern, toplevel_) \
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300514 DATA_ALLOC(SRE(match_context), nextctx); \
515 nextctx->last_ctx_pos = ctx_pos; \
516 nextctx->jump = jumpvalue; \
517 nextctx->pattern = nextpattern; \
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200518 nextctx->toplevel = toplevel_; \
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300519 ctx_pos = alloc_pos; \
520 ctx = nextctx; \
521 goto entrance; \
522 jumplabel: \
523 while (0) /* gcc doesn't like labels at end of scopes */ \
524
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200525#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200526 DO_JUMPX(jumpvalue, jumplabel, nextpattern, ctx->toplevel)
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200527
528#define DO_JUMP0(jumpvalue, jumplabel, nextpattern) \
529 DO_JUMPX(jumpvalue, jumplabel, nextpattern, 0)
530
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300531typedef struct {
532 Py_ssize_t last_ctx_pos;
533 Py_ssize_t jump;
534 SRE_CHAR* ptr;
535 SRE_CODE* pattern;
536 Py_ssize_t count;
537 Py_ssize_t lastmark;
538 Py_ssize_t lastindex;
539 union {
540 SRE_CODE chr;
541 SRE_REPEAT* rep;
542 } u;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200543 int toplevel;
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300544} SRE(match_context);
545
546/* check if string matches the given pattern. returns <0 for
547 error, 0 for failure, and 1 for success */
548LOCAL(Py_ssize_t)
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200549SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel)
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300550{
551 SRE_CHAR* end = (SRE_CHAR *)state->end;
552 Py_ssize_t alloc_pos, ctx_pos = -1;
553 Py_ssize_t i, ret = 0;
554 Py_ssize_t jump;
555 unsigned int sigcount=0;
556
557 SRE(match_context)* ctx;
558 SRE(match_context)* nextctx;
559
560 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
561
562 DATA_ALLOC(SRE(match_context), ctx);
563 ctx->last_ctx_pos = -1;
564 ctx->jump = JUMP_NONE;
565 ctx->pattern = pattern;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200566 ctx->toplevel = toplevel;
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300567 ctx_pos = alloc_pos;
568
569entrance:
570
571 ctx->ptr = (SRE_CHAR *)state->ptr;
572
573 if (ctx->pattern[0] == SRE_OP_INFO) {
574 /* optimization info block */
575 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Benjamin Petersonca470632016-09-06 13:47:26 -0700576 if (ctx->pattern[3] && (uintptr_t)(end - ctx->ptr) < ctx->pattern[3]) {
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300577 TRACE(("reject (got %" PY_FORMAT_SIZE_T "d chars, "
578 "need %" PY_FORMAT_SIZE_T "d)\n",
579 end - ctx->ptr, (Py_ssize_t) ctx->pattern[3]));
580 RETURN_FAILURE;
581 }
582 ctx->pattern += ctx->pattern[1] + 1;
583 }
584
585 for (;;) {
586 ++sigcount;
587 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
588 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
589
590 switch (*ctx->pattern++) {
591
592 case SRE_OP_MARK:
593 /* set mark */
594 /* <MARK> <gid> */
595 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
596 ctx->ptr, ctx->pattern[0]));
597 i = ctx->pattern[0];
598 if (i & 1)
599 state->lastindex = i/2 + 1;
600 if (i > state->lastmark) {
601 /* state->lastmark is the highest valid index in the
602 state->mark array. If it is increased by more than 1,
603 the intervening marks must be set to NULL to signal
604 that these marks have not been encountered. */
605 Py_ssize_t j = state->lastmark + 1;
606 while (j < i)
607 state->mark[j++] = NULL;
608 state->lastmark = i;
609 }
610 state->mark[i] = ctx->ptr;
611 ctx->pattern++;
612 break;
613
614 case SRE_OP_LITERAL:
615 /* match literal string */
616 /* <LITERAL> <code> */
617 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
618 ctx->ptr, *ctx->pattern));
619 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
620 RETURN_FAILURE;
621 ctx->pattern++;
622 ctx->ptr++;
623 break;
624
625 case SRE_OP_NOT_LITERAL:
626 /* match anything that is not literal character */
627 /* <NOT_LITERAL> <code> */
628 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
629 ctx->ptr, *ctx->pattern));
630 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
631 RETURN_FAILURE;
632 ctx->pattern++;
633 ctx->ptr++;
634 break;
635
636 case SRE_OP_SUCCESS:
637 /* end of pattern */
638 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200639 if (ctx->toplevel &&
640 ((state->match_all && ctx->ptr != state->end) ||
641 (state->must_advance && ctx->ptr == state->start)))
642 {
643 RETURN_FAILURE;
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200644 }
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200645 state->ptr = ctx->ptr;
646 RETURN_SUCCESS;
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300647
648 case SRE_OP_AT:
649 /* match at given position */
650 /* <AT> <code> */
651 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
652 if (!SRE(at)(state, ctx->ptr, *ctx->pattern))
653 RETURN_FAILURE;
654 ctx->pattern++;
655 break;
656
657 case SRE_OP_CATEGORY:
658 /* match at given category */
659 /* <CATEGORY> <code> */
660 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
661 ctx->ptr, *ctx->pattern));
662 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
663 RETURN_FAILURE;
664 ctx->pattern++;
665 ctx->ptr++;
666 break;
667
668 case SRE_OP_ANY:
669 /* match anything (except a newline) */
670 /* <ANY> */
671 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
672 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
673 RETURN_FAILURE;
674 ctx->ptr++;
675 break;
676
677 case SRE_OP_ANY_ALL:
678 /* match anything */
679 /* <ANY_ALL> */
680 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
681 if (ctx->ptr >= end)
682 RETURN_FAILURE;
683 ctx->ptr++;
684 break;
685
686 case SRE_OP_IN:
687 /* match set member (or non_member) */
688 /* <IN> <skip> <set> */
689 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200690 if (ctx->ptr >= end ||
691 !SRE(charset)(state, ctx->pattern + 1, *ctx->ptr))
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300692 RETURN_FAILURE;
693 ctx->pattern += ctx->pattern[0];
694 ctx->ptr++;
695 break;
696
697 case SRE_OP_LITERAL_IGNORE:
698 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
699 ctx->pattern, ctx->ptr, ctx->pattern[0]));
700 if (ctx->ptr >= end ||
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300701 sre_lower_ascii(*ctx->ptr) != *ctx->pattern)
702 RETURN_FAILURE;
703 ctx->pattern++;
704 ctx->ptr++;
705 break;
706
707 case SRE_OP_LITERAL_UNI_IGNORE:
708 TRACE(("|%p|%p|LITERAL_UNI_IGNORE %d\n",
709 ctx->pattern, ctx->ptr, ctx->pattern[0]));
710 if (ctx->ptr >= end ||
711 sre_lower_unicode(*ctx->ptr) != *ctx->pattern)
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300712 RETURN_FAILURE;
713 ctx->pattern++;
714 ctx->ptr++;
715 break;
716
717 case SRE_OP_LITERAL_LOC_IGNORE:
718 TRACE(("|%p|%p|LITERAL_LOC_IGNORE %d\n",
719 ctx->pattern, ctx->ptr, ctx->pattern[0]));
720 if (ctx->ptr >= end
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300721 || !char_loc_ignore(*ctx->pattern, *ctx->ptr))
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300722 RETURN_FAILURE;
723 ctx->pattern++;
724 ctx->ptr++;
725 break;
726
727 case SRE_OP_NOT_LITERAL_IGNORE:
728 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
729 ctx->pattern, ctx->ptr, *ctx->pattern));
730 if (ctx->ptr >= end ||
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300731 sre_lower_ascii(*ctx->ptr) == *ctx->pattern)
732 RETURN_FAILURE;
733 ctx->pattern++;
734 ctx->ptr++;
735 break;
736
737 case SRE_OP_NOT_LITERAL_UNI_IGNORE:
738 TRACE(("|%p|%p|NOT_LITERAL_UNI_IGNORE %d\n",
739 ctx->pattern, ctx->ptr, *ctx->pattern));
740 if (ctx->ptr >= end ||
741 sre_lower_unicode(*ctx->ptr) == *ctx->pattern)
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300742 RETURN_FAILURE;
743 ctx->pattern++;
744 ctx->ptr++;
745 break;
746
747 case SRE_OP_NOT_LITERAL_LOC_IGNORE:
748 TRACE(("|%p|%p|NOT_LITERAL_LOC_IGNORE %d\n",
749 ctx->pattern, ctx->ptr, *ctx->pattern));
750 if (ctx->ptr >= end
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300751 || char_loc_ignore(*ctx->pattern, *ctx->ptr))
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300752 RETURN_FAILURE;
753 ctx->pattern++;
754 ctx->ptr++;
755 break;
756
757 case SRE_OP_IN_IGNORE:
758 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
759 if (ctx->ptr >= end
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200760 || !SRE(charset)(state, ctx->pattern+1,
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300761 (SRE_CODE)sre_lower_ascii(*ctx->ptr)))
762 RETURN_FAILURE;
763 ctx->pattern += ctx->pattern[0];
764 ctx->ptr++;
765 break;
766
767 case SRE_OP_IN_UNI_IGNORE:
768 TRACE(("|%p|%p|IN_UNI_IGNORE\n", ctx->pattern, ctx->ptr));
769 if (ctx->ptr >= end
770 || !SRE(charset)(state, ctx->pattern+1,
771 (SRE_CODE)sre_lower_unicode(*ctx->ptr)))
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300772 RETURN_FAILURE;
773 ctx->pattern += ctx->pattern[0];
774 ctx->ptr++;
775 break;
776
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300777 case SRE_OP_IN_LOC_IGNORE:
778 TRACE(("|%p|%p|IN_LOC_IGNORE\n", ctx->pattern, ctx->ptr));
779 if (ctx->ptr >= end
780 || !SRE(charset_loc_ignore)(state, ctx->pattern+1, *ctx->ptr))
781 RETURN_FAILURE;
782 ctx->pattern += ctx->pattern[0];
783 ctx->ptr++;
784 break;
785
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300786 case SRE_OP_JUMP:
787 case SRE_OP_INFO:
788 /* jump forward */
789 /* <JUMP> <offset> */
790 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
791 ctx->ptr, ctx->pattern[0]));
792 ctx->pattern += ctx->pattern[0];
793 break;
794
795 case SRE_OP_BRANCH:
796 /* alternation */
797 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
798 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
799 LASTMARK_SAVE();
800 ctx->u.rep = state->repeat;
801 if (ctx->u.rep)
802 MARK_PUSH(ctx->lastmark);
803 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
804 if (ctx->pattern[1] == SRE_OP_LITERAL &&
805 (ctx->ptr >= end ||
806 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
807 continue;
808 if (ctx->pattern[1] == SRE_OP_IN &&
809 (ctx->ptr >= end ||
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200810 !SRE(charset)(state, ctx->pattern + 3,
811 (SRE_CODE) *ctx->ptr)))
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300812 continue;
813 state->ptr = ctx->ptr;
814 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
815 if (ret) {
816 if (ctx->u.rep)
817 MARK_POP_DISCARD(ctx->lastmark);
818 RETURN_ON_ERROR(ret);
819 RETURN_SUCCESS;
820 }
821 if (ctx->u.rep)
822 MARK_POP_KEEP(ctx->lastmark);
823 LASTMARK_RESTORE();
824 }
825 if (ctx->u.rep)
826 MARK_POP_DISCARD(ctx->lastmark);
827 RETURN_FAILURE;
828
829 case SRE_OP_REPEAT_ONE:
830 /* match repeated sequence (maximizing regexp) */
831
832 /* this operator only works if the repeated item is
833 exactly one character wide, and we're not already
834 collecting backtracking points. for other cases,
835 use the MAX_REPEAT operator */
836
837 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
838
839 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
840 ctx->pattern[1], ctx->pattern[2]));
841
842 if ((Py_ssize_t) ctx->pattern[1] > end - ctx->ptr)
843 RETURN_FAILURE; /* cannot match */
844
845 state->ptr = ctx->ptr;
846
847 ret = SRE(count)(state, ctx->pattern+3, ctx->pattern[2]);
848 RETURN_ON_ERROR(ret);
849 DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos);
850 ctx->count = ret;
851 ctx->ptr += ctx->count;
852
853 /* when we arrive here, count contains the number of
854 matches, and ctx->ptr points to the tail of the target
855 string. check if the rest of the pattern matches,
856 and backtrack if not. */
857
858 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
859 RETURN_FAILURE;
860
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200861 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS &&
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200862 ctx->ptr == state->end &&
863 !(ctx->toplevel && state->must_advance && ctx->ptr == state->start))
864 {
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300865 /* tail is empty. we're finished */
866 state->ptr = ctx->ptr;
867 RETURN_SUCCESS;
868 }
869
870 LASTMARK_SAVE();
871
872 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
873 /* tail starts with a literal. skip positions where
874 the rest of the pattern cannot possibly match */
875 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
876 for (;;) {
877 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
878 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
879 ctx->ptr--;
880 ctx->count--;
881 }
882 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
883 break;
884 state->ptr = ctx->ptr;
885 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
886 ctx->pattern+ctx->pattern[0]);
887 if (ret) {
888 RETURN_ON_ERROR(ret);
889 RETURN_SUCCESS;
890 }
891
892 LASTMARK_RESTORE();
893
894 ctx->ptr--;
895 ctx->count--;
896 }
897
898 } else {
899 /* general case */
900 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
901 state->ptr = ctx->ptr;
902 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
903 ctx->pattern+ctx->pattern[0]);
904 if (ret) {
905 RETURN_ON_ERROR(ret);
906 RETURN_SUCCESS;
907 }
908 ctx->ptr--;
909 ctx->count--;
910 LASTMARK_RESTORE();
911 }
912 }
913 RETURN_FAILURE;
914
915 case SRE_OP_MIN_REPEAT_ONE:
916 /* match repeated sequence (minimizing regexp) */
917
918 /* this operator only works if the repeated item is
919 exactly one character wide, and we're not already
920 collecting backtracking points. for other cases,
921 use the MIN_REPEAT operator */
922
923 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
924
925 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
926 ctx->pattern[1], ctx->pattern[2]));
927
928 if ((Py_ssize_t) ctx->pattern[1] > end - ctx->ptr)
929 RETURN_FAILURE; /* cannot match */
930
931 state->ptr = ctx->ptr;
932
933 if (ctx->pattern[1] == 0)
934 ctx->count = 0;
935 else {
936 /* count using pattern min as the maximum */
937 ret = SRE(count)(state, ctx->pattern+3, ctx->pattern[1]);
938 RETURN_ON_ERROR(ret);
939 DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos);
940 if (ret < (Py_ssize_t) ctx->pattern[1])
941 /* didn't match minimum number of times */
942 RETURN_FAILURE;
943 /* advance past minimum matches of repeat */
944 ctx->count = ret;
945 ctx->ptr += ctx->count;
946 }
947
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200948 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS &&
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200949 !(ctx->toplevel &&
950 ((state->match_all && ctx->ptr != state->end) ||
951 (state->must_advance && ctx->ptr == state->start))))
952 {
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300953 /* tail is empty. we're finished */
954 state->ptr = ctx->ptr;
955 RETURN_SUCCESS;
956
957 } else {
958 /* general case */
959 LASTMARK_SAVE();
960 while ((Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT
961 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
962 state->ptr = ctx->ptr;
963 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
964 ctx->pattern+ctx->pattern[0]);
965 if (ret) {
966 RETURN_ON_ERROR(ret);
967 RETURN_SUCCESS;
968 }
969 state->ptr = ctx->ptr;
970 ret = SRE(count)(state, ctx->pattern+3, 1);
971 RETURN_ON_ERROR(ret);
972 DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos);
973 if (ret == 0)
974 break;
975 assert(ret == 1);
976 ctx->ptr++;
977 ctx->count++;
978 LASTMARK_RESTORE();
979 }
980 }
981 RETURN_FAILURE;
982
983 case SRE_OP_REPEAT:
984 /* create repeat context. all the hard work is done
985 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
986 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
987 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
988 ctx->pattern[1], ctx->pattern[2]));
989
990 /* install new repeat context */
991 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
992 if (!ctx->u.rep) {
993 PyErr_NoMemory();
994 RETURN_FAILURE;
995 }
996 ctx->u.rep->count = -1;
997 ctx->u.rep->pattern = ctx->pattern;
998 ctx->u.rep->prev = state->repeat;
999 ctx->u.rep->last_ptr = NULL;
1000 state->repeat = ctx->u.rep;
1001
1002 state->ptr = ctx->ptr;
1003 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
1004 state->repeat = ctx->u.rep->prev;
1005 PyObject_FREE(ctx->u.rep);
1006
1007 if (ret) {
1008 RETURN_ON_ERROR(ret);
1009 RETURN_SUCCESS;
1010 }
1011 RETURN_FAILURE;
1012
1013 case SRE_OP_MAX_UNTIL:
1014 /* maximizing repeat */
1015 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1016
1017 /* FIXME: we probably need to deal with zero-width
1018 matches in here... */
1019
1020 ctx->u.rep = state->repeat;
1021 if (!ctx->u.rep)
1022 RETURN_ERROR(SRE_ERROR_STATE);
1023
1024 state->ptr = ctx->ptr;
1025
1026 ctx->count = ctx->u.rep->count+1;
1027
1028 TRACE(("|%p|%p|MAX_UNTIL %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
1029 ctx->ptr, ctx->count));
1030
1031 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
1032 /* not enough matches */
1033 ctx->u.rep->count = ctx->count;
1034 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1035 ctx->u.rep->pattern+3);
1036 if (ret) {
1037 RETURN_ON_ERROR(ret);
1038 RETURN_SUCCESS;
1039 }
1040 ctx->u.rep->count = ctx->count-1;
1041 state->ptr = ctx->ptr;
1042 RETURN_FAILURE;
1043 }
1044
1045 if ((ctx->count < (Py_ssize_t) ctx->u.rep->pattern[2] ||
1046 ctx->u.rep->pattern[2] == SRE_MAXREPEAT) &&
1047 state->ptr != ctx->u.rep->last_ptr) {
1048 /* we may have enough matches, but if we can
1049 match another item, do so */
1050 ctx->u.rep->count = ctx->count;
1051 LASTMARK_SAVE();
1052 MARK_PUSH(ctx->lastmark);
1053 /* zero-width match protection */
1054 DATA_PUSH(&ctx->u.rep->last_ptr);
1055 ctx->u.rep->last_ptr = state->ptr;
1056 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1057 ctx->u.rep->pattern+3);
1058 DATA_POP(&ctx->u.rep->last_ptr);
1059 if (ret) {
1060 MARK_POP_DISCARD(ctx->lastmark);
1061 RETURN_ON_ERROR(ret);
1062 RETURN_SUCCESS;
1063 }
1064 MARK_POP(ctx->lastmark);
1065 LASTMARK_RESTORE();
1066 ctx->u.rep->count = ctx->count-1;
1067 state->ptr = ctx->ptr;
1068 }
1069
1070 /* cannot match more repeated items here. make sure the
1071 tail matches */
1072 state->repeat = ctx->u.rep->prev;
1073 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
1074 RETURN_ON_SUCCESS(ret);
1075 state->repeat = ctx->u.rep;
1076 state->ptr = ctx->ptr;
1077 RETURN_FAILURE;
1078
1079 case SRE_OP_MIN_UNTIL:
1080 /* minimizing repeat */
1081 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1082
1083 ctx->u.rep = state->repeat;
1084 if (!ctx->u.rep)
1085 RETURN_ERROR(SRE_ERROR_STATE);
1086
1087 state->ptr = ctx->ptr;
1088
1089 ctx->count = ctx->u.rep->count+1;
1090
1091 TRACE(("|%p|%p|MIN_UNTIL %" PY_FORMAT_SIZE_T "d %p\n", ctx->pattern,
1092 ctx->ptr, ctx->count, ctx->u.rep->pattern));
1093
1094 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
1095 /* not enough matches */
1096 ctx->u.rep->count = ctx->count;
1097 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1098 ctx->u.rep->pattern+3);
1099 if (ret) {
1100 RETURN_ON_ERROR(ret);
1101 RETURN_SUCCESS;
1102 }
1103 ctx->u.rep->count = ctx->count-1;
1104 state->ptr = ctx->ptr;
1105 RETURN_FAILURE;
1106 }
1107
1108 LASTMARK_SAVE();
1109
1110 /* see if the tail matches */
1111 state->repeat = ctx->u.rep->prev;
1112 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
1113 if (ret) {
1114 RETURN_ON_ERROR(ret);
1115 RETURN_SUCCESS;
1116 }
1117
1118 state->repeat = ctx->u.rep;
1119 state->ptr = ctx->ptr;
1120
1121 LASTMARK_RESTORE();
1122
1123 if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2]
1124 && ctx->u.rep->pattern[2] != SRE_MAXREPEAT) ||
1125 state->ptr == ctx->u.rep->last_ptr)
1126 RETURN_FAILURE;
1127
1128 ctx->u.rep->count = ctx->count;
1129 /* zero-width match protection */
1130 DATA_PUSH(&ctx->u.rep->last_ptr);
1131 ctx->u.rep->last_ptr = state->ptr;
1132 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1133 ctx->u.rep->pattern+3);
1134 DATA_POP(&ctx->u.rep->last_ptr);
1135 if (ret) {
1136 RETURN_ON_ERROR(ret);
1137 RETURN_SUCCESS;
1138 }
1139 ctx->u.rep->count = ctx->count-1;
1140 state->ptr = ctx->ptr;
1141 RETURN_FAILURE;
1142
1143 case SRE_OP_GROUPREF:
1144 /* match backreference */
1145 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1146 ctx->ptr, ctx->pattern[0]));
1147 i = ctx->pattern[0];
1148 {
1149 Py_ssize_t groupref = i+i;
1150 if (groupref >= state->lastmark) {
1151 RETURN_FAILURE;
1152 } else {
1153 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1154 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1155 if (!p || !e || e < p)
1156 RETURN_FAILURE;
1157 while (p < e) {
1158 if (ctx->ptr >= end || *ctx->ptr != *p)
1159 RETURN_FAILURE;
1160 p++;
1161 ctx->ptr++;
1162 }
1163 }
1164 }
1165 ctx->pattern++;
1166 break;
1167
1168 case SRE_OP_GROUPREF_IGNORE:
1169 /* match backreference */
1170 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1171 ctx->ptr, ctx->pattern[0]));
1172 i = ctx->pattern[0];
1173 {
1174 Py_ssize_t groupref = i+i;
1175 if (groupref >= state->lastmark) {
1176 RETURN_FAILURE;
1177 } else {
1178 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1179 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1180 if (!p || !e || e < p)
1181 RETURN_FAILURE;
1182 while (p < e) {
1183 if (ctx->ptr >= end ||
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001184 sre_lower_ascii(*ctx->ptr) != sre_lower_ascii(*p))
1185 RETURN_FAILURE;
1186 p++;
1187 ctx->ptr++;
1188 }
1189 }
1190 }
1191 ctx->pattern++;
1192 break;
1193
1194 case SRE_OP_GROUPREF_UNI_IGNORE:
1195 /* match backreference */
1196 TRACE(("|%p|%p|GROUPREF_UNI_IGNORE %d\n", ctx->pattern,
1197 ctx->ptr, ctx->pattern[0]));
1198 i = ctx->pattern[0];
1199 {
1200 Py_ssize_t groupref = i+i;
1201 if (groupref >= state->lastmark) {
1202 RETURN_FAILURE;
1203 } else {
1204 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1205 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1206 if (!p || !e || e < p)
1207 RETURN_FAILURE;
1208 while (p < e) {
1209 if (ctx->ptr >= end ||
1210 sre_lower_unicode(*ctx->ptr) != sre_lower_unicode(*p))
1211 RETURN_FAILURE;
1212 p++;
1213 ctx->ptr++;
1214 }
1215 }
1216 }
1217 ctx->pattern++;
1218 break;
1219
1220 case SRE_OP_GROUPREF_LOC_IGNORE:
1221 /* match backreference */
1222 TRACE(("|%p|%p|GROUPREF_LOC_IGNORE %d\n", ctx->pattern,
1223 ctx->ptr, ctx->pattern[0]));
1224 i = ctx->pattern[0];
1225 {
1226 Py_ssize_t groupref = i+i;
1227 if (groupref >= state->lastmark) {
1228 RETURN_FAILURE;
1229 } else {
1230 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1231 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1232 if (!p || !e || e < p)
1233 RETURN_FAILURE;
1234 while (p < e) {
1235 if (ctx->ptr >= end ||
1236 sre_lower_locale(*ctx->ptr) != sre_lower_locale(*p))
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001237 RETURN_FAILURE;
1238 p++;
1239 ctx->ptr++;
1240 }
1241 }
1242 }
1243 ctx->pattern++;
1244 break;
1245
1246 case SRE_OP_GROUPREF_EXISTS:
1247 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1248 ctx->ptr, ctx->pattern[0]));
1249 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1250 i = ctx->pattern[0];
1251 {
1252 Py_ssize_t groupref = i+i;
1253 if (groupref >= state->lastmark) {
1254 ctx->pattern += ctx->pattern[1];
1255 break;
1256 } else {
1257 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1258 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1259 if (!p || !e || e < p) {
1260 ctx->pattern += ctx->pattern[1];
1261 break;
1262 }
1263 }
1264 }
1265 ctx->pattern += 2;
1266 break;
1267
1268 case SRE_OP_ASSERT:
1269 /* assert subpattern */
1270 /* <ASSERT> <skip> <back> <pattern> */
1271 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1272 ctx->ptr, ctx->pattern[1]));
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001273 if (ctx->ptr - (SRE_CHAR *)state->beginning < (Py_ssize_t)ctx->pattern[1])
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001274 RETURN_FAILURE;
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001275 state->ptr = ctx->ptr - ctx->pattern[1];
Serhiy Storchaka32eddc12013-11-23 23:20:30 +02001276 DO_JUMP0(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001277 RETURN_ON_FAILURE(ret);
1278 ctx->pattern += ctx->pattern[0];
1279 break;
1280
1281 case SRE_OP_ASSERT_NOT:
1282 /* assert not subpattern */
1283 /* <ASSERT_NOT> <skip> <back> <pattern> */
1284 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1285 ctx->ptr, ctx->pattern[1]));
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001286 if (ctx->ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)ctx->pattern[1]) {
1287 state->ptr = ctx->ptr - ctx->pattern[1];
Serhiy Storchaka32eddc12013-11-23 23:20:30 +02001288 DO_JUMP0(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001289 if (ret) {
1290 RETURN_ON_ERROR(ret);
1291 RETURN_FAILURE;
1292 }
1293 }
1294 ctx->pattern += ctx->pattern[0];
1295 break;
1296
1297 case SRE_OP_FAILURE:
1298 /* immediate failure */
1299 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1300 RETURN_FAILURE;
1301
1302 default:
1303 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1304 ctx->pattern[-1]));
1305 RETURN_ERROR(SRE_ERROR_ILLEGAL);
1306 }
1307 }
1308
1309exit:
1310 ctx_pos = ctx->last_ctx_pos;
1311 jump = ctx->jump;
1312 DATA_POP_DISCARD(ctx);
1313 if (ctx_pos == -1)
1314 return ret;
1315 DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos);
1316
1317 switch (jump) {
1318 case JUMP_MAX_UNTIL_2:
1319 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1320 goto jump_max_until_2;
1321 case JUMP_MAX_UNTIL_3:
1322 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1323 goto jump_max_until_3;
1324 case JUMP_MIN_UNTIL_2:
1325 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1326 goto jump_min_until_2;
1327 case JUMP_MIN_UNTIL_3:
1328 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1329 goto jump_min_until_3;
1330 case JUMP_BRANCH:
1331 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1332 goto jump_branch;
1333 case JUMP_MAX_UNTIL_1:
1334 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1335 goto jump_max_until_1;
1336 case JUMP_MIN_UNTIL_1:
1337 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1338 goto jump_min_until_1;
1339 case JUMP_REPEAT:
1340 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1341 goto jump_repeat;
1342 case JUMP_REPEAT_ONE_1:
1343 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1344 goto jump_repeat_one_1;
1345 case JUMP_REPEAT_ONE_2:
1346 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1347 goto jump_repeat_one_2;
1348 case JUMP_MIN_REPEAT_ONE:
1349 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1350 goto jump_min_repeat_one;
1351 case JUMP_ASSERT:
1352 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1353 goto jump_assert;
1354 case JUMP_ASSERT_NOT:
1355 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1356 goto jump_assert_not;
1357 case JUMP_NONE:
1358 TRACE(("|%p|%p|RETURN %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
1359 ctx->ptr, ret));
1360 break;
1361 }
1362
1363 return ret; /* should never get here */
1364}
1365
animalize4a7f44a2019-02-18 21:26:37 +08001366/* need to reset capturing groups between two SRE(match) callings in loops */
1367#define RESET_CAPTURE_GROUP() \
1368 do { state->lastmark = state->lastindex = -1; } while (0)
1369
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001370LOCAL(Py_ssize_t)
1371SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
1372{
1373 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1374 SRE_CHAR* end = (SRE_CHAR *)state->end;
1375 Py_ssize_t status = 0;
1376 Py_ssize_t prefix_len = 0;
1377 Py_ssize_t prefix_skip = 0;
1378 SRE_CODE* prefix = NULL;
1379 SRE_CODE* charset = NULL;
1380 SRE_CODE* overlap = NULL;
1381 int flags = 0;
1382
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001383 if (ptr > end)
1384 return 0;
1385
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001386 if (pattern[0] == SRE_OP_INFO) {
1387 /* optimization info block */
1388 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1389
1390 flags = pattern[2];
1391
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001392 if (pattern[3] && end - ptr < (Py_ssize_t)pattern[3]) {
1393 TRACE(("reject (got %u chars, need %u)\n",
1394 (unsigned int)(end - ptr), pattern[3]));
1395 return 0;
1396 }
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001397 if (pattern[3] > 1) {
1398 /* adjust end point (but make sure we leave at least one
1399 character in there, so literal search will work) */
1400 end -= pattern[3] - 1;
1401 if (end <= ptr)
1402 end = ptr;
1403 }
1404
1405 if (flags & SRE_INFO_PREFIX) {
1406 /* pattern starts with a known prefix */
1407 /* <length> <skip> <prefix data> <overlap data> */
1408 prefix_len = pattern[5];
1409 prefix_skip = pattern[6];
1410 prefix = pattern + 7;
1411 overlap = prefix + prefix_len - 1;
1412 } else if (flags & SRE_INFO_CHARSET)
1413 /* pattern starts with a character from a known set */
1414 /* <charset> */
1415 charset = pattern + 5;
1416
1417 pattern += 1 + pattern[1];
1418 }
1419
1420 TRACE(("prefix = %p %" PY_FORMAT_SIZE_T "d %" PY_FORMAT_SIZE_T "d\n",
1421 prefix, prefix_len, prefix_skip));
1422 TRACE(("charset = %p\n", charset));
1423
Serhiy Storchaka66dc4642015-06-21 14:06:55 +03001424 if (prefix_len == 1) {
1425 /* pattern starts with a literal character */
1426 SRE_CHAR c = (SRE_CHAR) prefix[0];
1427#if SIZEOF_SRE_CHAR < 4
1428 if ((SRE_CODE) c != prefix[0])
1429 return 0; /* literal can't match: doesn't fit in char width */
1430#endif
1431 end = (SRE_CHAR *)state->end;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02001432 state->must_advance = 0;
Serhiy Storchaka66dc4642015-06-21 14:06:55 +03001433 while (ptr < end) {
1434 while (*ptr != c) {
1435 if (++ptr >= end)
1436 return 0;
1437 }
1438 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1439 state->start = ptr;
1440 state->ptr = ptr + prefix_skip;
1441 if (flags & SRE_INFO_LITERAL)
1442 return 1; /* we got all of it */
1443 status = SRE(match)(state, pattern + 2*prefix_skip, 0);
1444 if (status != 0)
1445 return status;
1446 ++ptr;
animalize4a7f44a2019-02-18 21:26:37 +08001447 RESET_CAPTURE_GROUP();
Serhiy Storchaka66dc4642015-06-21 14:06:55 +03001448 }
1449 return 0;
1450 }
1451
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001452 if (prefix_len > 1) {
1453 /* pattern starts with a known prefix. use the overlap
1454 table to skip forward as fast as we possibly can */
1455 Py_ssize_t i = 0;
1456
1457 end = (SRE_CHAR *)state->end;
1458 if (prefix_len > end - ptr)
1459 return 0;
1460#if SIZEOF_SRE_CHAR < 4
1461 for (i = 0; i < prefix_len; i++)
1462 if ((SRE_CODE)(SRE_CHAR) prefix[i] != prefix[i])
1463 return 0; /* literal can't match: doesn't fit in char width */
1464#endif
1465 while (ptr < end) {
1466 SRE_CHAR c = (SRE_CHAR) prefix[0];
1467 while (*ptr++ != c) {
1468 if (ptr >= end)
1469 return 0;
1470 }
1471 if (ptr >= end)
1472 return 0;
1473
1474 i = 1;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02001475 state->must_advance = 0;
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001476 do {
1477 if (*ptr == (SRE_CHAR) prefix[i]) {
1478 if (++i != prefix_len) {
1479 if (++ptr >= end)
1480 return 0;
1481 continue;
1482 }
1483 /* found a potential match */
1484 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1485 state->start = ptr - (prefix_len - 1);
1486 state->ptr = ptr - (prefix_len - prefix_skip - 1);
1487 if (flags & SRE_INFO_LITERAL)
1488 return 1; /* we got all of it */
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03001489 status = SRE(match)(state, pattern + 2*prefix_skip, 0);
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001490 if (status != 0)
1491 return status;
1492 /* close but no cigar -- try again */
1493 if (++ptr >= end)
1494 return 0;
animalize4a7f44a2019-02-18 21:26:37 +08001495 RESET_CAPTURE_GROUP();
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001496 }
1497 i = overlap[i];
1498 } while (i != 0);
1499 }
1500 return 0;
1501 }
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001502
Serhiy Storchaka66dc4642015-06-21 14:06:55 +03001503 if (charset) {
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001504 /* pattern starts with a character from a known set */
1505 end = (SRE_CHAR *)state->end;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02001506 state->must_advance = 0;
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001507 for (;;) {
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +02001508 while (ptr < end && !SRE(charset)(state, charset, *ptr))
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001509 ptr++;
1510 if (ptr >= end)
1511 return 0;
1512 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1513 state->start = ptr;
1514 state->ptr = ptr;
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03001515 status = SRE(match)(state, pattern, 0);
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001516 if (status != 0)
1517 break;
1518 ptr++;
animalize4a7f44a2019-02-18 21:26:37 +08001519 RESET_CAPTURE_GROUP();
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001520 }
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001521 } else {
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001522 /* general case */
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001523 assert(ptr <= end);
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02001524 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1525 state->start = state->ptr = ptr;
1526 status = SRE(match)(state, pattern, 1);
1527 state->must_advance = 0;
1528 while (status == 0 && ptr < end) {
1529 ptr++;
animalize4a7f44a2019-02-18 21:26:37 +08001530 RESET_CAPTURE_GROUP();
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001531 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001532 state->start = state->ptr = ptr;
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03001533 status = SRE(match)(state, pattern, 0);
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001534 }
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001535 }
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +03001536
1537 return status;
1538}
1539
1540#undef SRE_CHAR
1541#undef SIZEOF_SRE_CHAR
1542#undef SRE
1543
1544/* vim:ts=4:sw=4:et
1545*/