blob: 9eec0355cfb8ca964d5222a237828668c892bae3 [file] [log] [blame]
Guido van Rossumb700df92000-03-31 14:59:30 +00001/* -*- Mode: C; tab-width: 4 -*-
2 *
3 * Secret Labs' Regular Expression Engine
4 * $Id$
5 *
6 * simple regular expression matching engine
7 *
8 * partial history:
9 * 99-10-24 fl created (bits and pieces from the template matcher)
10 * 99-11-13 fl added categories, branching, and more (0.2)
11 * 99-11-16 fl some tweaks to compile on non-Windows platforms
12 * 99-12-18 fl non-literals, generic maximizing repeat (0.3)
13 * 99-02-28 fl tons of changes (not all to the better ;-) (0.4)
14 * 99-03-06 fl first alpha, sort of (0.5)
15 * 99-03-14 fl removed most compatibility stuff (0.6)
16 *
17 * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
18 *
19 * This code can only be used for 1.6 alpha testing. All other use
20 * require explicit permission from Secret Labs AB.
21 *
22 * Portions of this engine have been developed in cooperation with
23 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
24 * other compatibility work.
25 */
26
27#ifndef SRE_RECURSIVE
28
29char copyright[] = " SRE 0.6 Copyright (c) 1997-2000 by Secret Labs AB ";
30
31#include "Python.h"
32
33#include "sre.h"
34
35#include "unicodeobject.h"
36
37#if defined(HAVE_LIMITS_H)
38#include <limits.h>
39#else
40#define INT_MAX 2147483647
41#endif
42
43#include <ctype.h> /* temporary hack */
44
45/* defining this one enables tracing */
46#undef DEBUG
47
48#ifdef WIN32 /* FIXME: <fl> don't assume Windows == MSVC */
49#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
50/* fastest possible local call under MSVC */
51#define LOCAL(type) static __inline type __fastcall
52#else
53#define LOCAL(type) static type
54#endif
55
56/* error codes */
57#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
58#define SRE_ERROR_MEMORY -9 /* out of memory */
59
60#ifdef DEBUG
61#define TRACE(v) printf v
62#define PTR(ptr) ((SRE_CHAR*) (ptr) - (SRE_CHAR*) state->beginning)
63#else
64#define TRACE(v)
65#endif
66
67#define SRE_CODE unsigned short /* unsigned short or larger */
68
69typedef struct {
70 /* string pointers */
71 void* ptr; /* current position (also end of current slice) */
72 void* beginning; /* start of original string */
73 void* start; /* start of current slice */
74 void* end; /* end of original string */
75 /* character size */
76 int charsize;
77 /* registers */
78 int marks;
79 void* mark[64]; /* FIXME: <fl> should be dynamically allocated! */
80 /* FIXME */
81 /* backtracking stack */
82 void** stack;
83 int stacksize;
84 int stackbase;
85} SRE_STATE;
86
87#if 1 /* FIXME: <fl> fix this one! */
88#define SRE_TO_LOWER Py_UNICODE_TOLOWER
89#define SRE_IS_DIGIT Py_UNICODE_ISDIGIT
90#define SRE_IS_SPACE Py_UNICODE_ISSPACE
91#define SRE_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
92#else
93#define SRE_TO_LOWER(ch) ((ch) < 256 ? tolower((ch)) : ch)
94#define SRE_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
95#define SRE_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
96#define SRE_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
97#endif
98
99#define SRE_IS_WORD(ch) (SRE_IS_ALNUM((ch)) || (ch) == '_')
100
101LOCAL(int)
102sre_category(SRE_CODE category, unsigned int ch)
103{
104 switch (category) {
105 case 'd':
106 return SRE_IS_DIGIT(ch);
107 case 'D':
108 return !SRE_IS_DIGIT(ch);
109 case 's':
110 return SRE_IS_SPACE(ch);
111 case 'S':
112 return !SRE_IS_SPACE(ch);
113 case 'w':
114 return SRE_IS_WORD(ch);
115 case 'W':
116 return !SRE_IS_WORD(ch);
117 }
118 return 0;
119}
120
121/* helpers */
122
123LOCAL(int)
124_stack_free(SRE_STATE* state)
125{
126 if (state->stack) {
127 TRACE(("release stack\n"));
128 free(state->stack);
129 state->stack = NULL;
130 }
131 state->stacksize = 0;
132 return 0;
133}
134
135static int /* shouldn't be LOCAL */
136_stack_extend(SRE_STATE* state, int lo, int hi)
137{
138 void** stack;
139 int stacksize;
140
141 /* grow the stack to a suitable size; we need at least lo entries,
142 at most hi entries. if for some reason hi is lower than lo, lo
143 wins */
144
145 stacksize = state->stacksize;
146
147 if (stacksize == 0) {
148 /* create new stack */
149 stacksize = 512;
150 if (stacksize < lo)
151 stacksize = lo;
152 else if (stacksize > hi)
153 stacksize = hi;
154 TRACE(("allocate stack %d\n", stacksize));
155 stack = malloc(sizeof(void*) * stacksize);
156 } else {
157 /* grow the stack (typically by a factor of two) */
158 while (stacksize < lo)
159 stacksize = 2 * stacksize;
160 /* FIXME: <fl> could trim size if it's larger than lo, and
161 much larger than hi */
162 TRACE(("grow stack to %d\n", stacksize));
163 stack = realloc(state->stack, sizeof(void*) * stacksize);
164 }
165
166 if (!stack) {
167 _stack_free(state);
168 return SRE_ERROR_MEMORY;
169 }
170
171 state->stack = stack;
172 state->stacksize = stacksize;
173
174 return 0;
175}
176
177/* set things up for the 8-bit version */
178
179#define SRE_CHAR unsigned char
180#define SRE_AT sre_at
181#define SRE_MEMBER sre_member
182#define SRE_MATCH sre_match
183#define SRE_SEARCH sre_search
184#define SRE_RECURSIVE
185
186#include "_sre.c"
187
188#undef SRE_RECURSIVE
189#undef SRE_SEARCH
190#undef SRE_MATCH
191#undef SRE_MEMBER
192#undef SRE_AT
193#undef SRE_CHAR
194
195/* set things up for the 16-bit unicode version */
196
197#define SRE_CHAR Py_UNICODE
198#define SRE_AT sre_uat
199#define SRE_MEMBER sre_umember
200#define SRE_MATCH sre_umatch
201#define SRE_SEARCH sre_usearch
202
203#endif /* SRE_RECURSIVE */
204
205/* -------------------------------------------------------------------- */
206/* String matching engine */
207
208/* the following section is compiled twice, with different character
209 settings */
210
211LOCAL(int)
212SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
213{
214 /* check if pointer is at given position. return 1 if so, 0
215 otherwise */
216
217 int this, that;
218
219 switch (at) {
220 case 'a':
221 /* beginning */
Guido van Rossum29530882000-04-10 17:06:55 +0000222 return ((void*) ptr == state->beginning);
Guido van Rossumb700df92000-03-31 14:59:30 +0000223 case 'z':
224 /* end */
Guido van Rossum29530882000-04-10 17:06:55 +0000225 return ((void*) ptr == state->end);
Guido van Rossumb700df92000-03-31 14:59:30 +0000226 case 'b':
227 /* word boundary */
228 if (state->beginning == state->end)
229 return 0;
230 that = ((void*) ptr > state->beginning) ?
231 SRE_IS_WORD((int) ptr[-1]) : 0;
232 this = ((void*) ptr < state->end) ?
233 SRE_IS_WORD((int) ptr[0]) : 0;
234 return this != that;
235 case 'B':
236 /* word non-boundary */
237 if (state->beginning == state->end)
238 return 0;
239 that = ((void*) ptr > state->beginning) ?
240 SRE_IS_WORD((int) ptr[-1]) : 0;
241 this = ((void*) ptr < state->end) ?
242 SRE_IS_WORD((int) ptr[0]) : 0;
243 return this == that;
244 }
245
246 return 0;
247}
248
249LOCAL(int)
250SRE_MEMBER(SRE_CODE* set, SRE_CHAR ch)
251{
252 /* check if character is a member of the given set. return 1 if
253 so, 0 otherwise */
254
255 int ok = 1;
256
257 for (;;) {
258 switch (*set++) {
259
260 case SRE_OP_NEGATE:
261 ok = !ok;
262 break;
263
264 case SRE_OP_FAILURE:
265 return !ok;
266
267 case SRE_OP_LITERAL:
268 if (ch == (SRE_CHAR) set[0])
269 return ok;
270 set++;
271 break;
272
273 case SRE_OP_RANGE:
274 if ((SRE_CHAR) set[0] <= ch && ch <= (SRE_CHAR) set[1])
275 return ok;
276 set += 2;
277 break;
278
279 case SRE_OP_CATEGORY:
280 if (sre_category(set[0], (int) ch))
281 return ok;
282 set += 1;
283 break;
284
285 default:
286 /* FIXME: internal error */
287 return 0;
288 }
289 }
290}
291
292LOCAL(int)
293SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
294{
295 /* check if string matches the given pattern. returns -1 for
296 error, 0 for failure, and 1 for success */
297
298 SRE_CHAR* end = state->end;
299 SRE_CHAR* ptr = state->ptr;
300 int stacksize;
301 int stackbase;
302 int i, count;
303
304 for (;;) {
305
306 TRACE(("[%p]\n", pattern));
307
308 switch (*pattern++) {
309
310 case SRE_OP_FAILURE:
311 /* immediate failure */
312 TRACE(("%8d: failure\n", PTR(ptr)));
313 return 0;
314
315 case SRE_OP_SUCCESS:
316 /* end of pattern */
317 TRACE(("%8d: success\n", PTR(ptr)));
318 state->ptr = ptr;
319 return 1;
320
321 case SRE_OP_AT:
322 /* match at given position */
323 TRACE(("%8d: match at \\%c\n", PTR(ptr), *pattern));
324 if (!SRE_AT(state, ptr, *pattern))
325 return 0;
326 pattern++;
327 break;
328
329 case SRE_OP_LITERAL:
330 /* match literal character */
331 /* args: <code> */
332 TRACE(("%8d: literal %c\n", PTR(ptr), (SRE_CHAR) *pattern));
333 if (ptr >= end || *ptr != (SRE_CHAR) *pattern)
334 return 0;
335 pattern++;
336 ptr++;
337 break;
338
339 case SRE_OP_NOT_LITERAL:
340 /* match anything that is not literal character */
341 /* args: <code> */
342 TRACE(("%8d: literal not %c\n", PTR(ptr), (SRE_CHAR) *pattern));
343 if (ptr >= end || *ptr == (SRE_CHAR) *pattern)
344 return 0;
345 pattern++;
346 ptr++;
347 break;
348
349 case SRE_OP_ANY:
350 /* match anything */
351 TRACE(("%8d: any\n", PTR(ptr)));
352 if (ptr >= end)
353 return 0;
354 ptr++;
355 break;
356
357 case SRE_OP_IN:
358 /* match set member (or non_member) */
359 /* args: <skip> <set> */
360 TRACE(("%8d: set %c\n", PTR(ptr), *ptr));
361 if (ptr >= end || !SRE_MEMBER(pattern + 1, *ptr))
362 return 0;
363 pattern += pattern[0];
364 ptr++;
365 break;
366
367 case SRE_OP_GROUP:
368 /* match backreference */
369 i = pattern[0];
370 {
371 /* FIXME: optimize size! */
372 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
373 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
374 if (!p || !e || e < p)
375 return 0;
376 while (p < e) {
377 if (ptr >= end || *ptr != *p)
378 return 0;
379 p++; ptr++;
380 }
381 }
382 pattern++;
383 break;
384
385 case SRE_OP_LITERAL_IGNORE:
386 TRACE(("%8d: literal lower(%c)\n", PTR(ptr), (SRE_CHAR) *pattern));
387 if (ptr >= end || SRE_TO_LOWER(*ptr) != (SRE_CHAR) *pattern)
388 return 0;
389 pattern++;
390 ptr++;
391 break;
392
393 case SRE_OP_NOT_LITERAL_IGNORE:
394 TRACE(("%8d: literal not lower(%c)\n", PTR(ptr),
395 (SRE_CHAR) *pattern));
396 if (ptr >= end || SRE_TO_LOWER(*ptr) == (SRE_CHAR) *pattern)
397 return 0;
398 pattern++;
399 ptr++;
400 break;
401
402 case SRE_OP_IN_IGNORE:
403 TRACE(("%8d: set lower(%c)\n", PTR(ptr), *ptr));
404 if (ptr >= end
405 || !SRE_MEMBER(pattern+1, (SRE_CHAR) SRE_TO_LOWER(*ptr)))
406 return 0;
407 pattern += pattern[0];
408 ptr++;
409 break;
410
411 case SRE_OP_MARK:
412 /* set mark */
413 /* args: <mark> */
414 TRACE(("%8d: set mark(%d)\n", PTR(ptr), pattern[0]));
415 state->mark[pattern[0]] = ptr;
416 pattern++;
417 break;
418
419 case SRE_OP_JUMP:
420 /* jump forward */
421 /* args: <skip> */
422 TRACE(("%8d: jump +%d\n", PTR(ptr), pattern[0]));
423 pattern += pattern[0];
424 break;
425
426 case SRE_OP_CALL:
427 /* match subpattern, without backtracking */
428 /* args: <skip> <pattern> */
429 TRACE(("%8d: match subpattern\n", PTR(ptr)));
430 state->ptr = ptr;
431 if (!SRE_MATCH(state, pattern + 1))
432 return 0;
433 pattern += pattern[0];
434 ptr = state->ptr;
435 break;
436
437 case SRE_OP_MAX_REPEAT_ONE:
438
439 /* match repeated sequence (maximizing regexp). this
440 variant only works if the repeated item is exactly one
441 character wide, and we're not already collecting
442 backtracking points. for other cases, use the
443 MAX_REPEAT operator instead */
444
445 /* args: <skip> <min> <max> <step> */
446
447 TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr),
448 pattern[1], pattern[2]));
449
450 count = 0;
451
452 if (pattern[3] == SRE_OP_ANY) {
453 /* repeated wildcard. skip to the end of the target
454 string, and backtrack from there */
455 /* FIXME: must look for line endings */
456 if (ptr + pattern[1] > end)
457 return 0; /* cannot match */
458 count = pattern[2];
459 if (count > end - ptr)
460 count = end - ptr;
461 ptr += count;
462
463 } else if (pattern[3] == SRE_OP_LITERAL) {
464 /* repeated literal */
465 SRE_CHAR chr = (SRE_CHAR) pattern[4];
466 while (count < (int) pattern[2]) {
467 if (ptr >= end || *ptr != chr)
468 break;
469 ptr++;
470 count++;
471 }
472
473 } else if (pattern[3] == SRE_OP_LITERAL_IGNORE) {
474 /* repeated literal */
475 SRE_CHAR chr = (SRE_CHAR) pattern[4];
476 while (count < (int) pattern[2]) {
477 if (ptr >= end || (SRE_CHAR) SRE_TO_LOWER(*ptr) != chr)
478 break;
479 ptr++;
480 count++;
481 }
482
483 } else if (pattern[3] == SRE_OP_NOT_LITERAL) {
484 /* repeated non-literal */
485 SRE_CHAR chr = (SRE_CHAR) pattern[4];
486 while (count < (int) pattern[2]) {
487 if (ptr >= end || *ptr == chr)
488 break;
489 ptr++;
490 count++;
491 }
492
493 } else if (pattern[3] == SRE_OP_NOT_LITERAL_IGNORE) {
494 /* repeated non-literal */
495 SRE_CHAR chr = (SRE_CHAR) pattern[4];
496 while (count < (int) pattern[2]) {
497 if (ptr >= end || (SRE_CHAR) SRE_TO_LOWER(*ptr) == chr)
498 break;
499 ptr++;
500 count++;
501 }
502
503 } else if (pattern[3] == SRE_OP_IN) {
504 /* repeated set */
505 while (count < (int) pattern[2]) {
506 if (ptr >= end || !SRE_MEMBER(pattern + 5, *ptr))
507 break;
508 ptr++;
509 count++;
510 }
511
512 } else {
513 /* repeated single character pattern */
514 state->ptr = ptr;
515 while (count < (int) pattern[2]) {
516 i = SRE_MATCH(state, pattern + 3);
517 if (i < 0)
518 return i;
519 if (i == 0)
520 break;
521 count++;
522 }
523 state->ptr = ptr;
524 ptr += count;
525 }
526
527 /* when we arrive here, count contains the number of
528 matches, and ptr points to the tail of the target
529 string. check if the rest of the pattern matches, and
530 backtrack if not. */
531
532 /* FIXME: <fl> this is a mess. fix it! */
533
534 TRACE(("%8d: repeat %d found\n", PTR(ptr), count));
535
536 if (count < (int) pattern[1])
537 return 0;
538
539 if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
540 /* tail is empty. we're finished */
541 TRACE(("%8d: tail is empty\n", PTR(ptr)));
542 state->ptr = ptr;
543 return 1;
544
545 } else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
546 /* tail starts with a literal. we can speed things up
547 by skipping positions where the rest of the pattern
548 cannot possibly match */
549 SRE_CHAR chr = (SRE_CHAR) pattern[pattern[0]+1];
550 TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr));
551 for (;;) {
552 TRACE(("%8d: scan for tail match\n", PTR(ptr)));
553 while (count >= (int) pattern[1] &&
554 (ptr >= end || *ptr != chr)) {
555 ptr--;
556 count--;
557 }
558 TRACE(("%8d: check tail\n", PTR(ptr)));
559 if (count < (int) pattern[1])
560 break;
561 state->ptr = ptr;
562 i = SRE_MATCH(state, pattern + pattern[0]);
563 if (i > 0) {
564 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
565 return 1;
566 }
567 TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
568 ptr--;
569 count--;
570 }
571
572 } else {
573 TRACE(("%8d: tail is pattern\n", PTR(ptr)));
574 while (count >= (int) pattern[1]) {
575 state->ptr = ptr;
576 i = SRE_MATCH(state, pattern + pattern[0]);
577 if (i > 0) {
578 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
579 return 1;
580 }
581 TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
582 ptr--;
583 count--;
584 }
585 }
586 return 0; /* failure! */
587
588/* ----------------------------------------------------------------------- */
589/* FIXME: the following section is just plain broken */
590
591 case SRE_OP_MAX_REPEAT:
592 /* match repeated sequence (maximizing regexp). repeated
593 group should end with a MAX_UNTIL code */
594
595 TRACE(("%8d: max repeat %d %d\n", PTR(ptr),
596 pattern[1], pattern[2]));
597
598 count = 0;
599 state->ptr = ptr;
600
601 /* FIXME: <fl> umm. what about matching the minimum
602 number of items before starting to collect backtracking
603 positions? */
604
605 stackbase = state->stackbase;
606
607 while (count < (int) pattern[2]) {
608 /* store current position on the stack */
609 TRACE(("%8d: push mark at index %d\n", PTR(ptr), count));
610 if (stackbase + count >= state->stacksize) {
611 i = _stack_extend(state, stackbase + count + 1,
612 stackbase + pattern[2]);
613 if (i < 0)
614 return i;
615 }
616 state->stack[stackbase + count] = ptr;
617 /* check if we can match another item */
618 state->stackbase += count + 1;
619 i = SRE_MATCH(state, pattern + 3);
620 state->stackbase = stackbase; /* rewind */
621 if (i != 2)
622 break;
623 if (state->ptr == ptr) {
624 /* if the match was successful but empty, set the
625 count to max and terminate the scanning loop */
626 stacksize = count; /* actual size of stack */
627 count = (int) pattern[2];
628 goto check_tail; /* FIXME: <fl> eliminate goto */
629 }
630 count++;
631 ptr = state->ptr;
632
633 }
634
635 stacksize = count; /* actual number of entries on the stack */
636
637 check_tail:
638
639 /* when we get here, count is the number of matches,
640 stacksize is the number of match points on the stack
641 (usually same as count, but it might be smaller) and
642 ptr points to the tail. */
643
644 if (count < (int) pattern[1])
645 return 0;
646
647 /* make sure that rest of the expression matches. if it
648 doesn't, backtrack */
649
650 TRACE(("%8d: repeat %d found (stack size = %d)\n", PTR(ptr),
651 count, stacksize + 1));
652
653 TRACE(("%8d: tail is pattern\n", PTR(ptr)));
654
655 /* hope for the best */
656 state->ptr = ptr;
657 state->stackbase += stacksize + 1;
658 i = SRE_MATCH(state, pattern + pattern[0]);
659 state->stackbase = stackbase;
660 if (i > 0) {
661 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
662 return 1;
663 }
664
665 /* backtrack! */
666 while (count >= (int) pattern[1]) {
667 ptr = state->stack[stackbase + (count < stacksize ? count : stacksize)];
668 state->ptr = ptr;
669 count--;
670 TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
671 state->stackbase += stacksize + 1;
672 i = SRE_MATCH(state, pattern + pattern[0]);
673 state->stackbase = stackbase;
674 if (i > 0) {
675 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
676 return 1;
677 }
678 }
679 return 0; /* failure! */
680
681 case SRE_OP_MAX_UNTIL:
682 /* match repeated sequence (maximizing regexp). repeated
683 group should end with a MAX_UNTIL code */
684
685 TRACE(("%8d: max until\n", PTR(ptr)));
686 state->ptr = ptr;
687 return 2; /* always succeeds, for now... */
688
689/* end of totally broken section */
690/* ----------------------------------------------------------------------- */
691
692 case SRE_OP_MIN_REPEAT:
693 /* match repeated sequence (minimizing regexp) */
694 TRACE(("%8d: min repeat %d %d\n", PTR(ptr),
695 pattern[1], pattern[2]));
696 count = 0;
697 state->ptr = ptr;
698 /* match minimum number of items */
699 while (count < (int) pattern[1]) {
700 i = SRE_MATCH(state, pattern + 3);
701 if (i <= 0)
702 return i;
703 count++;
704 }
705 /* move forward until the tail matches. */
706 while (count <= (int) pattern[2]) {
707 ptr = state->ptr;
708 i = SRE_MATCH(state, pattern + pattern[0]);
709 if (i > 0) {
710 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
711 return 1;
712 }
713 TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
714 state->ptr = ptr; /* backtrack */
715 i = SRE_MATCH(state, pattern + 3);
716 if (i <= 0)
717 return i;
718 count++;
719 }
720 return 0; /* failure! */
721
722 case SRE_OP_MIN_UNTIL:
723 /* end of repeat group */
724 TRACE(("%8d: min until\n", PTR(ptr)));
725 state->ptr = ptr;
726 return 2; /* always succeeds, for now... */
727
728 case SRE_OP_BRANCH:
729 /* match one of several subpatterns */
730 /* format: <branch> <size> <head> ... <null> <tail> */
731 TRACE(("%8d: branch\n", PTR(ptr)));
732 while (*pattern) {
733 if (pattern[1] != SRE_OP_LITERAL ||
734 (ptr < end && *ptr == (SRE_CHAR) pattern[2])) {
735 TRACE(("%8d: branch check\n", PTR(ptr)));
736 state->ptr = ptr;
737 i = SRE_MATCH(state, pattern + 1);
738 if (i > 0) {
739 TRACE(("%8d: branch succeeded\n", PTR(ptr)));
740 return 1;
741 }
742 }
743 pattern += *pattern;
744 }
745 TRACE(("%8d: branch failed\n", PTR(ptr)));
746 return 0; /* failure! */
747
748 case SRE_OP_REPEAT:
749 /* TEMPLATE: match repeated sequence (no backtracking) */
750 /* format: <repeat> <skip> <min> <max> */
751 TRACE(("%8d: repeat %d %d\n", PTR(ptr), pattern[1], pattern[2]));
752 count = 0;
753 state->ptr = ptr;
754 while (count < (int) pattern[2]) {
755 i = SRE_MATCH(state, pattern + 3);
756 if (i <= 0)
757 break;
758 count++;
759 }
760 if (count <= (int) pattern[1])
761 return 0;
762 TRACE(("%8d: repeat %d matches\n", PTR(ptr), count));
763 pattern += pattern[0];
764 ptr = state->ptr;
765 break;
766
767 default:
768 return SRE_ERROR_ILLEGAL;
769 }
770 }
771}
772
773LOCAL(int)
774SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
775{
776 SRE_CHAR* ptr = state->start;
777 SRE_CHAR* end = state->end;
778 int status = 0;
779
780 /* FIXME: <fl> add IGNORE cases (or implement full ASSERT support? */
781
782 if (pattern[0] == SRE_OP_LITERAL) {
783 /* pattern starts with a literal */
784 SRE_CHAR chr = (SRE_CHAR) pattern[1];
785 for (;;) {
786 while (ptr < end && *ptr != chr)
787 ptr++;
788 if (ptr == end)
789 return 0;
790 TRACE(("%8d: search found literal\n", PTR(ptr)));
791 state->start = ptr;
792 state->ptr = ++ptr;
793 status = SRE_MATCH(state, pattern + 2);
794 if (status != 0)
795 break;
796 }
797
798 } else if (pattern[0] == SRE_OP_IN) {
799 /* pattern starts with a set */
800 for (;;) {
801 /* format: <in> <skip> <data> */
802 while (ptr < end && !SRE_MEMBER(pattern + 2, *ptr))
803 ptr++;
804 if (ptr == end)
805 return 0;
806 TRACE(("%8d: search found set\n", PTR(ptr)));
807 state->start = ptr;
808 state->ptr = ++ptr;
809 status = SRE_MATCH(state, pattern + pattern[1] + 1);
810 if (status != 0)
811 break;
812 }
813
814 } else
815 while (ptr <= end) {
816 TRACE(("%8d: search\n", PTR(ptr)));
817 state->start = state->ptr = ptr++;
818 status = SRE_MATCH(state, pattern);
819 if (status != 0)
820 break;
821 }
822
823 return status;
824}
825
826#ifndef SRE_RECURSIVE
827
828/* -------------------------------------------------------------------- */
829/* factories and destructors */
830
831/* see sre.h for object declarations */
832
833staticforward PyTypeObject Pattern_Type;
834staticforward PyTypeObject Match_Type;
835
836static PyObject *
837_compile(PyObject* self_, PyObject* args)
838{
839 /* "compile" pattern descriptor to pattern object */
840
841 PatternObject* self;
842
843 PyObject* pattern;
844 PyObject* code;
845 int groups = 0;
846 PyObject* groupindex = NULL;
847 if (!PyArg_ParseTuple(args, "OO!|iO", &pattern,
848 &PyString_Type, &code, &groups, &groupindex))
849 return NULL;
850
851 self = PyObject_NEW(PatternObject, &Pattern_Type);
852 if (self == NULL)
853 return NULL;
854
855 Py_INCREF(pattern);
856 self->pattern = pattern;
857
858 Py_INCREF(code);
859 self->code = code;
860
861 self->groups = groups;
862
863 Py_XINCREF(groupindex);
864 self->groupindex = groupindex;
865
866 return (PyObject*) self;
867}
868
869static PyObject *
870_getcodesize(PyObject* self_, PyObject* args)
871{
872 return Py_BuildValue("i", sizeof(SRE_CODE));
873}
874
875static PyObject*
876_pattern_new_match(PatternObject* pattern, SRE_STATE* state,
877 PyObject* string, int status)
878{
879 /* create match object (from state object) */
880
881 MatchObject* match;
882 int i, j;
883
884 TRACE(("status = %d\n", status));
885
886 if (status > 0) {
887
888 /* create match object (with room for extra group marks) */
889 match = PyObject_NEW_VAR(MatchObject, &Match_Type, 2*pattern->groups);
890 if (match == NULL)
891 return NULL;
892
893 Py_INCREF(pattern);
894 match->pattern = pattern;
895
896 Py_INCREF(string);
897 match->string = string;
898
899 match->groups = pattern->groups+1;
900
901 /* group zero */
902 match->mark[0] = ((char*) state->start -
903 (char*) state->beginning) / state->charsize;
904 match->mark[1] = ((char*) state->ptr -
905 (char*) state->beginning) / state->charsize;
906
907 /* fill in the rest of the groups */
908 for (i = j = 0; i < pattern->groups; i++, j+=2)
909 if (state->mark[j] != NULL && state->mark[j+1] != NULL) {
910 match->mark[j+2] = ((char*) state->mark[j] -
911 (char*) state->beginning) / state->charsize;
912 match->mark[j+3] = ((char*) state->mark[j+1] -
913 (char*) state->beginning) / state->charsize;
914 } else
915 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
916
917 return (PyObject*) match;
918
919 } else if (status < 0) {
920
921 /* internal error */
922 PyErr_SetString(
923 PyExc_RuntimeError, "internal error in regular expression engine"
924 );
925 return NULL;
926
927 }
928
929 Py_INCREF(Py_None);
930 return Py_None;
931}
932
933/* -------------------------------------------------------------------- */
934/* pattern methods */
935
936LOCAL(PyObject*)
937_setup(SRE_STATE* state, PyObject* args)
938{
939 /* prepare state object */
940
941 PyBufferProcs *buffer;
942 int i, count;
943 void* ptr;
944
945 PyObject* string;
946 int start = 0;
947 int end = INT_MAX;
948 if (!PyArg_ParseTuple(args, "O|ii", &string, &start, &end))
949 return NULL;
950
951 /* get pointer to string buffer */
952 buffer = string->ob_type->tp_as_buffer;
953 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
954 buffer->bf_getsegcount(string, NULL) != 1) {
955 PyErr_SetString(PyExc_TypeError, "expected read-only buffer");
956 return NULL;
957 }
958
959 /* determine buffer size */
960 count = buffer->bf_getreadbuffer(string, 0, &ptr);
961 if (count < 0) {
962 /* sanity check */
963 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
964 return NULL;
965 }
966
967 /* determine character size */
968 state->charsize = (PyUnicode_Check(string) ? sizeof(Py_UNICODE) : 1);
969
970 count /= state->charsize;
971
972 /* adjust boundaries */
973 if (start < 0)
974 start = 0;
975 else if (start > count)
976 start = count;
977
978 if (end < 0)
979 end = 0;
980 else if (end > count)
981 end = count;
982
983 state->beginning = ptr;
984
985 state->start = (void*) ((char*) ptr + start * state->charsize);
986 state->end = (void*) ((char*) ptr + end * state->charsize);
987
988 /* FIXME: dynamic! */
989 for (i = 0; i < 64; i++)
990 state->mark[i] = NULL;
991
992 state->stack = NULL;
993 state->stackbase = 0;
994 state->stacksize = 0;
995
996 return string;
997}
998
999static void
1000_pattern_dealloc(PatternObject* self)
1001{
1002 Py_XDECREF(self->code);
1003 Py_XDECREF(self->pattern);
1004 Py_XDECREF(self->groupindex);
1005 PyMem_DEL(self);
1006}
1007
1008static PyObject*
1009_pattern_match(PatternObject* self, PyObject* args)
1010{
1011 SRE_STATE state;
1012 PyObject* string;
1013 int status;
1014
1015 string = _setup(&state, args);
1016 if (!string)
1017 return NULL;
1018
1019 state.ptr = state.start;
1020
1021 if (state.charsize == 1) {
1022 status = sre_match(&state, PatternObject_GetCode(self));
1023 } else {
1024 status = sre_umatch(&state, PatternObject_GetCode(self));
1025 }
1026
1027 _stack_free(&state);
1028
1029 return _pattern_new_match(self, &state, string, status);
1030}
1031
1032static PyObject*
1033_pattern_search(PatternObject* self, PyObject* args)
1034{
1035 SRE_STATE state;
1036 PyObject* string;
1037 int status;
1038
1039 string = _setup(&state, args);
1040 if (!string)
1041 return NULL;
1042
1043 if (state.charsize == 1) {
1044 status = sre_search(&state, PatternObject_GetCode(self));
1045 } else {
1046 status = sre_usearch(&state, PatternObject_GetCode(self));
1047 }
1048
1049 _stack_free(&state);
1050
1051 return _pattern_new_match(self, &state, string, status);
1052}
1053
1054static PyObject*
1055_pattern_findall(PatternObject* self, PyObject* args)
1056{
1057 /* FIXME: not sure about the semantics here. this is good enough
1058 for SXP, though... */
1059
1060 SRE_STATE state;
1061 PyObject* string;
1062 PyObject* list;
1063 int status;
1064
1065 string = _setup(&state, args);
1066 if (!string)
1067 return NULL;
1068
1069 list = PyList_New(0);
1070
1071 while (state.start < state.end) {
1072
1073 PyObject* item;
1074
1075 state.ptr = state.start;
1076
1077 if (state.charsize == 1) {
1078 status = sre_match(&state, PatternObject_GetCode(self));
1079 } else {
1080 status = sre_umatch(&state, PatternObject_GetCode(self));
1081 }
1082
1083 if (status >= 0) {
1084
1085 if (status == 0)
1086 state.ptr = (void*) ((char*) state.start + 1);
1087
1088 item = PySequence_GetSlice(
1089 string,
1090 ((char*) state.start - (char*) state.beginning),
1091 ((char*) state.ptr - (char*) state.beginning));
1092 if (!item)
1093 goto error;
1094 if (PyList_Append(list, item) < 0)
1095 goto error;
1096
1097 state.start = state.ptr;
1098
1099 } else {
1100
1101 /* internal error */
1102 PyErr_SetString(
1103 PyExc_RuntimeError,
1104 "internal error in regular expression engine"
1105 );
1106 goto error;
1107
1108 }
1109 }
1110
1111 _stack_free(&state);
1112
1113 return list;
1114
1115error:
1116 _stack_free(&state);
1117 return NULL;
1118
1119}
1120
1121static PyMethodDef _pattern_methods[] = {
1122 {"match", (PyCFunction) _pattern_match, 1},
1123 {"search", (PyCFunction) _pattern_search, 1},
1124 {"findall", (PyCFunction) _pattern_findall, 1},
1125 {NULL, NULL}
1126};
1127
1128static PyObject*
1129_pattern_getattr(PatternObject* self, char* name)
1130{
1131 PyObject* res;
1132
1133 res = Py_FindMethod(_pattern_methods, (PyObject*) self, name);
1134
1135 if (res)
1136 return res;
1137
1138 PyErr_Clear();
1139
1140 /* attributes */
1141 if (!strcmp(name, "pattern")) {
1142 Py_INCREF(self->pattern);
1143 return self->pattern;
1144 }
1145
1146 PyErr_SetString(PyExc_AttributeError, name);
1147 return NULL;
1148}
1149
1150statichere PyTypeObject Pattern_Type = {
1151 PyObject_HEAD_INIT(NULL)
1152 0, "Pattern", sizeof(PatternObject), 0,
1153 (destructor)_pattern_dealloc, /*tp_dealloc*/
1154 0, /*tp_print*/
1155 (getattrfunc)_pattern_getattr, /*tp_getattr*/
1156};
1157
1158/* -------------------------------------------------------------------- */
1159/* match methods */
1160
1161static void
1162_match_dealloc(MatchObject* self)
1163{
1164 Py_XDECREF(self->string);
1165 Py_DECREF(self->pattern);
1166 PyMem_DEL(self);
1167}
1168
1169static PyObject*
1170getslice_by_index(MatchObject* self, int index)
1171{
1172 if (index < 0 || index >= self->groups) {
1173 /* raise IndexError if we were given a bad group number */
1174 PyErr_SetString(
1175 PyExc_IndexError,
1176 "no such group"
1177 );
1178 return NULL;
1179 }
1180
1181 if (self->string == Py_None || self->mark[index+index] < 0) {
1182 /* return None if the string or group is undefined */
1183 Py_INCREF(Py_None);
1184 return Py_None;
1185 }
1186
1187 return PySequence_GetSlice(
1188 self->string, self->mark[index+index], self->mark[index+index+1]
1189 );
1190}
1191
1192static PyObject*
1193getslice(MatchObject* self, PyObject* index)
1194{
1195 if (!PyInt_Check(index) && self->pattern->groupindex != NULL) {
1196 /* FIXME: resource leak? */
1197 index = PyObject_GetItem(self->pattern->groupindex, index);
1198 if (!index)
1199 return NULL;
1200 }
1201
1202 if (PyInt_Check(index))
1203 return getslice_by_index(self, (int) PyInt_AS_LONG(index));
1204
1205 return getslice_by_index(self, -1); /* signal error */
1206}
1207
1208static PyObject*
1209_match_group(MatchObject* self, PyObject* args)
1210{
1211 PyObject* result;
1212 int i, size;
1213
1214 size = PyTuple_GET_SIZE(args);
1215
1216 switch (size) {
1217 case 0:
1218 result = getslice(self, Py_False); /* force error */
1219 break;
1220 case 1:
1221 result = getslice(self, PyTuple_GET_ITEM(args, 0));
1222 break;
1223 default:
1224 /* fetch multiple items */
1225 result = PyTuple_New(size);
1226 if (!result)
1227 return NULL;
1228 for (i = 0; i < size; i++) {
1229 PyObject* item = getslice(self, PyTuple_GET_ITEM(args, i));
1230 if (!item) {
1231 Py_DECREF(result);
1232 return NULL;
1233 }
1234 PyTuple_SET_ITEM(result, i, item);
1235 }
1236 break;
1237 }
1238 return result;
1239}
1240
1241static PyObject*
1242_match_groups(MatchObject* self, PyObject* args)
1243{
1244 PyObject* result;
1245 int index;
1246
1247 result = PyTuple_New(self->groups-1);
1248 if (!result)
1249 return NULL;
1250
1251 for (index = 1; index < self->groups; index++) {
1252 PyObject* item;
1253 /* FIXME: <fl> handle default! */
1254 item = getslice_by_index(self, index);
1255 if (!item) {
1256 Py_DECREF(result);
1257 return NULL;
1258 }
1259 PyTuple_SET_ITEM(result, index-1, item);
1260 }
1261
1262 return result;
1263}
1264
1265static PyObject*
1266_match_groupdict(MatchObject* self, PyObject* args)
1267{
1268 PyObject* result;
1269 PyObject* keys;
1270 int index;
1271
1272 result = PyDict_New();
1273 if (!result)
1274 return NULL;
1275 if (!self->pattern->groupindex)
1276 return result;
1277
1278 keys = PyMapping_Keys(self->pattern->groupindex);
1279 if (!keys)
1280 return NULL;
1281
1282 for (index = 0; index < PySequence_Length(keys); index++) {
1283 PyObject* key;
1284 PyObject* item;
1285 key = PySequence_GetItem(keys, index);
1286 if (!key) {
1287 Py_DECREF(keys);
1288 Py_DECREF(result);
1289 return NULL;
1290 }
1291 item = getslice(self, key);
1292 if (!item) {
1293 Py_DECREF(key);
1294 Py_DECREF(keys);
1295 Py_DECREF(result);
1296 return NULL;
1297 }
1298 /* FIXME: <fl> this can fail, right? */
1299 PyDict_SetItem(result, key, item);
1300 }
1301
1302 Py_DECREF(keys);
1303
1304 return result;
1305}
1306
1307static PyObject*
1308_match_start(MatchObject* self, PyObject* args)
1309{
1310 int index = 0;
1311 if (!PyArg_ParseTuple(args, "|i", &index))
1312 return NULL;
1313
1314 if (index < 0 || index >= self->groups) {
1315 PyErr_SetString(
1316 PyExc_IndexError,
1317 "no such group"
1318 );
1319 return NULL;
1320 }
1321
1322 if (self->mark[index*2] < 0) {
1323 Py_INCREF(Py_None);
1324 return Py_None;
1325 }
1326
1327 return Py_BuildValue("i", self->mark[index*2]);
1328}
1329
1330static PyObject*
1331_match_end(MatchObject* self, PyObject* args)
1332{
1333 int index = 0;
1334 if (!PyArg_ParseTuple(args, "|i", &index))
1335 return NULL;
1336
1337 if (index < 0 || index >= self->groups) {
1338 PyErr_SetString(
1339 PyExc_IndexError,
1340 "no such group"
1341 );
1342 return NULL;
1343 }
1344
1345 if (self->mark[index*2] < 0) {
1346 Py_INCREF(Py_None);
1347 return Py_None;
1348 }
1349
1350 return Py_BuildValue("i", self->mark[index*2+1]);
1351}
1352
1353static PyObject*
1354_match_span(MatchObject* self, PyObject* args)
1355{
1356 int index = 0;
1357 if (!PyArg_ParseTuple(args, "|i", &index))
1358 return NULL;
1359
1360 if (index < 0 || index >= self->groups) {
1361 PyErr_SetString(
1362 PyExc_IndexError,
1363 "no such group"
1364 );
1365 return NULL;
1366 }
1367
1368 if (self->mark[index*2] < 0) {
1369 Py_INCREF(Py_None);
1370 return Py_None;
1371 }
1372
1373 return Py_BuildValue("ii", self->mark[index*2], self->mark[index*2+1]);
1374}
1375
1376static PyMethodDef _match_methods[] = {
1377 {"group", (PyCFunction) _match_group, 1},
1378 {"start", (PyCFunction) _match_start, 1},
1379 {"end", (PyCFunction) _match_end, 1},
1380 {"span", (PyCFunction) _match_span, 1},
1381 {"groups", (PyCFunction) _match_groups, 1},
1382 {"groupdict", (PyCFunction) _match_groupdict, 1},
1383 {NULL, NULL}
1384};
1385
1386static PyObject*
1387_match_getattr(MatchObject* self, char* name)
1388{
1389 PyObject* res;
1390
1391 res = Py_FindMethod(_match_methods, (PyObject*) self, name);
1392 if (res)
1393 return res;
1394
1395 PyErr_Clear();
1396
1397 /* attributes! */
1398 if (!strcmp(name, "string")) {
1399 Py_INCREF(self->string);
1400 return self->string;
1401 }
1402 if (!strcmp(name, "regs"))
1403 /* FIXME: should return the whole list! */
1404 return Py_BuildValue("((i,i))", self->mark[0], self->mark[1]);
1405 if (!strcmp(name, "re")) {
1406 Py_INCREF(self->pattern);
1407 return (PyObject*) self->pattern;
1408 }
1409 if (!strcmp(name, "groupindex") && self->pattern->groupindex) {
1410 Py_INCREF(self->pattern->groupindex);
1411 return self->pattern->groupindex;
1412 }
1413 if (!strcmp(name, "pos"))
1414 return Py_BuildValue("i", 0); /* FIXME */
1415 if (!strcmp(name, "endpos"))
1416 return Py_BuildValue("i", 0); /* FIXME */
1417
1418 PyErr_SetString(PyExc_AttributeError, name);
1419 return NULL;
1420}
1421
1422/* FIXME: implement setattr("string", None) as a special case (to
1423 detach the associated string, if any */
1424
1425statichere PyTypeObject Match_Type = {
1426 PyObject_HEAD_INIT(NULL)
1427 0, "Match",
1428 sizeof(MatchObject), /* size of basic object */
1429 sizeof(int), /* space for group item */
1430 (destructor)_match_dealloc, /*tp_dealloc*/
1431 0, /*tp_print*/
1432 (getattrfunc)_match_getattr, /*tp_getattr*/
1433};
1434
1435static PyMethodDef _functions[] = {
1436 {"compile", _compile, 1},
1437 {"getcodesize", _getcodesize, 1},
1438 {NULL, NULL}
1439};
1440
1441void
1442#ifdef WIN32
1443__declspec(dllexport)
1444#endif
1445init_sre()
1446{
1447 /* Patch object types */
1448 Pattern_Type.ob_type = Match_Type.ob_type = &PyType_Type;
1449
1450 Py_InitModule("_sre", _functions);
1451}
1452
1453#endif