blob: 8d468444b574ae2eeca56a7210a5ae4bcaec8ded [file] [log] [blame]
Guido van Rossumb700df92000-03-31 14:59:30 +00001/* -*- Mode: C; tab-width: 4 -*-
2 *
3 * Secret Labs' Regular Expression Engine
4 * $Id$
5 *
6 * simple regular expression matching engine
7 *
8 * partial history:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00009 * 99-10-24 fl created (based on the template matcher)
Guido van Rossumb700df92000-03-31 14:59:30 +000010 * 99-11-13 fl added categories, branching, and more (0.2)
11 * 99-11-16 fl some tweaks to compile on non-Windows platforms
12 * 99-12-18 fl non-literals, generic maximizing repeat (0.3)
13 * 99-02-28 fl tons of changes (not all to the better ;-) (0.4)
14 * 99-03-06 fl first alpha, sort of (0.5)
15 * 99-03-14 fl removed most compatibility stuff (0.6)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000016 * 99-05-10 fl towards third alpha (0.8.2)
17 * 99-05-13 fl added experimental cursor stuff (0.8.3)
18 * 99-05-27 fl final bug hunt (0.8.4)
Guido van Rossumb700df92000-03-31 14:59:30 +000019 *
20 * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
21 *
22 * This code can only be used for 1.6 alpha testing. All other use
23 * require explicit permission from Secret Labs AB.
24 *
25 * Portions of this engine have been developed in cooperation with
26 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
27 * other compatibility work.
28 */
29
30#ifndef SRE_RECURSIVE
31
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000032char copyright[] = " SRE 0.8.4 Copyright (c) 1997-2000 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000033
34#include "Python.h"
35
36#include "sre.h"
37
38#include "unicodeobject.h"
39
40#if defined(HAVE_LIMITS_H)
41#include <limits.h>
42#else
43#define INT_MAX 2147483647
44#endif
45
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000046#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000047
48/* defining this one enables tracing */
49#undef DEBUG
50
51#ifdef WIN32 /* FIXME: <fl> don't assume Windows == MSVC */
52#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
53/* fastest possible local call under MSVC */
54#define LOCAL(type) static __inline type __fastcall
55#else
56#define LOCAL(type) static type
57#endif
58
59/* error codes */
60#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
61#define SRE_ERROR_MEMORY -9 /* out of memory */
62
63#ifdef DEBUG
64#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +000065#else
66#define TRACE(v)
67#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000068#define PTR(ptr) ((SRE_CHAR*) (ptr) - (SRE_CHAR*) state->beginning)
Guido van Rossumb700df92000-03-31 14:59:30 +000069
70#define SRE_CODE unsigned short /* unsigned short or larger */
71
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000072/* -------------------------------------------------------------------- */
73/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +000074
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000075/* unicode character predicates */
76#define SRE_TO_LOWER(ch) Py_UNICODE_TOLOWER((Py_UNICODE)(ch))
77#define SRE_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
78#define SRE_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
79#define SRE_IS_LINEBREAK(ch) ((ch) == '\n')
80/* #define SRE_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch)) */
Guido van Rossumb700df92000-03-31 14:59:30 +000081#define SRE_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +000082#define SRE_IS_WORD(ch) (SRE_IS_ALNUM((ch)) || (ch) == '_')
83
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000084/* locale-specific character predicates */
85#define SRE_LOC_TO_LOWER(ch) ((ch) < 256 ? tolower((ch)) : ch)
86#define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
87#define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
88#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
89#define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
90#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
91
Guido van Rossumb700df92000-03-31 14:59:30 +000092LOCAL(int)
93sre_category(SRE_CODE category, unsigned int ch)
94{
95 switch (category) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000096 case SRE_CATEGORY_DIGIT:
Guido van Rossumb700df92000-03-31 14:59:30 +000097 return SRE_IS_DIGIT(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000098 case SRE_CATEGORY_NOT_DIGIT:
Guido van Rossumb700df92000-03-31 14:59:30 +000099 return !SRE_IS_DIGIT(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000100 case SRE_CATEGORY_SPACE:
Guido van Rossumb700df92000-03-31 14:59:30 +0000101 return SRE_IS_SPACE(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000102 case SRE_CATEGORY_NOT_SPACE:
Guido van Rossumb700df92000-03-31 14:59:30 +0000103 return !SRE_IS_SPACE(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000104 case SRE_CATEGORY_WORD:
Guido van Rossumb700df92000-03-31 14:59:30 +0000105 return SRE_IS_WORD(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000106 case SRE_CATEGORY_NOT_WORD:
Guido van Rossumb700df92000-03-31 14:59:30 +0000107 return !SRE_IS_WORD(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000108 case SRE_CATEGORY_LINEBREAK:
109 return SRE_IS_LINEBREAK(ch);
110 case SRE_CATEGORY_NOT_LINEBREAK:
111 return !SRE_IS_LINEBREAK(ch);
112 case SRE_CATEGORY_LOC_DIGIT:
113 return SRE_LOC_IS_DIGIT(ch);
114 case SRE_CATEGORY_LOC_NOT_DIGIT:
115 return !SRE_LOC_IS_DIGIT(ch);
116 case SRE_CATEGORY_LOC_SPACE:
117 return SRE_LOC_IS_SPACE(ch);
118 case SRE_CATEGORY_LOC_NOT_SPACE:
119 return !SRE_LOC_IS_SPACE(ch);
120 case SRE_CATEGORY_LOC_WORD:
121 return SRE_LOC_IS_WORD(ch);
122 case SRE_CATEGORY_LOC_NOT_WORD:
123 return !SRE_LOC_IS_WORD(ch);
124 case SRE_CATEGORY_LOC_LINEBREAK:
125 return SRE_LOC_IS_LINEBREAK(ch);
126 case SRE_CATEGORY_LOC_NOT_LINEBREAK:
127 return !SRE_LOC_IS_LINEBREAK(ch);
Guido van Rossumb700df92000-03-31 14:59:30 +0000128 }
129 return 0;
130}
131
132/* helpers */
133
134LOCAL(int)
135_stack_free(SRE_STATE* state)
136{
137 if (state->stack) {
138 TRACE(("release stack\n"));
139 free(state->stack);
140 state->stack = NULL;
141 }
142 state->stacksize = 0;
143 return 0;
144}
145
146static int /* shouldn't be LOCAL */
147_stack_extend(SRE_STATE* state, int lo, int hi)
148{
149 void** stack;
150 int stacksize;
151
152 /* grow the stack to a suitable size; we need at least lo entries,
153 at most hi entries. if for some reason hi is lower than lo, lo
154 wins */
155
156 stacksize = state->stacksize;
157
158 if (stacksize == 0) {
159 /* create new stack */
160 stacksize = 512;
161 if (stacksize < lo)
162 stacksize = lo;
163 else if (stacksize > hi)
164 stacksize = hi;
165 TRACE(("allocate stack %d\n", stacksize));
166 stack = malloc(sizeof(void*) * stacksize);
167 } else {
168 /* grow the stack (typically by a factor of two) */
169 while (stacksize < lo)
170 stacksize = 2 * stacksize;
171 /* FIXME: <fl> could trim size if it's larger than lo, and
172 much larger than hi */
173 TRACE(("grow stack to %d\n", stacksize));
174 stack = realloc(state->stack, sizeof(void*) * stacksize);
175 }
176
177 if (!stack) {
178 _stack_free(state);
179 return SRE_ERROR_MEMORY;
180 }
181
182 state->stack = stack;
183 state->stacksize = stacksize;
184
185 return 0;
186}
187
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000188/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000189
190#define SRE_CHAR unsigned char
191#define SRE_AT sre_at
192#define SRE_MEMBER sre_member
193#define SRE_MATCH sre_match
194#define SRE_SEARCH sre_search
195#define SRE_RECURSIVE
196
197#include "_sre.c"
198
199#undef SRE_RECURSIVE
200#undef SRE_SEARCH
201#undef SRE_MATCH
202#undef SRE_MEMBER
203#undef SRE_AT
204#undef SRE_CHAR
205
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000206/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000207
208#define SRE_CHAR Py_UNICODE
209#define SRE_AT sre_uat
210#define SRE_MEMBER sre_umember
211#define SRE_MATCH sre_umatch
212#define SRE_SEARCH sre_usearch
213
214#endif /* SRE_RECURSIVE */
215
216/* -------------------------------------------------------------------- */
217/* String matching engine */
218
219/* the following section is compiled twice, with different character
220 settings */
221
222LOCAL(int)
223SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
224{
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000225 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000226
227 int this, that;
228
229 switch (at) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000230 case SRE_AT_BEGINNING:
Guido van Rossum29530882000-04-10 17:06:55 +0000231 return ((void*) ptr == state->beginning);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000232 case SRE_AT_BEGINNING_LINE:
233 return ((void*) ptr == state->beginning ||
234 SRE_IS_LINEBREAK((int) ptr[-1]));
235 case SRE_AT_END:
Guido van Rossum29530882000-04-10 17:06:55 +0000236 return ((void*) ptr == state->end);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000237 case SRE_AT_END_LINE:
238 return ((void*) ptr == state->end ||
239 SRE_IS_LINEBREAK((int) ptr[0]));
240 case SRE_AT_BOUNDARY:
Guido van Rossumb700df92000-03-31 14:59:30 +0000241 if (state->beginning == state->end)
242 return 0;
243 that = ((void*) ptr > state->beginning) ?
244 SRE_IS_WORD((int) ptr[-1]) : 0;
245 this = ((void*) ptr < state->end) ?
246 SRE_IS_WORD((int) ptr[0]) : 0;
247 return this != that;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000248 case SRE_AT_NON_BOUNDARY:
Guido van Rossumb700df92000-03-31 14:59:30 +0000249 if (state->beginning == state->end)
250 return 0;
251 that = ((void*) ptr > state->beginning) ?
252 SRE_IS_WORD((int) ptr[-1]) : 0;
253 this = ((void*) ptr < state->end) ?
254 SRE_IS_WORD((int) ptr[0]) : 0;
255 return this == that;
256 }
257
258 return 0;
259}
260
261LOCAL(int)
262SRE_MEMBER(SRE_CODE* set, SRE_CHAR ch)
263{
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000264 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000265
266 int ok = 1;
267
268 for (;;) {
269 switch (*set++) {
270
271 case SRE_OP_NEGATE:
272 ok = !ok;
273 break;
274
275 case SRE_OP_FAILURE:
276 return !ok;
277
278 case SRE_OP_LITERAL:
279 if (ch == (SRE_CHAR) set[0])
280 return ok;
281 set++;
282 break;
283
284 case SRE_OP_RANGE:
285 if ((SRE_CHAR) set[0] <= ch && ch <= (SRE_CHAR) set[1])
286 return ok;
287 set += 2;
288 break;
289
290 case SRE_OP_CATEGORY:
291 if (sre_category(set[0], (int) ch))
292 return ok;
293 set += 1;
294 break;
295
296 default:
297 /* FIXME: internal error */
298 return 0;
299 }
300 }
301}
302
303LOCAL(int)
304SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
305{
306 /* check if string matches the given pattern. returns -1 for
307 error, 0 for failure, and 1 for success */
308
309 SRE_CHAR* end = state->end;
310 SRE_CHAR* ptr = state->ptr;
311 int stacksize;
312 int stackbase;
313 int i, count;
314
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000315 /* FIXME: this is one ugly hack */
316 void* *mark = NULL;
317 void* mark_data[64];
Guido van Rossumb700df92000-03-31 14:59:30 +0000318
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000319 for (;;) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000320
321 switch (*pattern++) {
322
323 case SRE_OP_FAILURE:
324 /* immediate failure */
325 TRACE(("%8d: failure\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000326 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000327
328 case SRE_OP_SUCCESS:
329 /* end of pattern */
330 TRACE(("%8d: success\n", PTR(ptr)));
331 state->ptr = ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000332 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000333
334 case SRE_OP_AT:
335 /* match at given position */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000336 /* args: <at> */
Guido van Rossumb700df92000-03-31 14:59:30 +0000337 TRACE(("%8d: match at \\%c\n", PTR(ptr), *pattern));
338 if (!SRE_AT(state, ptr, *pattern))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000339 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000340 pattern++;
341 break;
342
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000343 case SRE_OP_CATEGORY:
344 /* match at given category */
345 /* args: <category> */
346 TRACE(("%8d: category match at \\%c\n", PTR(ptr), *pattern));
347 if (ptr >= end || !sre_category(pattern[0], ptr[0]))
348 goto failure;
349 pattern++;
350 ptr++;
351 break;
352
Guido van Rossumb700df92000-03-31 14:59:30 +0000353 case SRE_OP_LITERAL:
354 /* match literal character */
355 /* args: <code> */
356 TRACE(("%8d: literal %c\n", PTR(ptr), (SRE_CHAR) *pattern));
357 if (ptr >= end || *ptr != (SRE_CHAR) *pattern)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000358 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000359 pattern++;
360 ptr++;
361 break;
362
363 case SRE_OP_NOT_LITERAL:
364 /* match anything that is not literal character */
365 /* args: <code> */
366 TRACE(("%8d: literal not %c\n", PTR(ptr), (SRE_CHAR) *pattern));
367 if (ptr >= end || *ptr == (SRE_CHAR) *pattern)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000368 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000369 pattern++;
370 ptr++;
371 break;
372
373 case SRE_OP_ANY:
374 /* match anything */
375 TRACE(("%8d: any\n", PTR(ptr)));
376 if (ptr >= end)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000377 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000378 ptr++;
379 break;
380
381 case SRE_OP_IN:
382 /* match set member (or non_member) */
383 /* args: <skip> <set> */
384 TRACE(("%8d: set %c\n", PTR(ptr), *ptr));
385 if (ptr >= end || !SRE_MEMBER(pattern + 1, *ptr))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000386 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000387 pattern += pattern[0];
388 ptr++;
389 break;
390
391 case SRE_OP_GROUP:
392 /* match backreference */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000393 TRACE(("%8d: group %d\n", PTR(ptr), pattern[0]));
Guido van Rossumb700df92000-03-31 14:59:30 +0000394 i = pattern[0];
395 {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000396 /* FIXME: optimize! */
Guido van Rossumb700df92000-03-31 14:59:30 +0000397 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
398 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000399 TRACE(("%8d: group %p %p\n", PTR(ptr), p, e));
Guido van Rossumb700df92000-03-31 14:59:30 +0000400 if (!p || !e || e < p)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000401 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000402 while (p < e) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000403 TRACE(("%8d: group test %c %c\n", PTR(ptr), *ptr, *p));
Guido van Rossumb700df92000-03-31 14:59:30 +0000404 if (ptr >= end || *ptr != *p)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000405 goto failure;
406 p++; ptr++;
407 }
408 }
409 pattern++;
410 break;
411
412 case SRE_OP_GROUP_IGNORE:
413 /* match backreference */
414 TRACE(("%8d: group ignore %d\n", PTR(ptr), pattern[0]));
415 i = pattern[0];
416 {
417 /* FIXME: optimize! */
418 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
419 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
420 TRACE(("%8d: group %p %p\n", PTR(ptr), p, e));
421 if (!p || !e || e < p)
422 goto failure;
423 while (p < e) {
424 TRACE(("%8d: group test %c %c\n", PTR(ptr), *ptr, *p));
425 if (ptr >= end || SRE_TO_LOWER(*ptr) != SRE_TO_LOWER(*p))
426 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000427 p++; ptr++;
428 }
429 }
430 pattern++;
431 break;
432
433 case SRE_OP_LITERAL_IGNORE:
434 TRACE(("%8d: literal lower(%c)\n", PTR(ptr), (SRE_CHAR) *pattern));
435 if (ptr >= end || SRE_TO_LOWER(*ptr) != (SRE_CHAR) *pattern)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000436 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000437 pattern++;
438 ptr++;
439 break;
440
441 case SRE_OP_NOT_LITERAL_IGNORE:
442 TRACE(("%8d: literal not lower(%c)\n", PTR(ptr),
443 (SRE_CHAR) *pattern));
444 if (ptr >= end || SRE_TO_LOWER(*ptr) == (SRE_CHAR) *pattern)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000445 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000446 pattern++;
447 ptr++;
448 break;
449
450 case SRE_OP_IN_IGNORE:
451 TRACE(("%8d: set lower(%c)\n", PTR(ptr), *ptr));
452 if (ptr >= end
453 || !SRE_MEMBER(pattern+1, (SRE_CHAR) SRE_TO_LOWER(*ptr)))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000454 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000455 pattern += pattern[0];
456 ptr++;
457 break;
458
459 case SRE_OP_MARK:
460 /* set mark */
461 /* args: <mark> */
462 TRACE(("%8d: set mark(%d)\n", PTR(ptr), pattern[0]));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000463 if (!mark) {
464 mark = mark_data;
465 memcpy(mark, state->mark, sizeof(state->mark));
466 }
467 state->mark[pattern[0]] = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +0000468 pattern++;
469 break;
470
471 case SRE_OP_JUMP:
472 /* jump forward */
473 /* args: <skip> */
474 TRACE(("%8d: jump +%d\n", PTR(ptr), pattern[0]));
475 pattern += pattern[0];
476 break;
477
478 case SRE_OP_CALL:
479 /* match subpattern, without backtracking */
480 /* args: <skip> <pattern> */
481 TRACE(("%8d: match subpattern\n", PTR(ptr)));
482 state->ptr = ptr;
483 if (!SRE_MATCH(state, pattern + 1))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000484 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000485 pattern += pattern[0];
486 ptr = state->ptr;
487 break;
488
489 case SRE_OP_MAX_REPEAT_ONE:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000490 /* match repeated sequence (maximizing regexp) */
491 /* this variant only works if the repeated item is exactly
492 one character wide, and we're not already collecting
493 backtracking points. for other cases, use the
Guido van Rossumb700df92000-03-31 14:59:30 +0000494 MAX_REPEAT operator instead */
Guido van Rossumb700df92000-03-31 14:59:30 +0000495 /* args: <skip> <min> <max> <step> */
Guido van Rossumb700df92000-03-31 14:59:30 +0000496 TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr),
497 pattern[1], pattern[2]));
498
499 count = 0;
500
501 if (pattern[3] == SRE_OP_ANY) {
502 /* repeated wildcard. skip to the end of the target
503 string, and backtrack from there */
504 /* FIXME: must look for line endings */
505 if (ptr + pattern[1] > end)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000506 goto failure; /* cannot match */
Guido van Rossumb700df92000-03-31 14:59:30 +0000507 count = pattern[2];
508 if (count > end - ptr)
509 count = end - ptr;
510 ptr += count;
511
512 } else if (pattern[3] == SRE_OP_LITERAL) {
513 /* repeated literal */
514 SRE_CHAR chr = (SRE_CHAR) pattern[4];
515 while (count < (int) pattern[2]) {
516 if (ptr >= end || *ptr != chr)
517 break;
518 ptr++;
519 count++;
520 }
521
522 } else if (pattern[3] == SRE_OP_LITERAL_IGNORE) {
523 /* repeated literal */
524 SRE_CHAR chr = (SRE_CHAR) pattern[4];
525 while (count < (int) pattern[2]) {
526 if (ptr >= end || (SRE_CHAR) SRE_TO_LOWER(*ptr) != chr)
527 break;
528 ptr++;
529 count++;
530 }
531
532 } else if (pattern[3] == SRE_OP_NOT_LITERAL) {
533 /* repeated non-literal */
534 SRE_CHAR chr = (SRE_CHAR) pattern[4];
535 while (count < (int) pattern[2]) {
536 if (ptr >= end || *ptr == chr)
537 break;
538 ptr++;
539 count++;
540 }
541
542 } else if (pattern[3] == SRE_OP_NOT_LITERAL_IGNORE) {
543 /* repeated non-literal */
544 SRE_CHAR chr = (SRE_CHAR) pattern[4];
545 while (count < (int) pattern[2]) {
546 if (ptr >= end || (SRE_CHAR) SRE_TO_LOWER(*ptr) == chr)
547 break;
548 ptr++;
549 count++;
550 }
551
552 } else if (pattern[3] == SRE_OP_IN) {
553 /* repeated set */
554 while (count < (int) pattern[2]) {
555 if (ptr >= end || !SRE_MEMBER(pattern + 5, *ptr))
556 break;
557 ptr++;
558 count++;
559 }
560
561 } else {
562 /* repeated single character pattern */
563 state->ptr = ptr;
564 while (count < (int) pattern[2]) {
565 i = SRE_MATCH(state, pattern + 3);
566 if (i < 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000567 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000568 if (i == 0)
569 break;
570 count++;
571 }
572 state->ptr = ptr;
573 ptr += count;
574 }
575
576 /* when we arrive here, count contains the number of
577 matches, and ptr points to the tail of the target
578 string. check if the rest of the pattern matches, and
579 backtrack if not. */
580
Guido van Rossumb700df92000-03-31 14:59:30 +0000581 TRACE(("%8d: repeat %d found\n", PTR(ptr), count));
582
583 if (count < (int) pattern[1])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000584 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000585
586 if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
587 /* tail is empty. we're finished */
588 TRACE(("%8d: tail is empty\n", PTR(ptr)));
589 state->ptr = ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000590 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000591
592 } else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000593 /* tail starts with a literal. skip positions where
594 the rest of the pattern cannot possibly match */
Guido van Rossumb700df92000-03-31 14:59:30 +0000595 SRE_CHAR chr = (SRE_CHAR) pattern[pattern[0]+1];
596 TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr));
597 for (;;) {
598 TRACE(("%8d: scan for tail match\n", PTR(ptr)));
599 while (count >= (int) pattern[1] &&
600 (ptr >= end || *ptr != chr)) {
601 ptr--;
602 count--;
603 }
604 TRACE(("%8d: check tail\n", PTR(ptr)));
605 if (count < (int) pattern[1])
606 break;
607 state->ptr = ptr;
608 i = SRE_MATCH(state, pattern + pattern[0]);
609 if (i > 0) {
610 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000611 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000612 }
613 TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
614 ptr--;
615 count--;
616 }
617
618 } else {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000619 /* general case */
Guido van Rossumb700df92000-03-31 14:59:30 +0000620 TRACE(("%8d: tail is pattern\n", PTR(ptr)));
621 while (count >= (int) pattern[1]) {
622 state->ptr = ptr;
623 i = SRE_MATCH(state, pattern + pattern[0]);
624 if (i > 0) {
625 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000626 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000627 }
628 TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
629 ptr--;
630 count--;
631 }
632 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000633 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000634
635 case SRE_OP_MAX_REPEAT:
636 /* match repeated sequence (maximizing regexp). repeated
637 group should end with a MAX_UNTIL code */
638
639 TRACE(("%8d: max repeat %d %d\n", PTR(ptr),
640 pattern[1], pattern[2]));
641
642 count = 0;
643 state->ptr = ptr;
644
645 /* FIXME: <fl> umm. what about matching the minimum
646 number of items before starting to collect backtracking
647 positions? */
648
649 stackbase = state->stackbase;
650
651 while (count < (int) pattern[2]) {
652 /* store current position on the stack */
653 TRACE(("%8d: push mark at index %d\n", PTR(ptr), count));
654 if (stackbase + count >= state->stacksize) {
655 i = _stack_extend(state, stackbase + count + 1,
656 stackbase + pattern[2]);
657 if (i < 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000658 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000659 }
660 state->stack[stackbase + count] = ptr;
661 /* check if we can match another item */
662 state->stackbase += count + 1;
663 i = SRE_MATCH(state, pattern + 3);
664 state->stackbase = stackbase; /* rewind */
665 if (i != 2)
666 break;
667 if (state->ptr == ptr) {
668 /* if the match was successful but empty, set the
669 count to max and terminate the scanning loop */
670 stacksize = count; /* actual size of stack */
671 count = (int) pattern[2];
672 goto check_tail; /* FIXME: <fl> eliminate goto */
673 }
674 count++;
675 ptr = state->ptr;
676
677 }
678
679 stacksize = count; /* actual number of entries on the stack */
680
681 check_tail:
682
683 /* when we get here, count is the number of matches,
684 stacksize is the number of match points on the stack
685 (usually same as count, but it might be smaller) and
686 ptr points to the tail. */
687
688 if (count < (int) pattern[1])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000689 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000690
691 /* make sure that rest of the expression matches. if it
692 doesn't, backtrack */
693
694 TRACE(("%8d: repeat %d found (stack size = %d)\n", PTR(ptr),
695 count, stacksize + 1));
696
697 TRACE(("%8d: tail is pattern\n", PTR(ptr)));
698
699 /* hope for the best */
700 state->ptr = ptr;
701 state->stackbase += stacksize + 1;
702 i = SRE_MATCH(state, pattern + pattern[0]);
703 state->stackbase = stackbase;
704 if (i > 0) {
705 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000706 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000707 }
708
709 /* backtrack! */
710 while (count >= (int) pattern[1]) {
711 ptr = state->stack[stackbase + (count < stacksize ? count : stacksize)];
712 state->ptr = ptr;
713 count--;
714 TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
715 state->stackbase += stacksize + 1;
716 i = SRE_MATCH(state, pattern + pattern[0]);
717 state->stackbase = stackbase;
718 if (i > 0) {
719 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000720 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000721 }
722 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000723 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000724
725 case SRE_OP_MAX_UNTIL:
726 /* match repeated sequence (maximizing regexp). repeated
727 group should end with a MAX_UNTIL code */
728
729 TRACE(("%8d: max until\n", PTR(ptr)));
730 state->ptr = ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000731 goto success; /* always succeeds, for now... */
Guido van Rossumb700df92000-03-31 14:59:30 +0000732
733 case SRE_OP_MIN_REPEAT:
734 /* match repeated sequence (minimizing regexp) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000735 /* FIXME: HERE BE BUGS! */
Guido van Rossumb700df92000-03-31 14:59:30 +0000736 TRACE(("%8d: min repeat %d %d\n", PTR(ptr),
737 pattern[1], pattern[2]));
738 count = 0;
739 state->ptr = ptr;
740 /* match minimum number of items */
741 while (count < (int) pattern[1]) {
742 i = SRE_MATCH(state, pattern + 3);
743 if (i <= 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000744 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000745 count++;
746 }
747 /* move forward until the tail matches. */
748 while (count <= (int) pattern[2]) {
749 ptr = state->ptr;
750 i = SRE_MATCH(state, pattern + pattern[0]);
751 if (i > 0) {
752 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000753 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000754 }
755 TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
756 state->ptr = ptr; /* backtrack */
757 i = SRE_MATCH(state, pattern + 3);
758 if (i <= 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000759 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000760 count++;
761 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000762 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000763
764 case SRE_OP_MIN_UNTIL:
765 /* end of repeat group */
766 TRACE(("%8d: min until\n", PTR(ptr)));
767 state->ptr = ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000768 goto success; /* always succeeds, for now... */
Guido van Rossumb700df92000-03-31 14:59:30 +0000769
770 case SRE_OP_BRANCH:
771 /* match one of several subpatterns */
772 /* format: <branch> <size> <head> ... <null> <tail> */
773 TRACE(("%8d: branch\n", PTR(ptr)));
774 while (*pattern) {
775 if (pattern[1] != SRE_OP_LITERAL ||
776 (ptr < end && *ptr == (SRE_CHAR) pattern[2])) {
777 TRACE(("%8d: branch check\n", PTR(ptr)));
778 state->ptr = ptr;
779 i = SRE_MATCH(state, pattern + 1);
780 if (i > 0) {
781 TRACE(("%8d: branch succeeded\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000782 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000783 }
784 }
785 pattern += *pattern;
786 }
787 TRACE(("%8d: branch failed\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000788 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000789
790 case SRE_OP_REPEAT:
791 /* TEMPLATE: match repeated sequence (no backtracking) */
792 /* format: <repeat> <skip> <min> <max> */
793 TRACE(("%8d: repeat %d %d\n", PTR(ptr), pattern[1], pattern[2]));
794 count = 0;
795 state->ptr = ptr;
796 while (count < (int) pattern[2]) {
797 i = SRE_MATCH(state, pattern + 3);
798 if (i <= 0)
799 break;
800 count++;
801 }
802 if (count <= (int) pattern[1])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000803 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000804 TRACE(("%8d: repeat %d matches\n", PTR(ptr), count));
805 pattern += pattern[0];
806 ptr = state->ptr;
807 break;
808
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000809 default:
Guido van Rossumb700df92000-03-31 14:59:30 +0000810 return SRE_ERROR_ILLEGAL;
811 }
812 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000813
814 failure:
815 if (mark)
816 memcpy(state->mark, mark, sizeof(state->mark));
817 return 0;
818
819 success:
820 return 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000821}
822
823LOCAL(int)
824SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
825{
826 SRE_CHAR* ptr = state->start;
827 SRE_CHAR* end = state->end;
828 int status = 0;
829
830 /* FIXME: <fl> add IGNORE cases (or implement full ASSERT support? */
831
832 if (pattern[0] == SRE_OP_LITERAL) {
833 /* pattern starts with a literal */
834 SRE_CHAR chr = (SRE_CHAR) pattern[1];
835 for (;;) {
836 while (ptr < end && *ptr != chr)
837 ptr++;
838 if (ptr == end)
839 return 0;
840 TRACE(("%8d: search found literal\n", PTR(ptr)));
841 state->start = ptr;
842 state->ptr = ++ptr;
843 status = SRE_MATCH(state, pattern + 2);
844 if (status != 0)
845 break;
846 }
847
848 } else if (pattern[0] == SRE_OP_IN) {
849 /* pattern starts with a set */
850 for (;;) {
851 /* format: <in> <skip> <data> */
852 while (ptr < end && !SRE_MEMBER(pattern + 2, *ptr))
853 ptr++;
854 if (ptr == end)
855 return 0;
856 TRACE(("%8d: search found set\n", PTR(ptr)));
857 state->start = ptr;
858 state->ptr = ++ptr;
859 status = SRE_MATCH(state, pattern + pattern[1] + 1);
860 if (status != 0)
861 break;
862 }
863
864 } else
865 while (ptr <= end) {
866 TRACE(("%8d: search\n", PTR(ptr)));
867 state->start = state->ptr = ptr++;
868 status = SRE_MATCH(state, pattern);
869 if (status != 0)
870 break;
871 }
872
873 return status;
874}
875
876#ifndef SRE_RECURSIVE
877
878/* -------------------------------------------------------------------- */
879/* factories and destructors */
880
881/* see sre.h for object declarations */
882
883staticforward PyTypeObject Pattern_Type;
884staticforward PyTypeObject Match_Type;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000885staticforward PyTypeObject Cursor_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +0000886
887static PyObject *
888_compile(PyObject* self_, PyObject* args)
889{
890 /* "compile" pattern descriptor to pattern object */
891
892 PatternObject* self;
893
894 PyObject* pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000895 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000896 PyObject* code;
897 int groups = 0;
898 PyObject* groupindex = NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000899 if (!PyArg_ParseTuple(args, "OiO!|iO", &pattern, &flags,
900 &PyString_Type, &code,
901 &groups, &groupindex))
Guido van Rossumb700df92000-03-31 14:59:30 +0000902 return NULL;
903
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000904 self = PyObject_NEW(PatternObject, &Pattern_Type);
Guido van Rossumb700df92000-03-31 14:59:30 +0000905 if (self == NULL)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000906
Guido van Rossumb700df92000-03-31 14:59:30 +0000907 return NULL;
908
909 Py_INCREF(pattern);
910 self->pattern = pattern;
911
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000912 self->flags = flags;
913
Guido van Rossumb700df92000-03-31 14:59:30 +0000914 Py_INCREF(code);
915 self->code = code;
916
917 self->groups = groups;
918
919 Py_XINCREF(groupindex);
920 self->groupindex = groupindex;
921
922 return (PyObject*) self;
923}
924
925static PyObject *
926_getcodesize(PyObject* self_, PyObject* args)
927{
928 return Py_BuildValue("i", sizeof(SRE_CODE));
929}
930
Guido van Rossumb700df92000-03-31 14:59:30 +0000931LOCAL(PyObject*)
932_setup(SRE_STATE* state, PyObject* args)
933{
934 /* prepare state object */
935
936 PyBufferProcs *buffer;
937 int i, count;
938 void* ptr;
939
940 PyObject* string;
941 int start = 0;
942 int end = INT_MAX;
943 if (!PyArg_ParseTuple(args, "O|ii", &string, &start, &end))
944 return NULL;
945
946 /* get pointer to string buffer */
947 buffer = string->ob_type->tp_as_buffer;
948 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
949 buffer->bf_getsegcount(string, NULL) != 1) {
950 PyErr_SetString(PyExc_TypeError, "expected read-only buffer");
951 return NULL;
952 }
953
954 /* determine buffer size */
955 count = buffer->bf_getreadbuffer(string, 0, &ptr);
956 if (count < 0) {
957 /* sanity check */
958 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
959 return NULL;
960 }
961
962 /* determine character size */
963 state->charsize = (PyUnicode_Check(string) ? sizeof(Py_UNICODE) : 1);
964
965 count /= state->charsize;
966
967 /* adjust boundaries */
968 if (start < 0)
969 start = 0;
970 else if (start > count)
971 start = count;
972
973 if (end < 0)
974 end = 0;
975 else if (end > count)
976 end = count;
977
978 state->beginning = ptr;
979
980 state->start = (void*) ((char*) ptr + start * state->charsize);
981 state->end = (void*) ((char*) ptr + end * state->charsize);
982
983 /* FIXME: dynamic! */
984 for (i = 0; i < 64; i++)
985 state->mark[i] = NULL;
986
987 state->stack = NULL;
988 state->stackbase = 0;
989 state->stacksize = 0;
990
991 return string;
992}
993
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000994static PyObject*
995_pattern_new_match(PatternObject* pattern, SRE_STATE* state,
996 PyObject* string, int status)
997{
998 /* create match object (from state object) */
999
1000 MatchObject* match;
1001 int i, j;
1002
1003 TRACE(("status = %d\n", status));
1004
1005 if (status > 0) {
1006
1007 /* create match object (with room for extra group marks) */
1008 match = PyObject_NEW_VAR(MatchObject, &Match_Type, 2*pattern->groups);
1009 if (match == NULL)
1010 return NULL;
1011
1012 Py_INCREF(pattern);
1013 match->pattern = pattern;
1014
1015 Py_INCREF(string);
1016 match->string = string;
1017
1018 match->groups = pattern->groups+1;
1019
1020 /* group zero */
1021 match->mark[0] = ((char*) state->start -
1022 (char*) state->beginning) / state->charsize;
1023 match->mark[1] = ((char*) state->ptr -
1024 (char*) state->beginning) / state->charsize;
1025
1026 /* fill in the rest of the groups */
1027 for (i = j = 0; i < pattern->groups; i++, j+=2)
1028 if (state->mark[j] != NULL && state->mark[j+1] != NULL) {
1029 match->mark[j+2] = ((char*) state->mark[j] -
1030 (char*) state->beginning) / state->charsize;
1031 match->mark[j+3] = ((char*) state->mark[j+1] -
1032 (char*) state->beginning) / state->charsize;
1033 } else
1034 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
1035
1036 return (PyObject*) match;
1037
1038 } else if (status < 0) {
1039
1040 /* internal error */
1041 PyErr_SetString(
1042 PyExc_RuntimeError, "internal error in regular expression engine"
1043 );
1044 return NULL;
1045
1046 }
1047
1048 Py_INCREF(Py_None);
1049 return Py_None;
1050}
1051
1052static PyObject*
1053_pattern_cursor(PyObject* pattern, PyObject* args)
1054{
1055 /* create search state object */
1056
1057 CursorObject* self;
1058 PyObject* string;
1059
1060 /* create match object (with room for extra group marks) */
1061 self = PyObject_NEW(CursorObject, &Cursor_Type);
1062 if (self == NULL)
1063 return NULL;
1064
1065 string = _setup(&self->state, args);
1066 if (!string) {
1067 /* FIXME: dealloc cursor object */
1068 return NULL;
1069 }
1070
1071 Py_INCREF(pattern);
1072 self->pattern = pattern;
1073
1074 Py_INCREF(string);
1075 self->string = string;
1076
1077 return (PyObject*) self;
1078}
1079
Guido van Rossumb700df92000-03-31 14:59:30 +00001080static void
1081_pattern_dealloc(PatternObject* self)
1082{
1083 Py_XDECREF(self->code);
1084 Py_XDECREF(self->pattern);
1085 Py_XDECREF(self->groupindex);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001086 PyMem_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001087}
1088
1089static PyObject*
1090_pattern_match(PatternObject* self, PyObject* args)
1091{
1092 SRE_STATE state;
1093 PyObject* string;
1094 int status;
1095
1096 string = _setup(&state, args);
1097 if (!string)
1098 return NULL;
1099
1100 state.ptr = state.start;
1101
1102 if (state.charsize == 1) {
1103 status = sre_match(&state, PatternObject_GetCode(self));
1104 } else {
1105 status = sre_umatch(&state, PatternObject_GetCode(self));
1106 }
1107
1108 _stack_free(&state);
1109
1110 return _pattern_new_match(self, &state, string, status);
1111}
1112
1113static PyObject*
1114_pattern_search(PatternObject* self, PyObject* args)
1115{
1116 SRE_STATE state;
1117 PyObject* string;
1118 int status;
1119
1120 string = _setup(&state, args);
1121 if (!string)
1122 return NULL;
1123
1124 if (state.charsize == 1) {
1125 status = sre_search(&state, PatternObject_GetCode(self));
1126 } else {
1127 status = sre_usearch(&state, PatternObject_GetCode(self));
1128 }
1129
1130 _stack_free(&state);
1131
1132 return _pattern_new_match(self, &state, string, status);
1133}
1134
1135static PyObject*
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001136call(char* function, PyObject* args)
1137{
1138 PyObject* name;
1139 PyObject* module;
1140 PyObject* func;
1141 PyObject* result;
1142
1143 name = PyString_FromString("sre");
1144 if (!name)
1145 return NULL;
1146 module = PyImport_Import(name);
1147 Py_DECREF(name);
1148 if (!module)
1149 return NULL;
1150 func = PyObject_GetAttrString(module, function);
1151 Py_DECREF(module);
1152 if (!func)
1153 return NULL;
1154 result = PyObject_CallObject(func, args);
1155 Py_DECREF(func);
1156 Py_DECREF(args);
1157 return result;
1158}
1159
1160static PyObject*
1161_pattern_sub(PatternObject* self, PyObject* args)
1162{
1163 PyObject* template;
1164 PyObject* string;
1165 PyObject* count;
1166 if (!PyArg_ParseTuple(args, "OOO", &template, &string, &count))
1167 return NULL;
1168
1169 /* delegate to Python code */
1170 return call("_sub", Py_BuildValue("OOOO", self, template, string, count));
1171}
1172
1173static PyObject*
1174_pattern_subn(PatternObject* self, PyObject* args)
1175{
1176 PyObject* template;
1177 PyObject* string;
1178 PyObject* count;
1179 if (!PyArg_ParseTuple(args, "OOO", &template, &string, &count))
1180 return NULL;
1181
1182 /* delegate to Python code */
1183 return call("_subn", Py_BuildValue("OOOO", self, template, string, count));
1184}
1185
1186static PyObject*
1187_pattern_split(PatternObject* self, PyObject* args)
1188{
1189 PyObject* string;
1190 PyObject* maxsplit;
1191 if (!PyArg_ParseTuple(args, "OO", &string, &maxsplit))
1192 return NULL;
1193
1194 /* delegate to Python code */
1195 return call("_split", Py_BuildValue("OOO", self, string, maxsplit));
1196}
1197
1198static PyObject*
Guido van Rossumb700df92000-03-31 14:59:30 +00001199_pattern_findall(PatternObject* self, PyObject* args)
1200{
Guido van Rossumb700df92000-03-31 14:59:30 +00001201 SRE_STATE state;
1202 PyObject* string;
1203 PyObject* list;
1204 int status;
1205
1206 string = _setup(&state, args);
1207 if (!string)
1208 return NULL;
1209
1210 list = PyList_New(0);
1211
1212 while (state.start < state.end) {
1213
1214 PyObject* item;
1215
1216 state.ptr = state.start;
1217
1218 if (state.charsize == 1) {
1219 status = sre_match(&state, PatternObject_GetCode(self));
1220 } else {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001221 status = sre_umatch(&state, PatternObject_GetCode(self));
Guido van Rossumb700df92000-03-31 14:59:30 +00001222 }
1223
1224 if (status >= 0) {
1225
1226 if (status == 0)
1227 state.ptr = (void*) ((char*) state.start + 1);
1228
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001229 /* FIXME: if one group is defined, slice that group
1230 instead. if multiple groups are defined, add tuple
1231 containing all slices */
1232
Guido van Rossumb700df92000-03-31 14:59:30 +00001233 item = PySequence_GetSlice(
1234 string,
1235 ((char*) state.start - (char*) state.beginning),
1236 ((char*) state.ptr - (char*) state.beginning));
1237 if (!item)
1238 goto error;
1239 if (PyList_Append(list, item) < 0)
1240 goto error;
1241
1242 state.start = state.ptr;
1243
1244 } else {
1245
1246 /* internal error */
1247 PyErr_SetString(
1248 PyExc_RuntimeError,
1249 "internal error in regular expression engine"
1250 );
1251 goto error;
1252
1253 }
1254 }
1255
1256 _stack_free(&state);
1257
1258 return list;
1259
1260error:
1261 _stack_free(&state);
1262 return NULL;
1263
1264}
1265
1266static PyMethodDef _pattern_methods[] = {
1267 {"match", (PyCFunction) _pattern_match, 1},
1268 {"search", (PyCFunction) _pattern_search, 1},
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001269 {"sub", (PyCFunction) _pattern_sub, 1},
1270 {"subn", (PyCFunction) _pattern_subn, 1},
1271 {"split", (PyCFunction) _pattern_split, 1},
Guido van Rossumb700df92000-03-31 14:59:30 +00001272 {"findall", (PyCFunction) _pattern_findall, 1},
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001273 /* experimental */
1274 {"cursor", (PyCFunction) _pattern_cursor, 1},
Guido van Rossumb700df92000-03-31 14:59:30 +00001275 {NULL, NULL}
1276};
1277
1278static PyObject*
1279_pattern_getattr(PatternObject* self, char* name)
1280{
1281 PyObject* res;
1282
1283 res = Py_FindMethod(_pattern_methods, (PyObject*) self, name);
1284
1285 if (res)
1286 return res;
1287
1288 PyErr_Clear();
1289
1290 /* attributes */
1291 if (!strcmp(name, "pattern")) {
1292 Py_INCREF(self->pattern);
1293 return self->pattern;
1294 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001295
1296 if (!strcmp(name, "flags"))
1297 return Py_BuildValue("i", self->flags);
1298
1299 if (!strcmp(name, "groupindex") && self->groupindex) {
1300 Py_INCREF(self->groupindex);
1301 return self->groupindex;
1302 }
1303
Guido van Rossumb700df92000-03-31 14:59:30 +00001304 PyErr_SetString(PyExc_AttributeError, name);
1305 return NULL;
1306}
1307
1308statichere PyTypeObject Pattern_Type = {
1309 PyObject_HEAD_INIT(NULL)
1310 0, "Pattern", sizeof(PatternObject), 0,
1311 (destructor)_pattern_dealloc, /*tp_dealloc*/
1312 0, /*tp_print*/
1313 (getattrfunc)_pattern_getattr, /*tp_getattr*/
1314};
1315
1316/* -------------------------------------------------------------------- */
1317/* match methods */
1318
1319static void
1320_match_dealloc(MatchObject* self)
1321{
1322 Py_XDECREF(self->string);
1323 Py_DECREF(self->pattern);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001324 PyMem_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001325}
1326
1327static PyObject*
1328getslice_by_index(MatchObject* self, int index)
1329{
1330 if (index < 0 || index >= self->groups) {
1331 /* raise IndexError if we were given a bad group number */
1332 PyErr_SetString(
1333 PyExc_IndexError,
1334 "no such group"
1335 );
1336 return NULL;
1337 }
1338
1339 if (self->string == Py_None || self->mark[index+index] < 0) {
1340 /* return None if the string or group is undefined */
1341 Py_INCREF(Py_None);
1342 return Py_None;
1343 }
1344
1345 return PySequence_GetSlice(
1346 self->string, self->mark[index+index], self->mark[index+index+1]
1347 );
1348}
1349
1350static PyObject*
1351getslice(MatchObject* self, PyObject* index)
1352{
1353 if (!PyInt_Check(index) && self->pattern->groupindex != NULL) {
1354 /* FIXME: resource leak? */
1355 index = PyObject_GetItem(self->pattern->groupindex, index);
1356 if (!index)
1357 return NULL;
1358 }
1359
1360 if (PyInt_Check(index))
1361 return getslice_by_index(self, (int) PyInt_AS_LONG(index));
1362
1363 return getslice_by_index(self, -1); /* signal error */
1364}
1365
1366static PyObject*
1367_match_group(MatchObject* self, PyObject* args)
1368{
1369 PyObject* result;
1370 int i, size;
1371
1372 size = PyTuple_GET_SIZE(args);
1373
1374 switch (size) {
1375 case 0:
1376 result = getslice(self, Py_False); /* force error */
1377 break;
1378 case 1:
1379 result = getslice(self, PyTuple_GET_ITEM(args, 0));
1380 break;
1381 default:
1382 /* fetch multiple items */
1383 result = PyTuple_New(size);
1384 if (!result)
1385 return NULL;
1386 for (i = 0; i < size; i++) {
1387 PyObject* item = getslice(self, PyTuple_GET_ITEM(args, i));
1388 if (!item) {
1389 Py_DECREF(result);
1390 return NULL;
1391 }
1392 PyTuple_SET_ITEM(result, i, item);
1393 }
1394 break;
1395 }
1396 return result;
1397}
1398
1399static PyObject*
1400_match_groups(MatchObject* self, PyObject* args)
1401{
1402 PyObject* result;
1403 int index;
1404
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001405 /* FIXME: <fl> handle default value! */
1406
Guido van Rossumb700df92000-03-31 14:59:30 +00001407 result = PyTuple_New(self->groups-1);
1408 if (!result)
1409 return NULL;
1410
1411 for (index = 1; index < self->groups; index++) {
1412 PyObject* item;
1413 /* FIXME: <fl> handle default! */
1414 item = getslice_by_index(self, index);
1415 if (!item) {
1416 Py_DECREF(result);
1417 return NULL;
1418 }
1419 PyTuple_SET_ITEM(result, index-1, item);
1420 }
1421
1422 return result;
1423}
1424
1425static PyObject*
1426_match_groupdict(MatchObject* self, PyObject* args)
1427{
1428 PyObject* result;
1429 PyObject* keys;
1430 int index;
1431
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001432 /* FIXME: <fl> handle default value! */
1433
Guido van Rossumb700df92000-03-31 14:59:30 +00001434 result = PyDict_New();
1435 if (!result)
1436 return NULL;
1437 if (!self->pattern->groupindex)
1438 return result;
1439
1440 keys = PyMapping_Keys(self->pattern->groupindex);
1441 if (!keys)
1442 return NULL;
1443
1444 for (index = 0; index < PySequence_Length(keys); index++) {
1445 PyObject* key;
1446 PyObject* item;
1447 key = PySequence_GetItem(keys, index);
1448 if (!key) {
1449 Py_DECREF(keys);
1450 Py_DECREF(result);
1451 return NULL;
1452 }
1453 item = getslice(self, key);
1454 if (!item) {
1455 Py_DECREF(key);
1456 Py_DECREF(keys);
1457 Py_DECREF(result);
1458 return NULL;
1459 }
1460 /* FIXME: <fl> this can fail, right? */
1461 PyDict_SetItem(result, key, item);
1462 }
1463
1464 Py_DECREF(keys);
1465
1466 return result;
1467}
1468
1469static PyObject*
1470_match_start(MatchObject* self, PyObject* args)
1471{
1472 int index = 0;
1473 if (!PyArg_ParseTuple(args, "|i", &index))
1474 return NULL;
1475
1476 if (index < 0 || index >= self->groups) {
1477 PyErr_SetString(
1478 PyExc_IndexError,
1479 "no such group"
1480 );
1481 return NULL;
1482 }
1483
1484 if (self->mark[index*2] < 0) {
1485 Py_INCREF(Py_None);
1486 return Py_None;
1487 }
1488
1489 return Py_BuildValue("i", self->mark[index*2]);
1490}
1491
1492static PyObject*
1493_match_end(MatchObject* self, PyObject* args)
1494{
1495 int index = 0;
1496 if (!PyArg_ParseTuple(args, "|i", &index))
1497 return NULL;
1498
1499 if (index < 0 || index >= self->groups) {
1500 PyErr_SetString(
1501 PyExc_IndexError,
1502 "no such group"
1503 );
1504 return NULL;
1505 }
1506
1507 if (self->mark[index*2] < 0) {
1508 Py_INCREF(Py_None);
1509 return Py_None;
1510 }
1511
1512 return Py_BuildValue("i", self->mark[index*2+1]);
1513}
1514
1515static PyObject*
1516_match_span(MatchObject* self, PyObject* args)
1517{
1518 int index = 0;
1519 if (!PyArg_ParseTuple(args, "|i", &index))
1520 return NULL;
1521
1522 if (index < 0 || index >= self->groups) {
1523 PyErr_SetString(
1524 PyExc_IndexError,
1525 "no such group"
1526 );
1527 return NULL;
1528 }
1529
1530 if (self->mark[index*2] < 0) {
1531 Py_INCREF(Py_None);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001532 Py_INCREF(Py_None);
1533 return Py_BuildValue("OO", Py_None, Py_None);
Guido van Rossumb700df92000-03-31 14:59:30 +00001534 }
1535
1536 return Py_BuildValue("ii", self->mark[index*2], self->mark[index*2+1]);
1537}
1538
1539static PyMethodDef _match_methods[] = {
1540 {"group", (PyCFunction) _match_group, 1},
1541 {"start", (PyCFunction) _match_start, 1},
1542 {"end", (PyCFunction) _match_end, 1},
1543 {"span", (PyCFunction) _match_span, 1},
1544 {"groups", (PyCFunction) _match_groups, 1},
1545 {"groupdict", (PyCFunction) _match_groupdict, 1},
1546 {NULL, NULL}
1547};
1548
1549static PyObject*
1550_match_getattr(MatchObject* self, char* name)
1551{
1552 PyObject* res;
1553
1554 res = Py_FindMethod(_match_methods, (PyObject*) self, name);
1555 if (res)
1556 return res;
1557
1558 PyErr_Clear();
1559
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001560 /* attributes */
Guido van Rossumb700df92000-03-31 14:59:30 +00001561 if (!strcmp(name, "string")) {
1562 Py_INCREF(self->string);
1563 return self->string;
1564 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001565
Guido van Rossumb700df92000-03-31 14:59:30 +00001566 if (!strcmp(name, "re")) {
1567 Py_INCREF(self->pattern);
1568 return (PyObject*) self->pattern;
1569 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001570
Guido van Rossumb700df92000-03-31 14:59:30 +00001571 if (!strcmp(name, "pos"))
1572 return Py_BuildValue("i", 0); /* FIXME */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001573
Guido van Rossumb700df92000-03-31 14:59:30 +00001574 if (!strcmp(name, "endpos"))
1575 return Py_BuildValue("i", 0); /* FIXME */
1576
1577 PyErr_SetString(PyExc_AttributeError, name);
1578 return NULL;
1579}
1580
1581/* FIXME: implement setattr("string", None) as a special case (to
1582 detach the associated string, if any */
1583
1584statichere PyTypeObject Match_Type = {
1585 PyObject_HEAD_INIT(NULL)
1586 0, "Match",
1587 sizeof(MatchObject), /* size of basic object */
1588 sizeof(int), /* space for group item */
1589 (destructor)_match_dealloc, /*tp_dealloc*/
1590 0, /*tp_print*/
1591 (getattrfunc)_match_getattr, /*tp_getattr*/
1592};
1593
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001594/* -------------------------------------------------------------------- */
1595/* cursor methods (experimental) */
1596
1597static void
1598_cursor_dealloc(CursorObject* self)
1599{
1600 _stack_free(&self->state);
1601 Py_DECREF(self->string);
1602 Py_DECREF(self->pattern);
1603 PyMem_DEL(self);
1604}
1605
1606static PyObject*
1607_cursor_match(CursorObject* self, PyObject* args)
1608{
1609 SRE_STATE* state = &self->state;
1610 PyObject* match;
1611 int status;
1612
1613 state->ptr = state->start;
1614
1615 if (state->charsize == 1) {
1616 status = sre_match(state, PatternObject_GetCode(self->pattern));
1617 } else {
1618 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
1619 }
1620
1621 match = _pattern_new_match((PatternObject*) self->pattern,
1622 state, self->string, status);
1623
1624 if (status >= 0)
1625 state->start = state->ptr;
1626 else
1627 state->start = (char*) state->ptr + state->charsize;
1628
1629 return match;
1630}
1631
1632
1633static PyObject*
1634_cursor_search(CursorObject* self, PyObject* args)
1635{
1636 SRE_STATE* state = &self->state;
1637 PyObject* match;
1638 int status;
1639
1640 state->ptr = state->start;
1641
1642 if (state->charsize == 1) {
1643 status = sre_search(state, PatternObject_GetCode(self->pattern));
1644 } else {
1645 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
1646 }
1647
1648 match = _pattern_new_match((PatternObject*) self->pattern,
1649 state, self->string, status);
1650
1651 if (status >= 0)
1652 state->start = state->ptr;
1653
1654 return match;
1655}
1656
1657static PyMethodDef _cursor_methods[] = {
1658 {"match", (PyCFunction) _cursor_match, 0},
1659 {"search", (PyCFunction) _cursor_search, 0},
1660 {NULL, NULL}
1661};
1662
1663static PyObject*
1664_cursor_getattr(CursorObject* self, char* name)
1665{
1666 PyObject* res;
1667
1668 res = Py_FindMethod(_cursor_methods, (PyObject*) self, name);
1669 if (res)
1670 return res;
1671
1672 PyErr_Clear();
1673
1674 /* attributes */
1675 if (!strcmp(name, "pattern")) {
1676 Py_INCREF(self->pattern);
1677 return self->pattern;
1678 }
1679
1680 PyErr_SetString(PyExc_AttributeError, name);
1681 return NULL;
1682}
1683
1684statichere PyTypeObject Cursor_Type = {
1685 PyObject_HEAD_INIT(NULL)
1686 0, "Cursor",
1687 sizeof(CursorObject), /* size of basic object */
1688 0,
1689 (destructor)_cursor_dealloc, /*tp_dealloc*/
1690 0, /*tp_print*/
1691 (getattrfunc)_cursor_getattr, /*tp_getattr*/
1692};
1693
Guido van Rossumb700df92000-03-31 14:59:30 +00001694static PyMethodDef _functions[] = {
1695 {"compile", _compile, 1},
1696 {"getcodesize", _getcodesize, 1},
1697 {NULL, NULL}
1698};
1699
1700void
1701#ifdef WIN32
1702__declspec(dllexport)
1703#endif
1704init_sre()
1705{
1706 /* Patch object types */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001707 Pattern_Type.ob_type = Match_Type.ob_type =
1708 Cursor_Type.ob_type = &PyType_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00001709
1710 Py_InitModule("_sre", _functions);
1711}
1712
1713#endif