blob: d6f050ec75a70fd208eeb20ef0595a61bf579336 [file] [log] [blame]
Guido van Rossumb700df92000-03-31 14:59:30 +00001/* -*- Mode: C; tab-width: 4 -*-
2 *
3 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00004 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00005 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00006 *
7 * partial history:
Fredrik Lundh436c3d582000-06-29 08:58:44 +00008 * 99-10-24 fl created (based on existing template matcher code)
Guido van Rossumb700df92000-03-31 14:59:30 +00009 * 99-11-13 fl added categories, branching, and more (0.2)
10 * 99-11-16 fl some tweaks to compile on non-Windows platforms
11 * 99-12-18 fl non-literals, generic maximizing repeat (0.3)
Fredrik Lundh436c3d582000-06-29 08:58:44 +000012 * 00-02-28 fl tons of changes (not all to the better ;-) (0.4)
13 * 00-03-06 fl first alpha, sort of (0.5)
14 * 00-03-14 fl removed most compatibility stuff (0.6)
15 * 00-05-10 fl towards third alpha (0.8.2)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +000016 * 00-05-13 fl added experimental scanner stuff (0.8.3)
Fredrik Lundh436c3d582000-06-29 08:58:44 +000017 * 00-05-27 fl final bug hunt (0.8.4)
18 * 00-06-21 fl less bugs, more taste (0.8.5)
19 * 00-06-25 fl major changes to better deal with nested repeats (0.9)
20 * 00-06-28 fl fixed findall (0.9.1)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +000021 * 00-06-29 fl fixed split, added more scanner features (0.9.2)
Fredrik Lundhc13222c2000-07-01 23:49:14 +000022 * 00-06-30 fl added fast search optimization (0.9.3)
Fredrik Lundh0640e112000-06-30 13:55:15 +000023 * 00-06-30 fl added assert (lookahead) primitives, etc (0.9.4)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +000024 * 00-07-02 fl added charset optimizations, etc (0.9.5)
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
26 * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
27 *
Guido van Rossumb700df92000-03-31 14:59:30 +000028 * Portions of this engine have been developed in cooperation with
Fredrik Lundh22d25462000-07-01 17:50:59 +000029 * CNRI. Hewlett-Packard provided funding for 2.0 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000030 * other compatibility work.
31 */
32
33#ifndef SRE_RECURSIVE
34
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +000035char copyright[] = " SRE 0.9.5 Copyright (c) 1997-2000 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000036
37#include "Python.h"
38
39#include "sre.h"
40
Guido van Rossumb700df92000-03-31 14:59:30 +000041#if defined(HAVE_LIMITS_H)
42#include <limits.h>
43#else
44#define INT_MAX 2147483647
45#endif
46
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000047#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000048
Fredrik Lundh436c3d582000-06-29 08:58:44 +000049/* name of this module, minus the leading underscore */
50#define MODULE "sre"
51
Guido van Rossumb700df92000-03-31 14:59:30 +000052/* defining this one enables tracing */
53#undef DEBUG
54
Fredrik Lundh436c3d582000-06-29 08:58:44 +000055#if PY_VERSION_HEX >= 0x01060000
Fredrik Lundh22d25462000-07-01 17:50:59 +000056/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +000057#define HAVE_UNICODE
58#endif
59
Fredrik Lundh29c08be2000-06-29 23:33:12 +000060/* optional features */
61#define USE_FAST_SEARCH
62
Fredrik Lundh80946112000-06-29 18:03:25 +000063#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000064#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
65/* fastest possible local call under MSVC */
66#define LOCAL(type) static __inline type __fastcall
67#else
Fredrik Lundh29c08be2000-06-29 23:33:12 +000068#define LOCAL(type) static inline type
Guido van Rossumb700df92000-03-31 14:59:30 +000069#endif
70
71/* error codes */
72#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
73#define SRE_ERROR_MEMORY -9 /* out of memory */
74
Fredrik Lundh436c3d582000-06-29 08:58:44 +000075#if defined(DEBUG)
Guido van Rossumb700df92000-03-31 14:59:30 +000076#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +000077#else
78#define TRACE(v)
79#endif
80
Fredrik Lundh436c3d582000-06-29 08:58:44 +000081#define PTR(ptr) ((SRE_CHAR*) (ptr) - (SRE_CHAR*) state->beginning)
Guido van Rossumb700df92000-03-31 14:59:30 +000082
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000083/* -------------------------------------------------------------------- */
84/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +000085
Fredrik Lundh436c3d582000-06-29 08:58:44 +000086/* default character predicates (run sre_chars.py to regenerate tables) */
87
88#define SRE_DIGIT_MASK 1
89#define SRE_SPACE_MASK 2
90#define SRE_LINEBREAK_MASK 4
91#define SRE_ALNUM_MASK 8
92#define SRE_WORD_MASK 16
93
94static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
952, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
960, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
9725, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
9824, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
990, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
10024, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
101
Fredrik Lundhb389df32000-06-29 12:48:37 +0000102static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000010310, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
10427, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
10544, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
10661, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
107108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
108122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
109106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
110120, 121, 122, 123, 124, 125, 126, 127 };
111
Fredrik Lundhb389df32000-06-29 12:48:37 +0000112static unsigned int sre_lower(unsigned int ch)
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000113{
Fredrik Lundhb389df32000-06-29 12:48:37 +0000114 return ((ch) < 128 ? sre_char_lower[ch] : ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000115}
116
117#define SRE_IS_DIGIT(ch)\
118 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
119#define SRE_IS_SPACE(ch)\
120 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
121#define SRE_IS_LINEBREAK(ch)\
122 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
123#define SRE_IS_ALNUM(ch)\
124 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
125#define SRE_IS_WORD(ch)\
126 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000127
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000128/* locale-specific character predicates */
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000129
Fredrik Lundhb389df32000-06-29 12:48:37 +0000130static unsigned int sre_lower_locale(unsigned int ch)
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000131{
132 return ((ch) < 256 ? tolower((ch)) : ch);
133}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000134#define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
135#define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
136#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
137#define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
138#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
139
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000140/* unicode-specific character predicates */
141
142#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +0000143static unsigned int sre_lower_unicode(unsigned int ch)
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000144{
145 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
146}
Fredrik Lundh22d25462000-07-01 17:50:59 +0000147
148#if !defined(Py_UNICODE_ISALNUM)
149/* FIXME: workaround. should be fixed in unicodectype.c */
150#define Py_UNICODE_ISALNUM(ch)\
151 (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISUPPER(ch) ||\
152 Py_UNICODE_ISTITLE(ch) || Py_UNICODE_ISDIGIT(ch))
153#endif
154
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000155#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
156#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
157#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000158#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000159#define SRE_UNI_IS_WORD(ch) (SRE_IS_ALNUM((ch)) || (ch) == '_')
160#endif
161
Guido van Rossumb700df92000-03-31 14:59:30 +0000162LOCAL(int)
163sre_category(SRE_CODE category, unsigned int ch)
164{
165 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000166
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000167 case SRE_CATEGORY_DIGIT:
Guido van Rossumb700df92000-03-31 14:59:30 +0000168 return SRE_IS_DIGIT(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000169 case SRE_CATEGORY_NOT_DIGIT:
Guido van Rossumb700df92000-03-31 14:59:30 +0000170 return !SRE_IS_DIGIT(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000171 case SRE_CATEGORY_SPACE:
Guido van Rossumb700df92000-03-31 14:59:30 +0000172 return SRE_IS_SPACE(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000173 case SRE_CATEGORY_NOT_SPACE:
Guido van Rossumb700df92000-03-31 14:59:30 +0000174 return !SRE_IS_SPACE(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000175 case SRE_CATEGORY_WORD:
Guido van Rossumb700df92000-03-31 14:59:30 +0000176 return SRE_IS_WORD(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000177 case SRE_CATEGORY_NOT_WORD:
Guido van Rossumb700df92000-03-31 14:59:30 +0000178 return !SRE_IS_WORD(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000179 case SRE_CATEGORY_LINEBREAK:
180 return SRE_IS_LINEBREAK(ch);
181 case SRE_CATEGORY_NOT_LINEBREAK:
182 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000183
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000184 case SRE_CATEGORY_LOC_WORD:
185 return SRE_LOC_IS_WORD(ch);
186 case SRE_CATEGORY_LOC_NOT_WORD:
187 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000188
189#if defined(HAVE_UNICODE)
190 case SRE_CATEGORY_UNI_DIGIT:
191 return SRE_UNI_IS_DIGIT(ch);
192 case SRE_CATEGORY_UNI_NOT_DIGIT:
193 return !SRE_UNI_IS_DIGIT(ch);
194 case SRE_CATEGORY_UNI_SPACE:
195 return SRE_UNI_IS_SPACE(ch);
196 case SRE_CATEGORY_UNI_NOT_SPACE:
197 return !SRE_UNI_IS_SPACE(ch);
198 case SRE_CATEGORY_UNI_WORD:
199 return SRE_UNI_IS_WORD(ch);
200 case SRE_CATEGORY_UNI_NOT_WORD:
201 return !SRE_UNI_IS_WORD(ch);
202 case SRE_CATEGORY_UNI_LINEBREAK:
203 return SRE_UNI_IS_LINEBREAK(ch);
204 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
205 return !SRE_UNI_IS_LINEBREAK(ch);
206#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000207 }
208 return 0;
209}
210
211/* helpers */
212
213LOCAL(int)
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000214stack_free(SRE_STATE* state)
Guido van Rossumb700df92000-03-31 14:59:30 +0000215{
216 if (state->stack) {
217 TRACE(("release stack\n"));
218 free(state->stack);
219 state->stack = NULL;
220 }
221 state->stacksize = 0;
222 return 0;
223}
224
225static int /* shouldn't be LOCAL */
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000226stack_extend(SRE_STATE* state, int lo, int hi)
Guido van Rossumb700df92000-03-31 14:59:30 +0000227{
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000228 SRE_STACK* stack;
Guido van Rossumb700df92000-03-31 14:59:30 +0000229 int stacksize;
230
231 /* grow the stack to a suitable size; we need at least lo entries,
232 at most hi entries. if for some reason hi is lower than lo, lo
233 wins */
234
235 stacksize = state->stacksize;
236
237 if (stacksize == 0) {
238 /* create new stack */
239 stacksize = 512;
240 if (stacksize < lo)
241 stacksize = lo;
242 else if (stacksize > hi)
243 stacksize = hi;
244 TRACE(("allocate stack %d\n", stacksize));
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000245 stack = malloc(sizeof(SRE_STACK) * stacksize);
Guido van Rossumb700df92000-03-31 14:59:30 +0000246 } else {
247 /* grow the stack (typically by a factor of two) */
248 while (stacksize < lo)
249 stacksize = 2 * stacksize;
250 /* FIXME: <fl> could trim size if it's larger than lo, and
251 much larger than hi */
252 TRACE(("grow stack to %d\n", stacksize));
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000253 stack = realloc(state->stack, sizeof(SRE_STACK) * stacksize);
Guido van Rossumb700df92000-03-31 14:59:30 +0000254 }
255
256 if (!stack) {
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000257 stack_free(state);
Guido van Rossumb700df92000-03-31 14:59:30 +0000258 return SRE_ERROR_MEMORY;
259 }
260
261 state->stack = stack;
262 state->stacksize = stacksize;
263
264 return 0;
265}
266
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000267/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000268
269#define SRE_CHAR unsigned char
270#define SRE_AT sre_at
271#define SRE_MEMBER sre_member
272#define SRE_MATCH sre_match
273#define SRE_SEARCH sre_search
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000274
275#if defined(HAVE_UNICODE)
276
Guido van Rossumb700df92000-03-31 14:59:30 +0000277#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000278#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000279#undef SRE_RECURSIVE
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000280
Guido van Rossumb700df92000-03-31 14:59:30 +0000281#undef SRE_SEARCH
282#undef SRE_MATCH
283#undef SRE_MEMBER
284#undef SRE_AT
285#undef SRE_CHAR
286
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000287/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000288
289#define SRE_CHAR Py_UNICODE
290#define SRE_AT sre_uat
291#define SRE_MEMBER sre_umember
292#define SRE_MATCH sre_umatch
293#define SRE_SEARCH sre_usearch
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000294#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000295
296#endif /* SRE_RECURSIVE */
297
298/* -------------------------------------------------------------------- */
299/* String matching engine */
300
301/* the following section is compiled twice, with different character
302 settings */
303
304LOCAL(int)
305SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
306{
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000307 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000308
309 int this, that;
310
311 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000312
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000313 case SRE_AT_BEGINNING:
Guido van Rossum29530882000-04-10 17:06:55 +0000314 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000315
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000316 case SRE_AT_BEGINNING_LINE:
317 return ((void*) ptr == state->beginning ||
318 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000319
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000320 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000321 return (((void*) (ptr+1) == state->end &&
322 SRE_IS_LINEBREAK((int) ptr[0])) ||
323 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000324
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000325 case SRE_AT_END_LINE:
326 return ((void*) ptr == state->end ||
327 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000328
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000329 case SRE_AT_BOUNDARY:
Guido van Rossumb700df92000-03-31 14:59:30 +0000330 if (state->beginning == state->end)
331 return 0;
332 that = ((void*) ptr > state->beginning) ?
333 SRE_IS_WORD((int) ptr[-1]) : 0;
334 this = ((void*) ptr < state->end) ?
335 SRE_IS_WORD((int) ptr[0]) : 0;
336 return this != that;
Fredrik Lundh80946112000-06-29 18:03:25 +0000337
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000338 case SRE_AT_NON_BOUNDARY:
Guido van Rossumb700df92000-03-31 14:59:30 +0000339 if (state->beginning == state->end)
340 return 0;
341 that = ((void*) ptr > state->beginning) ?
342 SRE_IS_WORD((int) ptr[-1]) : 0;
343 this = ((void*) ptr < state->end) ?
344 SRE_IS_WORD((int) ptr[0]) : 0;
345 return this == that;
346 }
347
348 return 0;
349}
350
351LOCAL(int)
Fredrik Lundh0640e112000-06-30 13:55:15 +0000352SRE_MEMBER(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000353{
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000354 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000355
356 int ok = 1;
357
358 for (;;) {
359 switch (*set++) {
360
361 case SRE_OP_NEGATE:
362 ok = !ok;
363 break;
364
365 case SRE_OP_FAILURE:
366 return !ok;
367
368 case SRE_OP_LITERAL:
Fredrik Lundhc13222c2000-07-01 23:49:14 +0000369 /* args: <literal> */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000370 if (ch == set[0])
Guido van Rossumb700df92000-03-31 14:59:30 +0000371 return ok;
372 set++;
373 break;
374
375 case SRE_OP_RANGE:
Fredrik Lundhc13222c2000-07-01 23:49:14 +0000376 /* args: <lower> <upper> */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000377 if (set[0] <= ch && ch <= set[1])
Guido van Rossumb700df92000-03-31 14:59:30 +0000378 return ok;
379 set += 2;
380 break;
381
Fredrik Lundh3562f112000-07-02 12:00:07 +0000382 case SRE_OP_CHARSET:
383 /* args: <bitmap> (16 bits per code word) */
384 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
385 return ok;
386 set += 16;
387 break;
388
Guido van Rossumb700df92000-03-31 14:59:30 +0000389 case SRE_OP_CATEGORY:
Fredrik Lundhc13222c2000-07-01 23:49:14 +0000390 /* args: <category> */
Guido van Rossumb700df92000-03-31 14:59:30 +0000391 if (sre_category(set[0], (int) ch))
392 return ok;
393 set += 1;
394 break;
395
396 default:
Fredrik Lundh80946112000-06-29 18:03:25 +0000397 /* internal error -- there's not much we can do about it
398 here, so let's just pretend it didn't match... */
Guido van Rossumb700df92000-03-31 14:59:30 +0000399 return 0;
400 }
401 }
402}
403
404LOCAL(int)
405SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
406{
407 /* check if string matches the given pattern. returns -1 for
408 error, 0 for failure, and 1 for success */
409
410 SRE_CHAR* end = state->end;
411 SRE_CHAR* ptr = state->ptr;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000412 int stack;
Guido van Rossumb700df92000-03-31 14:59:30 +0000413 int stackbase;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000414 int lastmark;
Guido van Rossumb700df92000-03-31 14:59:30 +0000415 int i, count;
416
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000417 /* FIXME: this is a hack! */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000418 void* mark_copy[SRE_MARK_SIZE];
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000419 void* mark = NULL;
420
421 TRACE(("%8d: enter\n", PTR(ptr)));
422
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000423 if (pattern[0] == SRE_OP_INFO) {
424 /* optimization info block */
425 /* args: <1=skip> <2=flags> <3=min> ... */
426 if (pattern[3] && (end - ptr) < pattern[3]) {
427 TRACE(("reject (got %d chars, need %d)\n",
428 (end - ptr), pattern[3]));
429 return 0;
430 }
431 pattern += pattern[1] + 1;
432 }
433
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000434 stackbase = stack = state->stackbase;
435 lastmark = state->lastmark;
436
437 retry:
Guido van Rossumb700df92000-03-31 14:59:30 +0000438
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000439 for (;;) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000440
441 switch (*pattern++) {
442
443 case SRE_OP_FAILURE:
444 /* immediate failure */
445 TRACE(("%8d: failure\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000446 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000447
448 case SRE_OP_SUCCESS:
449 /* end of pattern */
450 TRACE(("%8d: success\n", PTR(ptr)));
451 state->ptr = ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000452 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000453
454 case SRE_OP_AT:
455 /* match at given position */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000456 /* args: <at> */
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000457 TRACE(("%8d: position %d\n", PTR(ptr), *pattern));
Guido van Rossumb700df92000-03-31 14:59:30 +0000458 if (!SRE_AT(state, ptr, *pattern))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000459 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000460 pattern++;
461 break;
462
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000463 case SRE_OP_CATEGORY:
464 /* match at given category */
465 /* args: <category> */
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000466 TRACE(("%8d: category %d [category %d]\n", PTR(ptr),
467 *ptr, *pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000468 if (ptr >= end || !sre_category(pattern[0], ptr[0]))
469 goto failure;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000470 TRACE(("%8d: category ok\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000471 pattern++;
472 ptr++;
473 break;
474
Guido van Rossumb700df92000-03-31 14:59:30 +0000475 case SRE_OP_LITERAL:
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000476 /* match literal string */
Guido van Rossumb700df92000-03-31 14:59:30 +0000477 /* args: <code> */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000478 TRACE(("%8d: literal %c\n", PTR(ptr), pattern[0]));
479 if (ptr >= end || (SRE_CODE) ptr[0] != pattern[0])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000480 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000481 pattern++;
482 ptr++;
483 break;
484
485 case SRE_OP_NOT_LITERAL:
486 /* match anything that is not literal character */
487 /* args: <code> */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000488 TRACE(("%8d: literal not %c\n", PTR(ptr), pattern[0]));
489 if (ptr >= end || (SRE_CODE) ptr[0] == pattern[0])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000490 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000491 pattern++;
492 ptr++;
493 break;
494
495 case SRE_OP_ANY:
496 /* match anything */
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000497 TRACE(("%8d: anything\n", PTR(ptr)));
Guido van Rossumb700df92000-03-31 14:59:30 +0000498 if (ptr >= end)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000499 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000500 ptr++;
501 break;
502
503 case SRE_OP_IN:
504 /* match set member (or non_member) */
505 /* args: <skip> <set> */
506 TRACE(("%8d: set %c\n", PTR(ptr), *ptr));
507 if (ptr >= end || !SRE_MEMBER(pattern + 1, *ptr))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000508 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000509 pattern += pattern[0];
510 ptr++;
511 break;
512
513 case SRE_OP_GROUP:
514 /* match backreference */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000515 TRACE(("%8d: group %d\n", PTR(ptr), pattern[0]));
Guido van Rossumb700df92000-03-31 14:59:30 +0000516 i = pattern[0];
517 {
Guido van Rossumb700df92000-03-31 14:59:30 +0000518 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
519 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
520 if (!p || !e || e < p)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000521 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000522 while (p < e) {
523 if (ptr >= end || *ptr != *p)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000524 goto failure;
525 p++; ptr++;
526 }
527 }
528 pattern++;
529 break;
530
531 case SRE_OP_GROUP_IGNORE:
532 /* match backreference */
533 TRACE(("%8d: group ignore %d\n", PTR(ptr), pattern[0]));
534 i = pattern[0];
535 {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000536 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
537 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000538 if (!p || !e || e < p)
539 goto failure;
540 while (p < e) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000541 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000542 state->lower(*ptr) != state->lower(*p))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000543 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000544 p++; ptr++;
545 }
546 }
547 pattern++;
548 break;
549
550 case SRE_OP_LITERAL_IGNORE:
Fredrik Lundh0640e112000-06-30 13:55:15 +0000551 TRACE(("%8d: literal lower(%c)\n", PTR(ptr), pattern[0]));
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000552 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000553 state->lower(*ptr) != state->lower(*pattern))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000554 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000555 pattern++;
556 ptr++;
557 break;
558
559 case SRE_OP_NOT_LITERAL_IGNORE:
Fredrik Lundh0640e112000-06-30 13:55:15 +0000560 TRACE(("%8d: literal not lower(%c)\n", PTR(ptr), pattern[0]));
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000561 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000562 state->lower(*ptr) == state->lower(*pattern))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000563 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000564 pattern++;
565 ptr++;
566 break;
567
568 case SRE_OP_IN_IGNORE:
569 TRACE(("%8d: set lower(%c)\n", PTR(ptr), *ptr));
570 if (ptr >= end
Fredrik Lundh0640e112000-06-30 13:55:15 +0000571 || !SRE_MEMBER(pattern+1, (SRE_CODE) state->lower(*ptr)))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000572 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000573 pattern += pattern[0];
574 ptr++;
575 break;
576
577 case SRE_OP_MARK:
578 /* set mark */
579 /* args: <mark> */
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000580 TRACE(("%8d: set mark %d\n", PTR(ptr), pattern[0]));
581 if (state->lastmark < pattern[0])
582 state->lastmark = pattern[0];
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000583 if (!mark) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000584 mark = mark_copy;
585 memcpy(mark, state->mark, state->lastmark*sizeof(void*));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000586 }
587 state->mark[pattern[0]] = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +0000588 pattern++;
589 break;
590
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000591 case SRE_OP_INDEX:
592 /* set index */
593 /* args: <index> */
594 TRACE(("%8d: set index %d\n", PTR(ptr), pattern[0]));
595 state->index = pattern[0];
596 pattern++;
597 break;
598
Guido van Rossumb700df92000-03-31 14:59:30 +0000599 case SRE_OP_JUMP:
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000600 case SRE_OP_INFO:
Guido van Rossumb700df92000-03-31 14:59:30 +0000601 /* jump forward */
602 /* args: <skip> */
603 TRACE(("%8d: jump +%d\n", PTR(ptr), pattern[0]));
604 pattern += pattern[0];
605 break;
606
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000607 case SRE_OP_ASSERT:
608 /* assert subpattern */
Guido van Rossumb700df92000-03-31 14:59:30 +0000609 /* args: <skip> <pattern> */
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000610 TRACE(("%8d: assert subpattern\n", PTR(ptr)));
Guido van Rossumb700df92000-03-31 14:59:30 +0000611 state->ptr = ptr;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000612 i = SRE_MATCH(state, pattern + 1);
613 if (i < 0)
614 return i;
615 if (!i)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000616 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000617 pattern += pattern[0];
Guido van Rossumb700df92000-03-31 14:59:30 +0000618 break;
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000619
620 case SRE_OP_ASSERT_NOT:
621 /* assert not subpattern */
622 /* args: <skip> <pattern> */
623 TRACE(("%8d: assert not subpattern\n", PTR(ptr)));
624 state->ptr = ptr;
625 i = SRE_MATCH(state, pattern + 1);
626 if (i < 0)
627 return i;
628 if (i)
629 goto failure;
630 pattern += pattern[0];
631 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000632
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000633#if 0
Guido van Rossumb700df92000-03-31 14:59:30 +0000634 case SRE_OP_MAX_REPEAT_ONE:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000635 /* match repeated sequence (maximizing regexp) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000636
637 /* this operator only works if the repeated item is
638 exactly one character wide, and we're not already
639 collecting backtracking points. for other cases,
640 use the MAX_REPEAT operator instead */
641
Guido van Rossumb700df92000-03-31 14:59:30 +0000642 /* args: <skip> <min> <max> <step> */
Guido van Rossumb700df92000-03-31 14:59:30 +0000643 TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr),
644 pattern[1], pattern[2]));
645
646 count = 0;
647
648 if (pattern[3] == SRE_OP_ANY) {
649 /* repeated wildcard. skip to the end of the target
650 string, and backtrack from there */
651 /* FIXME: must look for line endings */
652 if (ptr + pattern[1] > end)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000653 goto failure; /* cannot match */
Guido van Rossumb700df92000-03-31 14:59:30 +0000654 count = pattern[2];
655 if (count > end - ptr)
656 count = end - ptr;
657 ptr += count;
658
659 } else if (pattern[3] == SRE_OP_LITERAL) {
660 /* repeated literal */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000661 SRE_CODE chr = pattern[4];
Guido van Rossumb700df92000-03-31 14:59:30 +0000662 while (count < (int) pattern[2]) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000663 if (ptr >= end || (SRE_CODE) ptr[0] != chr)
Guido van Rossumb700df92000-03-31 14:59:30 +0000664 break;
665 ptr++;
666 count++;
667 }
668
669 } else if (pattern[3] == SRE_OP_LITERAL_IGNORE) {
670 /* repeated literal */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000671 SRE_CODE chr = pattern[4];
Guido van Rossumb700df92000-03-31 14:59:30 +0000672 while (count < (int) pattern[2]) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000673 if (ptr >= end || (SRE_CODE) state->lower(*ptr) != chr)
Guido van Rossumb700df92000-03-31 14:59:30 +0000674 break;
675 ptr++;
676 count++;
677 }
678
679 } else if (pattern[3] == SRE_OP_NOT_LITERAL) {
680 /* repeated non-literal */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000681 SRE_CODE chr = pattern[4];
Guido van Rossumb700df92000-03-31 14:59:30 +0000682 while (count < (int) pattern[2]) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000683 if (ptr >= end || (SRE_CODE) ptr[0] == chr)
Guido van Rossumb700df92000-03-31 14:59:30 +0000684 break;
685 ptr++;
686 count++;
687 }
688
689 } else if (pattern[3] == SRE_OP_NOT_LITERAL_IGNORE) {
690 /* repeated non-literal */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000691 SRE_CODE chr = pattern[4];
Guido van Rossumb700df92000-03-31 14:59:30 +0000692 while (count < (int) pattern[2]) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000693 if (ptr >= end || (SRE_CODE) state->lower(ptr[0]) == chr)
Guido van Rossumb700df92000-03-31 14:59:30 +0000694 break;
695 ptr++;
696 count++;
697 }
698
699 } else if (pattern[3] == SRE_OP_IN) {
700 /* repeated set */
701 while (count < (int) pattern[2]) {
702 if (ptr >= end || !SRE_MEMBER(pattern + 5, *ptr))
703 break;
704 ptr++;
705 count++;
706 }
707
708 } else {
709 /* repeated single character pattern */
710 state->ptr = ptr;
711 while (count < (int) pattern[2]) {
712 i = SRE_MATCH(state, pattern + 3);
713 if (i < 0)
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000714 return i;
715 if (!i)
Guido van Rossumb700df92000-03-31 14:59:30 +0000716 break;
717 count++;
718 }
719 state->ptr = ptr;
720 ptr += count;
721 }
722
723 /* when we arrive here, count contains the number of
724 matches, and ptr points to the tail of the target
725 string. check if the rest of the pattern matches, and
726 backtrack if not. */
727
Guido van Rossumb700df92000-03-31 14:59:30 +0000728 TRACE(("%8d: repeat %d found\n", PTR(ptr), count));
729
730 if (count < (int) pattern[1])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000731 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000732
733 if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
734 /* tail is empty. we're finished */
735 TRACE(("%8d: tail is empty\n", PTR(ptr)));
736 state->ptr = ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000737 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000738
739 } else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000740 /* tail starts with a literal. skip positions where
741 the rest of the pattern cannot possibly match */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000742 SRE_CODE chr = pattern[pattern[0]+1];
Guido van Rossumb700df92000-03-31 14:59:30 +0000743 TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr));
744 for (;;) {
745 TRACE(("%8d: scan for tail match\n", PTR(ptr)));
746 while (count >= (int) pattern[1] &&
747 (ptr >= end || *ptr != chr)) {
748 ptr--;
749 count--;
750 }
751 TRACE(("%8d: check tail\n", PTR(ptr)));
752 if (count < (int) pattern[1])
753 break;
754 state->ptr = ptr;
755 i = SRE_MATCH(state, pattern + pattern[0]);
756 if (i > 0) {
757 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000758 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000759 }
760 TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
761 ptr--;
762 count--;
763 }
764
765 } else {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000766 /* general case */
Guido van Rossumb700df92000-03-31 14:59:30 +0000767 TRACE(("%8d: tail is pattern\n", PTR(ptr)));
768 while (count >= (int) pattern[1]) {
769 state->ptr = ptr;
770 i = SRE_MATCH(state, pattern + pattern[0]);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000771 if (i < 0)
772 return i;
773 if (i) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000774 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000775 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000776 }
777 TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
778 ptr--;
779 count--;
780 }
781 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000782 goto failure;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000783#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000784
785 case SRE_OP_MAX_REPEAT:
786 /* match repeated sequence (maximizing regexp). repeated
787 group should end with a MAX_UNTIL code */
788
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000789 /* args: <skip> <min> <max> <item> */
790
791 TRACE(("%8d: max repeat (%d %d)\n", PTR(ptr),
Guido van Rossumb700df92000-03-31 14:59:30 +0000792 pattern[1], pattern[2]));
793
794 count = 0;
795 state->ptr = ptr;
796
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000797 /* match minimum number of items */
798 while (count < (int) pattern[1]) {
799 i = SRE_MATCH(state, pattern + 3);
800 if (i < 0)
801 return i;
802 if (!i)
803 goto failure;
804 if (state->ptr == ptr) {
805 /* if the match was successful but empty, set the
806 count to max and terminate the scanning loop */
807 count = (int) pattern[2];
808 break;
809 }
810 count++;
811 ptr = state->ptr;
812 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000813
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000814 TRACE(("%8d: found %d leading items\n", PTR(ptr), count));
Guido van Rossumb700df92000-03-31 14:59:30 +0000815
816 if (count < (int) pattern[1])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000817 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000818
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000819 /* match maximum number of items, pushing alternate end
820 points to the stack */
Guido van Rossumb700df92000-03-31 14:59:30 +0000821
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000822 while (pattern[2] == 65535 || count < (int) pattern[2]) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000823 state->stackbase = stack;
824 i = SRE_MATCH(state, pattern + 3);
825 state->stackbase = stackbase; /* rewind */
826 if (i < 0)
827 return i;
828 if (!i)
829 break;
830 if (state->ptr == ptr) {
831 count = (int) pattern[2];
832 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000833 }
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000834 /* this position was valid; add it to the retry
835 stack */
836 if (stack >= state->stacksize) {
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000837 i = stack_extend(state, stack + 1,
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000838 stackbase + pattern[2]);
839 if (i < 0)
840 return i; /* out of memory */
841 }
842 TRACE(("%8d: stack[%d] = %d\n", PTR(ptr), stack, PTR(ptr)));
843 state->stack[stack].ptr = ptr;
844 state->stack[stack].pattern = pattern + pattern[0];
845 stack++;
846 /* move forward */
847 ptr = state->ptr;
848 count++;
Guido van Rossumb700df92000-03-31 14:59:30 +0000849 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000850
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000851 /* when we get here, count is the number of successful
852 matches, and ptr points to the tail. */
Guido van Rossumb700df92000-03-31 14:59:30 +0000853
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000854 TRACE(("%8d: skip +%d\n", PTR(ptr), pattern[0]));
855
856 pattern += pattern[0];
857 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000858
859 case SRE_OP_MIN_REPEAT:
860 /* match repeated sequence (minimizing regexp) */
861 TRACE(("%8d: min repeat %d %d\n", PTR(ptr),
862 pattern[1], pattern[2]));
863 count = 0;
864 state->ptr = ptr;
865 /* match minimum number of items */
866 while (count < (int) pattern[1]) {
867 i = SRE_MATCH(state, pattern + 3);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000868 if (i < 0)
869 return i;
870 if (!i)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000871 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000872 count++;
873 }
874 /* move forward until the tail matches. */
875 while (count <= (int) pattern[2]) {
876 ptr = state->ptr;
877 i = SRE_MATCH(state, pattern + pattern[0]);
878 if (i > 0) {
879 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000880 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000881 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000882 state->ptr = ptr; /* backtrack */
883 i = SRE_MATCH(state, pattern + 3);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000884 if (i < 0)
885 return i;
886 if (!i)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000887 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000888 count++;
889 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000890 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000891
Guido van Rossumb700df92000-03-31 14:59:30 +0000892 case SRE_OP_BRANCH:
893 /* match one of several subpatterns */
894 /* format: <branch> <size> <head> ... <null> <tail> */
895 TRACE(("%8d: branch\n", PTR(ptr)));
896 while (*pattern) {
897 if (pattern[1] != SRE_OP_LITERAL ||
Fredrik Lundh0640e112000-06-30 13:55:15 +0000898 (ptr < end && (SRE_CODE) ptr[0] == pattern[2])) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000899 TRACE(("%8d: branch check\n", PTR(ptr)));
900 state->ptr = ptr;
901 i = SRE_MATCH(state, pattern + 1);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000902 if (i < 0)
903 return i;
904 if (i) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000905 TRACE(("%8d: branch succeeded\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000906 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000907 }
908 }
909 pattern += *pattern;
910 }
911 TRACE(("%8d: branch failed\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000912 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000913
914 case SRE_OP_REPEAT:
915 /* TEMPLATE: match repeated sequence (no backtracking) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000916 /* args: <skip> <min> <max> */
Guido van Rossumb700df92000-03-31 14:59:30 +0000917 TRACE(("%8d: repeat %d %d\n", PTR(ptr), pattern[1], pattern[2]));
918 count = 0;
919 state->ptr = ptr;
920 while (count < (int) pattern[2]) {
921 i = SRE_MATCH(state, pattern + 3);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000922 if (i < 0)
923 return i;
924 if (!i)
Guido van Rossumb700df92000-03-31 14:59:30 +0000925 break;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000926 if (state->ptr == ptr) {
927 count = (int) pattern[2];
928 break;
929 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000930 count++;
931 }
932 if (count <= (int) pattern[1])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000933 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000934 TRACE(("%8d: repeat %d matches\n", PTR(ptr), count));
935 pattern += pattern[0];
936 ptr = state->ptr;
937 break;
938
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000939 default:
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000940 TRACE(("%8d: unknown opcode %d\n", PTR(ptr), pattern[-1]));
Guido van Rossumb700df92000-03-31 14:59:30 +0000941 return SRE_ERROR_ILLEGAL;
942 }
943 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000944
945 failure:
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000946 if (stack-- > stackbase) {
947 ptr = state->stack[stack].ptr;
948 pattern = state->stack[stack].pattern;
949 TRACE(("%8d: retry (%d)\n", PTR(ptr), stack));
950 goto retry;
951 }
952 TRACE(("%8d: leave (failure)\n", PTR(ptr)));
953 state->stackbase = stackbase;
954 state->lastmark = lastmark;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000955 if (mark)
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000956 memcpy(state->mark, mark, state->lastmark*sizeof(void*));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000957 return 0;
958
959 success:
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000960 TRACE(("%8d: leave (success)\n", PTR(ptr)));
961 state->stackbase = stackbase;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000962 return 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000963}
964
965LOCAL(int)
966SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
967{
968 SRE_CHAR* ptr = state->start;
969 SRE_CHAR* end = state->end;
970 int status = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +0000971 int prefix_len;
972 SRE_CODE* prefix = NULL;
973 SRE_CODE* charset = NULL;
974 SRE_CODE* overlap = NULL;
975 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000976
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000977 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000978 /* optimization info block */
Fredrik Lundh3562f112000-07-02 12:00:07 +0000979 /* args: <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
980
981 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000982
983 if (pattern[3] > 0) {
984 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +0000985 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000986 end -= pattern[3]-1;
987 if (end <= ptr)
988 end = ptr+1;
989 }
990
Fredrik Lundh3562f112000-07-02 12:00:07 +0000991 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000992 /* pattern starts with a known prefix */
Fredrik Lundh3562f112000-07-02 12:00:07 +0000993 prefix_len = pattern[5];
994 prefix = pattern + 6;
995 overlap = prefix + prefix_len - 1;
996 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000997 /* pattern starts with a character from a known set */
Fredrik Lundh3562f112000-07-02 12:00:07 +0000998 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000999
1000 pattern += 1 + pattern[1];
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001001 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001002
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001003#if defined(USE_FAST_SEARCH)
Fredrik Lundh3562f112000-07-02 12:00:07 +00001004 if (prefix && overlap && prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001005 /* pattern starts with a known prefix. use the overlap
1006 table to skip forward as fast as we possibly can */
1007 int i = 0;
1008 end = state->end;
1009 while (ptr < end) {
1010 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001011 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001012 if (!i)
1013 break;
1014 else
1015 i = overlap[i];
1016 } else {
1017 if (++i == prefix_len) {
1018 /* found a potential match */
1019 TRACE(("%8d: === SEARCH === hit\n", PTR(ptr)));
1020 state->start = ptr - prefix_len + 1;
1021 state->ptr = ptr + 1;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001022 if (flags & SRE_INFO_LITERAL)
1023 return 1; /* we got all of it */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001024 status = SRE_MATCH(state, pattern + 2*prefix_len);
1025 if (status != 0)
1026 return status;
1027 /* close but no cigar -- try again */
1028 i = overlap[i];
1029 }
1030 break;
1031 }
1032
1033 }
1034 ptr++;
1035 }
1036 return 0;
1037 }
1038#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001039
Fredrik Lundh3562f112000-07-02 12:00:07 +00001040 if (pattern[0] == SRE_OP_LITERAL) {
1041 /* pattern starts with a literal character. this is used
1042 for short prefixes, and if fast search is disabled */
Fredrik Lundh0640e112000-06-30 13:55:15 +00001043 SRE_CODE chr = pattern[1];
Guido van Rossumb700df92000-03-31 14:59:30 +00001044 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001045 while (ptr < end && (SRE_CODE) ptr[0] != chr)
Guido van Rossumb700df92000-03-31 14:59:30 +00001046 ptr++;
1047 if (ptr == end)
1048 return 0;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001049 TRACE(("%8d: === SEARCH === literal\n", PTR(ptr)));
Guido van Rossumb700df92000-03-31 14:59:30 +00001050 state->start = ptr;
1051 state->ptr = ++ptr;
1052 status = SRE_MATCH(state, pattern + 2);
1053 if (status != 0)
1054 break;
1055 }
Fredrik Lundh3562f112000-07-02 12:00:07 +00001056 } else if (charset) {
1057 /* pattern starts with a character from a known set */
1058 for (;;) {
1059 while (ptr < end && !SRE_MEMBER(charset, ptr[0]))
1060 ptr++;
1061 if (ptr == end)
1062 return 0;
1063 TRACE(("%8d: === SEARCH === charset\n", PTR(ptr)));
1064 state->start = ptr;
1065 state->ptr = ptr;
1066 status = SRE_MATCH(state, pattern);
1067 if (status != 0)
1068 break;
1069 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001070 } else
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001071 /* general case */
Guido van Rossumb700df92000-03-31 14:59:30 +00001072 while (ptr <= end) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001073 TRACE(("%8d: === SEARCH ===\n", PTR(ptr)));
Guido van Rossumb700df92000-03-31 14:59:30 +00001074 state->start = state->ptr = ptr++;
1075 status = SRE_MATCH(state, pattern);
1076 if (status != 0)
1077 break;
1078 }
1079
1080 return status;
1081}
Fredrik Lundh3562f112000-07-02 12:00:07 +00001082
Guido van Rossumb700df92000-03-31 14:59:30 +00001083
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001084#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001085
1086/* -------------------------------------------------------------------- */
1087/* factories and destructors */
1088
1089/* see sre.h for object declarations */
1090
1091staticforward PyTypeObject Pattern_Type;
1092staticforward PyTypeObject Match_Type;
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001093staticforward PyTypeObject Scanner_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00001094
1095static PyObject *
1096_compile(PyObject* self_, PyObject* args)
1097{
1098 /* "compile" pattern descriptor to pattern object */
1099
1100 PatternObject* self;
1101
1102 PyObject* pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001103 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001104 PyObject* code;
1105 int groups = 0;
1106 PyObject* groupindex = NULL;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001107 PyObject* indexgroup = NULL;
1108 if (!PyArg_ParseTuple(args, "OiO!|iOO", &pattern, &flags,
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001109 &PyString_Type, &code,
Fredrik Lundhc2301732000-07-02 22:25:39 +00001110 &groups, &groupindex, &indexgroup))
Guido van Rossumb700df92000-03-31 14:59:30 +00001111 return NULL;
1112
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001113 self = PyObject_NEW(PatternObject, &Pattern_Type);
Guido van Rossumb700df92000-03-31 14:59:30 +00001114 if (self == NULL)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001115
Guido van Rossumb700df92000-03-31 14:59:30 +00001116 return NULL;
1117
1118 Py_INCREF(pattern);
1119 self->pattern = pattern;
1120
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001121 self->flags = flags;
1122
Guido van Rossumb700df92000-03-31 14:59:30 +00001123 Py_INCREF(code);
1124 self->code = code;
1125
1126 self->groups = groups;
1127
1128 Py_XINCREF(groupindex);
1129 self->groupindex = groupindex;
1130
Fredrik Lundhc2301732000-07-02 22:25:39 +00001131 Py_XINCREF(indexgroup);
1132 self->indexgroup = indexgroup;
1133
Guido van Rossumb700df92000-03-31 14:59:30 +00001134 return (PyObject*) self;
1135}
1136
1137static PyObject *
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001138sre_codesize(PyObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001139{
1140 return Py_BuildValue("i", sizeof(SRE_CODE));
1141}
1142
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001143static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001144sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001145{
1146 int character, flags;
1147 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1148 return NULL;
1149 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001150 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001151#if defined(HAVE_UNICODE)
1152 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001153 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001154#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001155 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001156}
1157
Guido van Rossumb700df92000-03-31 14:59:30 +00001158LOCAL(PyObject*)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001159state_init(SRE_STATE* state, PatternObject* pattern, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001160{
1161 /* prepare state object */
1162
1163 PyBufferProcs *buffer;
1164 int i, count;
1165 void* ptr;
1166
1167 PyObject* string;
1168 int start = 0;
1169 int end = INT_MAX;
1170 if (!PyArg_ParseTuple(args, "O|ii", &string, &start, &end))
1171 return NULL;
1172
1173 /* get pointer to string buffer */
1174 buffer = string->ob_type->tp_as_buffer;
1175 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1176 buffer->bf_getsegcount(string, NULL) != 1) {
1177 PyErr_SetString(PyExc_TypeError, "expected read-only buffer");
1178 return NULL;
1179 }
1180
1181 /* determine buffer size */
1182 count = buffer->bf_getreadbuffer(string, 0, &ptr);
1183 if (count < 0) {
1184 /* sanity check */
1185 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1186 return NULL;
1187 }
1188
1189 /* determine character size */
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001190#if defined(HAVE_UNICODE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001191 state->charsize = (PyUnicode_Check(string) ? sizeof(Py_UNICODE) : 1);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001192#else
1193 state->charsize = 1;
1194#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001195
1196 count /= state->charsize;
1197
1198 /* adjust boundaries */
1199 if (start < 0)
1200 start = 0;
1201 else if (start > count)
1202 start = count;
1203
1204 if (end < 0)
1205 end = 0;
1206 else if (end > count)
1207 end = count;
1208
1209 state->beginning = ptr;
1210
1211 state->start = (void*) ((char*) ptr + start * state->charsize);
1212 state->end = (void*) ((char*) ptr + end * state->charsize);
1213
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001214 state->lastmark = 0;
1215
Guido van Rossumb700df92000-03-31 14:59:30 +00001216 /* FIXME: dynamic! */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001217 for (i = 0; i < SRE_MARK_SIZE; i++)
Guido van Rossumb700df92000-03-31 14:59:30 +00001218 state->mark[i] = NULL;
1219
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001220 state->index = -1;
1221
Guido van Rossumb700df92000-03-31 14:59:30 +00001222 state->stack = NULL;
1223 state->stackbase = 0;
1224 state->stacksize = 0;
1225
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001226 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001227 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001228#if defined(HAVE_UNICODE)
1229 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001230 state->lower = sre_lower_unicode;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001231#endif
1232 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001233 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001234
Guido van Rossumb700df92000-03-31 14:59:30 +00001235 return string;
1236}
1237
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001238LOCAL(void)
1239state_fini(SRE_STATE* state)
1240{
1241 stack_free(state);
1242}
1243
1244LOCAL(PyObject*)
1245state_getslice(SRE_STATE* state, int index, PyObject* string)
1246{
1247 index = (index - 1) * 2;
1248
1249 if (string == Py_None || !state->mark[index] || !state->mark[index+1]) {
1250 Py_INCREF(Py_None);
1251 return Py_None;
1252 }
1253
1254 return PySequence_GetSlice(
1255 string,
1256 ((char*)state->mark[index] - (char*)state->beginning) /
1257 state->charsize,
1258 ((char*)state->mark[index+1] - (char*)state->beginning) /
1259 state->charsize
1260 );
1261}
1262
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001263static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001264pattern_new_match(PatternObject* pattern, SRE_STATE* state,
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001265 PyObject* string, int status)
1266{
1267 /* create match object (from state object) */
1268
1269 MatchObject* match;
1270 int i, j;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001271 char* base;
1272 int n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001273
1274 if (status > 0) {
1275
1276 /* create match object (with room for extra group marks) */
1277 match = PyObject_NEW_VAR(MatchObject, &Match_Type, 2*pattern->groups);
1278 if (match == NULL)
1279 return NULL;
1280
1281 Py_INCREF(pattern);
1282 match->pattern = pattern;
1283
1284 Py_INCREF(string);
1285 match->string = string;
1286
1287 match->groups = pattern->groups+1;
1288
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001289 base = (char*) state->beginning;
1290 n = state->charsize;
1291
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001292 /* group zero */
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001293 match->mark[0] = ((char*) state->start - base) / n;
1294 match->mark[1] = ((char*) state->ptr - base) / n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001295
1296 /* fill in the rest of the groups */
1297 for (i = j = 0; i < pattern->groups; i++, j+=2)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001298 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
1299 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
1300 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001301 } else
1302 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
1303
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001304 match->index = state->index;
1305
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001306 return (PyObject*) match;
1307
1308 } else if (status < 0) {
1309
1310 /* internal error */
1311 PyErr_SetString(
1312 PyExc_RuntimeError, "internal error in regular expression engine"
1313 );
1314 return NULL;
1315
1316 }
1317
1318 Py_INCREF(Py_None);
1319 return Py_None;
1320}
1321
1322static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001323pattern_scanner(PatternObject* pattern, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001324{
1325 /* create search state object */
1326
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001327 ScannerObject* self;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001328 PyObject* string;
1329
1330 /* create match object (with room for extra group marks) */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001331 self = PyObject_NEW(ScannerObject, &Scanner_Type);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001332 if (self == NULL)
1333 return NULL;
1334
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001335 string = state_init(&self->state, pattern, args);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001336 if (!string) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001337 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001338 return NULL;
1339 }
1340
1341 Py_INCREF(pattern);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001342 self->pattern = (PyObject*) pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001343
1344 Py_INCREF(string);
1345 self->string = string;
1346
1347 return (PyObject*) self;
1348}
1349
Guido van Rossumb700df92000-03-31 14:59:30 +00001350static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001351pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001352{
1353 Py_XDECREF(self->code);
1354 Py_XDECREF(self->pattern);
1355 Py_XDECREF(self->groupindex);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001356 PyMem_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001357}
1358
1359static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001360pattern_match(PatternObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001361{
1362 SRE_STATE state;
1363 PyObject* string;
1364 int status;
1365
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001366 string = state_init(&state, self, args);
Guido van Rossumb700df92000-03-31 14:59:30 +00001367 if (!string)
1368 return NULL;
1369
1370 state.ptr = state.start;
1371
1372 if (state.charsize == 1) {
1373 status = sre_match(&state, PatternObject_GetCode(self));
1374 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001375#if defined(HAVE_UNICODE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001376 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001377#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001378 }
1379
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001380 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001381
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001382 return pattern_new_match(self, &state, string, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001383}
1384
1385static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001386pattern_search(PatternObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001387{
1388 SRE_STATE state;
1389 PyObject* string;
1390 int status;
1391
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001392 string = state_init(&state, self, args);
Guido van Rossumb700df92000-03-31 14:59:30 +00001393 if (!string)
1394 return NULL;
1395
1396 if (state.charsize == 1) {
1397 status = sre_search(&state, PatternObject_GetCode(self));
1398 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001399#if defined(HAVE_UNICODE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001400 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001401#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001402 }
1403
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001404 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001405
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001406 return pattern_new_match(self, &state, string, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001407}
1408
1409static PyObject*
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001410call(char* function, PyObject* args)
1411{
1412 PyObject* name;
1413 PyObject* module;
1414 PyObject* func;
1415 PyObject* result;
1416
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001417 name = PyString_FromString(MODULE);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001418 if (!name)
1419 return NULL;
1420 module = PyImport_Import(name);
1421 Py_DECREF(name);
1422 if (!module)
1423 return NULL;
1424 func = PyObject_GetAttrString(module, function);
1425 Py_DECREF(module);
1426 if (!func)
1427 return NULL;
1428 result = PyObject_CallObject(func, args);
1429 Py_DECREF(func);
1430 Py_DECREF(args);
1431 return result;
1432}
1433
1434static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001435pattern_sub(PatternObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001436{
1437 PyObject* template;
1438 PyObject* string;
1439 PyObject* count;
1440 if (!PyArg_ParseTuple(args, "OOO", &template, &string, &count))
1441 return NULL;
1442
1443 /* delegate to Python code */
1444 return call("_sub", Py_BuildValue("OOOO", self, template, string, count));
1445}
1446
1447static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001448pattern_subn(PatternObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001449{
1450 PyObject* template;
1451 PyObject* string;
1452 PyObject* count;
1453 if (!PyArg_ParseTuple(args, "OOO", &template, &string, &count))
1454 return NULL;
1455
1456 /* delegate to Python code */
1457 return call("_subn", Py_BuildValue("OOOO", self, template, string, count));
1458}
1459
1460static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001461pattern_split(PatternObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001462{
1463 PyObject* string;
1464 PyObject* maxsplit;
1465 if (!PyArg_ParseTuple(args, "OO", &string, &maxsplit))
1466 return NULL;
1467
1468 /* delegate to Python code */
1469 return call("_split", Py_BuildValue("OOO", self, string, maxsplit));
1470}
1471
1472static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001473pattern_findall(PatternObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001474{
Guido van Rossumb700df92000-03-31 14:59:30 +00001475 SRE_STATE state;
1476 PyObject* string;
1477 PyObject* list;
1478 int status;
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001479 int i;
Guido van Rossumb700df92000-03-31 14:59:30 +00001480
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001481 string = state_init(&state, self, args);
Guido van Rossumb700df92000-03-31 14:59:30 +00001482 if (!string)
1483 return NULL;
1484
1485 list = PyList_New(0);
1486
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001487 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001488
1489 PyObject* item;
1490
1491 state.ptr = state.start;
1492
1493 if (state.charsize == 1) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001494 status = sre_search(&state, PatternObject_GetCode(self));
Guido van Rossumb700df92000-03-31 14:59:30 +00001495 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001496#if defined(HAVE_UNICODE)
1497 status = sre_usearch(&state, PatternObject_GetCode(self));
1498#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001499 }
1500
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001501 if (status > 0) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001502
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001503 /* don't bother to build a match object */
1504 switch (self->groups) {
1505 case 0:
1506 item = PySequence_GetSlice(
1507 string,
1508 ((char*) state.start - (char*) state.beginning) /
1509 state.charsize,
1510 ((char*) state.ptr - (char*) state.beginning) /
1511 state.charsize);
1512 if (!item)
1513 goto error;
1514 break;
1515 case 1:
1516 item = state_getslice(&state, 1, string);
1517 if (!item)
1518 goto error;
1519 break;
1520 default:
1521 item = PyTuple_New(self->groups);
1522 if (!item)
1523 goto error;
1524 for (i = 0; i < self->groups; i++) {
1525 PyObject* o = state_getslice(&state, i+1, string);
1526 if (!o) {
1527 Py_DECREF(item);
1528 goto error;
1529 }
1530 PyTuple_SET_ITEM(item, i, o);
1531 }
1532 break;
1533 }
1534
1535 if (PyList_Append(list, item) < 0) {
1536 Py_DECREF(item);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001537 goto error;
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001538 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001539
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001540 if (state.ptr == state.start)
1541 state.start = (void*) ((char*) state.ptr + state.charsize);
1542 else
1543 state.start = state.ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001544
1545 } else {
1546
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001547 if (status == 0)
1548 break;
1549
Guido van Rossumb700df92000-03-31 14:59:30 +00001550 /* internal error */
1551 PyErr_SetString(
1552 PyExc_RuntimeError,
1553 "internal error in regular expression engine"
1554 );
1555 goto error;
1556
1557 }
1558 }
1559
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001560 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001561 return list;
1562
1563error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001564 Py_DECREF(list);
1565 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001566 return NULL;
1567
1568}
1569
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001570static PyMethodDef pattern_methods[] = {
1571 {"match", (PyCFunction) pattern_match, 1},
1572 {"search", (PyCFunction) pattern_search, 1},
1573 {"sub", (PyCFunction) pattern_sub, 1},
1574 {"subn", (PyCFunction) pattern_subn, 1},
1575 {"split", (PyCFunction) pattern_split, 1},
1576 {"findall", (PyCFunction) pattern_findall, 1},
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001577 /* experimental */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001578 {"scanner", (PyCFunction) pattern_scanner, 1},
Guido van Rossumb700df92000-03-31 14:59:30 +00001579 {NULL, NULL}
1580};
1581
1582static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001583pattern_getattr(PatternObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00001584{
1585 PyObject* res;
1586
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001587 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00001588
1589 if (res)
1590 return res;
1591
1592 PyErr_Clear();
1593
1594 /* attributes */
1595 if (!strcmp(name, "pattern")) {
1596 Py_INCREF(self->pattern);
1597 return self->pattern;
1598 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001599
1600 if (!strcmp(name, "flags"))
1601 return Py_BuildValue("i", self->flags);
1602
Fredrik Lundh01016fe2000-06-30 00:27:46 +00001603 if (!strcmp(name, "groups"))
1604 return Py_BuildValue("i", self->groups);
1605
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001606 if (!strcmp(name, "groupindex") && self->groupindex) {
1607 Py_INCREF(self->groupindex);
1608 return self->groupindex;
1609 }
1610
Guido van Rossumb700df92000-03-31 14:59:30 +00001611 PyErr_SetString(PyExc_AttributeError, name);
1612 return NULL;
1613}
1614
1615statichere PyTypeObject Pattern_Type = {
1616 PyObject_HEAD_INIT(NULL)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001617 0, "SRE_Pattern", sizeof(PatternObject), 0,
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001618 (destructor)pattern_dealloc, /*tp_dealloc*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001619 0, /*tp_print*/
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001620 (getattrfunc)pattern_getattr, /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001621};
1622
1623/* -------------------------------------------------------------------- */
1624/* match methods */
1625
1626static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001627match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001628{
1629 Py_XDECREF(self->string);
1630 Py_DECREF(self->pattern);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001631 PyMem_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001632}
1633
1634static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001635match_getslice_by_index(MatchObject* self, int index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00001636{
1637 if (index < 0 || index >= self->groups) {
1638 /* raise IndexError if we were given a bad group number */
1639 PyErr_SetString(
1640 PyExc_IndexError,
1641 "no such group"
1642 );
1643 return NULL;
1644 }
1645
1646 if (self->string == Py_None || self->mark[index+index] < 0) {
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001647 /* return default value if the string or group is undefined */
1648 Py_INCREF(def);
1649 return def;
Guido van Rossumb700df92000-03-31 14:59:30 +00001650 }
1651
1652 return PySequence_GetSlice(
1653 self->string, self->mark[index+index], self->mark[index+index+1]
1654 );
1655}
1656
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001657static int
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001658match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00001659{
1660 if (!PyInt_Check(index) && self->pattern->groupindex != NULL) {
1661 /* FIXME: resource leak? */
1662 index = PyObject_GetItem(self->pattern->groupindex, index);
1663 if (!index)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001664 return -1;
Guido van Rossumb700df92000-03-31 14:59:30 +00001665 }
1666
1667 if (PyInt_Check(index))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001668 return (int) PyInt_AS_LONG(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00001669
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001670 return -1;
1671}
1672
1673static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001674match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001675{
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001676 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00001677}
1678
1679static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001680match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001681{
1682 PyObject* result;
1683 int i, size;
1684
1685 size = PyTuple_GET_SIZE(args);
1686
1687 switch (size) {
1688 case 0:
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001689 result = match_getslice(self, Py_False, Py_None);
Guido van Rossumb700df92000-03-31 14:59:30 +00001690 break;
1691 case 1:
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001692 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
Guido van Rossumb700df92000-03-31 14:59:30 +00001693 break;
1694 default:
1695 /* fetch multiple items */
1696 result = PyTuple_New(size);
1697 if (!result)
1698 return NULL;
1699 for (i = 0; i < size; i++) {
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001700 PyObject* item = match_getslice(
1701 self, PyTuple_GET_ITEM(args, i), Py_None
1702 );
Guido van Rossumb700df92000-03-31 14:59:30 +00001703 if (!item) {
1704 Py_DECREF(result);
1705 return NULL;
1706 }
1707 PyTuple_SET_ITEM(result, i, item);
1708 }
1709 break;
1710 }
1711 return result;
1712}
1713
1714static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001715match_groups(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001716{
1717 PyObject* result;
1718 int index;
1719
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001720 PyObject* def = Py_None;
1721 if (!PyArg_ParseTuple(args, "|O", &def))
1722 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001723
Guido van Rossumb700df92000-03-31 14:59:30 +00001724 result = PyTuple_New(self->groups-1);
1725 if (!result)
1726 return NULL;
1727
1728 for (index = 1; index < self->groups; index++) {
1729 PyObject* item;
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001730 item = match_getslice_by_index(self, index, def);
Guido van Rossumb700df92000-03-31 14:59:30 +00001731 if (!item) {
1732 Py_DECREF(result);
1733 return NULL;
1734 }
1735 PyTuple_SET_ITEM(result, index-1, item);
1736 }
1737
1738 return result;
1739}
1740
1741static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001742match_groupdict(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001743{
1744 PyObject* result;
1745 PyObject* keys;
1746 int index;
1747
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001748 PyObject* def = Py_None;
1749 if (!PyArg_ParseTuple(args, "|O", &def))
1750 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001751
Guido van Rossumb700df92000-03-31 14:59:30 +00001752 result = PyDict_New();
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001753 if (!result || !self->pattern->groupindex)
Guido van Rossumb700df92000-03-31 14:59:30 +00001754 return result;
1755
1756 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001757 if (!keys) {
1758 Py_DECREF(result);
Guido van Rossumb700df92000-03-31 14:59:30 +00001759 return NULL;
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001760 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001761
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001762 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001763 PyObject* key;
1764 PyObject* item;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001765 key = PyList_GET_ITEM(keys, index);
Guido van Rossumb700df92000-03-31 14:59:30 +00001766 if (!key) {
1767 Py_DECREF(keys);
1768 Py_DECREF(result);
1769 return NULL;
1770 }
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001771 item = match_getslice(self, key, def);
Guido van Rossumb700df92000-03-31 14:59:30 +00001772 if (!item) {
1773 Py_DECREF(key);
1774 Py_DECREF(keys);
1775 Py_DECREF(result);
1776 return NULL;
1777 }
1778 /* FIXME: <fl> this can fail, right? */
1779 PyDict_SetItem(result, key, item);
1780 }
1781
1782 Py_DECREF(keys);
1783
1784 return result;
1785}
1786
1787static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001788match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001789{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001790 int index;
1791
1792 PyObject* index_ = Py_False;
1793 if (!PyArg_ParseTuple(args, "|O", &index_))
Guido van Rossumb700df92000-03-31 14:59:30 +00001794 return NULL;
1795
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001796 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001797
Guido van Rossumb700df92000-03-31 14:59:30 +00001798 if (index < 0 || index >= self->groups) {
1799 PyErr_SetString(
1800 PyExc_IndexError,
1801 "no such group"
1802 );
1803 return NULL;
1804 }
1805
1806 if (self->mark[index*2] < 0) {
1807 Py_INCREF(Py_None);
1808 return Py_None;
1809 }
1810
1811 return Py_BuildValue("i", self->mark[index*2]);
1812}
1813
1814static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001815match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001816{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001817 int index;
1818
1819 PyObject* index_ = Py_False;
1820 if (!PyArg_ParseTuple(args, "|O", &index_))
Guido van Rossumb700df92000-03-31 14:59:30 +00001821 return NULL;
1822
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001823 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001824
Guido van Rossumb700df92000-03-31 14:59:30 +00001825 if (index < 0 || index >= self->groups) {
1826 PyErr_SetString(
1827 PyExc_IndexError,
1828 "no such group"
1829 );
1830 return NULL;
1831 }
1832
1833 if (self->mark[index*2] < 0) {
1834 Py_INCREF(Py_None);
1835 return Py_None;
1836 }
1837
1838 return Py_BuildValue("i", self->mark[index*2+1]);
1839}
1840
1841static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001842match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001843{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001844 int index;
1845
1846 PyObject* index_ = Py_False;
1847 if (!PyArg_ParseTuple(args, "|O", &index_))
Guido van Rossumb700df92000-03-31 14:59:30 +00001848 return NULL;
1849
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001850 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001851
Guido van Rossumb700df92000-03-31 14:59:30 +00001852 if (index < 0 || index >= self->groups) {
1853 PyErr_SetString(
1854 PyExc_IndexError,
1855 "no such group"
1856 );
1857 return NULL;
1858 }
1859
1860 if (self->mark[index*2] < 0) {
1861 Py_INCREF(Py_None);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001862 Py_INCREF(Py_None);
1863 return Py_BuildValue("OO", Py_None, Py_None);
Guido van Rossumb700df92000-03-31 14:59:30 +00001864 }
1865
1866 return Py_BuildValue("ii", self->mark[index*2], self->mark[index*2+1]);
1867}
1868
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001869static PyMethodDef match_methods[] = {
1870 {"group", (PyCFunction) match_group, 1},
1871 {"start", (PyCFunction) match_start, 1},
1872 {"end", (PyCFunction) match_end, 1},
1873 {"span", (PyCFunction) match_span, 1},
1874 {"groups", (PyCFunction) match_groups, 1},
1875 {"groupdict", (PyCFunction) match_groupdict, 1},
Guido van Rossumb700df92000-03-31 14:59:30 +00001876 {NULL, NULL}
1877};
1878
1879static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001880match_getattr(MatchObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00001881{
1882 PyObject* res;
1883
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001884 res = Py_FindMethod(match_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00001885 if (res)
1886 return res;
1887
1888 PyErr_Clear();
1889
Fredrik Lundhc2301732000-07-02 22:25:39 +00001890 if (!strcmp(name, "lastindex")) {
1891 /* experimental */
1892 if (self->index >= 0)
1893 return Py_BuildValue("i", self->index);
1894 Py_INCREF(Py_None);
1895 return Py_None;
1896 }
1897
1898 if (!strcmp(name, "lastgroup")) {
1899 /* experimental */
1900 if (self->pattern->indexgroup) {
1901 PyObject* result = PySequence_GetItem(
1902 self->pattern->indexgroup, self->index
1903 );
1904 if (result)
1905 return result;
1906 PyErr_Clear();
1907 }
1908 Py_INCREF(Py_None);
1909 return Py_None;
1910 }
1911
Guido van Rossumb700df92000-03-31 14:59:30 +00001912 if (!strcmp(name, "string")) {
1913 Py_INCREF(self->string);
1914 return self->string;
1915 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001916
Guido van Rossumb700df92000-03-31 14:59:30 +00001917 if (!strcmp(name, "re")) {
1918 Py_INCREF(self->pattern);
1919 return (PyObject*) self->pattern;
1920 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001921
Guido van Rossumb700df92000-03-31 14:59:30 +00001922 if (!strcmp(name, "pos"))
1923 return Py_BuildValue("i", 0); /* FIXME */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001924
Guido van Rossumb700df92000-03-31 14:59:30 +00001925 if (!strcmp(name, "endpos"))
1926 return Py_BuildValue("i", 0); /* FIXME */
1927
1928 PyErr_SetString(PyExc_AttributeError, name);
1929 return NULL;
1930}
1931
1932/* FIXME: implement setattr("string", None) as a special case (to
1933 detach the associated string, if any */
1934
1935statichere PyTypeObject Match_Type = {
1936 PyObject_HEAD_INIT(NULL)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001937 0, "SRE_Match",
Guido van Rossumb700df92000-03-31 14:59:30 +00001938 sizeof(MatchObject), /* size of basic object */
1939 sizeof(int), /* space for group item */
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001940 (destructor)match_dealloc, /*tp_dealloc*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001941 0, /*tp_print*/
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001942 (getattrfunc)match_getattr, /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001943};
1944
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001945/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001946/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001947
1948static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001949scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001950{
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001951 state_fini(&self->state);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001952 Py_DECREF(self->string);
1953 Py_DECREF(self->pattern);
1954 PyMem_DEL(self);
1955}
1956
1957static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001958scanner_match(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001959{
1960 SRE_STATE* state = &self->state;
1961 PyObject* match;
1962 int status;
1963
1964 state->ptr = state->start;
1965
1966 if (state->charsize == 1) {
1967 status = sre_match(state, PatternObject_GetCode(self->pattern));
1968 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001969#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001970 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001971#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001972 }
1973
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001974 match = pattern_new_match((PatternObject*) self->pattern,
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001975 state, self->string, status);
1976
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001977 if (status == 0 || state->ptr == state->start)
1978 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001979 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001980 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001981
1982 return match;
1983}
1984
1985
1986static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001987scanner_search(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001988{
1989 SRE_STATE* state = &self->state;
1990 PyObject* match;
1991 int status;
1992
1993 state->ptr = state->start;
1994
1995 if (state->charsize == 1) {
1996 status = sre_search(state, PatternObject_GetCode(self->pattern));
1997 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001998#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001999 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002000#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002001 }
2002
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002003 match = pattern_new_match((PatternObject*) self->pattern,
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002004 state, self->string, status);
2005
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002006 if (status == 0 || state->ptr == state->start)
2007 state->start = (void*) ((char*) state->ptr + state->charsize);
2008 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002009 state->start = state->ptr;
2010
2011 return match;
2012}
2013
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002014static PyMethodDef scanner_methods[] = {
2015 {"match", (PyCFunction) scanner_match, 0},
2016 {"search", (PyCFunction) scanner_search, 0},
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002017 {NULL, NULL}
2018};
2019
2020static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002021scanner_getattr(ScannerObject* self, char* name)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002022{
2023 PyObject* res;
2024
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002025 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002026 if (res)
2027 return res;
2028
2029 PyErr_Clear();
2030
2031 /* attributes */
2032 if (!strcmp(name, "pattern")) {
2033 Py_INCREF(self->pattern);
2034 return self->pattern;
2035 }
2036
2037 PyErr_SetString(PyExc_AttributeError, name);
2038 return NULL;
2039}
2040
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002041statichere PyTypeObject Scanner_Type = {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002042 PyObject_HEAD_INIT(NULL)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002043 0, "SRE_Scanner",
2044 sizeof(ScannerObject), /* size of basic object */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002045 0,
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002046 (destructor)scanner_dealloc, /*tp_dealloc*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002047 0, /*tp_print*/
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002048 (getattrfunc)scanner_getattr, /*tp_getattr*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002049};
2050
Guido van Rossumb700df92000-03-31 14:59:30 +00002051static PyMethodDef _functions[] = {
2052 {"compile", _compile, 1},
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002053 {"getcodesize", sre_codesize, 1},
Fredrik Lundhb389df32000-06-29 12:48:37 +00002054 {"getlower", sre_getlower, 1},
Guido van Rossumb700df92000-03-31 14:59:30 +00002055 {NULL, NULL}
2056};
2057
2058void
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002059#if defined(WIN32)
Guido van Rossumb700df92000-03-31 14:59:30 +00002060__declspec(dllexport)
2061#endif
2062init_sre()
2063{
2064 /* Patch object types */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002065 Pattern_Type.ob_type = Match_Type.ob_type =
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002066 Scanner_Type.ob_type = &PyType_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00002067
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002068 Py_InitModule("_" MODULE, _functions);
Guido van Rossumb700df92000-03-31 14:59:30 +00002069}
2070
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002071#endif /* !defined(SRE_RECURSIVE) */