blob: 3bc023789a285acd1d7a8dda3724d95c2f4d5c9a [file] [log] [blame]
Guido van Rossumb700df92000-03-31 14:59:30 +00001/* -*- Mode: C; tab-width: 4 -*-
2 *
3 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00004 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00005 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00006 *
7 * partial history:
Fredrik Lundh436c3d582000-06-29 08:58:44 +00008 * 99-10-24 fl created (based on existing template matcher code)
Guido van Rossumb700df92000-03-31 14:59:30 +00009 * 99-11-13 fl added categories, branching, and more (0.2)
10 * 99-11-16 fl some tweaks to compile on non-Windows platforms
11 * 99-12-18 fl non-literals, generic maximizing repeat (0.3)
Fredrik Lundh436c3d582000-06-29 08:58:44 +000012 * 00-02-28 fl tons of changes (not all to the better ;-) (0.4)
13 * 00-03-06 fl first alpha, sort of (0.5)
14 * 00-03-14 fl removed most compatibility stuff (0.6)
15 * 00-05-10 fl towards third alpha (0.8.2)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +000016 * 00-05-13 fl added experimental scanner stuff (0.8.3)
Fredrik Lundh436c3d582000-06-29 08:58:44 +000017 * 00-05-27 fl final bug hunt (0.8.4)
18 * 00-06-21 fl less bugs, more taste (0.8.5)
19 * 00-06-25 fl major changes to better deal with nested repeats (0.9)
20 * 00-06-28 fl fixed findall (0.9.1)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +000021 * 00-06-29 fl fixed split, added more scanner features (0.9.2)
Fredrik Lundhc13222c2000-07-01 23:49:14 +000022 * 00-06-30 fl added fast search optimization (0.9.3)
Fredrik Lundh0640e112000-06-30 13:55:15 +000023 * 00-06-30 fl added assert (lookahead) primitives, etc (0.9.4)
Guido van Rossumb700df92000-03-31 14:59:30 +000024 *
25 * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
26 *
Guido van Rossumb700df92000-03-31 14:59:30 +000027 * Portions of this engine have been developed in cooperation with
Fredrik Lundh22d25462000-07-01 17:50:59 +000028 * CNRI. Hewlett-Packard provided funding for 2.0 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000029 * other compatibility work.
30 */
31
32#ifndef SRE_RECURSIVE
33
Fredrik Lundh43b3b492000-06-30 10:41:31 +000034char copyright[] = " SRE 0.9.4 Copyright (c) 1997-2000 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000035
36#include "Python.h"
37
38#include "sre.h"
39
Guido van Rossumb700df92000-03-31 14:59:30 +000040#if defined(HAVE_LIMITS_H)
41#include <limits.h>
42#else
43#define INT_MAX 2147483647
44#endif
45
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000046#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000047
Fredrik Lundh436c3d582000-06-29 08:58:44 +000048/* name of this module, minus the leading underscore */
49#define MODULE "sre"
50
Guido van Rossumb700df92000-03-31 14:59:30 +000051/* defining this one enables tracing */
52#undef DEBUG
53
Fredrik Lundh436c3d582000-06-29 08:58:44 +000054#if PY_VERSION_HEX >= 0x01060000
Fredrik Lundh22d25462000-07-01 17:50:59 +000055/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +000056#define HAVE_UNICODE
57#endif
58
Fredrik Lundh29c08be2000-06-29 23:33:12 +000059/* optional features */
60#define USE_FAST_SEARCH
61
Fredrik Lundh80946112000-06-29 18:03:25 +000062#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000063#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
64/* fastest possible local call under MSVC */
65#define LOCAL(type) static __inline type __fastcall
66#else
Fredrik Lundh29c08be2000-06-29 23:33:12 +000067#define LOCAL(type) static inline type
Guido van Rossumb700df92000-03-31 14:59:30 +000068#endif
69
70/* error codes */
71#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
72#define SRE_ERROR_MEMORY -9 /* out of memory */
73
Fredrik Lundh436c3d582000-06-29 08:58:44 +000074#if defined(DEBUG)
Guido van Rossumb700df92000-03-31 14:59:30 +000075#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +000076#else
77#define TRACE(v)
78#endif
79
Fredrik Lundh436c3d582000-06-29 08:58:44 +000080#define PTR(ptr) ((SRE_CHAR*) (ptr) - (SRE_CHAR*) state->beginning)
Guido van Rossumb700df92000-03-31 14:59:30 +000081
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000082/* -------------------------------------------------------------------- */
83/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +000084
Fredrik Lundh436c3d582000-06-29 08:58:44 +000085/* default character predicates (run sre_chars.py to regenerate tables) */
86
87#define SRE_DIGIT_MASK 1
88#define SRE_SPACE_MASK 2
89#define SRE_LINEBREAK_MASK 4
90#define SRE_ALNUM_MASK 8
91#define SRE_WORD_MASK 16
92
93static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
942, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
950, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
9625, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
9724, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
980, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
9924, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
100
Fredrik Lundhb389df32000-06-29 12:48:37 +0000101static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000010210, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
10327, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
10444, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
10561, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
106108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
107122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
108106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
109120, 121, 122, 123, 124, 125, 126, 127 };
110
Fredrik Lundhb389df32000-06-29 12:48:37 +0000111static unsigned int sre_lower(unsigned int ch)
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000112{
Fredrik Lundhb389df32000-06-29 12:48:37 +0000113 return ((ch) < 128 ? sre_char_lower[ch] : ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000114}
115
116#define SRE_IS_DIGIT(ch)\
117 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
118#define SRE_IS_SPACE(ch)\
119 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
120#define SRE_IS_LINEBREAK(ch)\
121 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
122#define SRE_IS_ALNUM(ch)\
123 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
124#define SRE_IS_WORD(ch)\
125 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000126
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000127/* locale-specific character predicates */
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000128
Fredrik Lundhb389df32000-06-29 12:48:37 +0000129static unsigned int sre_lower_locale(unsigned int ch)
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000130{
131 return ((ch) < 256 ? tolower((ch)) : ch);
132}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000133#define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
134#define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
135#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
136#define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
137#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
138
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000139/* unicode-specific character predicates */
140
141#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +0000142static unsigned int sre_lower_unicode(unsigned int ch)
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000143{
144 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
145}
Fredrik Lundh22d25462000-07-01 17:50:59 +0000146
147#if !defined(Py_UNICODE_ISALNUM)
148/* FIXME: workaround. should be fixed in unicodectype.c */
149#define Py_UNICODE_ISALNUM(ch)\
150 (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISUPPER(ch) ||\
151 Py_UNICODE_ISTITLE(ch) || Py_UNICODE_ISDIGIT(ch))
152#endif
153
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000154#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
155#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
156#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000157#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000158#define SRE_UNI_IS_WORD(ch) (SRE_IS_ALNUM((ch)) || (ch) == '_')
159#endif
160
Guido van Rossumb700df92000-03-31 14:59:30 +0000161LOCAL(int)
162sre_category(SRE_CODE category, unsigned int ch)
163{
164 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000165
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000166 case SRE_CATEGORY_DIGIT:
Guido van Rossumb700df92000-03-31 14:59:30 +0000167 return SRE_IS_DIGIT(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000168 case SRE_CATEGORY_NOT_DIGIT:
Guido van Rossumb700df92000-03-31 14:59:30 +0000169 return !SRE_IS_DIGIT(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000170 case SRE_CATEGORY_SPACE:
Guido van Rossumb700df92000-03-31 14:59:30 +0000171 return SRE_IS_SPACE(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000172 case SRE_CATEGORY_NOT_SPACE:
Guido van Rossumb700df92000-03-31 14:59:30 +0000173 return !SRE_IS_SPACE(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000174 case SRE_CATEGORY_WORD:
Guido van Rossumb700df92000-03-31 14:59:30 +0000175 return SRE_IS_WORD(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000176 case SRE_CATEGORY_NOT_WORD:
Guido van Rossumb700df92000-03-31 14:59:30 +0000177 return !SRE_IS_WORD(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000178 case SRE_CATEGORY_LINEBREAK:
179 return SRE_IS_LINEBREAK(ch);
180 case SRE_CATEGORY_NOT_LINEBREAK:
181 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000182
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000183 case SRE_CATEGORY_LOC_WORD:
184 return SRE_LOC_IS_WORD(ch);
185 case SRE_CATEGORY_LOC_NOT_WORD:
186 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000187
188#if defined(HAVE_UNICODE)
189 case SRE_CATEGORY_UNI_DIGIT:
190 return SRE_UNI_IS_DIGIT(ch);
191 case SRE_CATEGORY_UNI_NOT_DIGIT:
192 return !SRE_UNI_IS_DIGIT(ch);
193 case SRE_CATEGORY_UNI_SPACE:
194 return SRE_UNI_IS_SPACE(ch);
195 case SRE_CATEGORY_UNI_NOT_SPACE:
196 return !SRE_UNI_IS_SPACE(ch);
197 case SRE_CATEGORY_UNI_WORD:
198 return SRE_UNI_IS_WORD(ch);
199 case SRE_CATEGORY_UNI_NOT_WORD:
200 return !SRE_UNI_IS_WORD(ch);
201 case SRE_CATEGORY_UNI_LINEBREAK:
202 return SRE_UNI_IS_LINEBREAK(ch);
203 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
204 return !SRE_UNI_IS_LINEBREAK(ch);
205#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000206 }
207 return 0;
208}
209
210/* helpers */
211
212LOCAL(int)
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000213stack_free(SRE_STATE* state)
Guido van Rossumb700df92000-03-31 14:59:30 +0000214{
215 if (state->stack) {
216 TRACE(("release stack\n"));
217 free(state->stack);
218 state->stack = NULL;
219 }
220 state->stacksize = 0;
221 return 0;
222}
223
224static int /* shouldn't be LOCAL */
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000225stack_extend(SRE_STATE* state, int lo, int hi)
Guido van Rossumb700df92000-03-31 14:59:30 +0000226{
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000227 SRE_STACK* stack;
Guido van Rossumb700df92000-03-31 14:59:30 +0000228 int stacksize;
229
230 /* grow the stack to a suitable size; we need at least lo entries,
231 at most hi entries. if for some reason hi is lower than lo, lo
232 wins */
233
234 stacksize = state->stacksize;
235
236 if (stacksize == 0) {
237 /* create new stack */
238 stacksize = 512;
239 if (stacksize < lo)
240 stacksize = lo;
241 else if (stacksize > hi)
242 stacksize = hi;
243 TRACE(("allocate stack %d\n", stacksize));
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000244 stack = malloc(sizeof(SRE_STACK) * stacksize);
Guido van Rossumb700df92000-03-31 14:59:30 +0000245 } else {
246 /* grow the stack (typically by a factor of two) */
247 while (stacksize < lo)
248 stacksize = 2 * stacksize;
249 /* FIXME: <fl> could trim size if it's larger than lo, and
250 much larger than hi */
251 TRACE(("grow stack to %d\n", stacksize));
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000252 stack = realloc(state->stack, sizeof(SRE_STACK) * stacksize);
Guido van Rossumb700df92000-03-31 14:59:30 +0000253 }
254
255 if (!stack) {
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000256 stack_free(state);
Guido van Rossumb700df92000-03-31 14:59:30 +0000257 return SRE_ERROR_MEMORY;
258 }
259
260 state->stack = stack;
261 state->stacksize = stacksize;
262
263 return 0;
264}
265
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000266/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000267
268#define SRE_CHAR unsigned char
269#define SRE_AT sre_at
270#define SRE_MEMBER sre_member
271#define SRE_MATCH sre_match
272#define SRE_SEARCH sre_search
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000273
274#if defined(HAVE_UNICODE)
275
Guido van Rossumb700df92000-03-31 14:59:30 +0000276#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000277#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000278#undef SRE_RECURSIVE
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000279
Guido van Rossumb700df92000-03-31 14:59:30 +0000280#undef SRE_SEARCH
281#undef SRE_MATCH
282#undef SRE_MEMBER
283#undef SRE_AT
284#undef SRE_CHAR
285
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000286/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000287
288#define SRE_CHAR Py_UNICODE
289#define SRE_AT sre_uat
290#define SRE_MEMBER sre_umember
291#define SRE_MATCH sre_umatch
292#define SRE_SEARCH sre_usearch
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000293#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000294
295#endif /* SRE_RECURSIVE */
296
297/* -------------------------------------------------------------------- */
298/* String matching engine */
299
300/* the following section is compiled twice, with different character
301 settings */
302
303LOCAL(int)
304SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
305{
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000306 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000307
308 int this, that;
309
310 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000311
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000312 case SRE_AT_BEGINNING:
Guido van Rossum29530882000-04-10 17:06:55 +0000313 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000314
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000315 case SRE_AT_BEGINNING_LINE:
316 return ((void*) ptr == state->beginning ||
317 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000318
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000319 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000320 return (((void*) (ptr+1) == state->end &&
321 SRE_IS_LINEBREAK((int) ptr[0])) ||
322 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000323
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000324 case SRE_AT_END_LINE:
325 return ((void*) ptr == state->end ||
326 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000327
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000328 case SRE_AT_BOUNDARY:
Guido van Rossumb700df92000-03-31 14:59:30 +0000329 if (state->beginning == state->end)
330 return 0;
331 that = ((void*) ptr > state->beginning) ?
332 SRE_IS_WORD((int) ptr[-1]) : 0;
333 this = ((void*) ptr < state->end) ?
334 SRE_IS_WORD((int) ptr[0]) : 0;
335 return this != that;
Fredrik Lundh80946112000-06-29 18:03:25 +0000336
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000337 case SRE_AT_NON_BOUNDARY:
Guido van Rossumb700df92000-03-31 14:59:30 +0000338 if (state->beginning == state->end)
339 return 0;
340 that = ((void*) ptr > state->beginning) ?
341 SRE_IS_WORD((int) ptr[-1]) : 0;
342 this = ((void*) ptr < state->end) ?
343 SRE_IS_WORD((int) ptr[0]) : 0;
344 return this == that;
345 }
346
347 return 0;
348}
349
350LOCAL(int)
Fredrik Lundh0640e112000-06-30 13:55:15 +0000351SRE_MEMBER(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000352{
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000353 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000354
355 int ok = 1;
356
357 for (;;) {
358 switch (*set++) {
359
360 case SRE_OP_NEGATE:
361 ok = !ok;
362 break;
363
364 case SRE_OP_FAILURE:
365 return !ok;
366
367 case SRE_OP_LITERAL:
Fredrik Lundhc13222c2000-07-01 23:49:14 +0000368 /* args: <literal> */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000369 if (ch == set[0])
Guido van Rossumb700df92000-03-31 14:59:30 +0000370 return ok;
371 set++;
372 break;
373
374 case SRE_OP_RANGE:
Fredrik Lundhc13222c2000-07-01 23:49:14 +0000375 /* args: <lower> <upper> */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000376 if (set[0] <= ch && ch <= set[1])
Guido van Rossumb700df92000-03-31 14:59:30 +0000377 return ok;
378 set += 2;
379 break;
380
Fredrik Lundh3562f112000-07-02 12:00:07 +0000381 case SRE_OP_CHARSET:
382 /* args: <bitmap> (16 bits per code word) */
383 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
384 return ok;
385 set += 16;
386 break;
387
Guido van Rossumb700df92000-03-31 14:59:30 +0000388 case SRE_OP_CATEGORY:
Fredrik Lundhc13222c2000-07-01 23:49:14 +0000389 /* args: <category> */
Guido van Rossumb700df92000-03-31 14:59:30 +0000390 if (sre_category(set[0], (int) ch))
391 return ok;
392 set += 1;
393 break;
394
395 default:
Fredrik Lundh80946112000-06-29 18:03:25 +0000396 /* internal error -- there's not much we can do about it
397 here, so let's just pretend it didn't match... */
Guido van Rossumb700df92000-03-31 14:59:30 +0000398 return 0;
399 }
400 }
401}
402
403LOCAL(int)
404SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
405{
406 /* check if string matches the given pattern. returns -1 for
407 error, 0 for failure, and 1 for success */
408
409 SRE_CHAR* end = state->end;
410 SRE_CHAR* ptr = state->ptr;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000411 int stack;
Guido van Rossumb700df92000-03-31 14:59:30 +0000412 int stackbase;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000413 int lastmark;
Guido van Rossumb700df92000-03-31 14:59:30 +0000414 int i, count;
415
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000416 /* FIXME: this is a hack! */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000417 void* mark_copy[SRE_MARK_SIZE];
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000418 void* mark = NULL;
419
420 TRACE(("%8d: enter\n", PTR(ptr)));
421
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000422 if (pattern[0] == SRE_OP_INFO) {
423 /* optimization info block */
424 /* args: <1=skip> <2=flags> <3=min> ... */
425 if (pattern[3] && (end - ptr) < pattern[3]) {
426 TRACE(("reject (got %d chars, need %d)\n",
427 (end - ptr), pattern[3]));
428 return 0;
429 }
430 pattern += pattern[1] + 1;
431 }
432
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000433 stackbase = stack = state->stackbase;
434 lastmark = state->lastmark;
435
436 retry:
Guido van Rossumb700df92000-03-31 14:59:30 +0000437
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000438 for (;;) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000439
440 switch (*pattern++) {
441
442 case SRE_OP_FAILURE:
443 /* immediate failure */
444 TRACE(("%8d: failure\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000445 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000446
447 case SRE_OP_SUCCESS:
448 /* end of pattern */
449 TRACE(("%8d: success\n", PTR(ptr)));
450 state->ptr = ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000451 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000452
453 case SRE_OP_AT:
454 /* match at given position */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000455 /* args: <at> */
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000456 TRACE(("%8d: position %d\n", PTR(ptr), *pattern));
Guido van Rossumb700df92000-03-31 14:59:30 +0000457 if (!SRE_AT(state, ptr, *pattern))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000458 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000459 pattern++;
460 break;
461
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000462 case SRE_OP_CATEGORY:
463 /* match at given category */
464 /* args: <category> */
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000465 TRACE(("%8d: category %d [category %d]\n", PTR(ptr),
466 *ptr, *pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000467 if (ptr >= end || !sre_category(pattern[0], ptr[0]))
468 goto failure;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000469 TRACE(("%8d: category ok\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000470 pattern++;
471 ptr++;
472 break;
473
Guido van Rossumb700df92000-03-31 14:59:30 +0000474 case SRE_OP_LITERAL:
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000475 /* match literal string */
Guido van Rossumb700df92000-03-31 14:59:30 +0000476 /* args: <code> */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000477 TRACE(("%8d: literal %c\n", PTR(ptr), pattern[0]));
478 if (ptr >= end || (SRE_CODE) ptr[0] != pattern[0])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000479 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000480 pattern++;
481 ptr++;
482 break;
483
484 case SRE_OP_NOT_LITERAL:
485 /* match anything that is not literal character */
486 /* args: <code> */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000487 TRACE(("%8d: literal not %c\n", PTR(ptr), pattern[0]));
488 if (ptr >= end || (SRE_CODE) ptr[0] == pattern[0])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000489 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000490 pattern++;
491 ptr++;
492 break;
493
494 case SRE_OP_ANY:
495 /* match anything */
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000496 TRACE(("%8d: anything\n", PTR(ptr)));
Guido van Rossumb700df92000-03-31 14:59:30 +0000497 if (ptr >= end)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000498 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000499 ptr++;
500 break;
501
502 case SRE_OP_IN:
503 /* match set member (or non_member) */
504 /* args: <skip> <set> */
505 TRACE(("%8d: set %c\n", PTR(ptr), *ptr));
506 if (ptr >= end || !SRE_MEMBER(pattern + 1, *ptr))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000507 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000508 pattern += pattern[0];
509 ptr++;
510 break;
511
512 case SRE_OP_GROUP:
513 /* match backreference */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000514 TRACE(("%8d: group %d\n", PTR(ptr), pattern[0]));
Guido van Rossumb700df92000-03-31 14:59:30 +0000515 i = pattern[0];
516 {
Guido van Rossumb700df92000-03-31 14:59:30 +0000517 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
518 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
519 if (!p || !e || e < p)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000520 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000521 while (p < e) {
522 if (ptr >= end || *ptr != *p)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000523 goto failure;
524 p++; ptr++;
525 }
526 }
527 pattern++;
528 break;
529
530 case SRE_OP_GROUP_IGNORE:
531 /* match backreference */
532 TRACE(("%8d: group ignore %d\n", PTR(ptr), pattern[0]));
533 i = pattern[0];
534 {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000535 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
536 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000537 if (!p || !e || e < p)
538 goto failure;
539 while (p < e) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000540 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000541 state->lower(*ptr) != state->lower(*p))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000542 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000543 p++; ptr++;
544 }
545 }
546 pattern++;
547 break;
548
549 case SRE_OP_LITERAL_IGNORE:
Fredrik Lundh0640e112000-06-30 13:55:15 +0000550 TRACE(("%8d: literal lower(%c)\n", PTR(ptr), pattern[0]));
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000551 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000552 state->lower(*ptr) != state->lower(*pattern))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000553 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000554 pattern++;
555 ptr++;
556 break;
557
558 case SRE_OP_NOT_LITERAL_IGNORE:
Fredrik Lundh0640e112000-06-30 13:55:15 +0000559 TRACE(("%8d: literal not lower(%c)\n", PTR(ptr), pattern[0]));
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000560 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000561 state->lower(*ptr) == state->lower(*pattern))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000562 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000563 pattern++;
564 ptr++;
565 break;
566
567 case SRE_OP_IN_IGNORE:
568 TRACE(("%8d: set lower(%c)\n", PTR(ptr), *ptr));
569 if (ptr >= end
Fredrik Lundh0640e112000-06-30 13:55:15 +0000570 || !SRE_MEMBER(pattern+1, (SRE_CODE) state->lower(*ptr)))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000571 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000572 pattern += pattern[0];
573 ptr++;
574 break;
575
576 case SRE_OP_MARK:
577 /* set mark */
578 /* args: <mark> */
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000579 TRACE(("%8d: set mark %d\n", PTR(ptr), pattern[0]));
580 if (state->lastmark < pattern[0])
581 state->lastmark = pattern[0];
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000582 if (!mark) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000583 mark = mark_copy;
584 memcpy(mark, state->mark, state->lastmark*sizeof(void*));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000585 }
586 state->mark[pattern[0]] = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +0000587 pattern++;
588 break;
589
590 case SRE_OP_JUMP:
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000591 case SRE_OP_INFO:
Guido van Rossumb700df92000-03-31 14:59:30 +0000592 /* jump forward */
593 /* args: <skip> */
594 TRACE(("%8d: jump +%d\n", PTR(ptr), pattern[0]));
595 pattern += pattern[0];
596 break;
597
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000598 case SRE_OP_ASSERT:
599 /* assert subpattern */
Guido van Rossumb700df92000-03-31 14:59:30 +0000600 /* args: <skip> <pattern> */
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000601 TRACE(("%8d: assert subpattern\n", PTR(ptr)));
Guido van Rossumb700df92000-03-31 14:59:30 +0000602 state->ptr = ptr;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000603 i = SRE_MATCH(state, pattern + 1);
604 if (i < 0)
605 return i;
606 if (!i)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000607 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000608 pattern += pattern[0];
Guido van Rossumb700df92000-03-31 14:59:30 +0000609 break;
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000610
611 case SRE_OP_ASSERT_NOT:
612 /* assert not subpattern */
613 /* args: <skip> <pattern> */
614 TRACE(("%8d: assert not subpattern\n", PTR(ptr)));
615 state->ptr = ptr;
616 i = SRE_MATCH(state, pattern + 1);
617 if (i < 0)
618 return i;
619 if (i)
620 goto failure;
621 pattern += pattern[0];
622 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000623
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000624#if 0
Guido van Rossumb700df92000-03-31 14:59:30 +0000625 case SRE_OP_MAX_REPEAT_ONE:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000626 /* match repeated sequence (maximizing regexp) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000627
628 /* this operator only works if the repeated item is
629 exactly one character wide, and we're not already
630 collecting backtracking points. for other cases,
631 use the MAX_REPEAT operator instead */
632
Guido van Rossumb700df92000-03-31 14:59:30 +0000633 /* args: <skip> <min> <max> <step> */
Guido van Rossumb700df92000-03-31 14:59:30 +0000634 TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr),
635 pattern[1], pattern[2]));
636
637 count = 0;
638
639 if (pattern[3] == SRE_OP_ANY) {
640 /* repeated wildcard. skip to the end of the target
641 string, and backtrack from there */
642 /* FIXME: must look for line endings */
643 if (ptr + pattern[1] > end)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000644 goto failure; /* cannot match */
Guido van Rossumb700df92000-03-31 14:59:30 +0000645 count = pattern[2];
646 if (count > end - ptr)
647 count = end - ptr;
648 ptr += count;
649
650 } else if (pattern[3] == SRE_OP_LITERAL) {
651 /* repeated literal */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000652 SRE_CODE chr = pattern[4];
Guido van Rossumb700df92000-03-31 14:59:30 +0000653 while (count < (int) pattern[2]) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000654 if (ptr >= end || (SRE_CODE) ptr[0] != chr)
Guido van Rossumb700df92000-03-31 14:59:30 +0000655 break;
656 ptr++;
657 count++;
658 }
659
660 } else if (pattern[3] == SRE_OP_LITERAL_IGNORE) {
661 /* repeated literal */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000662 SRE_CODE chr = pattern[4];
Guido van Rossumb700df92000-03-31 14:59:30 +0000663 while (count < (int) pattern[2]) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000664 if (ptr >= end || (SRE_CODE) state->lower(*ptr) != chr)
Guido van Rossumb700df92000-03-31 14:59:30 +0000665 break;
666 ptr++;
667 count++;
668 }
669
670 } else if (pattern[3] == SRE_OP_NOT_LITERAL) {
671 /* repeated non-literal */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000672 SRE_CODE chr = pattern[4];
Guido van Rossumb700df92000-03-31 14:59:30 +0000673 while (count < (int) pattern[2]) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000674 if (ptr >= end || (SRE_CODE) ptr[0] == chr)
Guido van Rossumb700df92000-03-31 14:59:30 +0000675 break;
676 ptr++;
677 count++;
678 }
679
680 } else if (pattern[3] == SRE_OP_NOT_LITERAL_IGNORE) {
681 /* repeated non-literal */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000682 SRE_CODE chr = pattern[4];
Guido van Rossumb700df92000-03-31 14:59:30 +0000683 while (count < (int) pattern[2]) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000684 if (ptr >= end || (SRE_CODE) state->lower(ptr[0]) == chr)
Guido van Rossumb700df92000-03-31 14:59:30 +0000685 break;
686 ptr++;
687 count++;
688 }
689
690 } else if (pattern[3] == SRE_OP_IN) {
691 /* repeated set */
692 while (count < (int) pattern[2]) {
693 if (ptr >= end || !SRE_MEMBER(pattern + 5, *ptr))
694 break;
695 ptr++;
696 count++;
697 }
698
699 } else {
700 /* repeated single character pattern */
701 state->ptr = ptr;
702 while (count < (int) pattern[2]) {
703 i = SRE_MATCH(state, pattern + 3);
704 if (i < 0)
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000705 return i;
706 if (!i)
Guido van Rossumb700df92000-03-31 14:59:30 +0000707 break;
708 count++;
709 }
710 state->ptr = ptr;
711 ptr += count;
712 }
713
714 /* when we arrive here, count contains the number of
715 matches, and ptr points to the tail of the target
716 string. check if the rest of the pattern matches, and
717 backtrack if not. */
718
Guido van Rossumb700df92000-03-31 14:59:30 +0000719 TRACE(("%8d: repeat %d found\n", PTR(ptr), count));
720
721 if (count < (int) pattern[1])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000722 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000723
724 if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
725 /* tail is empty. we're finished */
726 TRACE(("%8d: tail is empty\n", PTR(ptr)));
727 state->ptr = ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000728 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000729
730 } else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000731 /* tail starts with a literal. skip positions where
732 the rest of the pattern cannot possibly match */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000733 SRE_CODE chr = pattern[pattern[0]+1];
Guido van Rossumb700df92000-03-31 14:59:30 +0000734 TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr));
735 for (;;) {
736 TRACE(("%8d: scan for tail match\n", PTR(ptr)));
737 while (count >= (int) pattern[1] &&
738 (ptr >= end || *ptr != chr)) {
739 ptr--;
740 count--;
741 }
742 TRACE(("%8d: check tail\n", PTR(ptr)));
743 if (count < (int) pattern[1])
744 break;
745 state->ptr = ptr;
746 i = SRE_MATCH(state, pattern + pattern[0]);
747 if (i > 0) {
748 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000749 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000750 }
751 TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
752 ptr--;
753 count--;
754 }
755
756 } else {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000757 /* general case */
Guido van Rossumb700df92000-03-31 14:59:30 +0000758 TRACE(("%8d: tail is pattern\n", PTR(ptr)));
759 while (count >= (int) pattern[1]) {
760 state->ptr = ptr;
761 i = SRE_MATCH(state, pattern + pattern[0]);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000762 if (i < 0)
763 return i;
764 if (i) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000765 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000766 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000767 }
768 TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
769 ptr--;
770 count--;
771 }
772 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000773 goto failure;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000774#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000775
776 case SRE_OP_MAX_REPEAT:
777 /* match repeated sequence (maximizing regexp). repeated
778 group should end with a MAX_UNTIL code */
779
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000780 /* args: <skip> <min> <max> <item> */
781
782 TRACE(("%8d: max repeat (%d %d)\n", PTR(ptr),
Guido van Rossumb700df92000-03-31 14:59:30 +0000783 pattern[1], pattern[2]));
784
785 count = 0;
786 state->ptr = ptr;
787
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000788 /* match minimum number of items */
789 while (count < (int) pattern[1]) {
790 i = SRE_MATCH(state, pattern + 3);
791 if (i < 0)
792 return i;
793 if (!i)
794 goto failure;
795 if (state->ptr == ptr) {
796 /* if the match was successful but empty, set the
797 count to max and terminate the scanning loop */
798 count = (int) pattern[2];
799 break;
800 }
801 count++;
802 ptr = state->ptr;
803 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000804
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000805 TRACE(("%8d: found %d leading items\n", PTR(ptr), count));
Guido van Rossumb700df92000-03-31 14:59:30 +0000806
807 if (count < (int) pattern[1])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000808 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000809
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000810 /* match maximum number of items, pushing alternate end
811 points to the stack */
Guido van Rossumb700df92000-03-31 14:59:30 +0000812
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000813 while (pattern[2] == 32767 || count < (int) pattern[2]) {
814 state->stackbase = stack;
815 i = SRE_MATCH(state, pattern + 3);
816 state->stackbase = stackbase; /* rewind */
817 if (i < 0)
818 return i;
819 if (!i)
820 break;
821 if (state->ptr == ptr) {
822 count = (int) pattern[2];
823 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000824 }
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000825 /* this position was valid; add it to the retry
826 stack */
827 if (stack >= state->stacksize) {
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000828 i = stack_extend(state, stack + 1,
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000829 stackbase + pattern[2]);
830 if (i < 0)
831 return i; /* out of memory */
832 }
833 TRACE(("%8d: stack[%d] = %d\n", PTR(ptr), stack, PTR(ptr)));
834 state->stack[stack].ptr = ptr;
835 state->stack[stack].pattern = pattern + pattern[0];
836 stack++;
837 /* move forward */
838 ptr = state->ptr;
839 count++;
Guido van Rossumb700df92000-03-31 14:59:30 +0000840 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000841
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000842 /* when we get here, count is the number of successful
843 matches, and ptr points to the tail. */
Guido van Rossumb700df92000-03-31 14:59:30 +0000844
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000845 TRACE(("%8d: skip +%d\n", PTR(ptr), pattern[0]));
846
847 pattern += pattern[0];
848 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000849
850 case SRE_OP_MIN_REPEAT:
851 /* match repeated sequence (minimizing regexp) */
852 TRACE(("%8d: min repeat %d %d\n", PTR(ptr),
853 pattern[1], pattern[2]));
854 count = 0;
855 state->ptr = ptr;
856 /* match minimum number of items */
857 while (count < (int) pattern[1]) {
858 i = SRE_MATCH(state, pattern + 3);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000859 if (i < 0)
860 return i;
861 if (!i)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000862 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000863 count++;
864 }
865 /* move forward until the tail matches. */
866 while (count <= (int) pattern[2]) {
867 ptr = state->ptr;
868 i = SRE_MATCH(state, pattern + pattern[0]);
869 if (i > 0) {
870 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000871 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000872 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000873 state->ptr = ptr; /* backtrack */
874 i = SRE_MATCH(state, pattern + 3);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000875 if (i < 0)
876 return i;
877 if (!i)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000878 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000879 count++;
880 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000881 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000882
Guido van Rossumb700df92000-03-31 14:59:30 +0000883 case SRE_OP_BRANCH:
884 /* match one of several subpatterns */
885 /* format: <branch> <size> <head> ... <null> <tail> */
886 TRACE(("%8d: branch\n", PTR(ptr)));
887 while (*pattern) {
888 if (pattern[1] != SRE_OP_LITERAL ||
Fredrik Lundh0640e112000-06-30 13:55:15 +0000889 (ptr < end && (SRE_CODE) ptr[0] == pattern[2])) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000890 TRACE(("%8d: branch check\n", PTR(ptr)));
891 state->ptr = ptr;
892 i = SRE_MATCH(state, pattern + 1);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000893 if (i < 0)
894 return i;
895 if (i) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000896 TRACE(("%8d: branch succeeded\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000897 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000898 }
899 }
900 pattern += *pattern;
901 }
902 TRACE(("%8d: branch failed\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000903 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000904
905 case SRE_OP_REPEAT:
906 /* TEMPLATE: match repeated sequence (no backtracking) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000907 /* args: <skip> <min> <max> */
Guido van Rossumb700df92000-03-31 14:59:30 +0000908 TRACE(("%8d: repeat %d %d\n", PTR(ptr), pattern[1], pattern[2]));
909 count = 0;
910 state->ptr = ptr;
911 while (count < (int) pattern[2]) {
912 i = SRE_MATCH(state, pattern + 3);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000913 if (i < 0)
914 return i;
915 if (!i)
Guido van Rossumb700df92000-03-31 14:59:30 +0000916 break;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000917 if (state->ptr == ptr) {
918 count = (int) pattern[2];
919 break;
920 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000921 count++;
922 }
923 if (count <= (int) pattern[1])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000924 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000925 TRACE(("%8d: repeat %d matches\n", PTR(ptr), count));
926 pattern += pattern[0];
927 ptr = state->ptr;
928 break;
929
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000930 default:
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000931 TRACE(("%8d: unknown opcode %d\n", PTR(ptr), pattern[-1]));
Guido van Rossumb700df92000-03-31 14:59:30 +0000932 return SRE_ERROR_ILLEGAL;
933 }
934 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000935
936 failure:
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000937 if (stack-- > stackbase) {
938 ptr = state->stack[stack].ptr;
939 pattern = state->stack[stack].pattern;
940 TRACE(("%8d: retry (%d)\n", PTR(ptr), stack));
941 goto retry;
942 }
943 TRACE(("%8d: leave (failure)\n", PTR(ptr)));
944 state->stackbase = stackbase;
945 state->lastmark = lastmark;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000946 if (mark)
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000947 memcpy(state->mark, mark, state->lastmark*sizeof(void*));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000948 return 0;
949
950 success:
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000951 TRACE(("%8d: leave (success)\n", PTR(ptr)));
952 state->stackbase = stackbase;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000953 return 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000954}
955
956LOCAL(int)
957SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
958{
959 SRE_CHAR* ptr = state->start;
960 SRE_CHAR* end = state->end;
961 int status = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +0000962 int prefix_len;
963 SRE_CODE* prefix = NULL;
964 SRE_CODE* charset = NULL;
965 SRE_CODE* overlap = NULL;
966 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000967
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000968 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000969 /* optimization info block */
Fredrik Lundh3562f112000-07-02 12:00:07 +0000970 /* args: <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
971
972 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000973
974 if (pattern[3] > 0) {
975 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +0000976 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000977 end -= pattern[3]-1;
978 if (end <= ptr)
979 end = ptr+1;
980 }
981
Fredrik Lundh3562f112000-07-02 12:00:07 +0000982 if (flags & SRE_INFO_PREFIX) {
983 prefix_len = pattern[5];
984 prefix = pattern + 6;
985 overlap = prefix + prefix_len - 1;
986 } else if (flags & SRE_INFO_CHARSET)
987 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000988
989 pattern += 1 + pattern[1];
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000990 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000991
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000992#if defined(USE_FAST_SEARCH)
Fredrik Lundh3562f112000-07-02 12:00:07 +0000993 if (prefix && overlap && prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000994 /* pattern starts with a known prefix. use the overlap
995 table to skip forward as fast as we possibly can */
996 int i = 0;
997 end = state->end;
998 while (ptr < end) {
999 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001000 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001001 if (!i)
1002 break;
1003 else
1004 i = overlap[i];
1005 } else {
1006 if (++i == prefix_len) {
1007 /* found a potential match */
1008 TRACE(("%8d: === SEARCH === hit\n", PTR(ptr)));
1009 state->start = ptr - prefix_len + 1;
1010 state->ptr = ptr + 1;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001011 if (flags & SRE_INFO_LITERAL)
1012 return 1; /* we got all of it */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001013 status = SRE_MATCH(state, pattern + 2*prefix_len);
1014 if (status != 0)
1015 return status;
1016 /* close but no cigar -- try again */
1017 i = overlap[i];
1018 }
1019 break;
1020 }
1021
1022 }
1023 ptr++;
1024 }
1025 return 0;
1026 }
1027#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001028
Fredrik Lundh3562f112000-07-02 12:00:07 +00001029 if (pattern[0] == SRE_OP_LITERAL) {
1030 /* pattern starts with a literal character. this is used
1031 for short prefixes, and if fast search is disabled */
Fredrik Lundh0640e112000-06-30 13:55:15 +00001032 SRE_CODE chr = pattern[1];
Guido van Rossumb700df92000-03-31 14:59:30 +00001033 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001034 while (ptr < end && (SRE_CODE) ptr[0] != chr)
Guido van Rossumb700df92000-03-31 14:59:30 +00001035 ptr++;
1036 if (ptr == end)
1037 return 0;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001038 TRACE(("%8d: === SEARCH === literal\n", PTR(ptr)));
Guido van Rossumb700df92000-03-31 14:59:30 +00001039 state->start = ptr;
1040 state->ptr = ++ptr;
1041 status = SRE_MATCH(state, pattern + 2);
1042 if (status != 0)
1043 break;
1044 }
Fredrik Lundh3562f112000-07-02 12:00:07 +00001045#if 0
1046 } else if (charset) {
1047 /* pattern starts with a character from a known set */
1048 for (;;) {
1049 while (ptr < end && !SRE_MEMBER(charset, ptr[0]))
1050 ptr++;
1051 if (ptr == end)
1052 return 0;
1053 TRACE(("%8d: === SEARCH === charset\n", PTR(ptr)));
1054 state->start = ptr;
1055 state->ptr = ptr;
1056 status = SRE_MATCH(state, pattern);
1057 if (status != 0)
1058 break;
1059 }
1060#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001061 } else
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001062 /* general case */
Guido van Rossumb700df92000-03-31 14:59:30 +00001063 while (ptr <= end) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001064 TRACE(("%8d: === SEARCH ===\n", PTR(ptr)));
Guido van Rossumb700df92000-03-31 14:59:30 +00001065 state->start = state->ptr = ptr++;
1066 status = SRE_MATCH(state, pattern);
1067 if (status != 0)
1068 break;
1069 }
1070
1071 return status;
1072}
Fredrik Lundh3562f112000-07-02 12:00:07 +00001073
Guido van Rossumb700df92000-03-31 14:59:30 +00001074
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001075#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001076
1077/* -------------------------------------------------------------------- */
1078/* factories and destructors */
1079
1080/* see sre.h for object declarations */
1081
1082staticforward PyTypeObject Pattern_Type;
1083staticforward PyTypeObject Match_Type;
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001084staticforward PyTypeObject Scanner_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00001085
1086static PyObject *
1087_compile(PyObject* self_, PyObject* args)
1088{
1089 /* "compile" pattern descriptor to pattern object */
1090
1091 PatternObject* self;
1092
1093 PyObject* pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001094 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001095 PyObject* code;
1096 int groups = 0;
1097 PyObject* groupindex = NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001098 if (!PyArg_ParseTuple(args, "OiO!|iO", &pattern, &flags,
1099 &PyString_Type, &code,
1100 &groups, &groupindex))
Guido van Rossumb700df92000-03-31 14:59:30 +00001101 return NULL;
1102
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001103 self = PyObject_NEW(PatternObject, &Pattern_Type);
Guido van Rossumb700df92000-03-31 14:59:30 +00001104 if (self == NULL)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001105
Guido van Rossumb700df92000-03-31 14:59:30 +00001106 return NULL;
1107
1108 Py_INCREF(pattern);
1109 self->pattern = pattern;
1110
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001111 self->flags = flags;
1112
Guido van Rossumb700df92000-03-31 14:59:30 +00001113 Py_INCREF(code);
1114 self->code = code;
1115
1116 self->groups = groups;
1117
1118 Py_XINCREF(groupindex);
1119 self->groupindex = groupindex;
1120
1121 return (PyObject*) self;
1122}
1123
1124static PyObject *
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001125sre_codesize(PyObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001126{
1127 return Py_BuildValue("i", sizeof(SRE_CODE));
1128}
1129
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001130static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001131sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001132{
1133 int character, flags;
1134 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1135 return NULL;
1136 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001137 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001138#if defined(HAVE_UNICODE)
1139 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001140 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001141#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001142 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001143}
1144
Guido van Rossumb700df92000-03-31 14:59:30 +00001145LOCAL(PyObject*)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001146state_init(SRE_STATE* state, PatternObject* pattern, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001147{
1148 /* prepare state object */
1149
1150 PyBufferProcs *buffer;
1151 int i, count;
1152 void* ptr;
1153
1154 PyObject* string;
1155 int start = 0;
1156 int end = INT_MAX;
1157 if (!PyArg_ParseTuple(args, "O|ii", &string, &start, &end))
1158 return NULL;
1159
1160 /* get pointer to string buffer */
1161 buffer = string->ob_type->tp_as_buffer;
1162 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1163 buffer->bf_getsegcount(string, NULL) != 1) {
1164 PyErr_SetString(PyExc_TypeError, "expected read-only buffer");
1165 return NULL;
1166 }
1167
1168 /* determine buffer size */
1169 count = buffer->bf_getreadbuffer(string, 0, &ptr);
1170 if (count < 0) {
1171 /* sanity check */
1172 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1173 return NULL;
1174 }
1175
1176 /* determine character size */
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001177#if defined(HAVE_UNICODE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001178 state->charsize = (PyUnicode_Check(string) ? sizeof(Py_UNICODE) : 1);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001179#else
1180 state->charsize = 1;
1181#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001182
1183 count /= state->charsize;
1184
1185 /* adjust boundaries */
1186 if (start < 0)
1187 start = 0;
1188 else if (start > count)
1189 start = count;
1190
1191 if (end < 0)
1192 end = 0;
1193 else if (end > count)
1194 end = count;
1195
1196 state->beginning = ptr;
1197
1198 state->start = (void*) ((char*) ptr + start * state->charsize);
1199 state->end = (void*) ((char*) ptr + end * state->charsize);
1200
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001201 state->lastmark = 0;
1202
Guido van Rossumb700df92000-03-31 14:59:30 +00001203 /* FIXME: dynamic! */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001204 for (i = 0; i < SRE_MARK_SIZE; i++)
Guido van Rossumb700df92000-03-31 14:59:30 +00001205 state->mark[i] = NULL;
1206
1207 state->stack = NULL;
1208 state->stackbase = 0;
1209 state->stacksize = 0;
1210
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001211 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001212 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001213#if defined(HAVE_UNICODE)
1214 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001215 state->lower = sre_lower_unicode;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001216#endif
1217 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001218 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001219
Guido van Rossumb700df92000-03-31 14:59:30 +00001220 return string;
1221}
1222
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001223LOCAL(void)
1224state_fini(SRE_STATE* state)
1225{
1226 stack_free(state);
1227}
1228
1229LOCAL(PyObject*)
1230state_getslice(SRE_STATE* state, int index, PyObject* string)
1231{
1232 index = (index - 1) * 2;
1233
1234 if (string == Py_None || !state->mark[index] || !state->mark[index+1]) {
1235 Py_INCREF(Py_None);
1236 return Py_None;
1237 }
1238
1239 return PySequence_GetSlice(
1240 string,
1241 ((char*)state->mark[index] - (char*)state->beginning) /
1242 state->charsize,
1243 ((char*)state->mark[index+1] - (char*)state->beginning) /
1244 state->charsize
1245 );
1246}
1247
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001248static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001249pattern_new_match(PatternObject* pattern, SRE_STATE* state,
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001250 PyObject* string, int status)
1251{
1252 /* create match object (from state object) */
1253
1254 MatchObject* match;
1255 int i, j;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001256 char* base;
1257 int n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001258
1259 if (status > 0) {
1260
1261 /* create match object (with room for extra group marks) */
1262 match = PyObject_NEW_VAR(MatchObject, &Match_Type, 2*pattern->groups);
1263 if (match == NULL)
1264 return NULL;
1265
1266 Py_INCREF(pattern);
1267 match->pattern = pattern;
1268
1269 Py_INCREF(string);
1270 match->string = string;
1271
1272 match->groups = pattern->groups+1;
1273
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001274 base = (char*) state->beginning;
1275 n = state->charsize;
1276
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001277 /* group zero */
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001278 match->mark[0] = ((char*) state->start - base) / n;
1279 match->mark[1] = ((char*) state->ptr - base) / n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001280
1281 /* fill in the rest of the groups */
1282 for (i = j = 0; i < pattern->groups; i++, j+=2)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001283 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
1284 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
1285 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001286 } else
1287 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
1288
1289 return (PyObject*) match;
1290
1291 } else if (status < 0) {
1292
1293 /* internal error */
1294 PyErr_SetString(
1295 PyExc_RuntimeError, "internal error in regular expression engine"
1296 );
1297 return NULL;
1298
1299 }
1300
1301 Py_INCREF(Py_None);
1302 return Py_None;
1303}
1304
1305static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001306pattern_scanner(PatternObject* pattern, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001307{
1308 /* create search state object */
1309
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001310 ScannerObject* self;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001311 PyObject* string;
1312
1313 /* create match object (with room for extra group marks) */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001314 self = PyObject_NEW(ScannerObject, &Scanner_Type);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001315 if (self == NULL)
1316 return NULL;
1317
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001318 string = state_init(&self->state, pattern, args);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001319 if (!string) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001320 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001321 return NULL;
1322 }
1323
1324 Py_INCREF(pattern);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001325 self->pattern = (PyObject*) pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001326
1327 Py_INCREF(string);
1328 self->string = string;
1329
1330 return (PyObject*) self;
1331}
1332
Guido van Rossumb700df92000-03-31 14:59:30 +00001333static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001334pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001335{
1336 Py_XDECREF(self->code);
1337 Py_XDECREF(self->pattern);
1338 Py_XDECREF(self->groupindex);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001339 PyMem_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001340}
1341
1342static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001343pattern_match(PatternObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001344{
1345 SRE_STATE state;
1346 PyObject* string;
1347 int status;
1348
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001349 string = state_init(&state, self, args);
Guido van Rossumb700df92000-03-31 14:59:30 +00001350 if (!string)
1351 return NULL;
1352
1353 state.ptr = state.start;
1354
1355 if (state.charsize == 1) {
1356 status = sre_match(&state, PatternObject_GetCode(self));
1357 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001358#if defined(HAVE_UNICODE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001359 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001360#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001361 }
1362
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001363 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001364
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001365 return pattern_new_match(self, &state, string, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001366}
1367
1368static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001369pattern_search(PatternObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001370{
1371 SRE_STATE state;
1372 PyObject* string;
1373 int status;
1374
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001375 string = state_init(&state, self, args);
Guido van Rossumb700df92000-03-31 14:59:30 +00001376 if (!string)
1377 return NULL;
1378
1379 if (state.charsize == 1) {
1380 status = sre_search(&state, PatternObject_GetCode(self));
1381 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001382#if defined(HAVE_UNICODE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001383 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001384#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001385 }
1386
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001387 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001388
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001389 return pattern_new_match(self, &state, string, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001390}
1391
1392static PyObject*
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001393call(char* function, PyObject* args)
1394{
1395 PyObject* name;
1396 PyObject* module;
1397 PyObject* func;
1398 PyObject* result;
1399
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001400 name = PyString_FromString(MODULE);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001401 if (!name)
1402 return NULL;
1403 module = PyImport_Import(name);
1404 Py_DECREF(name);
1405 if (!module)
1406 return NULL;
1407 func = PyObject_GetAttrString(module, function);
1408 Py_DECREF(module);
1409 if (!func)
1410 return NULL;
1411 result = PyObject_CallObject(func, args);
1412 Py_DECREF(func);
1413 Py_DECREF(args);
1414 return result;
1415}
1416
1417static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001418pattern_sub(PatternObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001419{
1420 PyObject* template;
1421 PyObject* string;
1422 PyObject* count;
1423 if (!PyArg_ParseTuple(args, "OOO", &template, &string, &count))
1424 return NULL;
1425
1426 /* delegate to Python code */
1427 return call("_sub", Py_BuildValue("OOOO", self, template, string, count));
1428}
1429
1430static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001431pattern_subn(PatternObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001432{
1433 PyObject* template;
1434 PyObject* string;
1435 PyObject* count;
1436 if (!PyArg_ParseTuple(args, "OOO", &template, &string, &count))
1437 return NULL;
1438
1439 /* delegate to Python code */
1440 return call("_subn", Py_BuildValue("OOOO", self, template, string, count));
1441}
1442
1443static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001444pattern_split(PatternObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001445{
1446 PyObject* string;
1447 PyObject* maxsplit;
1448 if (!PyArg_ParseTuple(args, "OO", &string, &maxsplit))
1449 return NULL;
1450
1451 /* delegate to Python code */
1452 return call("_split", Py_BuildValue("OOO", self, string, maxsplit));
1453}
1454
1455static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001456pattern_findall(PatternObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001457{
Guido van Rossumb700df92000-03-31 14:59:30 +00001458 SRE_STATE state;
1459 PyObject* string;
1460 PyObject* list;
1461 int status;
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001462 int i;
Guido van Rossumb700df92000-03-31 14:59:30 +00001463
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001464 string = state_init(&state, self, args);
Guido van Rossumb700df92000-03-31 14:59:30 +00001465 if (!string)
1466 return NULL;
1467
1468 list = PyList_New(0);
1469
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001470 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001471
1472 PyObject* item;
1473
1474 state.ptr = state.start;
1475
1476 if (state.charsize == 1) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001477 status = sre_search(&state, PatternObject_GetCode(self));
Guido van Rossumb700df92000-03-31 14:59:30 +00001478 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001479#if defined(HAVE_UNICODE)
1480 status = sre_usearch(&state, PatternObject_GetCode(self));
1481#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001482 }
1483
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001484 if (status > 0) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001485
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001486 /* don't bother to build a match object */
1487 switch (self->groups) {
1488 case 0:
1489 item = PySequence_GetSlice(
1490 string,
1491 ((char*) state.start - (char*) state.beginning) /
1492 state.charsize,
1493 ((char*) state.ptr - (char*) state.beginning) /
1494 state.charsize);
1495 if (!item)
1496 goto error;
1497 break;
1498 case 1:
1499 item = state_getslice(&state, 1, string);
1500 if (!item)
1501 goto error;
1502 break;
1503 default:
1504 item = PyTuple_New(self->groups);
1505 if (!item)
1506 goto error;
1507 for (i = 0; i < self->groups; i++) {
1508 PyObject* o = state_getslice(&state, i+1, string);
1509 if (!o) {
1510 Py_DECREF(item);
1511 goto error;
1512 }
1513 PyTuple_SET_ITEM(item, i, o);
1514 }
1515 break;
1516 }
1517
1518 if (PyList_Append(list, item) < 0) {
1519 Py_DECREF(item);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001520 goto error;
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001521 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001522
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001523 if (state.ptr == state.start)
1524 state.start = (void*) ((char*) state.ptr + state.charsize);
1525 else
1526 state.start = state.ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001527
1528 } else {
1529
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001530 if (status == 0)
1531 break;
1532
Guido van Rossumb700df92000-03-31 14:59:30 +00001533 /* internal error */
1534 PyErr_SetString(
1535 PyExc_RuntimeError,
1536 "internal error in regular expression engine"
1537 );
1538 goto error;
1539
1540 }
1541 }
1542
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001543 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001544 return list;
1545
1546error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001547 Py_DECREF(list);
1548 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001549 return NULL;
1550
1551}
1552
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001553static PyMethodDef pattern_methods[] = {
1554 {"match", (PyCFunction) pattern_match, 1},
1555 {"search", (PyCFunction) pattern_search, 1},
1556 {"sub", (PyCFunction) pattern_sub, 1},
1557 {"subn", (PyCFunction) pattern_subn, 1},
1558 {"split", (PyCFunction) pattern_split, 1},
1559 {"findall", (PyCFunction) pattern_findall, 1},
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001560 /* experimental */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001561 {"scanner", (PyCFunction) pattern_scanner, 1},
Guido van Rossumb700df92000-03-31 14:59:30 +00001562 {NULL, NULL}
1563};
1564
1565static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001566pattern_getattr(PatternObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00001567{
1568 PyObject* res;
1569
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001570 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00001571
1572 if (res)
1573 return res;
1574
1575 PyErr_Clear();
1576
1577 /* attributes */
1578 if (!strcmp(name, "pattern")) {
1579 Py_INCREF(self->pattern);
1580 return self->pattern;
1581 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001582
1583 if (!strcmp(name, "flags"))
1584 return Py_BuildValue("i", self->flags);
1585
Fredrik Lundh01016fe2000-06-30 00:27:46 +00001586 if (!strcmp(name, "groups"))
1587 return Py_BuildValue("i", self->groups);
1588
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001589 if (!strcmp(name, "groupindex") && self->groupindex) {
1590 Py_INCREF(self->groupindex);
1591 return self->groupindex;
1592 }
1593
Guido van Rossumb700df92000-03-31 14:59:30 +00001594 PyErr_SetString(PyExc_AttributeError, name);
1595 return NULL;
1596}
1597
1598statichere PyTypeObject Pattern_Type = {
1599 PyObject_HEAD_INIT(NULL)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001600 0, "SRE_Pattern", sizeof(PatternObject), 0,
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001601 (destructor)pattern_dealloc, /*tp_dealloc*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001602 0, /*tp_print*/
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001603 (getattrfunc)pattern_getattr, /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001604};
1605
1606/* -------------------------------------------------------------------- */
1607/* match methods */
1608
1609static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001610match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001611{
1612 Py_XDECREF(self->string);
1613 Py_DECREF(self->pattern);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001614 PyMem_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001615}
1616
1617static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001618match_getslice_by_index(MatchObject* self, int index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00001619{
1620 if (index < 0 || index >= self->groups) {
1621 /* raise IndexError if we were given a bad group number */
1622 PyErr_SetString(
1623 PyExc_IndexError,
1624 "no such group"
1625 );
1626 return NULL;
1627 }
1628
1629 if (self->string == Py_None || self->mark[index+index] < 0) {
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001630 /* return default value if the string or group is undefined */
1631 Py_INCREF(def);
1632 return def;
Guido van Rossumb700df92000-03-31 14:59:30 +00001633 }
1634
1635 return PySequence_GetSlice(
1636 self->string, self->mark[index+index], self->mark[index+index+1]
1637 );
1638}
1639
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001640static int
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001641match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00001642{
1643 if (!PyInt_Check(index) && self->pattern->groupindex != NULL) {
1644 /* FIXME: resource leak? */
1645 index = PyObject_GetItem(self->pattern->groupindex, index);
1646 if (!index)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001647 return -1;
Guido van Rossumb700df92000-03-31 14:59:30 +00001648 }
1649
1650 if (PyInt_Check(index))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001651 return (int) PyInt_AS_LONG(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00001652
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001653 return -1;
1654}
1655
1656static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001657match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001658{
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001659 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00001660}
1661
1662static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001663match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001664{
1665 PyObject* result;
1666 int i, size;
1667
1668 size = PyTuple_GET_SIZE(args);
1669
1670 switch (size) {
1671 case 0:
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001672 result = match_getslice(self, Py_False, Py_None);
Guido van Rossumb700df92000-03-31 14:59:30 +00001673 break;
1674 case 1:
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001675 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
Guido van Rossumb700df92000-03-31 14:59:30 +00001676 break;
1677 default:
1678 /* fetch multiple items */
1679 result = PyTuple_New(size);
1680 if (!result)
1681 return NULL;
1682 for (i = 0; i < size; i++) {
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001683 PyObject* item = match_getslice(
1684 self, PyTuple_GET_ITEM(args, i), Py_None
1685 );
Guido van Rossumb700df92000-03-31 14:59:30 +00001686 if (!item) {
1687 Py_DECREF(result);
1688 return NULL;
1689 }
1690 PyTuple_SET_ITEM(result, i, item);
1691 }
1692 break;
1693 }
1694 return result;
1695}
1696
1697static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001698match_groups(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001699{
1700 PyObject* result;
1701 int index;
1702
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001703 PyObject* def = Py_None;
1704 if (!PyArg_ParseTuple(args, "|O", &def))
1705 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001706
Guido van Rossumb700df92000-03-31 14:59:30 +00001707 result = PyTuple_New(self->groups-1);
1708 if (!result)
1709 return NULL;
1710
1711 for (index = 1; index < self->groups; index++) {
1712 PyObject* item;
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001713 item = match_getslice_by_index(self, index, def);
Guido van Rossumb700df92000-03-31 14:59:30 +00001714 if (!item) {
1715 Py_DECREF(result);
1716 return NULL;
1717 }
1718 PyTuple_SET_ITEM(result, index-1, item);
1719 }
1720
1721 return result;
1722}
1723
1724static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001725match_groupdict(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001726{
1727 PyObject* result;
1728 PyObject* keys;
1729 int index;
1730
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001731 PyObject* def = Py_None;
1732 if (!PyArg_ParseTuple(args, "|O", &def))
1733 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001734
Guido van Rossumb700df92000-03-31 14:59:30 +00001735 result = PyDict_New();
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001736 if (!result || !self->pattern->groupindex)
Guido van Rossumb700df92000-03-31 14:59:30 +00001737 return result;
1738
1739 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001740 if (!keys) {
1741 Py_DECREF(result);
Guido van Rossumb700df92000-03-31 14:59:30 +00001742 return NULL;
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001743 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001744
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001745 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001746 PyObject* key;
1747 PyObject* item;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001748 key = PyList_GET_ITEM(keys, index);
Guido van Rossumb700df92000-03-31 14:59:30 +00001749 if (!key) {
1750 Py_DECREF(keys);
1751 Py_DECREF(result);
1752 return NULL;
1753 }
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001754 item = match_getslice(self, key, def);
Guido van Rossumb700df92000-03-31 14:59:30 +00001755 if (!item) {
1756 Py_DECREF(key);
1757 Py_DECREF(keys);
1758 Py_DECREF(result);
1759 return NULL;
1760 }
1761 /* FIXME: <fl> this can fail, right? */
1762 PyDict_SetItem(result, key, item);
1763 }
1764
1765 Py_DECREF(keys);
1766
1767 return result;
1768}
1769
1770static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001771match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001772{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001773 int index;
1774
1775 PyObject* index_ = Py_False;
1776 if (!PyArg_ParseTuple(args, "|O", &index_))
Guido van Rossumb700df92000-03-31 14:59:30 +00001777 return NULL;
1778
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001779 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001780
Guido van Rossumb700df92000-03-31 14:59:30 +00001781 if (index < 0 || index >= self->groups) {
1782 PyErr_SetString(
1783 PyExc_IndexError,
1784 "no such group"
1785 );
1786 return NULL;
1787 }
1788
1789 if (self->mark[index*2] < 0) {
1790 Py_INCREF(Py_None);
1791 return Py_None;
1792 }
1793
1794 return Py_BuildValue("i", self->mark[index*2]);
1795}
1796
1797static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001798match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001799{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001800 int index;
1801
1802 PyObject* index_ = Py_False;
1803 if (!PyArg_ParseTuple(args, "|O", &index_))
Guido van Rossumb700df92000-03-31 14:59:30 +00001804 return NULL;
1805
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001806 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001807
Guido van Rossumb700df92000-03-31 14:59:30 +00001808 if (index < 0 || index >= self->groups) {
1809 PyErr_SetString(
1810 PyExc_IndexError,
1811 "no such group"
1812 );
1813 return NULL;
1814 }
1815
1816 if (self->mark[index*2] < 0) {
1817 Py_INCREF(Py_None);
1818 return Py_None;
1819 }
1820
1821 return Py_BuildValue("i", self->mark[index*2+1]);
1822}
1823
1824static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001825match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001826{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001827 int index;
1828
1829 PyObject* index_ = Py_False;
1830 if (!PyArg_ParseTuple(args, "|O", &index_))
Guido van Rossumb700df92000-03-31 14:59:30 +00001831 return NULL;
1832
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001833 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001834
Guido van Rossumb700df92000-03-31 14:59:30 +00001835 if (index < 0 || index >= self->groups) {
1836 PyErr_SetString(
1837 PyExc_IndexError,
1838 "no such group"
1839 );
1840 return NULL;
1841 }
1842
1843 if (self->mark[index*2] < 0) {
1844 Py_INCREF(Py_None);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001845 Py_INCREF(Py_None);
1846 return Py_BuildValue("OO", Py_None, Py_None);
Guido van Rossumb700df92000-03-31 14:59:30 +00001847 }
1848
1849 return Py_BuildValue("ii", self->mark[index*2], self->mark[index*2+1]);
1850}
1851
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001852static PyMethodDef match_methods[] = {
1853 {"group", (PyCFunction) match_group, 1},
1854 {"start", (PyCFunction) match_start, 1},
1855 {"end", (PyCFunction) match_end, 1},
1856 {"span", (PyCFunction) match_span, 1},
1857 {"groups", (PyCFunction) match_groups, 1},
1858 {"groupdict", (PyCFunction) match_groupdict, 1},
Guido van Rossumb700df92000-03-31 14:59:30 +00001859 {NULL, NULL}
1860};
1861
1862static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001863match_getattr(MatchObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00001864{
1865 PyObject* res;
1866
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001867 res = Py_FindMethod(match_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00001868 if (res)
1869 return res;
1870
1871 PyErr_Clear();
1872
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001873 /* attributes */
Guido van Rossumb700df92000-03-31 14:59:30 +00001874 if (!strcmp(name, "string")) {
1875 Py_INCREF(self->string);
1876 return self->string;
1877 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001878
Guido van Rossumb700df92000-03-31 14:59:30 +00001879 if (!strcmp(name, "re")) {
1880 Py_INCREF(self->pattern);
1881 return (PyObject*) self->pattern;
1882 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001883
Guido van Rossumb700df92000-03-31 14:59:30 +00001884 if (!strcmp(name, "pos"))
1885 return Py_BuildValue("i", 0); /* FIXME */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001886
Guido van Rossumb700df92000-03-31 14:59:30 +00001887 if (!strcmp(name, "endpos"))
1888 return Py_BuildValue("i", 0); /* FIXME */
1889
1890 PyErr_SetString(PyExc_AttributeError, name);
1891 return NULL;
1892}
1893
1894/* FIXME: implement setattr("string", None) as a special case (to
1895 detach the associated string, if any */
1896
1897statichere PyTypeObject Match_Type = {
1898 PyObject_HEAD_INIT(NULL)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001899 0, "SRE_Match",
Guido van Rossumb700df92000-03-31 14:59:30 +00001900 sizeof(MatchObject), /* size of basic object */
1901 sizeof(int), /* space for group item */
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001902 (destructor)match_dealloc, /*tp_dealloc*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001903 0, /*tp_print*/
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001904 (getattrfunc)match_getattr, /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001905};
1906
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001907/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001908/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001909
1910static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001911scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001912{
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001913 state_fini(&self->state);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001914 Py_DECREF(self->string);
1915 Py_DECREF(self->pattern);
1916 PyMem_DEL(self);
1917}
1918
1919static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001920scanner_match(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001921{
1922 SRE_STATE* state = &self->state;
1923 PyObject* match;
1924 int status;
1925
1926 state->ptr = state->start;
1927
1928 if (state->charsize == 1) {
1929 status = sre_match(state, PatternObject_GetCode(self->pattern));
1930 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001931#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001932 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001933#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001934 }
1935
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001936 match = pattern_new_match((PatternObject*) self->pattern,
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001937 state, self->string, status);
1938
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001939 if (status == 0 || state->ptr == state->start)
1940 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001941 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001942 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001943
1944 return match;
1945}
1946
1947
1948static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001949scanner_search(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001950{
1951 SRE_STATE* state = &self->state;
1952 PyObject* match;
1953 int status;
1954
1955 state->ptr = state->start;
1956
1957 if (state->charsize == 1) {
1958 status = sre_search(state, PatternObject_GetCode(self->pattern));
1959 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001960#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001961 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001962#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001963 }
1964
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001965 match = pattern_new_match((PatternObject*) self->pattern,
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001966 state, self->string, status);
1967
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001968 if (status == 0 || state->ptr == state->start)
1969 state->start = (void*) ((char*) state->ptr + state->charsize);
1970 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001971 state->start = state->ptr;
1972
1973 return match;
1974}
1975
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001976static PyMethodDef scanner_methods[] = {
1977 {"match", (PyCFunction) scanner_match, 0},
1978 {"search", (PyCFunction) scanner_search, 0},
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001979 {NULL, NULL}
1980};
1981
1982static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001983scanner_getattr(ScannerObject* self, char* name)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001984{
1985 PyObject* res;
1986
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001987 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001988 if (res)
1989 return res;
1990
1991 PyErr_Clear();
1992
1993 /* attributes */
1994 if (!strcmp(name, "pattern")) {
1995 Py_INCREF(self->pattern);
1996 return self->pattern;
1997 }
1998
1999 PyErr_SetString(PyExc_AttributeError, name);
2000 return NULL;
2001}
2002
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002003statichere PyTypeObject Scanner_Type = {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002004 PyObject_HEAD_INIT(NULL)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002005 0, "SRE_Scanner",
2006 sizeof(ScannerObject), /* size of basic object */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002007 0,
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002008 (destructor)scanner_dealloc, /*tp_dealloc*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002009 0, /*tp_print*/
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002010 (getattrfunc)scanner_getattr, /*tp_getattr*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002011};
2012
Guido van Rossumb700df92000-03-31 14:59:30 +00002013static PyMethodDef _functions[] = {
2014 {"compile", _compile, 1},
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002015 {"getcodesize", sre_codesize, 1},
Fredrik Lundhb389df32000-06-29 12:48:37 +00002016 {"getlower", sre_getlower, 1},
Guido van Rossumb700df92000-03-31 14:59:30 +00002017 {NULL, NULL}
2018};
2019
2020void
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002021#if defined(WIN32)
Guido van Rossumb700df92000-03-31 14:59:30 +00002022__declspec(dllexport)
2023#endif
2024init_sre()
2025{
2026 /* Patch object types */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002027 Pattern_Type.ob_type = Match_Type.ob_type =
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002028 Scanner_Type.ob_type = &PyType_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00002029
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002030 Py_InitModule("_" MODULE, _functions);
Guido van Rossumb700df92000-03-31 14:59:30 +00002031}
2032
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002033#endif /* !defined(SRE_RECURSIVE) */