blob: e11a8923dca3e00a34e24fd59bfea0afe7ed3dd3 [file] [log] [blame]
Guido van Rossumb700df92000-03-31 14:59:30 +00001/* -*- Mode: C; tab-width: 4 -*-
2 *
3 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00004 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00005 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00006 *
7 * partial history:
Fredrik Lundh436c3d582000-06-29 08:58:44 +00008 * 99-10-24 fl created (based on existing template matcher code)
Guido van Rossumb700df92000-03-31 14:59:30 +00009 * 99-11-13 fl added categories, branching, and more (0.2)
10 * 99-11-16 fl some tweaks to compile on non-Windows platforms
11 * 99-12-18 fl non-literals, generic maximizing repeat (0.3)
Fredrik Lundh436c3d582000-06-29 08:58:44 +000012 * 00-02-28 fl tons of changes (not all to the better ;-) (0.4)
13 * 00-03-06 fl first alpha, sort of (0.5)
14 * 00-03-14 fl removed most compatibility stuff (0.6)
15 * 00-05-10 fl towards third alpha (0.8.2)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +000016 * 00-05-13 fl added experimental scanner stuff (0.8.3)
Fredrik Lundh436c3d582000-06-29 08:58:44 +000017 * 00-05-27 fl final bug hunt (0.8.4)
18 * 00-06-21 fl less bugs, more taste (0.8.5)
19 * 00-06-25 fl major changes to better deal with nested repeats (0.9)
20 * 00-06-28 fl fixed findall (0.9.1)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +000021 * 00-06-29 fl fixed split, added more scanner features (0.9.2)
Fredrik Lundhc13222c2000-07-01 23:49:14 +000022 * 00-06-30 fl added fast search optimization (0.9.3)
Fredrik Lundh0640e112000-06-30 13:55:15 +000023 * 00-06-30 fl added assert (lookahead) primitives, etc (0.9.4)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +000024 * 00-07-02 fl added charset optimizations, etc (0.9.5)
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
26 * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
27 *
Guido van Rossumb700df92000-03-31 14:59:30 +000028 * Portions of this engine have been developed in cooperation with
Fredrik Lundh22d25462000-07-01 17:50:59 +000029 * CNRI. Hewlett-Packard provided funding for 2.0 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000030 * other compatibility work.
31 */
32
33#ifndef SRE_RECURSIVE
34
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +000035char copyright[] = " SRE 0.9.5 Copyright (c) 1997-2000 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000036
37#include "Python.h"
38
39#include "sre.h"
40
Guido van Rossumb700df92000-03-31 14:59:30 +000041#if defined(HAVE_LIMITS_H)
42#include <limits.h>
43#else
44#define INT_MAX 2147483647
45#endif
46
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000047#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000048
Fredrik Lundh436c3d582000-06-29 08:58:44 +000049/* name of this module, minus the leading underscore */
50#define MODULE "sre"
51
Guido van Rossumb700df92000-03-31 14:59:30 +000052/* defining this one enables tracing */
53#undef DEBUG
54
Fredrik Lundh436c3d582000-06-29 08:58:44 +000055#if PY_VERSION_HEX >= 0x01060000
Fredrik Lundh22d25462000-07-01 17:50:59 +000056/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +000057#define HAVE_UNICODE
58#endif
59
Fredrik Lundh29c08be2000-06-29 23:33:12 +000060/* optional features */
61#define USE_FAST_SEARCH
62
Fredrik Lundh80946112000-06-29 18:03:25 +000063#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000064#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
65/* fastest possible local call under MSVC */
66#define LOCAL(type) static __inline type __fastcall
67#else
Fredrik Lundh29c08be2000-06-29 23:33:12 +000068#define LOCAL(type) static inline type
Guido van Rossumb700df92000-03-31 14:59:30 +000069#endif
70
71/* error codes */
72#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
73#define SRE_ERROR_MEMORY -9 /* out of memory */
74
Fredrik Lundh436c3d582000-06-29 08:58:44 +000075#if defined(DEBUG)
Guido van Rossumb700df92000-03-31 14:59:30 +000076#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +000077#else
78#define TRACE(v)
79#endif
80
Fredrik Lundh436c3d582000-06-29 08:58:44 +000081#define PTR(ptr) ((SRE_CHAR*) (ptr) - (SRE_CHAR*) state->beginning)
Guido van Rossumb700df92000-03-31 14:59:30 +000082
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000083/* -------------------------------------------------------------------- */
84/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +000085
Fredrik Lundh436c3d582000-06-29 08:58:44 +000086/* default character predicates (run sre_chars.py to regenerate tables) */
87
88#define SRE_DIGIT_MASK 1
89#define SRE_SPACE_MASK 2
90#define SRE_LINEBREAK_MASK 4
91#define SRE_ALNUM_MASK 8
92#define SRE_WORD_MASK 16
93
94static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
952, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
960, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
9725, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
9824, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
990, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
10024, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
101
Fredrik Lundhb389df32000-06-29 12:48:37 +0000102static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000010310, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
10427, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
10544, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
10661, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
107108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
108122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
109106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
110120, 121, 122, 123, 124, 125, 126, 127 };
111
Fredrik Lundhb389df32000-06-29 12:48:37 +0000112static unsigned int sre_lower(unsigned int ch)
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000113{
Fredrik Lundhb389df32000-06-29 12:48:37 +0000114 return ((ch) < 128 ? sre_char_lower[ch] : ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000115}
116
117#define SRE_IS_DIGIT(ch)\
118 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
119#define SRE_IS_SPACE(ch)\
120 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
121#define SRE_IS_LINEBREAK(ch)\
122 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
123#define SRE_IS_ALNUM(ch)\
124 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
125#define SRE_IS_WORD(ch)\
126 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000127
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000128/* locale-specific character predicates */
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000129
Fredrik Lundhb389df32000-06-29 12:48:37 +0000130static unsigned int sre_lower_locale(unsigned int ch)
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000131{
132 return ((ch) < 256 ? tolower((ch)) : ch);
133}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000134#define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
135#define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
136#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
137#define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
138#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
139
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000140/* unicode-specific character predicates */
141
142#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +0000143static unsigned int sre_lower_unicode(unsigned int ch)
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000144{
145 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
146}
Fredrik Lundh22d25462000-07-01 17:50:59 +0000147
148#if !defined(Py_UNICODE_ISALNUM)
149/* FIXME: workaround. should be fixed in unicodectype.c */
150#define Py_UNICODE_ISALNUM(ch)\
151 (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISUPPER(ch) ||\
152 Py_UNICODE_ISTITLE(ch) || Py_UNICODE_ISDIGIT(ch))
153#endif
154
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000155#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
156#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
157#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000158#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000159#define SRE_UNI_IS_WORD(ch) (SRE_IS_ALNUM((ch)) || (ch) == '_')
160#endif
161
Guido van Rossumb700df92000-03-31 14:59:30 +0000162LOCAL(int)
163sre_category(SRE_CODE category, unsigned int ch)
164{
165 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000166
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000167 case SRE_CATEGORY_DIGIT:
Guido van Rossumb700df92000-03-31 14:59:30 +0000168 return SRE_IS_DIGIT(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000169 case SRE_CATEGORY_NOT_DIGIT:
Guido van Rossumb700df92000-03-31 14:59:30 +0000170 return !SRE_IS_DIGIT(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000171 case SRE_CATEGORY_SPACE:
Guido van Rossumb700df92000-03-31 14:59:30 +0000172 return SRE_IS_SPACE(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000173 case SRE_CATEGORY_NOT_SPACE:
Guido van Rossumb700df92000-03-31 14:59:30 +0000174 return !SRE_IS_SPACE(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000175 case SRE_CATEGORY_WORD:
Guido van Rossumb700df92000-03-31 14:59:30 +0000176 return SRE_IS_WORD(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000177 case SRE_CATEGORY_NOT_WORD:
Guido van Rossumb700df92000-03-31 14:59:30 +0000178 return !SRE_IS_WORD(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000179 case SRE_CATEGORY_LINEBREAK:
180 return SRE_IS_LINEBREAK(ch);
181 case SRE_CATEGORY_NOT_LINEBREAK:
182 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000183
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000184 case SRE_CATEGORY_LOC_WORD:
185 return SRE_LOC_IS_WORD(ch);
186 case SRE_CATEGORY_LOC_NOT_WORD:
187 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000188
189#if defined(HAVE_UNICODE)
190 case SRE_CATEGORY_UNI_DIGIT:
191 return SRE_UNI_IS_DIGIT(ch);
192 case SRE_CATEGORY_UNI_NOT_DIGIT:
193 return !SRE_UNI_IS_DIGIT(ch);
194 case SRE_CATEGORY_UNI_SPACE:
195 return SRE_UNI_IS_SPACE(ch);
196 case SRE_CATEGORY_UNI_NOT_SPACE:
197 return !SRE_UNI_IS_SPACE(ch);
198 case SRE_CATEGORY_UNI_WORD:
199 return SRE_UNI_IS_WORD(ch);
200 case SRE_CATEGORY_UNI_NOT_WORD:
201 return !SRE_UNI_IS_WORD(ch);
202 case SRE_CATEGORY_UNI_LINEBREAK:
203 return SRE_UNI_IS_LINEBREAK(ch);
204 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
205 return !SRE_UNI_IS_LINEBREAK(ch);
206#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000207 }
208 return 0;
209}
210
211/* helpers */
212
213LOCAL(int)
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000214stack_free(SRE_STATE* state)
Guido van Rossumb700df92000-03-31 14:59:30 +0000215{
216 if (state->stack) {
217 TRACE(("release stack\n"));
218 free(state->stack);
219 state->stack = NULL;
220 }
221 state->stacksize = 0;
222 return 0;
223}
224
225static int /* shouldn't be LOCAL */
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000226stack_extend(SRE_STATE* state, int lo, int hi)
Guido van Rossumb700df92000-03-31 14:59:30 +0000227{
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000228 SRE_STACK* stack;
Guido van Rossumb700df92000-03-31 14:59:30 +0000229 int stacksize;
230
231 /* grow the stack to a suitable size; we need at least lo entries,
232 at most hi entries. if for some reason hi is lower than lo, lo
233 wins */
234
235 stacksize = state->stacksize;
236
237 if (stacksize == 0) {
238 /* create new stack */
239 stacksize = 512;
240 if (stacksize < lo)
241 stacksize = lo;
242 else if (stacksize > hi)
243 stacksize = hi;
244 TRACE(("allocate stack %d\n", stacksize));
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000245 stack = malloc(sizeof(SRE_STACK) * stacksize);
Guido van Rossumb700df92000-03-31 14:59:30 +0000246 } else {
247 /* grow the stack (typically by a factor of two) */
248 while (stacksize < lo)
249 stacksize = 2 * stacksize;
250 /* FIXME: <fl> could trim size if it's larger than lo, and
251 much larger than hi */
252 TRACE(("grow stack to %d\n", stacksize));
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000253 stack = realloc(state->stack, sizeof(SRE_STACK) * stacksize);
Guido van Rossumb700df92000-03-31 14:59:30 +0000254 }
255
256 if (!stack) {
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000257 stack_free(state);
Guido van Rossumb700df92000-03-31 14:59:30 +0000258 return SRE_ERROR_MEMORY;
259 }
260
261 state->stack = stack;
262 state->stacksize = stacksize;
263
264 return 0;
265}
266
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000267/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000268
269#define SRE_CHAR unsigned char
270#define SRE_AT sre_at
271#define SRE_MEMBER sre_member
272#define SRE_MATCH sre_match
273#define SRE_SEARCH sre_search
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000274
275#if defined(HAVE_UNICODE)
276
Guido van Rossumb700df92000-03-31 14:59:30 +0000277#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000278#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000279#undef SRE_RECURSIVE
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000280
Guido van Rossumb700df92000-03-31 14:59:30 +0000281#undef SRE_SEARCH
282#undef SRE_MATCH
283#undef SRE_MEMBER
284#undef SRE_AT
285#undef SRE_CHAR
286
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000287/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000288
289#define SRE_CHAR Py_UNICODE
290#define SRE_AT sre_uat
291#define SRE_MEMBER sre_umember
292#define SRE_MATCH sre_umatch
293#define SRE_SEARCH sre_usearch
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000294#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000295
296#endif /* SRE_RECURSIVE */
297
298/* -------------------------------------------------------------------- */
299/* String matching engine */
300
301/* the following section is compiled twice, with different character
302 settings */
303
304LOCAL(int)
305SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
306{
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000307 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000308
309 int this, that;
310
311 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000312
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000313 case SRE_AT_BEGINNING:
Guido van Rossum29530882000-04-10 17:06:55 +0000314 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000315
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000316 case SRE_AT_BEGINNING_LINE:
317 return ((void*) ptr == state->beginning ||
318 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000319
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000320 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000321 return (((void*) (ptr+1) == state->end &&
322 SRE_IS_LINEBREAK((int) ptr[0])) ||
323 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000324
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000325 case SRE_AT_END_LINE:
326 return ((void*) ptr == state->end ||
327 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000328
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000329 case SRE_AT_BOUNDARY:
Guido van Rossumb700df92000-03-31 14:59:30 +0000330 if (state->beginning == state->end)
331 return 0;
332 that = ((void*) ptr > state->beginning) ?
333 SRE_IS_WORD((int) ptr[-1]) : 0;
334 this = ((void*) ptr < state->end) ?
335 SRE_IS_WORD((int) ptr[0]) : 0;
336 return this != that;
Fredrik Lundh80946112000-06-29 18:03:25 +0000337
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000338 case SRE_AT_NON_BOUNDARY:
Guido van Rossumb700df92000-03-31 14:59:30 +0000339 if (state->beginning == state->end)
340 return 0;
341 that = ((void*) ptr > state->beginning) ?
342 SRE_IS_WORD((int) ptr[-1]) : 0;
343 this = ((void*) ptr < state->end) ?
344 SRE_IS_WORD((int) ptr[0]) : 0;
345 return this == that;
346 }
347
348 return 0;
349}
350
351LOCAL(int)
Fredrik Lundh0640e112000-06-30 13:55:15 +0000352SRE_MEMBER(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000353{
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000354 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000355
356 int ok = 1;
357
358 for (;;) {
359 switch (*set++) {
360
361 case SRE_OP_NEGATE:
362 ok = !ok;
363 break;
364
365 case SRE_OP_FAILURE:
366 return !ok;
367
368 case SRE_OP_LITERAL:
Fredrik Lundhc13222c2000-07-01 23:49:14 +0000369 /* args: <literal> */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000370 if (ch == set[0])
Guido van Rossumb700df92000-03-31 14:59:30 +0000371 return ok;
372 set++;
373 break;
374
375 case SRE_OP_RANGE:
Fredrik Lundhc13222c2000-07-01 23:49:14 +0000376 /* args: <lower> <upper> */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000377 if (set[0] <= ch && ch <= set[1])
Guido van Rossumb700df92000-03-31 14:59:30 +0000378 return ok;
379 set += 2;
380 break;
381
Fredrik Lundh3562f112000-07-02 12:00:07 +0000382 case SRE_OP_CHARSET:
383 /* args: <bitmap> (16 bits per code word) */
384 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
385 return ok;
386 set += 16;
387 break;
388
Guido van Rossumb700df92000-03-31 14:59:30 +0000389 case SRE_OP_CATEGORY:
Fredrik Lundhc13222c2000-07-01 23:49:14 +0000390 /* args: <category> */
Guido van Rossumb700df92000-03-31 14:59:30 +0000391 if (sre_category(set[0], (int) ch))
392 return ok;
393 set += 1;
394 break;
395
396 default:
Fredrik Lundh80946112000-06-29 18:03:25 +0000397 /* internal error -- there's not much we can do about it
398 here, so let's just pretend it didn't match... */
Guido van Rossumb700df92000-03-31 14:59:30 +0000399 return 0;
400 }
401 }
402}
403
404LOCAL(int)
405SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
406{
407 /* check if string matches the given pattern. returns -1 for
408 error, 0 for failure, and 1 for success */
409
410 SRE_CHAR* end = state->end;
411 SRE_CHAR* ptr = state->ptr;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000412 int stack;
Guido van Rossumb700df92000-03-31 14:59:30 +0000413 int stackbase;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000414 int lastmark;
Guido van Rossumb700df92000-03-31 14:59:30 +0000415 int i, count;
416
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000417 /* FIXME: this is a hack! */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000418 void* mark_copy[SRE_MARK_SIZE];
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000419 void* mark = NULL;
420
421 TRACE(("%8d: enter\n", PTR(ptr)));
422
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000423 if (pattern[0] == SRE_OP_INFO) {
424 /* optimization info block */
425 /* args: <1=skip> <2=flags> <3=min> ... */
426 if (pattern[3] && (end - ptr) < pattern[3]) {
427 TRACE(("reject (got %d chars, need %d)\n",
428 (end - ptr), pattern[3]));
429 return 0;
430 }
431 pattern += pattern[1] + 1;
432 }
433
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000434 stackbase = stack = state->stackbase;
435 lastmark = state->lastmark;
436
437 retry:
Guido van Rossumb700df92000-03-31 14:59:30 +0000438
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000439 for (;;) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000440
441 switch (*pattern++) {
442
443 case SRE_OP_FAILURE:
444 /* immediate failure */
445 TRACE(("%8d: failure\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000446 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000447
448 case SRE_OP_SUCCESS:
449 /* end of pattern */
450 TRACE(("%8d: success\n", PTR(ptr)));
451 state->ptr = ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000452 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000453
454 case SRE_OP_AT:
455 /* match at given position */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000456 /* args: <at> */
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000457 TRACE(("%8d: position %d\n", PTR(ptr), *pattern));
Guido van Rossumb700df92000-03-31 14:59:30 +0000458 if (!SRE_AT(state, ptr, *pattern))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000459 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000460 pattern++;
461 break;
462
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000463 case SRE_OP_CATEGORY:
464 /* match at given category */
465 /* args: <category> */
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000466 TRACE(("%8d: category %d [category %d]\n", PTR(ptr),
467 *ptr, *pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000468 if (ptr >= end || !sre_category(pattern[0], ptr[0]))
469 goto failure;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000470 TRACE(("%8d: category ok\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000471 pattern++;
472 ptr++;
473 break;
474
Guido van Rossumb700df92000-03-31 14:59:30 +0000475 case SRE_OP_LITERAL:
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000476 /* match literal string */
Guido van Rossumb700df92000-03-31 14:59:30 +0000477 /* args: <code> */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000478 TRACE(("%8d: literal %c\n", PTR(ptr), pattern[0]));
479 if (ptr >= end || (SRE_CODE) ptr[0] != pattern[0])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000480 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000481 pattern++;
482 ptr++;
483 break;
484
485 case SRE_OP_NOT_LITERAL:
486 /* match anything that is not literal character */
487 /* args: <code> */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000488 TRACE(("%8d: literal not %c\n", PTR(ptr), pattern[0]));
489 if (ptr >= end || (SRE_CODE) ptr[0] == pattern[0])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000490 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000491 pattern++;
492 ptr++;
493 break;
494
495 case SRE_OP_ANY:
496 /* match anything */
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000497 TRACE(("%8d: anything\n", PTR(ptr)));
Guido van Rossumb700df92000-03-31 14:59:30 +0000498 if (ptr >= end)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000499 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000500 ptr++;
501 break;
502
503 case SRE_OP_IN:
504 /* match set member (or non_member) */
505 /* args: <skip> <set> */
506 TRACE(("%8d: set %c\n", PTR(ptr), *ptr));
507 if (ptr >= end || !SRE_MEMBER(pattern + 1, *ptr))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000508 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000509 pattern += pattern[0];
510 ptr++;
511 break;
512
513 case SRE_OP_GROUP:
514 /* match backreference */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000515 TRACE(("%8d: group %d\n", PTR(ptr), pattern[0]));
Guido van Rossumb700df92000-03-31 14:59:30 +0000516 i = pattern[0];
517 {
Guido van Rossumb700df92000-03-31 14:59:30 +0000518 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
519 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
520 if (!p || !e || e < p)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000521 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000522 while (p < e) {
523 if (ptr >= end || *ptr != *p)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000524 goto failure;
525 p++; ptr++;
526 }
527 }
528 pattern++;
529 break;
530
531 case SRE_OP_GROUP_IGNORE:
532 /* match backreference */
533 TRACE(("%8d: group ignore %d\n", PTR(ptr), pattern[0]));
534 i = pattern[0];
535 {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000536 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
537 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000538 if (!p || !e || e < p)
539 goto failure;
540 while (p < e) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000541 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000542 state->lower(*ptr) != state->lower(*p))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000543 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000544 p++; ptr++;
545 }
546 }
547 pattern++;
548 break;
549
550 case SRE_OP_LITERAL_IGNORE:
Fredrik Lundh0640e112000-06-30 13:55:15 +0000551 TRACE(("%8d: literal lower(%c)\n", PTR(ptr), pattern[0]));
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000552 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000553 state->lower(*ptr) != state->lower(*pattern))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000554 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000555 pattern++;
556 ptr++;
557 break;
558
559 case SRE_OP_NOT_LITERAL_IGNORE:
Fredrik Lundh0640e112000-06-30 13:55:15 +0000560 TRACE(("%8d: literal not lower(%c)\n", PTR(ptr), pattern[0]));
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000561 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000562 state->lower(*ptr) == state->lower(*pattern))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000563 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000564 pattern++;
565 ptr++;
566 break;
567
568 case SRE_OP_IN_IGNORE:
569 TRACE(("%8d: set lower(%c)\n", PTR(ptr), *ptr));
570 if (ptr >= end
Fredrik Lundh0640e112000-06-30 13:55:15 +0000571 || !SRE_MEMBER(pattern+1, (SRE_CODE) state->lower(*ptr)))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000572 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000573 pattern += pattern[0];
574 ptr++;
575 break;
576
577 case SRE_OP_MARK:
578 /* set mark */
579 /* args: <mark> */
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000580 TRACE(("%8d: set mark %d\n", PTR(ptr), pattern[0]));
581 if (state->lastmark < pattern[0])
582 state->lastmark = pattern[0];
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000583 if (!mark) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000584 mark = mark_copy;
585 memcpy(mark, state->mark, state->lastmark*sizeof(void*));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000586 }
587 state->mark[pattern[0]] = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +0000588 pattern++;
589 break;
590
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000591 case SRE_OP_INDEX:
592 /* set index */
593 /* args: <index> */
594 TRACE(("%8d: set index %d\n", PTR(ptr), pattern[0]));
595 state->index = pattern[0];
596 pattern++;
597 break;
598
Guido van Rossumb700df92000-03-31 14:59:30 +0000599 case SRE_OP_JUMP:
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000600 case SRE_OP_INFO:
Guido van Rossumb700df92000-03-31 14:59:30 +0000601 /* jump forward */
602 /* args: <skip> */
603 TRACE(("%8d: jump +%d\n", PTR(ptr), pattern[0]));
604 pattern += pattern[0];
605 break;
606
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000607 case SRE_OP_ASSERT:
608 /* assert subpattern */
Guido van Rossumb700df92000-03-31 14:59:30 +0000609 /* args: <skip> <pattern> */
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000610 TRACE(("%8d: assert subpattern\n", PTR(ptr)));
Guido van Rossumb700df92000-03-31 14:59:30 +0000611 state->ptr = ptr;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000612 i = SRE_MATCH(state, pattern + 1);
613 if (i < 0)
614 return i;
615 if (!i)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000616 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000617 pattern += pattern[0];
Guido van Rossumb700df92000-03-31 14:59:30 +0000618 break;
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000619
620 case SRE_OP_ASSERT_NOT:
621 /* assert not subpattern */
622 /* args: <skip> <pattern> */
623 TRACE(("%8d: assert not subpattern\n", PTR(ptr)));
624 state->ptr = ptr;
625 i = SRE_MATCH(state, pattern + 1);
626 if (i < 0)
627 return i;
628 if (i)
629 goto failure;
630 pattern += pattern[0];
631 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000632
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000633#if 0
Guido van Rossumb700df92000-03-31 14:59:30 +0000634 case SRE_OP_MAX_REPEAT_ONE:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000635 /* match repeated sequence (maximizing regexp) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000636
637 /* this operator only works if the repeated item is
638 exactly one character wide, and we're not already
639 collecting backtracking points. for other cases,
640 use the MAX_REPEAT operator instead */
641
Guido van Rossumb700df92000-03-31 14:59:30 +0000642 /* args: <skip> <min> <max> <step> */
Guido van Rossumb700df92000-03-31 14:59:30 +0000643 TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr),
644 pattern[1], pattern[2]));
645
646 count = 0;
647
648 if (pattern[3] == SRE_OP_ANY) {
649 /* repeated wildcard. skip to the end of the target
650 string, and backtrack from there */
651 /* FIXME: must look for line endings */
652 if (ptr + pattern[1] > end)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000653 goto failure; /* cannot match */
Guido van Rossumb700df92000-03-31 14:59:30 +0000654 count = pattern[2];
655 if (count > end - ptr)
656 count = end - ptr;
657 ptr += count;
658
659 } else if (pattern[3] == SRE_OP_LITERAL) {
660 /* repeated literal */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000661 SRE_CODE chr = pattern[4];
Guido van Rossumb700df92000-03-31 14:59:30 +0000662 while (count < (int) pattern[2]) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000663 if (ptr >= end || (SRE_CODE) ptr[0] != chr)
Guido van Rossumb700df92000-03-31 14:59:30 +0000664 break;
665 ptr++;
666 count++;
667 }
668
669 } else if (pattern[3] == SRE_OP_LITERAL_IGNORE) {
670 /* repeated literal */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000671 SRE_CODE chr = pattern[4];
Guido van Rossumb700df92000-03-31 14:59:30 +0000672 while (count < (int) pattern[2]) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000673 if (ptr >= end || (SRE_CODE) state->lower(*ptr) != chr)
Guido van Rossumb700df92000-03-31 14:59:30 +0000674 break;
675 ptr++;
676 count++;
677 }
678
679 } else if (pattern[3] == SRE_OP_NOT_LITERAL) {
680 /* repeated non-literal */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000681 SRE_CODE chr = pattern[4];
Guido van Rossumb700df92000-03-31 14:59:30 +0000682 while (count < (int) pattern[2]) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000683 if (ptr >= end || (SRE_CODE) ptr[0] == chr)
Guido van Rossumb700df92000-03-31 14:59:30 +0000684 break;
685 ptr++;
686 count++;
687 }
688
689 } else if (pattern[3] == SRE_OP_NOT_LITERAL_IGNORE) {
690 /* repeated non-literal */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000691 SRE_CODE chr = pattern[4];
Guido van Rossumb700df92000-03-31 14:59:30 +0000692 while (count < (int) pattern[2]) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000693 if (ptr >= end || (SRE_CODE) state->lower(ptr[0]) == chr)
Guido van Rossumb700df92000-03-31 14:59:30 +0000694 break;
695 ptr++;
696 count++;
697 }
698
699 } else if (pattern[3] == SRE_OP_IN) {
700 /* repeated set */
701 while (count < (int) pattern[2]) {
702 if (ptr >= end || !SRE_MEMBER(pattern + 5, *ptr))
703 break;
704 ptr++;
705 count++;
706 }
707
708 } else {
709 /* repeated single character pattern */
710 state->ptr = ptr;
711 while (count < (int) pattern[2]) {
712 i = SRE_MATCH(state, pattern + 3);
713 if (i < 0)
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000714 return i;
715 if (!i)
Guido van Rossumb700df92000-03-31 14:59:30 +0000716 break;
717 count++;
718 }
719 state->ptr = ptr;
720 ptr += count;
721 }
722
723 /* when we arrive here, count contains the number of
724 matches, and ptr points to the tail of the target
725 string. check if the rest of the pattern matches, and
726 backtrack if not. */
727
Guido van Rossumb700df92000-03-31 14:59:30 +0000728 TRACE(("%8d: repeat %d found\n", PTR(ptr), count));
729
730 if (count < (int) pattern[1])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000731 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000732
733 if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
734 /* tail is empty. we're finished */
735 TRACE(("%8d: tail is empty\n", PTR(ptr)));
736 state->ptr = ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000737 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000738
739 } else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000740 /* tail starts with a literal. skip positions where
741 the rest of the pattern cannot possibly match */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000742 SRE_CODE chr = pattern[pattern[0]+1];
Guido van Rossumb700df92000-03-31 14:59:30 +0000743 TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr));
744 for (;;) {
745 TRACE(("%8d: scan for tail match\n", PTR(ptr)));
746 while (count >= (int) pattern[1] &&
747 (ptr >= end || *ptr != chr)) {
748 ptr--;
749 count--;
750 }
751 TRACE(("%8d: check tail\n", PTR(ptr)));
752 if (count < (int) pattern[1])
753 break;
754 state->ptr = ptr;
755 i = SRE_MATCH(state, pattern + pattern[0]);
756 if (i > 0) {
757 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000758 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000759 }
760 TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
761 ptr--;
762 count--;
763 }
764
765 } else {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000766 /* general case */
Guido van Rossumb700df92000-03-31 14:59:30 +0000767 TRACE(("%8d: tail is pattern\n", PTR(ptr)));
768 while (count >= (int) pattern[1]) {
769 state->ptr = ptr;
770 i = SRE_MATCH(state, pattern + pattern[0]);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000771 if (i < 0)
772 return i;
773 if (i) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000774 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000775 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000776 }
777 TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
778 ptr--;
779 count--;
780 }
781 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000782 goto failure;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000783#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000784
785 case SRE_OP_MAX_REPEAT:
786 /* match repeated sequence (maximizing regexp). repeated
787 group should end with a MAX_UNTIL code */
788
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000789 /* args: <skip> <min> <max> <item> */
790
791 TRACE(("%8d: max repeat (%d %d)\n", PTR(ptr),
Guido van Rossumb700df92000-03-31 14:59:30 +0000792 pattern[1], pattern[2]));
793
794 count = 0;
795 state->ptr = ptr;
796
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000797 /* match minimum number of items */
798 while (count < (int) pattern[1]) {
799 i = SRE_MATCH(state, pattern + 3);
800 if (i < 0)
801 return i;
802 if (!i)
803 goto failure;
804 if (state->ptr == ptr) {
805 /* if the match was successful but empty, set the
806 count to max and terminate the scanning loop */
807 count = (int) pattern[2];
808 break;
809 }
810 count++;
811 ptr = state->ptr;
812 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000813
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000814 TRACE(("%8d: found %d leading items\n", PTR(ptr), count));
Guido van Rossumb700df92000-03-31 14:59:30 +0000815
816 if (count < (int) pattern[1])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000817 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000818
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000819 /* match maximum number of items, pushing alternate end
820 points to the stack */
Guido van Rossumb700df92000-03-31 14:59:30 +0000821
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000822 while (pattern[2] == 65535 || count < (int) pattern[2]) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000823 state->stackbase = stack;
824 i = SRE_MATCH(state, pattern + 3);
825 state->stackbase = stackbase; /* rewind */
826 if (i < 0)
827 return i;
828 if (!i)
829 break;
830 if (state->ptr == ptr) {
831 count = (int) pattern[2];
832 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000833 }
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000834 /* this position was valid; add it to the retry
835 stack */
836 if (stack >= state->stacksize) {
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000837 i = stack_extend(state, stack + 1,
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000838 stackbase + pattern[2]);
839 if (i < 0)
840 return i; /* out of memory */
841 }
842 TRACE(("%8d: stack[%d] = %d\n", PTR(ptr), stack, PTR(ptr)));
843 state->stack[stack].ptr = ptr;
844 state->stack[stack].pattern = pattern + pattern[0];
845 stack++;
846 /* move forward */
847 ptr = state->ptr;
848 count++;
Guido van Rossumb700df92000-03-31 14:59:30 +0000849 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000850
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000851 /* when we get here, count is the number of successful
852 matches, and ptr points to the tail. */
Guido van Rossumb700df92000-03-31 14:59:30 +0000853
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000854 TRACE(("%8d: skip +%d\n", PTR(ptr), pattern[0]));
855
856 pattern += pattern[0];
857 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000858
859 case SRE_OP_MIN_REPEAT:
860 /* match repeated sequence (minimizing regexp) */
861 TRACE(("%8d: min repeat %d %d\n", PTR(ptr),
862 pattern[1], pattern[2]));
863 count = 0;
864 state->ptr = ptr;
865 /* match minimum number of items */
866 while (count < (int) pattern[1]) {
867 i = SRE_MATCH(state, pattern + 3);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000868 if (i < 0)
869 return i;
870 if (!i)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000871 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000872 count++;
873 }
874 /* move forward until the tail matches. */
875 while (count <= (int) pattern[2]) {
876 ptr = state->ptr;
877 i = SRE_MATCH(state, pattern + pattern[0]);
878 if (i > 0) {
879 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000880 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000881 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000882 state->ptr = ptr; /* backtrack */
883 i = SRE_MATCH(state, pattern + 3);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000884 if (i < 0)
885 return i;
886 if (!i)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000887 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000888 count++;
889 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000890 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000891
Guido van Rossumb700df92000-03-31 14:59:30 +0000892 case SRE_OP_BRANCH:
893 /* match one of several subpatterns */
894 /* format: <branch> <size> <head> ... <null> <tail> */
895 TRACE(("%8d: branch\n", PTR(ptr)));
896 while (*pattern) {
897 if (pattern[1] != SRE_OP_LITERAL ||
Fredrik Lundh0640e112000-06-30 13:55:15 +0000898 (ptr < end && (SRE_CODE) ptr[0] == pattern[2])) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000899 TRACE(("%8d: branch check\n", PTR(ptr)));
900 state->ptr = ptr;
901 i = SRE_MATCH(state, pattern + 1);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000902 if (i < 0)
903 return i;
904 if (i) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000905 TRACE(("%8d: branch succeeded\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000906 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000907 }
908 }
909 pattern += *pattern;
910 }
911 TRACE(("%8d: branch failed\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000912 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000913
914 case SRE_OP_REPEAT:
915 /* TEMPLATE: match repeated sequence (no backtracking) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000916 /* args: <skip> <min> <max> */
Guido van Rossumb700df92000-03-31 14:59:30 +0000917 TRACE(("%8d: repeat %d %d\n", PTR(ptr), pattern[1], pattern[2]));
918 count = 0;
919 state->ptr = ptr;
920 while (count < (int) pattern[2]) {
921 i = SRE_MATCH(state, pattern + 3);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000922 if (i < 0)
923 return i;
924 if (!i)
Guido van Rossumb700df92000-03-31 14:59:30 +0000925 break;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000926 if (state->ptr == ptr) {
927 count = (int) pattern[2];
928 break;
929 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000930 count++;
931 }
932 if (count <= (int) pattern[1])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000933 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000934 TRACE(("%8d: repeat %d matches\n", PTR(ptr), count));
935 pattern += pattern[0];
936 ptr = state->ptr;
937 break;
938
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000939 default:
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000940 TRACE(("%8d: unknown opcode %d\n", PTR(ptr), pattern[-1]));
Guido van Rossumb700df92000-03-31 14:59:30 +0000941 return SRE_ERROR_ILLEGAL;
942 }
943 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000944
945 failure:
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000946 if (stack-- > stackbase) {
947 ptr = state->stack[stack].ptr;
948 pattern = state->stack[stack].pattern;
949 TRACE(("%8d: retry (%d)\n", PTR(ptr), stack));
950 goto retry;
951 }
952 TRACE(("%8d: leave (failure)\n", PTR(ptr)));
953 state->stackbase = stackbase;
954 state->lastmark = lastmark;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000955 if (mark)
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000956 memcpy(state->mark, mark, state->lastmark*sizeof(void*));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000957 return 0;
958
959 success:
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000960 TRACE(("%8d: leave (success)\n", PTR(ptr)));
961 state->stackbase = stackbase;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000962 return 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000963}
964
965LOCAL(int)
966SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
967{
968 SRE_CHAR* ptr = state->start;
969 SRE_CHAR* end = state->end;
970 int status = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +0000971 int prefix_len;
972 SRE_CODE* prefix = NULL;
973 SRE_CODE* charset = NULL;
974 SRE_CODE* overlap = NULL;
975 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000976
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000977 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000978 /* optimization info block */
Fredrik Lundh3562f112000-07-02 12:00:07 +0000979 /* args: <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
980
981 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000982
983 if (pattern[3] > 0) {
984 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +0000985 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000986 end -= pattern[3]-1;
987 if (end <= ptr)
988 end = ptr+1;
989 }
990
Fredrik Lundh3562f112000-07-02 12:00:07 +0000991 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000992 /* pattern starts with a known prefix */
Fredrik Lundh3562f112000-07-02 12:00:07 +0000993 prefix_len = pattern[5];
994 prefix = pattern + 6;
995 overlap = prefix + prefix_len - 1;
996 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000997 /* pattern starts with a character from a known set */
Fredrik Lundh3562f112000-07-02 12:00:07 +0000998 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000999
1000 pattern += 1 + pattern[1];
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001001 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001002
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001003#if defined(USE_FAST_SEARCH)
Fredrik Lundh3562f112000-07-02 12:00:07 +00001004 if (prefix && overlap && prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001005 /* pattern starts with a known prefix. use the overlap
1006 table to skip forward as fast as we possibly can */
1007 int i = 0;
1008 end = state->end;
1009 while (ptr < end) {
1010 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001011 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001012 if (!i)
1013 break;
1014 else
1015 i = overlap[i];
1016 } else {
1017 if (++i == prefix_len) {
1018 /* found a potential match */
1019 TRACE(("%8d: === SEARCH === hit\n", PTR(ptr)));
1020 state->start = ptr - prefix_len + 1;
1021 state->ptr = ptr + 1;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001022 if (flags & SRE_INFO_LITERAL)
1023 return 1; /* we got all of it */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001024 status = SRE_MATCH(state, pattern + 2*prefix_len);
1025 if (status != 0)
1026 return status;
1027 /* close but no cigar -- try again */
1028 i = overlap[i];
1029 }
1030 break;
1031 }
1032
1033 }
1034 ptr++;
1035 }
1036 return 0;
1037 }
1038#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001039
Fredrik Lundh3562f112000-07-02 12:00:07 +00001040 if (pattern[0] == SRE_OP_LITERAL) {
1041 /* pattern starts with a literal character. this is used
1042 for short prefixes, and if fast search is disabled */
Fredrik Lundh0640e112000-06-30 13:55:15 +00001043 SRE_CODE chr = pattern[1];
Guido van Rossumb700df92000-03-31 14:59:30 +00001044 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001045 while (ptr < end && (SRE_CODE) ptr[0] != chr)
Guido van Rossumb700df92000-03-31 14:59:30 +00001046 ptr++;
1047 if (ptr == end)
1048 return 0;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001049 TRACE(("%8d: === SEARCH === literal\n", PTR(ptr)));
Guido van Rossumb700df92000-03-31 14:59:30 +00001050 state->start = ptr;
1051 state->ptr = ++ptr;
1052 status = SRE_MATCH(state, pattern + 2);
1053 if (status != 0)
1054 break;
1055 }
Fredrik Lundh3562f112000-07-02 12:00:07 +00001056 } else if (charset) {
1057 /* pattern starts with a character from a known set */
1058 for (;;) {
1059 while (ptr < end && !SRE_MEMBER(charset, ptr[0]))
1060 ptr++;
1061 if (ptr == end)
1062 return 0;
1063 TRACE(("%8d: === SEARCH === charset\n", PTR(ptr)));
1064 state->start = ptr;
1065 state->ptr = ptr;
1066 status = SRE_MATCH(state, pattern);
1067 if (status != 0)
1068 break;
1069 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001070 } else
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001071 /* general case */
Guido van Rossumb700df92000-03-31 14:59:30 +00001072 while (ptr <= end) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001073 TRACE(("%8d: === SEARCH ===\n", PTR(ptr)));
Guido van Rossumb700df92000-03-31 14:59:30 +00001074 state->start = state->ptr = ptr++;
1075 status = SRE_MATCH(state, pattern);
1076 if (status != 0)
1077 break;
1078 }
1079
1080 return status;
1081}
Fredrik Lundh3562f112000-07-02 12:00:07 +00001082
Guido van Rossumb700df92000-03-31 14:59:30 +00001083
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001084#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001085
1086/* -------------------------------------------------------------------- */
1087/* factories and destructors */
1088
1089/* see sre.h for object declarations */
1090
1091staticforward PyTypeObject Pattern_Type;
1092staticforward PyTypeObject Match_Type;
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001093staticforward PyTypeObject Scanner_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00001094
1095static PyObject *
1096_compile(PyObject* self_, PyObject* args)
1097{
1098 /* "compile" pattern descriptor to pattern object */
1099
1100 PatternObject* self;
1101
1102 PyObject* pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001103 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001104 PyObject* code;
1105 int groups = 0;
1106 PyObject* groupindex = NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001107 if (!PyArg_ParseTuple(args, "OiO!|iO", &pattern, &flags,
1108 &PyString_Type, &code,
1109 &groups, &groupindex))
Guido van Rossumb700df92000-03-31 14:59:30 +00001110 return NULL;
1111
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001112 self = PyObject_NEW(PatternObject, &Pattern_Type);
Guido van Rossumb700df92000-03-31 14:59:30 +00001113 if (self == NULL)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001114
Guido van Rossumb700df92000-03-31 14:59:30 +00001115 return NULL;
1116
1117 Py_INCREF(pattern);
1118 self->pattern = pattern;
1119
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001120 self->flags = flags;
1121
Guido van Rossumb700df92000-03-31 14:59:30 +00001122 Py_INCREF(code);
1123 self->code = code;
1124
1125 self->groups = groups;
1126
1127 Py_XINCREF(groupindex);
1128 self->groupindex = groupindex;
1129
1130 return (PyObject*) self;
1131}
1132
1133static PyObject *
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001134sre_codesize(PyObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001135{
1136 return Py_BuildValue("i", sizeof(SRE_CODE));
1137}
1138
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001139static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001140sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001141{
1142 int character, flags;
1143 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1144 return NULL;
1145 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001146 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001147#if defined(HAVE_UNICODE)
1148 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001149 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001150#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001151 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001152}
1153
Guido van Rossumb700df92000-03-31 14:59:30 +00001154LOCAL(PyObject*)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001155state_init(SRE_STATE* state, PatternObject* pattern, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001156{
1157 /* prepare state object */
1158
1159 PyBufferProcs *buffer;
1160 int i, count;
1161 void* ptr;
1162
1163 PyObject* string;
1164 int start = 0;
1165 int end = INT_MAX;
1166 if (!PyArg_ParseTuple(args, "O|ii", &string, &start, &end))
1167 return NULL;
1168
1169 /* get pointer to string buffer */
1170 buffer = string->ob_type->tp_as_buffer;
1171 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1172 buffer->bf_getsegcount(string, NULL) != 1) {
1173 PyErr_SetString(PyExc_TypeError, "expected read-only buffer");
1174 return NULL;
1175 }
1176
1177 /* determine buffer size */
1178 count = buffer->bf_getreadbuffer(string, 0, &ptr);
1179 if (count < 0) {
1180 /* sanity check */
1181 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1182 return NULL;
1183 }
1184
1185 /* determine character size */
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001186#if defined(HAVE_UNICODE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001187 state->charsize = (PyUnicode_Check(string) ? sizeof(Py_UNICODE) : 1);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001188#else
1189 state->charsize = 1;
1190#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001191
1192 count /= state->charsize;
1193
1194 /* adjust boundaries */
1195 if (start < 0)
1196 start = 0;
1197 else if (start > count)
1198 start = count;
1199
1200 if (end < 0)
1201 end = 0;
1202 else if (end > count)
1203 end = count;
1204
1205 state->beginning = ptr;
1206
1207 state->start = (void*) ((char*) ptr + start * state->charsize);
1208 state->end = (void*) ((char*) ptr + end * state->charsize);
1209
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001210 state->lastmark = 0;
1211
Guido van Rossumb700df92000-03-31 14:59:30 +00001212 /* FIXME: dynamic! */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001213 for (i = 0; i < SRE_MARK_SIZE; i++)
Guido van Rossumb700df92000-03-31 14:59:30 +00001214 state->mark[i] = NULL;
1215
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001216 state->index = -1;
1217
Guido van Rossumb700df92000-03-31 14:59:30 +00001218 state->stack = NULL;
1219 state->stackbase = 0;
1220 state->stacksize = 0;
1221
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001222 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001223 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001224#if defined(HAVE_UNICODE)
1225 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001226 state->lower = sre_lower_unicode;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001227#endif
1228 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001229 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001230
Guido van Rossumb700df92000-03-31 14:59:30 +00001231 return string;
1232}
1233
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001234LOCAL(void)
1235state_fini(SRE_STATE* state)
1236{
1237 stack_free(state);
1238}
1239
1240LOCAL(PyObject*)
1241state_getslice(SRE_STATE* state, int index, PyObject* string)
1242{
1243 index = (index - 1) * 2;
1244
1245 if (string == Py_None || !state->mark[index] || !state->mark[index+1]) {
1246 Py_INCREF(Py_None);
1247 return Py_None;
1248 }
1249
1250 return PySequence_GetSlice(
1251 string,
1252 ((char*)state->mark[index] - (char*)state->beginning) /
1253 state->charsize,
1254 ((char*)state->mark[index+1] - (char*)state->beginning) /
1255 state->charsize
1256 );
1257}
1258
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001259static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001260pattern_new_match(PatternObject* pattern, SRE_STATE* state,
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001261 PyObject* string, int status)
1262{
1263 /* create match object (from state object) */
1264
1265 MatchObject* match;
1266 int i, j;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001267 char* base;
1268 int n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001269
1270 if (status > 0) {
1271
1272 /* create match object (with room for extra group marks) */
1273 match = PyObject_NEW_VAR(MatchObject, &Match_Type, 2*pattern->groups);
1274 if (match == NULL)
1275 return NULL;
1276
1277 Py_INCREF(pattern);
1278 match->pattern = pattern;
1279
1280 Py_INCREF(string);
1281 match->string = string;
1282
1283 match->groups = pattern->groups+1;
1284
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001285 base = (char*) state->beginning;
1286 n = state->charsize;
1287
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001288 /* group zero */
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001289 match->mark[0] = ((char*) state->start - base) / n;
1290 match->mark[1] = ((char*) state->ptr - base) / n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001291
1292 /* fill in the rest of the groups */
1293 for (i = j = 0; i < pattern->groups; i++, j+=2)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001294 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
1295 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
1296 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001297 } else
1298 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
1299
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001300 match->index = state->index;
1301
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001302 return (PyObject*) match;
1303
1304 } else if (status < 0) {
1305
1306 /* internal error */
1307 PyErr_SetString(
1308 PyExc_RuntimeError, "internal error in regular expression engine"
1309 );
1310 return NULL;
1311
1312 }
1313
1314 Py_INCREF(Py_None);
1315 return Py_None;
1316}
1317
1318static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001319pattern_scanner(PatternObject* pattern, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001320{
1321 /* create search state object */
1322
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001323 ScannerObject* self;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001324 PyObject* string;
1325
1326 /* create match object (with room for extra group marks) */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001327 self = PyObject_NEW(ScannerObject, &Scanner_Type);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001328 if (self == NULL)
1329 return NULL;
1330
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001331 string = state_init(&self->state, pattern, args);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001332 if (!string) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001333 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001334 return NULL;
1335 }
1336
1337 Py_INCREF(pattern);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001338 self->pattern = (PyObject*) pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001339
1340 Py_INCREF(string);
1341 self->string = string;
1342
1343 return (PyObject*) self;
1344}
1345
Guido van Rossumb700df92000-03-31 14:59:30 +00001346static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001347pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001348{
1349 Py_XDECREF(self->code);
1350 Py_XDECREF(self->pattern);
1351 Py_XDECREF(self->groupindex);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001352 PyMem_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001353}
1354
1355static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001356pattern_match(PatternObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001357{
1358 SRE_STATE state;
1359 PyObject* string;
1360 int status;
1361
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001362 string = state_init(&state, self, args);
Guido van Rossumb700df92000-03-31 14:59:30 +00001363 if (!string)
1364 return NULL;
1365
1366 state.ptr = state.start;
1367
1368 if (state.charsize == 1) {
1369 status = sre_match(&state, PatternObject_GetCode(self));
1370 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001371#if defined(HAVE_UNICODE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001372 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001373#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001374 }
1375
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001376 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001377
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001378 return pattern_new_match(self, &state, string, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001379}
1380
1381static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001382pattern_search(PatternObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001383{
1384 SRE_STATE state;
1385 PyObject* string;
1386 int status;
1387
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001388 string = state_init(&state, self, args);
Guido van Rossumb700df92000-03-31 14:59:30 +00001389 if (!string)
1390 return NULL;
1391
1392 if (state.charsize == 1) {
1393 status = sre_search(&state, PatternObject_GetCode(self));
1394 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001395#if defined(HAVE_UNICODE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001396 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001397#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001398 }
1399
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001400 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001401
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001402 return pattern_new_match(self, &state, string, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001403}
1404
1405static PyObject*
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001406call(char* function, PyObject* args)
1407{
1408 PyObject* name;
1409 PyObject* module;
1410 PyObject* func;
1411 PyObject* result;
1412
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001413 name = PyString_FromString(MODULE);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001414 if (!name)
1415 return NULL;
1416 module = PyImport_Import(name);
1417 Py_DECREF(name);
1418 if (!module)
1419 return NULL;
1420 func = PyObject_GetAttrString(module, function);
1421 Py_DECREF(module);
1422 if (!func)
1423 return NULL;
1424 result = PyObject_CallObject(func, args);
1425 Py_DECREF(func);
1426 Py_DECREF(args);
1427 return result;
1428}
1429
1430static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001431pattern_sub(PatternObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001432{
1433 PyObject* template;
1434 PyObject* string;
1435 PyObject* count;
1436 if (!PyArg_ParseTuple(args, "OOO", &template, &string, &count))
1437 return NULL;
1438
1439 /* delegate to Python code */
1440 return call("_sub", Py_BuildValue("OOOO", self, template, string, count));
1441}
1442
1443static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001444pattern_subn(PatternObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001445{
1446 PyObject* template;
1447 PyObject* string;
1448 PyObject* count;
1449 if (!PyArg_ParseTuple(args, "OOO", &template, &string, &count))
1450 return NULL;
1451
1452 /* delegate to Python code */
1453 return call("_subn", Py_BuildValue("OOOO", self, template, string, count));
1454}
1455
1456static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001457pattern_split(PatternObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001458{
1459 PyObject* string;
1460 PyObject* maxsplit;
1461 if (!PyArg_ParseTuple(args, "OO", &string, &maxsplit))
1462 return NULL;
1463
1464 /* delegate to Python code */
1465 return call("_split", Py_BuildValue("OOO", self, string, maxsplit));
1466}
1467
1468static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001469pattern_findall(PatternObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001470{
Guido van Rossumb700df92000-03-31 14:59:30 +00001471 SRE_STATE state;
1472 PyObject* string;
1473 PyObject* list;
1474 int status;
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001475 int i;
Guido van Rossumb700df92000-03-31 14:59:30 +00001476
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001477 string = state_init(&state, self, args);
Guido van Rossumb700df92000-03-31 14:59:30 +00001478 if (!string)
1479 return NULL;
1480
1481 list = PyList_New(0);
1482
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001483 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001484
1485 PyObject* item;
1486
1487 state.ptr = state.start;
1488
1489 if (state.charsize == 1) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001490 status = sre_search(&state, PatternObject_GetCode(self));
Guido van Rossumb700df92000-03-31 14:59:30 +00001491 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001492#if defined(HAVE_UNICODE)
1493 status = sre_usearch(&state, PatternObject_GetCode(self));
1494#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001495 }
1496
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001497 if (status > 0) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001498
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001499 /* don't bother to build a match object */
1500 switch (self->groups) {
1501 case 0:
1502 item = PySequence_GetSlice(
1503 string,
1504 ((char*) state.start - (char*) state.beginning) /
1505 state.charsize,
1506 ((char*) state.ptr - (char*) state.beginning) /
1507 state.charsize);
1508 if (!item)
1509 goto error;
1510 break;
1511 case 1:
1512 item = state_getslice(&state, 1, string);
1513 if (!item)
1514 goto error;
1515 break;
1516 default:
1517 item = PyTuple_New(self->groups);
1518 if (!item)
1519 goto error;
1520 for (i = 0; i < self->groups; i++) {
1521 PyObject* o = state_getslice(&state, i+1, string);
1522 if (!o) {
1523 Py_DECREF(item);
1524 goto error;
1525 }
1526 PyTuple_SET_ITEM(item, i, o);
1527 }
1528 break;
1529 }
1530
1531 if (PyList_Append(list, item) < 0) {
1532 Py_DECREF(item);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001533 goto error;
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001534 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001535
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001536 if (state.ptr == state.start)
1537 state.start = (void*) ((char*) state.ptr + state.charsize);
1538 else
1539 state.start = state.ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001540
1541 } else {
1542
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001543 if (status == 0)
1544 break;
1545
Guido van Rossumb700df92000-03-31 14:59:30 +00001546 /* internal error */
1547 PyErr_SetString(
1548 PyExc_RuntimeError,
1549 "internal error in regular expression engine"
1550 );
1551 goto error;
1552
1553 }
1554 }
1555
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001556 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001557 return list;
1558
1559error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001560 Py_DECREF(list);
1561 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001562 return NULL;
1563
1564}
1565
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001566static PyMethodDef pattern_methods[] = {
1567 {"match", (PyCFunction) pattern_match, 1},
1568 {"search", (PyCFunction) pattern_search, 1},
1569 {"sub", (PyCFunction) pattern_sub, 1},
1570 {"subn", (PyCFunction) pattern_subn, 1},
1571 {"split", (PyCFunction) pattern_split, 1},
1572 {"findall", (PyCFunction) pattern_findall, 1},
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001573 /* experimental */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001574 {"scanner", (PyCFunction) pattern_scanner, 1},
Guido van Rossumb700df92000-03-31 14:59:30 +00001575 {NULL, NULL}
1576};
1577
1578static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001579pattern_getattr(PatternObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00001580{
1581 PyObject* res;
1582
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001583 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00001584
1585 if (res)
1586 return res;
1587
1588 PyErr_Clear();
1589
1590 /* attributes */
1591 if (!strcmp(name, "pattern")) {
1592 Py_INCREF(self->pattern);
1593 return self->pattern;
1594 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001595
1596 if (!strcmp(name, "flags"))
1597 return Py_BuildValue("i", self->flags);
1598
Fredrik Lundh01016fe2000-06-30 00:27:46 +00001599 if (!strcmp(name, "groups"))
1600 return Py_BuildValue("i", self->groups);
1601
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001602 if (!strcmp(name, "groupindex") && self->groupindex) {
1603 Py_INCREF(self->groupindex);
1604 return self->groupindex;
1605 }
1606
Guido van Rossumb700df92000-03-31 14:59:30 +00001607 PyErr_SetString(PyExc_AttributeError, name);
1608 return NULL;
1609}
1610
1611statichere PyTypeObject Pattern_Type = {
1612 PyObject_HEAD_INIT(NULL)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001613 0, "SRE_Pattern", sizeof(PatternObject), 0,
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001614 (destructor)pattern_dealloc, /*tp_dealloc*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001615 0, /*tp_print*/
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001616 (getattrfunc)pattern_getattr, /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001617};
1618
1619/* -------------------------------------------------------------------- */
1620/* match methods */
1621
1622static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001623match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001624{
1625 Py_XDECREF(self->string);
1626 Py_DECREF(self->pattern);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001627 PyMem_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001628}
1629
1630static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001631match_getslice_by_index(MatchObject* self, int index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00001632{
1633 if (index < 0 || index >= self->groups) {
1634 /* raise IndexError if we were given a bad group number */
1635 PyErr_SetString(
1636 PyExc_IndexError,
1637 "no such group"
1638 );
1639 return NULL;
1640 }
1641
1642 if (self->string == Py_None || self->mark[index+index] < 0) {
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001643 /* return default value if the string or group is undefined */
1644 Py_INCREF(def);
1645 return def;
Guido van Rossumb700df92000-03-31 14:59:30 +00001646 }
1647
1648 return PySequence_GetSlice(
1649 self->string, self->mark[index+index], self->mark[index+index+1]
1650 );
1651}
1652
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001653static int
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001654match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00001655{
1656 if (!PyInt_Check(index) && self->pattern->groupindex != NULL) {
1657 /* FIXME: resource leak? */
1658 index = PyObject_GetItem(self->pattern->groupindex, index);
1659 if (!index)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001660 return -1;
Guido van Rossumb700df92000-03-31 14:59:30 +00001661 }
1662
1663 if (PyInt_Check(index))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001664 return (int) PyInt_AS_LONG(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00001665
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001666 return -1;
1667}
1668
1669static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001670match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001671{
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001672 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00001673}
1674
1675static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001676match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001677{
1678 PyObject* result;
1679 int i, size;
1680
1681 size = PyTuple_GET_SIZE(args);
1682
1683 switch (size) {
1684 case 0:
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001685 result = match_getslice(self, Py_False, Py_None);
Guido van Rossumb700df92000-03-31 14:59:30 +00001686 break;
1687 case 1:
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001688 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
Guido van Rossumb700df92000-03-31 14:59:30 +00001689 break;
1690 default:
1691 /* fetch multiple items */
1692 result = PyTuple_New(size);
1693 if (!result)
1694 return NULL;
1695 for (i = 0; i < size; i++) {
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001696 PyObject* item = match_getslice(
1697 self, PyTuple_GET_ITEM(args, i), Py_None
1698 );
Guido van Rossumb700df92000-03-31 14:59:30 +00001699 if (!item) {
1700 Py_DECREF(result);
1701 return NULL;
1702 }
1703 PyTuple_SET_ITEM(result, i, item);
1704 }
1705 break;
1706 }
1707 return result;
1708}
1709
1710static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001711match_groups(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001712{
1713 PyObject* result;
1714 int index;
1715
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001716 PyObject* def = Py_None;
1717 if (!PyArg_ParseTuple(args, "|O", &def))
1718 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001719
Guido van Rossumb700df92000-03-31 14:59:30 +00001720 result = PyTuple_New(self->groups-1);
1721 if (!result)
1722 return NULL;
1723
1724 for (index = 1; index < self->groups; index++) {
1725 PyObject* item;
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001726 item = match_getslice_by_index(self, index, def);
Guido van Rossumb700df92000-03-31 14:59:30 +00001727 if (!item) {
1728 Py_DECREF(result);
1729 return NULL;
1730 }
1731 PyTuple_SET_ITEM(result, index-1, item);
1732 }
1733
1734 return result;
1735}
1736
1737static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001738match_groupdict(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001739{
1740 PyObject* result;
1741 PyObject* keys;
1742 int index;
1743
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001744 PyObject* def = Py_None;
1745 if (!PyArg_ParseTuple(args, "|O", &def))
1746 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001747
Guido van Rossumb700df92000-03-31 14:59:30 +00001748 result = PyDict_New();
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001749 if (!result || !self->pattern->groupindex)
Guido van Rossumb700df92000-03-31 14:59:30 +00001750 return result;
1751
1752 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001753 if (!keys) {
1754 Py_DECREF(result);
Guido van Rossumb700df92000-03-31 14:59:30 +00001755 return NULL;
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001756 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001757
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001758 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001759 PyObject* key;
1760 PyObject* item;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001761 key = PyList_GET_ITEM(keys, index);
Guido van Rossumb700df92000-03-31 14:59:30 +00001762 if (!key) {
1763 Py_DECREF(keys);
1764 Py_DECREF(result);
1765 return NULL;
1766 }
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001767 item = match_getslice(self, key, def);
Guido van Rossumb700df92000-03-31 14:59:30 +00001768 if (!item) {
1769 Py_DECREF(key);
1770 Py_DECREF(keys);
1771 Py_DECREF(result);
1772 return NULL;
1773 }
1774 /* FIXME: <fl> this can fail, right? */
1775 PyDict_SetItem(result, key, item);
1776 }
1777
1778 Py_DECREF(keys);
1779
1780 return result;
1781}
1782
1783static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001784match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001785{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001786 int index;
1787
1788 PyObject* index_ = Py_False;
1789 if (!PyArg_ParseTuple(args, "|O", &index_))
Guido van Rossumb700df92000-03-31 14:59:30 +00001790 return NULL;
1791
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001792 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001793
Guido van Rossumb700df92000-03-31 14:59:30 +00001794 if (index < 0 || index >= self->groups) {
1795 PyErr_SetString(
1796 PyExc_IndexError,
1797 "no such group"
1798 );
1799 return NULL;
1800 }
1801
1802 if (self->mark[index*2] < 0) {
1803 Py_INCREF(Py_None);
1804 return Py_None;
1805 }
1806
1807 return Py_BuildValue("i", self->mark[index*2]);
1808}
1809
1810static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001811match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001812{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001813 int index;
1814
1815 PyObject* index_ = Py_False;
1816 if (!PyArg_ParseTuple(args, "|O", &index_))
Guido van Rossumb700df92000-03-31 14:59:30 +00001817 return NULL;
1818
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001819 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001820
Guido van Rossumb700df92000-03-31 14:59:30 +00001821 if (index < 0 || index >= self->groups) {
1822 PyErr_SetString(
1823 PyExc_IndexError,
1824 "no such group"
1825 );
1826 return NULL;
1827 }
1828
1829 if (self->mark[index*2] < 0) {
1830 Py_INCREF(Py_None);
1831 return Py_None;
1832 }
1833
1834 return Py_BuildValue("i", self->mark[index*2+1]);
1835}
1836
1837static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001838match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001839{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001840 int index;
1841
1842 PyObject* index_ = Py_False;
1843 if (!PyArg_ParseTuple(args, "|O", &index_))
Guido van Rossumb700df92000-03-31 14:59:30 +00001844 return NULL;
1845
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001846 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001847
Guido van Rossumb700df92000-03-31 14:59:30 +00001848 if (index < 0 || index >= self->groups) {
1849 PyErr_SetString(
1850 PyExc_IndexError,
1851 "no such group"
1852 );
1853 return NULL;
1854 }
1855
1856 if (self->mark[index*2] < 0) {
1857 Py_INCREF(Py_None);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001858 Py_INCREF(Py_None);
1859 return Py_BuildValue("OO", Py_None, Py_None);
Guido van Rossumb700df92000-03-31 14:59:30 +00001860 }
1861
1862 return Py_BuildValue("ii", self->mark[index*2], self->mark[index*2+1]);
1863}
1864
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001865static PyMethodDef match_methods[] = {
1866 {"group", (PyCFunction) match_group, 1},
1867 {"start", (PyCFunction) match_start, 1},
1868 {"end", (PyCFunction) match_end, 1},
1869 {"span", (PyCFunction) match_span, 1},
1870 {"groups", (PyCFunction) match_groups, 1},
1871 {"groupdict", (PyCFunction) match_groupdict, 1},
Guido van Rossumb700df92000-03-31 14:59:30 +00001872 {NULL, NULL}
1873};
1874
1875static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001876match_getattr(MatchObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00001877{
1878 PyObject* res;
1879
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001880 res = Py_FindMethod(match_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00001881 if (res)
1882 return res;
1883
1884 PyErr_Clear();
1885
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001886 /* attributes */
Guido van Rossumb700df92000-03-31 14:59:30 +00001887 if (!strcmp(name, "string")) {
1888 Py_INCREF(self->string);
1889 return self->string;
1890 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001891
Guido van Rossumb700df92000-03-31 14:59:30 +00001892 if (!strcmp(name, "re")) {
1893 Py_INCREF(self->pattern);
1894 return (PyObject*) self->pattern;
1895 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001896
Guido van Rossumb700df92000-03-31 14:59:30 +00001897 if (!strcmp(name, "pos"))
1898 return Py_BuildValue("i", 0); /* FIXME */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001899
Guido van Rossumb700df92000-03-31 14:59:30 +00001900 if (!strcmp(name, "endpos"))
1901 return Py_BuildValue("i", 0); /* FIXME */
1902
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001903 if (!strcmp(name, "index")) {
1904 /* experimental */
1905 if (self->index < 0) {
1906 Py_INCREF(Py_None);
1907 return Py_None;
1908 } else
1909 return Py_BuildValue("i", self->index);
1910 }
1911
Guido van Rossumb700df92000-03-31 14:59:30 +00001912 PyErr_SetString(PyExc_AttributeError, name);
1913 return NULL;
1914}
1915
1916/* FIXME: implement setattr("string", None) as a special case (to
1917 detach the associated string, if any */
1918
1919statichere PyTypeObject Match_Type = {
1920 PyObject_HEAD_INIT(NULL)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001921 0, "SRE_Match",
Guido van Rossumb700df92000-03-31 14:59:30 +00001922 sizeof(MatchObject), /* size of basic object */
1923 sizeof(int), /* space for group item */
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001924 (destructor)match_dealloc, /*tp_dealloc*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001925 0, /*tp_print*/
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001926 (getattrfunc)match_getattr, /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001927};
1928
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001929/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001930/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001931
1932static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001933scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001934{
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001935 state_fini(&self->state);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001936 Py_DECREF(self->string);
1937 Py_DECREF(self->pattern);
1938 PyMem_DEL(self);
1939}
1940
1941static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001942scanner_match(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001943{
1944 SRE_STATE* state = &self->state;
1945 PyObject* match;
1946 int status;
1947
1948 state->ptr = state->start;
1949
1950 if (state->charsize == 1) {
1951 status = sre_match(state, PatternObject_GetCode(self->pattern));
1952 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001953#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001954 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001955#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001956 }
1957
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001958 match = pattern_new_match((PatternObject*) self->pattern,
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001959 state, self->string, status);
1960
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001961 if (status == 0 || state->ptr == state->start)
1962 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001963 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001964 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001965
1966 return match;
1967}
1968
1969
1970static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001971scanner_search(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001972{
1973 SRE_STATE* state = &self->state;
1974 PyObject* match;
1975 int status;
1976
1977 state->ptr = state->start;
1978
1979 if (state->charsize == 1) {
1980 status = sre_search(state, PatternObject_GetCode(self->pattern));
1981 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001982#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001983 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001984#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001985 }
1986
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001987 match = pattern_new_match((PatternObject*) self->pattern,
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001988 state, self->string, status);
1989
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001990 if (status == 0 || state->ptr == state->start)
1991 state->start = (void*) ((char*) state->ptr + state->charsize);
1992 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001993 state->start = state->ptr;
1994
1995 return match;
1996}
1997
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001998static PyMethodDef scanner_methods[] = {
1999 {"match", (PyCFunction) scanner_match, 0},
2000 {"search", (PyCFunction) scanner_search, 0},
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002001 {NULL, NULL}
2002};
2003
2004static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002005scanner_getattr(ScannerObject* self, char* name)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002006{
2007 PyObject* res;
2008
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002009 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002010 if (res)
2011 return res;
2012
2013 PyErr_Clear();
2014
2015 /* attributes */
2016 if (!strcmp(name, "pattern")) {
2017 Py_INCREF(self->pattern);
2018 return self->pattern;
2019 }
2020
2021 PyErr_SetString(PyExc_AttributeError, name);
2022 return NULL;
2023}
2024
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002025statichere PyTypeObject Scanner_Type = {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002026 PyObject_HEAD_INIT(NULL)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002027 0, "SRE_Scanner",
2028 sizeof(ScannerObject), /* size of basic object */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002029 0,
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002030 (destructor)scanner_dealloc, /*tp_dealloc*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002031 0, /*tp_print*/
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002032 (getattrfunc)scanner_getattr, /*tp_getattr*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002033};
2034
Guido van Rossumb700df92000-03-31 14:59:30 +00002035static PyMethodDef _functions[] = {
2036 {"compile", _compile, 1},
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002037 {"getcodesize", sre_codesize, 1},
Fredrik Lundhb389df32000-06-29 12:48:37 +00002038 {"getlower", sre_getlower, 1},
Guido van Rossumb700df92000-03-31 14:59:30 +00002039 {NULL, NULL}
2040};
2041
2042void
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002043#if defined(WIN32)
Guido van Rossumb700df92000-03-31 14:59:30 +00002044__declspec(dllexport)
2045#endif
2046init_sre()
2047{
2048 /* Patch object types */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002049 Pattern_Type.ob_type = Match_Type.ob_type =
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002050 Scanner_Type.ob_type = &PyType_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00002051
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002052 Py_InitModule("_" MODULE, _functions);
Guido van Rossumb700df92000-03-31 14:59:30 +00002053}
2054
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002055#endif /* !defined(SRE_RECURSIVE) */