blob: 764e155f974e6e570cdd9b3cf188862bdb2fe22b [file] [log] [blame]
Guido van Rossumb700df92000-03-31 14:59:30 +00001/* -*- Mode: C; tab-width: 4 -*-
2 *
3 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00004 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00005 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00006 *
7 * partial history:
Fredrik Lundh436c3d52000-06-29 08:58:44 +00008 * 99-10-24 fl created (based on existing template matcher code)
Guido van Rossumb700df92000-03-31 14:59:30 +00009 * 99-11-13 fl added categories, branching, and more (0.2)
10 * 99-11-16 fl some tweaks to compile on non-Windows platforms
11 * 99-12-18 fl non-literals, generic maximizing repeat (0.3)
Fredrik Lundh436c3d52000-06-29 08:58:44 +000012 * 00-02-28 fl tons of changes (not all to the better ;-) (0.4)
13 * 00-03-06 fl first alpha, sort of (0.5)
14 * 00-03-14 fl removed most compatibility stuff (0.6)
15 * 00-05-10 fl towards third alpha (0.8.2)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +000016 * 00-05-13 fl added experimental scanner stuff (0.8.3)
Fredrik Lundh436c3d52000-06-29 08:58:44 +000017 * 00-05-27 fl final bug hunt (0.8.4)
18 * 00-06-21 fl less bugs, more taste (0.8.5)
19 * 00-06-25 fl major changes to better deal with nested repeats (0.9)
20 * 00-06-28 fl fixed findall (0.9.1)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +000021 * 00-06-29 fl fixed split, added more scanner features (0.9.2)
Fredrik Lundhc13222c2000-07-01 23:49:14 +000022 * 00-06-30 fl added fast search optimization (0.9.3)
Fredrik Lundh0640e112000-06-30 13:55:15 +000023 * 00-06-30 fl added assert (lookahead) primitives, etc (0.9.4)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +000024 * 00-07-02 fl added charset optimizations, etc (0.9.5)
Fredrik Lundh6f013982000-07-03 18:44:21 +000025 * 00-07-03 fl store code in pattern object, lookbehind, etc
Guido van Rossumb700df92000-03-31 14:59:30 +000026 *
27 * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
28 *
Guido van Rossumb700df92000-03-31 14:59:30 +000029 * Portions of this engine have been developed in cooperation with
Fredrik Lundh22d25462000-07-01 17:50:59 +000030 * CNRI. Hewlett-Packard provided funding for 2.0 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000031 * other compatibility work.
32 */
33
34#ifndef SRE_RECURSIVE
35
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +000036char copyright[] = " SRE 0.9.5 Copyright (c) 1997-2000 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000037
38#include "Python.h"
39
40#include "sre.h"
41
Guido van Rossumb700df92000-03-31 14:59:30 +000042#if defined(HAVE_LIMITS_H)
43#include <limits.h>
44#else
45#define INT_MAX 2147483647
46#endif
47
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000048#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000049
Fredrik Lundh436c3d52000-06-29 08:58:44 +000050/* name of this module, minus the leading underscore */
51#define MODULE "sre"
52
Guido van Rossumb700df92000-03-31 14:59:30 +000053/* defining this one enables tracing */
54#undef DEBUG
55
Fredrik Lundh436c3d52000-06-29 08:58:44 +000056#if PY_VERSION_HEX >= 0x01060000
Fredrik Lundh22d25462000-07-01 17:50:59 +000057/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d52000-06-29 08:58:44 +000058#define HAVE_UNICODE
59#endif
60
Fredrik Lundh29c08be2000-06-29 23:33:12 +000061/* optional features */
62#define USE_FAST_SEARCH
63
Fredrik Lundh80946112000-06-29 18:03:25 +000064#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000065#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
66/* fastest possible local call under MSVC */
67#define LOCAL(type) static __inline type __fastcall
68#else
Fredrik Lundh29c08be2000-06-29 23:33:12 +000069#define LOCAL(type) static inline type
Guido van Rossumb700df92000-03-31 14:59:30 +000070#endif
71
72/* error codes */
73#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
74#define SRE_ERROR_MEMORY -9 /* out of memory */
75
Fredrik Lundh436c3d52000-06-29 08:58:44 +000076#if defined(DEBUG)
Guido van Rossumb700df92000-03-31 14:59:30 +000077#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +000078#else
79#define TRACE(v)
80#endif
81
Fredrik Lundh436c3d52000-06-29 08:58:44 +000082#define PTR(ptr) ((SRE_CHAR*) (ptr) - (SRE_CHAR*) state->beginning)
Guido van Rossumb700df92000-03-31 14:59:30 +000083
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000084/* -------------------------------------------------------------------- */
85/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +000086
Fredrik Lundh436c3d52000-06-29 08:58:44 +000087/* default character predicates (run sre_chars.py to regenerate tables) */
88
89#define SRE_DIGIT_MASK 1
90#define SRE_SPACE_MASK 2
91#define SRE_LINEBREAK_MASK 4
92#define SRE_ALNUM_MASK 8
93#define SRE_WORD_MASK 16
94
95static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
962, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
970, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
9825, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
9924, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1000, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
10124, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
102
Fredrik Lundhb389df32000-06-29 12:48:37 +0000103static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d52000-06-29 08:58:44 +000010410, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
10527, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
10644, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
10761, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
108108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
109122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
110106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
111120, 121, 122, 123, 124, 125, 126, 127 };
112
Fredrik Lundhb389df32000-06-29 12:48:37 +0000113static unsigned int sre_lower(unsigned int ch)
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000114{
Fredrik Lundhb389df32000-06-29 12:48:37 +0000115 return ((ch) < 128 ? sre_char_lower[ch] : ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000116}
117
118#define SRE_IS_DIGIT(ch)\
119 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
120#define SRE_IS_SPACE(ch)\
121 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
122#define SRE_IS_LINEBREAK(ch)\
123 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
124#define SRE_IS_ALNUM(ch)\
125 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
126#define SRE_IS_WORD(ch)\
127 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000128
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000129/* locale-specific character predicates */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000130
Fredrik Lundhb389df32000-06-29 12:48:37 +0000131static unsigned int sre_lower_locale(unsigned int ch)
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000132{
133 return ((ch) < 256 ? tolower((ch)) : ch);
134}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000135#define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
136#define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
137#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
138#define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
139#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
140
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000141/* unicode-specific character predicates */
142
143#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +0000144static unsigned int sre_lower_unicode(unsigned int ch)
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000145{
146 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
147}
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000148#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
149#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
150#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000151#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000152#define SRE_UNI_IS_WORD(ch) (SRE_IS_ALNUM((ch)) || (ch) == '_')
153#endif
154
Guido van Rossumb700df92000-03-31 14:59:30 +0000155LOCAL(int)
156sre_category(SRE_CODE category, unsigned int ch)
157{
158 switch (category) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000159
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000160 case SRE_CATEGORY_DIGIT:
Guido van Rossumb700df92000-03-31 14:59:30 +0000161 return SRE_IS_DIGIT(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000162 case SRE_CATEGORY_NOT_DIGIT:
Guido van Rossumb700df92000-03-31 14:59:30 +0000163 return !SRE_IS_DIGIT(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000164 case SRE_CATEGORY_SPACE:
Guido van Rossumb700df92000-03-31 14:59:30 +0000165 return SRE_IS_SPACE(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000166 case SRE_CATEGORY_NOT_SPACE:
Guido van Rossumb700df92000-03-31 14:59:30 +0000167 return !SRE_IS_SPACE(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000168 case SRE_CATEGORY_WORD:
Guido van Rossumb700df92000-03-31 14:59:30 +0000169 return SRE_IS_WORD(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000170 case SRE_CATEGORY_NOT_WORD:
Guido van Rossumb700df92000-03-31 14:59:30 +0000171 return !SRE_IS_WORD(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000172 case SRE_CATEGORY_LINEBREAK:
173 return SRE_IS_LINEBREAK(ch);
174 case SRE_CATEGORY_NOT_LINEBREAK:
175 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000176
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000177 case SRE_CATEGORY_LOC_WORD:
178 return SRE_LOC_IS_WORD(ch);
179 case SRE_CATEGORY_LOC_NOT_WORD:
180 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000181
182#if defined(HAVE_UNICODE)
183 case SRE_CATEGORY_UNI_DIGIT:
184 return SRE_UNI_IS_DIGIT(ch);
185 case SRE_CATEGORY_UNI_NOT_DIGIT:
186 return !SRE_UNI_IS_DIGIT(ch);
187 case SRE_CATEGORY_UNI_SPACE:
188 return SRE_UNI_IS_SPACE(ch);
189 case SRE_CATEGORY_UNI_NOT_SPACE:
190 return !SRE_UNI_IS_SPACE(ch);
191 case SRE_CATEGORY_UNI_WORD:
192 return SRE_UNI_IS_WORD(ch);
193 case SRE_CATEGORY_UNI_NOT_WORD:
194 return !SRE_UNI_IS_WORD(ch);
195 case SRE_CATEGORY_UNI_LINEBREAK:
196 return SRE_UNI_IS_LINEBREAK(ch);
197 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
198 return !SRE_UNI_IS_LINEBREAK(ch);
199#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000200 }
201 return 0;
202}
203
204/* helpers */
205
206LOCAL(int)
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000207stack_free(SRE_STATE* state)
Guido van Rossumb700df92000-03-31 14:59:30 +0000208{
209 if (state->stack) {
210 TRACE(("release stack\n"));
211 free(state->stack);
212 state->stack = NULL;
213 }
214 state->stacksize = 0;
215 return 0;
216}
217
218static int /* shouldn't be LOCAL */
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000219stack_extend(SRE_STATE* state, int lo, int hi)
Guido van Rossumb700df92000-03-31 14:59:30 +0000220{
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000221 SRE_STACK* stack;
Guido van Rossumb700df92000-03-31 14:59:30 +0000222 int stacksize;
223
224 /* grow the stack to a suitable size; we need at least lo entries,
225 at most hi entries. if for some reason hi is lower than lo, lo
226 wins */
227
228 stacksize = state->stacksize;
229
230 if (stacksize == 0) {
231 /* create new stack */
232 stacksize = 512;
233 if (stacksize < lo)
234 stacksize = lo;
235 else if (stacksize > hi)
236 stacksize = hi;
237 TRACE(("allocate stack %d\n", stacksize));
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000238 stack = malloc(sizeof(SRE_STACK) * stacksize);
Guido van Rossumb700df92000-03-31 14:59:30 +0000239 } else {
240 /* grow the stack (typically by a factor of two) */
241 while (stacksize < lo)
242 stacksize = 2 * stacksize;
243 /* FIXME: <fl> could trim size if it's larger than lo, and
244 much larger than hi */
245 TRACE(("grow stack to %d\n", stacksize));
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000246 stack = realloc(state->stack, sizeof(SRE_STACK) * stacksize);
Guido van Rossumb700df92000-03-31 14:59:30 +0000247 }
248
249 if (!stack) {
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000250 stack_free(state);
Guido van Rossumb700df92000-03-31 14:59:30 +0000251 return SRE_ERROR_MEMORY;
252 }
253
254 state->stack = stack;
255 state->stacksize = stacksize;
256
257 return 0;
258}
259
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000260/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000261
262#define SRE_CHAR unsigned char
263#define SRE_AT sre_at
264#define SRE_MEMBER sre_member
265#define SRE_MATCH sre_match
266#define SRE_SEARCH sre_search
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000267
268#if defined(HAVE_UNICODE)
269
Guido van Rossumb700df92000-03-31 14:59:30 +0000270#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000271#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000272#undef SRE_RECURSIVE
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000273
Guido van Rossumb700df92000-03-31 14:59:30 +0000274#undef SRE_SEARCH
275#undef SRE_MATCH
276#undef SRE_MEMBER
277#undef SRE_AT
278#undef SRE_CHAR
279
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000280/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000281
282#define SRE_CHAR Py_UNICODE
283#define SRE_AT sre_uat
284#define SRE_MEMBER sre_umember
285#define SRE_MATCH sre_umatch
286#define SRE_SEARCH sre_usearch
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000287#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000288
289#endif /* SRE_RECURSIVE */
290
291/* -------------------------------------------------------------------- */
292/* String matching engine */
293
294/* the following section is compiled twice, with different character
295 settings */
296
297LOCAL(int)
298SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
299{
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000300 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000301
302 int this, that;
303
304 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000305
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000306 case SRE_AT_BEGINNING:
Guido van Rossum29530882000-04-10 17:06:55 +0000307 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000308
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000309 case SRE_AT_BEGINNING_LINE:
310 return ((void*) ptr == state->beginning ||
311 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000312
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000313 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000314 return (((void*) (ptr+1) == state->end &&
315 SRE_IS_LINEBREAK((int) ptr[0])) ||
316 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000317
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000318 case SRE_AT_END_LINE:
319 return ((void*) ptr == state->end ||
320 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000321
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000322 case SRE_AT_BOUNDARY:
Guido van Rossumb700df92000-03-31 14:59:30 +0000323 if (state->beginning == state->end)
324 return 0;
325 that = ((void*) ptr > state->beginning) ?
326 SRE_IS_WORD((int) ptr[-1]) : 0;
327 this = ((void*) ptr < state->end) ?
328 SRE_IS_WORD((int) ptr[0]) : 0;
329 return this != that;
Fredrik Lundh80946112000-06-29 18:03:25 +0000330
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000331 case SRE_AT_NON_BOUNDARY:
Guido van Rossumb700df92000-03-31 14:59:30 +0000332 if (state->beginning == state->end)
333 return 0;
334 that = ((void*) ptr > state->beginning) ?
335 SRE_IS_WORD((int) ptr[-1]) : 0;
336 this = ((void*) ptr < state->end) ?
337 SRE_IS_WORD((int) ptr[0]) : 0;
338 return this == that;
339 }
340
341 return 0;
342}
343
344LOCAL(int)
Fredrik Lundh0640e112000-06-30 13:55:15 +0000345SRE_MEMBER(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000346{
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000347 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000348
349 int ok = 1;
350
351 for (;;) {
352 switch (*set++) {
353
354 case SRE_OP_NEGATE:
355 ok = !ok;
356 break;
357
358 case SRE_OP_FAILURE:
359 return !ok;
360
361 case SRE_OP_LITERAL:
Fredrik Lundhc13222c2000-07-01 23:49:14 +0000362 /* args: <literal> */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000363 if (ch == set[0])
Guido van Rossumb700df92000-03-31 14:59:30 +0000364 return ok;
365 set++;
366 break;
367
368 case SRE_OP_RANGE:
Fredrik Lundhc13222c2000-07-01 23:49:14 +0000369 /* args: <lower> <upper> */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000370 if (set[0] <= ch && ch <= set[1])
Guido van Rossumb700df92000-03-31 14:59:30 +0000371 return ok;
372 set += 2;
373 break;
374
Fredrik Lundh3562f112000-07-02 12:00:07 +0000375 case SRE_OP_CHARSET:
376 /* args: <bitmap> (16 bits per code word) */
377 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
378 return ok;
379 set += 16;
380 break;
381
Guido van Rossumb700df92000-03-31 14:59:30 +0000382 case SRE_OP_CATEGORY:
Fredrik Lundhc13222c2000-07-01 23:49:14 +0000383 /* args: <category> */
Guido van Rossumb700df92000-03-31 14:59:30 +0000384 if (sre_category(set[0], (int) ch))
385 return ok;
386 set += 1;
387 break;
388
389 default:
Fredrik Lundh80946112000-06-29 18:03:25 +0000390 /* internal error -- there's not much we can do about it
391 here, so let's just pretend it didn't match... */
Guido van Rossumb700df92000-03-31 14:59:30 +0000392 return 0;
393 }
394 }
395}
396
397LOCAL(int)
398SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
399{
400 /* check if string matches the given pattern. returns -1 for
401 error, 0 for failure, and 1 for success */
402
403 SRE_CHAR* end = state->end;
404 SRE_CHAR* ptr = state->ptr;
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000405 int stack;
Guido van Rossumb700df92000-03-31 14:59:30 +0000406 int stackbase;
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000407 int lastmark;
Guido van Rossumb700df92000-03-31 14:59:30 +0000408 int i, count;
409
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000410 /* FIXME: this is a hack! */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000411 void* mark_copy[SRE_MARK_SIZE];
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000412 void* mark = NULL;
413
414 TRACE(("%8d: enter\n", PTR(ptr)));
415
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000416 if (pattern[0] == SRE_OP_INFO) {
417 /* optimization info block */
418 /* args: <1=skip> <2=flags> <3=min> ... */
419 if (pattern[3] && (end - ptr) < pattern[3]) {
420 TRACE(("reject (got %d chars, need %d)\n",
421 (end - ptr), pattern[3]));
422 return 0;
423 }
424 pattern += pattern[1] + 1;
425 }
426
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000427 stackbase = stack = state->stackbase;
428 lastmark = state->lastmark;
429
430 retry:
Guido van Rossumb700df92000-03-31 14:59:30 +0000431
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000432 for (;;) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000433
434 switch (*pattern++) {
435
436 case SRE_OP_FAILURE:
437 /* immediate failure */
438 TRACE(("%8d: failure\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000439 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000440
441 case SRE_OP_SUCCESS:
442 /* end of pattern */
443 TRACE(("%8d: success\n", PTR(ptr)));
444 state->ptr = ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000445 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000446
447 case SRE_OP_AT:
448 /* match at given position */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000449 /* args: <at> */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000450 TRACE(("%8d: position %d\n", PTR(ptr), *pattern));
Guido van Rossumb700df92000-03-31 14:59:30 +0000451 if (!SRE_AT(state, ptr, *pattern))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000452 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000453 pattern++;
454 break;
455
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000456 case SRE_OP_CATEGORY:
457 /* match at given category */
458 /* args: <category> */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000459 TRACE(("%8d: category %d [category %d]\n", PTR(ptr),
460 *ptr, *pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000461 if (ptr >= end || !sre_category(pattern[0], ptr[0]))
462 goto failure;
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000463 TRACE(("%8d: category ok\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000464 pattern++;
465 ptr++;
466 break;
467
Guido van Rossumb700df92000-03-31 14:59:30 +0000468 case SRE_OP_LITERAL:
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000469 /* match literal string */
Guido van Rossumb700df92000-03-31 14:59:30 +0000470 /* args: <code> */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000471 TRACE(("%8d: literal %c\n", PTR(ptr), pattern[0]));
472 if (ptr >= end || (SRE_CODE) ptr[0] != pattern[0])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000473 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000474 pattern++;
475 ptr++;
476 break;
477
478 case SRE_OP_NOT_LITERAL:
479 /* match anything that is not literal character */
480 /* args: <code> */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000481 TRACE(("%8d: literal not %c\n", PTR(ptr), pattern[0]));
482 if (ptr >= end || (SRE_CODE) ptr[0] == pattern[0])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000483 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000484 pattern++;
485 ptr++;
486 break;
487
488 case SRE_OP_ANY:
489 /* match anything */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000490 TRACE(("%8d: anything\n", PTR(ptr)));
Guido van Rossumb700df92000-03-31 14:59:30 +0000491 if (ptr >= end)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000492 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000493 ptr++;
494 break;
495
496 case SRE_OP_IN:
497 /* match set member (or non_member) */
498 /* args: <skip> <set> */
499 TRACE(("%8d: set %c\n", PTR(ptr), *ptr));
500 if (ptr >= end || !SRE_MEMBER(pattern + 1, *ptr))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000501 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000502 pattern += pattern[0];
503 ptr++;
504 break;
505
506 case SRE_OP_GROUP:
507 /* match backreference */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000508 TRACE(("%8d: group %d\n", PTR(ptr), pattern[0]));
Guido van Rossumb700df92000-03-31 14:59:30 +0000509 i = pattern[0];
510 {
Guido van Rossumb700df92000-03-31 14:59:30 +0000511 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
512 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
513 if (!p || !e || e < p)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000514 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000515 while (p < e) {
516 if (ptr >= end || *ptr != *p)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000517 goto failure;
518 p++; ptr++;
519 }
520 }
521 pattern++;
522 break;
523
524 case SRE_OP_GROUP_IGNORE:
525 /* match backreference */
526 TRACE(("%8d: group ignore %d\n", PTR(ptr), pattern[0]));
527 i = pattern[0];
528 {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000529 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
530 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000531 if (!p || !e || e < p)
532 goto failure;
533 while (p < e) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000534 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000535 state->lower(*ptr) != state->lower(*p))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000536 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000537 p++; ptr++;
538 }
539 }
540 pattern++;
541 break;
542
543 case SRE_OP_LITERAL_IGNORE:
Fredrik Lundh0640e112000-06-30 13:55:15 +0000544 TRACE(("%8d: literal lower(%c)\n", PTR(ptr), pattern[0]));
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000545 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000546 state->lower(*ptr) != state->lower(*pattern))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000547 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000548 pattern++;
549 ptr++;
550 break;
551
552 case SRE_OP_NOT_LITERAL_IGNORE:
Fredrik Lundh0640e112000-06-30 13:55:15 +0000553 TRACE(("%8d: literal not lower(%c)\n", PTR(ptr), pattern[0]));
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000554 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000555 state->lower(*ptr) == state->lower(*pattern))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000556 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000557 pattern++;
558 ptr++;
559 break;
560
561 case SRE_OP_IN_IGNORE:
562 TRACE(("%8d: set lower(%c)\n", PTR(ptr), *ptr));
563 if (ptr >= end
Fredrik Lundh0640e112000-06-30 13:55:15 +0000564 || !SRE_MEMBER(pattern+1, (SRE_CODE) state->lower(*ptr)))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000565 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000566 pattern += pattern[0];
567 ptr++;
568 break;
569
570 case SRE_OP_MARK:
571 /* set mark */
572 /* args: <mark> */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000573 TRACE(("%8d: set mark %d\n", PTR(ptr), pattern[0]));
574 if (state->lastmark < pattern[0])
575 state->lastmark = pattern[0];
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000576 if (!mark) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000577 mark = mark_copy;
578 memcpy(mark, state->mark, state->lastmark*sizeof(void*));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000579 }
580 state->mark[pattern[0]] = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +0000581 pattern++;
582 break;
583
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000584 case SRE_OP_INDEX:
585 /* set index */
586 /* args: <index> */
587 TRACE(("%8d: set index %d\n", PTR(ptr), pattern[0]));
Fredrik Lundh6f013982000-07-03 18:44:21 +0000588 state->lastindex = pattern[0];
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000589 pattern++;
590 break;
591
Guido van Rossumb700df92000-03-31 14:59:30 +0000592 case SRE_OP_JUMP:
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000593 case SRE_OP_INFO:
Guido van Rossumb700df92000-03-31 14:59:30 +0000594 /* jump forward */
595 /* args: <skip> */
596 TRACE(("%8d: jump +%d\n", PTR(ptr), pattern[0]));
597 pattern += pattern[0];
598 break;
599
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000600 case SRE_OP_ASSERT:
601 /* assert subpattern */
Fredrik Lundh6f013982000-07-03 18:44:21 +0000602 /* args: <skip> <back> <pattern> */
603 TRACE(("%8d: assert subpattern %d\n", PTR(ptr), pattern[1]));
604 state->ptr = ptr - pattern[1];
605 if (state->ptr < state->beginning)
606 goto failure;
607 i = SRE_MATCH(state, pattern + 2);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000608 if (i < 0)
609 return i;
610 if (!i)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000611 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000612 pattern += pattern[0];
Guido van Rossumb700df92000-03-31 14:59:30 +0000613 break;
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000614
615 case SRE_OP_ASSERT_NOT:
616 /* assert not subpattern */
617 /* args: <skip> <pattern> */
Fredrik Lundh6f013982000-07-03 18:44:21 +0000618 TRACE(("%8d: assert not subpattern %d\n", PTR(ptr), pattern[1]));
619 state->ptr = ptr - pattern[1];
620 if (state->ptr < state->beginning)
621 goto failure;
622 i = SRE_MATCH(state, pattern + 2);
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000623 if (i < 0)
624 return i;
625 if (i)
626 goto failure;
627 pattern += pattern[0];
628 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000629
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000630#if 0
Guido van Rossumb700df92000-03-31 14:59:30 +0000631 case SRE_OP_MAX_REPEAT_ONE:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000632 /* match repeated sequence (maximizing regexp) */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000633
634 /* this operator only works if the repeated item is
635 exactly one character wide, and we're not already
636 collecting backtracking points. for other cases,
637 use the MAX_REPEAT operator instead */
638
Guido van Rossumb700df92000-03-31 14:59:30 +0000639 /* args: <skip> <min> <max> <step> */
Guido van Rossumb700df92000-03-31 14:59:30 +0000640 TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr),
641 pattern[1], pattern[2]));
642
643 count = 0;
644
645 if (pattern[3] == SRE_OP_ANY) {
646 /* repeated wildcard. skip to the end of the target
647 string, and backtrack from there */
648 /* FIXME: must look for line endings */
649 if (ptr + pattern[1] > end)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000650 goto failure; /* cannot match */
Guido van Rossumb700df92000-03-31 14:59:30 +0000651 count = pattern[2];
652 if (count > end - ptr)
653 count = end - ptr;
654 ptr += count;
655
656 } else if (pattern[3] == SRE_OP_LITERAL) {
657 /* repeated literal */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000658 SRE_CODE chr = pattern[4];
Guido van Rossumb700df92000-03-31 14:59:30 +0000659 while (count < (int) pattern[2]) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000660 if (ptr >= end || (SRE_CODE) ptr[0] != chr)
Guido van Rossumb700df92000-03-31 14:59:30 +0000661 break;
662 ptr++;
663 count++;
664 }
665
666 } else if (pattern[3] == SRE_OP_LITERAL_IGNORE) {
667 /* repeated literal */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000668 SRE_CODE chr = pattern[4];
Guido van Rossumb700df92000-03-31 14:59:30 +0000669 while (count < (int) pattern[2]) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000670 if (ptr >= end || (SRE_CODE) state->lower(*ptr) != chr)
Guido van Rossumb700df92000-03-31 14:59:30 +0000671 break;
672 ptr++;
673 count++;
674 }
675
676 } else if (pattern[3] == SRE_OP_NOT_LITERAL) {
677 /* repeated non-literal */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000678 SRE_CODE chr = pattern[4];
Guido van Rossumb700df92000-03-31 14:59:30 +0000679 while (count < (int) pattern[2]) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000680 if (ptr >= end || (SRE_CODE) ptr[0] == chr)
Guido van Rossumb700df92000-03-31 14:59:30 +0000681 break;
682 ptr++;
683 count++;
684 }
685
686 } else if (pattern[3] == SRE_OP_NOT_LITERAL_IGNORE) {
687 /* repeated non-literal */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000688 SRE_CODE chr = pattern[4];
Guido van Rossumb700df92000-03-31 14:59:30 +0000689 while (count < (int) pattern[2]) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000690 if (ptr >= end || (SRE_CODE) state->lower(ptr[0]) == chr)
Guido van Rossumb700df92000-03-31 14:59:30 +0000691 break;
692 ptr++;
693 count++;
694 }
695
696 } else if (pattern[3] == SRE_OP_IN) {
697 /* repeated set */
698 while (count < (int) pattern[2]) {
699 if (ptr >= end || !SRE_MEMBER(pattern + 5, *ptr))
700 break;
701 ptr++;
702 count++;
703 }
704
705 } else {
706 /* repeated single character pattern */
707 state->ptr = ptr;
708 while (count < (int) pattern[2]) {
709 i = SRE_MATCH(state, pattern + 3);
710 if (i < 0)
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000711 return i;
712 if (!i)
Guido van Rossumb700df92000-03-31 14:59:30 +0000713 break;
714 count++;
715 }
716 state->ptr = ptr;
717 ptr += count;
718 }
719
720 /* when we arrive here, count contains the number of
721 matches, and ptr points to the tail of the target
722 string. check if the rest of the pattern matches, and
723 backtrack if not. */
724
Guido van Rossumb700df92000-03-31 14:59:30 +0000725 TRACE(("%8d: repeat %d found\n", PTR(ptr), count));
726
727 if (count < (int) pattern[1])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000728 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000729
730 if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
731 /* tail is empty. we're finished */
732 TRACE(("%8d: tail is empty\n", PTR(ptr)));
733 state->ptr = ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000734 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000735
736 } else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000737 /* tail starts with a literal. skip positions where
738 the rest of the pattern cannot possibly match */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000739 SRE_CODE chr = pattern[pattern[0]+1];
Guido van Rossumb700df92000-03-31 14:59:30 +0000740 TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr));
741 for (;;) {
742 TRACE(("%8d: scan for tail match\n", PTR(ptr)));
743 while (count >= (int) pattern[1] &&
744 (ptr >= end || *ptr != chr)) {
745 ptr--;
746 count--;
747 }
748 TRACE(("%8d: check tail\n", PTR(ptr)));
749 if (count < (int) pattern[1])
750 break;
751 state->ptr = ptr;
752 i = SRE_MATCH(state, pattern + pattern[0]);
753 if (i > 0) {
754 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000755 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000756 }
757 TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
758 ptr--;
759 count--;
760 }
761
762 } else {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000763 /* general case */
Guido van Rossumb700df92000-03-31 14:59:30 +0000764 TRACE(("%8d: tail is pattern\n", PTR(ptr)));
765 while (count >= (int) pattern[1]) {
766 state->ptr = ptr;
767 i = SRE_MATCH(state, pattern + pattern[0]);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000768 if (i < 0)
769 return i;
770 if (i) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000771 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000772 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000773 }
774 TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
775 ptr--;
776 count--;
777 }
778 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000779 goto failure;
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000780#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000781
782 case SRE_OP_MAX_REPEAT:
783 /* match repeated sequence (maximizing regexp). repeated
784 group should end with a MAX_UNTIL code */
785
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000786 /* args: <skip> <min> <max> <item> */
787
788 TRACE(("%8d: max repeat (%d %d)\n", PTR(ptr),
Guido van Rossumb700df92000-03-31 14:59:30 +0000789 pattern[1], pattern[2]));
790
791 count = 0;
792 state->ptr = ptr;
793
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000794 /* match minimum number of items */
795 while (count < (int) pattern[1]) {
796 i = SRE_MATCH(state, pattern + 3);
797 if (i < 0)
798 return i;
799 if (!i)
800 goto failure;
801 if (state->ptr == ptr) {
802 /* if the match was successful but empty, set the
803 count to max and terminate the scanning loop */
804 count = (int) pattern[2];
805 break;
806 }
807 count++;
808 ptr = state->ptr;
809 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000810
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000811 TRACE(("%8d: found %d leading items\n", PTR(ptr), count));
Guido van Rossumb700df92000-03-31 14:59:30 +0000812
813 if (count < (int) pattern[1])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000814 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000815
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000816 /* match maximum number of items, pushing alternate end
817 points to the stack */
Guido van Rossumb700df92000-03-31 14:59:30 +0000818
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000819 while (pattern[2] == 65535 || count < (int) pattern[2]) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000820 state->stackbase = stack;
821 i = SRE_MATCH(state, pattern + 3);
822 state->stackbase = stackbase; /* rewind */
823 if (i < 0)
824 return i;
825 if (!i)
826 break;
827 if (state->ptr == ptr) {
828 count = (int) pattern[2];
829 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000830 }
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000831 /* this position was valid; add it to the retry
832 stack */
833 if (stack >= state->stacksize) {
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000834 i = stack_extend(state, stack + 1,
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000835 stackbase + pattern[2]);
836 if (i < 0)
837 return i; /* out of memory */
838 }
839 TRACE(("%8d: stack[%d] = %d\n", PTR(ptr), stack, PTR(ptr)));
840 state->stack[stack].ptr = ptr;
841 state->stack[stack].pattern = pattern + pattern[0];
842 stack++;
843 /* move forward */
844 ptr = state->ptr;
845 count++;
Guido van Rossumb700df92000-03-31 14:59:30 +0000846 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000847
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000848 /* when we get here, count is the number of successful
849 matches, and ptr points to the tail. */
Guido van Rossumb700df92000-03-31 14:59:30 +0000850
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000851 TRACE(("%8d: skip +%d\n", PTR(ptr), pattern[0]));
852
853 pattern += pattern[0];
854 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000855
856 case SRE_OP_MIN_REPEAT:
857 /* match repeated sequence (minimizing regexp) */
858 TRACE(("%8d: min repeat %d %d\n", PTR(ptr),
859 pattern[1], pattern[2]));
860 count = 0;
861 state->ptr = ptr;
862 /* match minimum number of items */
863 while (count < (int) pattern[1]) {
864 i = SRE_MATCH(state, pattern + 3);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000865 if (i < 0)
866 return i;
867 if (!i)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000868 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000869 count++;
870 }
871 /* move forward until the tail matches. */
872 while (count <= (int) pattern[2]) {
873 ptr = state->ptr;
874 i = SRE_MATCH(state, pattern + pattern[0]);
875 if (i > 0) {
876 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000877 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000878 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000879 state->ptr = ptr; /* backtrack */
880 i = SRE_MATCH(state, pattern + 3);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000881 if (i < 0)
882 return i;
883 if (!i)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000884 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000885 count++;
886 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000887 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000888
Guido van Rossumb700df92000-03-31 14:59:30 +0000889 case SRE_OP_BRANCH:
890 /* match one of several subpatterns */
891 /* format: <branch> <size> <head> ... <null> <tail> */
892 TRACE(("%8d: branch\n", PTR(ptr)));
893 while (*pattern) {
894 if (pattern[1] != SRE_OP_LITERAL ||
Fredrik Lundh0640e112000-06-30 13:55:15 +0000895 (ptr < end && (SRE_CODE) ptr[0] == pattern[2])) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000896 TRACE(("%8d: branch check\n", PTR(ptr)));
897 state->ptr = ptr;
898 i = SRE_MATCH(state, pattern + 1);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000899 if (i < 0)
900 return i;
901 if (i) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000902 TRACE(("%8d: branch succeeded\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000903 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000904 }
905 }
906 pattern += *pattern;
907 }
908 TRACE(("%8d: branch failed\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000909 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000910
911 case SRE_OP_REPEAT:
912 /* TEMPLATE: match repeated sequence (no backtracking) */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000913 /* args: <skip> <min> <max> */
Guido van Rossumb700df92000-03-31 14:59:30 +0000914 TRACE(("%8d: repeat %d %d\n", PTR(ptr), pattern[1], pattern[2]));
915 count = 0;
916 state->ptr = ptr;
917 while (count < (int) pattern[2]) {
918 i = SRE_MATCH(state, pattern + 3);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000919 if (i < 0)
920 return i;
921 if (!i)
Guido van Rossumb700df92000-03-31 14:59:30 +0000922 break;
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000923 if (state->ptr == ptr) {
924 count = (int) pattern[2];
925 break;
926 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000927 count++;
928 }
929 if (count <= (int) pattern[1])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000930 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000931 TRACE(("%8d: repeat %d matches\n", PTR(ptr), count));
932 pattern += pattern[0];
933 ptr = state->ptr;
934 break;
935
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000936 default:
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000937 TRACE(("%8d: unknown opcode %d\n", PTR(ptr), pattern[-1]));
Guido van Rossumb700df92000-03-31 14:59:30 +0000938 return SRE_ERROR_ILLEGAL;
939 }
940 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000941
942 failure:
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000943 if (stack-- > stackbase) {
944 ptr = state->stack[stack].ptr;
945 pattern = state->stack[stack].pattern;
946 TRACE(("%8d: retry (%d)\n", PTR(ptr), stack));
947 goto retry;
948 }
949 TRACE(("%8d: leave (failure)\n", PTR(ptr)));
950 state->stackbase = stackbase;
951 state->lastmark = lastmark;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000952 if (mark)
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000953 memcpy(state->mark, mark, state->lastmark*sizeof(void*));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000954 return 0;
955
956 success:
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000957 TRACE(("%8d: leave (success)\n", PTR(ptr)));
958 state->stackbase = stackbase;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000959 return 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000960}
961
962LOCAL(int)
963SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
964{
965 SRE_CHAR* ptr = state->start;
966 SRE_CHAR* end = state->end;
967 int status = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +0000968 int prefix_len;
969 SRE_CODE* prefix = NULL;
970 SRE_CODE* charset = NULL;
971 SRE_CODE* overlap = NULL;
972 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000973
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000974 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000975 /* optimization info block */
Fredrik Lundh3562f112000-07-02 12:00:07 +0000976 /* args: <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
977
978 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000979
980 if (pattern[3] > 0) {
981 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +0000982 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000983 end -= pattern[3]-1;
984 if (end <= ptr)
985 end = ptr+1;
986 }
987
Fredrik Lundh3562f112000-07-02 12:00:07 +0000988 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000989 /* pattern starts with a known prefix */
Fredrik Lundh3562f112000-07-02 12:00:07 +0000990 prefix_len = pattern[5];
991 prefix = pattern + 6;
992 overlap = prefix + prefix_len - 1;
993 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000994 /* pattern starts with a character from a known set */
Fredrik Lundh3562f112000-07-02 12:00:07 +0000995 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000996
997 pattern += 1 + pattern[1];
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000998 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000999
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001000#if defined(USE_FAST_SEARCH)
Fredrik Lundh3562f112000-07-02 12:00:07 +00001001 if (prefix && overlap && prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001002 /* pattern starts with a known prefix. use the overlap
1003 table to skip forward as fast as we possibly can */
1004 int i = 0;
1005 end = state->end;
1006 while (ptr < end) {
1007 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001008 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001009 if (!i)
1010 break;
1011 else
1012 i = overlap[i];
1013 } else {
1014 if (++i == prefix_len) {
1015 /* found a potential match */
1016 TRACE(("%8d: === SEARCH === hit\n", PTR(ptr)));
1017 state->start = ptr - prefix_len + 1;
1018 state->ptr = ptr + 1;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001019 if (flags & SRE_INFO_LITERAL)
1020 return 1; /* we got all of it */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001021 status = SRE_MATCH(state, pattern + 2*prefix_len);
1022 if (status != 0)
1023 return status;
1024 /* close but no cigar -- try again */
1025 i = overlap[i];
1026 }
1027 break;
1028 }
1029
1030 }
1031 ptr++;
1032 }
1033 return 0;
1034 }
1035#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001036
Fredrik Lundh3562f112000-07-02 12:00:07 +00001037 if (pattern[0] == SRE_OP_LITERAL) {
1038 /* pattern starts with a literal character. this is used
1039 for short prefixes, and if fast search is disabled */
Fredrik Lundh0640e112000-06-30 13:55:15 +00001040 SRE_CODE chr = pattern[1];
Guido van Rossumb700df92000-03-31 14:59:30 +00001041 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001042 while (ptr < end && (SRE_CODE) ptr[0] != chr)
Guido van Rossumb700df92000-03-31 14:59:30 +00001043 ptr++;
1044 if (ptr == end)
1045 return 0;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001046 TRACE(("%8d: === SEARCH === literal\n", PTR(ptr)));
Guido van Rossumb700df92000-03-31 14:59:30 +00001047 state->start = ptr;
1048 state->ptr = ++ptr;
1049 status = SRE_MATCH(state, pattern + 2);
1050 if (status != 0)
1051 break;
1052 }
Fredrik Lundh3562f112000-07-02 12:00:07 +00001053 } else if (charset) {
1054 /* pattern starts with a character from a known set */
1055 for (;;) {
1056 while (ptr < end && !SRE_MEMBER(charset, ptr[0]))
1057 ptr++;
1058 if (ptr == end)
1059 return 0;
1060 TRACE(("%8d: === SEARCH === charset\n", PTR(ptr)));
1061 state->start = ptr;
1062 state->ptr = ptr;
1063 status = SRE_MATCH(state, pattern);
1064 if (status != 0)
1065 break;
1066 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001067 } else
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001068 /* general case */
Guido van Rossumb700df92000-03-31 14:59:30 +00001069 while (ptr <= end) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001070 TRACE(("%8d: === SEARCH ===\n", PTR(ptr)));
Guido van Rossumb700df92000-03-31 14:59:30 +00001071 state->start = state->ptr = ptr++;
1072 status = SRE_MATCH(state, pattern);
1073 if (status != 0)
1074 break;
1075 }
1076
1077 return status;
1078}
Fredrik Lundh3562f112000-07-02 12:00:07 +00001079
Guido van Rossumb700df92000-03-31 14:59:30 +00001080
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001081#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001082
1083/* -------------------------------------------------------------------- */
1084/* factories and destructors */
1085
1086/* see sre.h for object declarations */
1087
1088staticforward PyTypeObject Pattern_Type;
1089staticforward PyTypeObject Match_Type;
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001090staticforward PyTypeObject Scanner_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00001091
1092static PyObject *
1093_compile(PyObject* self_, PyObject* args)
1094{
1095 /* "compile" pattern descriptor to pattern object */
1096
1097 PatternObject* self;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001098 int i, n;
Guido van Rossumb700df92000-03-31 14:59:30 +00001099
1100 PyObject* pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001101 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001102 PyObject* code;
1103 int groups = 0;
1104 PyObject* groupindex = NULL;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001105 PyObject* indexgroup = NULL;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001106 if (!PyArg_ParseTuple(args, "OiO|iOO", &pattern, &flags, &code,
Fredrik Lundhc2301732000-07-02 22:25:39 +00001107 &groups, &groupindex, &indexgroup))
Guido van Rossumb700df92000-03-31 14:59:30 +00001108 return NULL;
1109
Fredrik Lundh6f013982000-07-03 18:44:21 +00001110 code = PySequence_Fast(code, "code argument must be a sequence");
1111 if (!code)
Guido van Rossumb700df92000-03-31 14:59:30 +00001112 return NULL;
1113
Fredrik Lundh6f013982000-07-03 18:44:21 +00001114 n = PySequence_Length(code);
1115
1116 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, 100*n);
1117 if (!self) {
1118 Py_DECREF(code);
1119 return NULL;
1120 }
1121
1122 for (i = 0; i < n; i++) {
1123 PyObject *o = PySequence_Fast_GET_ITEM(code, i);
1124 self->code[i] = (SRE_CODE) PyInt_AsLong(o);
1125 }
1126
1127 Py_DECREF(code);
1128
1129 if (PyErr_Occurred())
1130 return NULL;
1131
Guido van Rossumb700df92000-03-31 14:59:30 +00001132 Py_INCREF(pattern);
1133 self->pattern = pattern;
1134
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001135 self->flags = flags;
1136
Guido van Rossumb700df92000-03-31 14:59:30 +00001137 self->groups = groups;
1138
1139 Py_XINCREF(groupindex);
1140 self->groupindex = groupindex;
1141
Fredrik Lundhc2301732000-07-02 22:25:39 +00001142 Py_XINCREF(indexgroup);
1143 self->indexgroup = indexgroup;
1144
Guido van Rossumb700df92000-03-31 14:59:30 +00001145 return (PyObject*) self;
1146}
1147
1148static PyObject *
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001149sre_codesize(PyObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001150{
1151 return Py_BuildValue("i", sizeof(SRE_CODE));
1152}
1153
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001154static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001155sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001156{
1157 int character, flags;
1158 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1159 return NULL;
1160 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001161 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001162#if defined(HAVE_UNICODE)
1163 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001164 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001165#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001166 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001167}
1168
Guido van Rossumb700df92000-03-31 14:59:30 +00001169LOCAL(PyObject*)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001170state_init(SRE_STATE* state, PatternObject* pattern, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001171{
1172 /* prepare state object */
1173
1174 PyBufferProcs *buffer;
1175 int i, count;
1176 void* ptr;
1177
1178 PyObject* string;
1179 int start = 0;
1180 int end = INT_MAX;
1181 if (!PyArg_ParseTuple(args, "O|ii", &string, &start, &end))
1182 return NULL;
1183
1184 /* get pointer to string buffer */
1185 buffer = string->ob_type->tp_as_buffer;
1186 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1187 buffer->bf_getsegcount(string, NULL) != 1) {
1188 PyErr_SetString(PyExc_TypeError, "expected read-only buffer");
1189 return NULL;
1190 }
1191
1192 /* determine buffer size */
1193 count = buffer->bf_getreadbuffer(string, 0, &ptr);
1194 if (count < 0) {
1195 /* sanity check */
1196 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1197 return NULL;
1198 }
1199
1200 /* determine character size */
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001201#if defined(HAVE_UNICODE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001202 state->charsize = (PyUnicode_Check(string) ? sizeof(Py_UNICODE) : 1);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001203#else
1204 state->charsize = 1;
1205#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001206
1207 count /= state->charsize;
1208
1209 /* adjust boundaries */
1210 if (start < 0)
1211 start = 0;
1212 else if (start > count)
1213 start = count;
1214
1215 if (end < 0)
1216 end = 0;
1217 else if (end > count)
1218 end = count;
1219
1220 state->beginning = ptr;
1221
1222 state->start = (void*) ((char*) ptr + start * state->charsize);
1223 state->end = (void*) ((char*) ptr + end * state->charsize);
1224
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001225 state->lastmark = 0;
1226
Guido van Rossumb700df92000-03-31 14:59:30 +00001227 /* FIXME: dynamic! */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001228 for (i = 0; i < SRE_MARK_SIZE; i++)
Guido van Rossumb700df92000-03-31 14:59:30 +00001229 state->mark[i] = NULL;
1230
Fredrik Lundh6f013982000-07-03 18:44:21 +00001231 state->lastindex = -1;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001232
Guido van Rossumb700df92000-03-31 14:59:30 +00001233 state->stack = NULL;
1234 state->stackbase = 0;
1235 state->stacksize = 0;
1236
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001237 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001238 state->lower = sre_lower_locale;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001239#if defined(HAVE_UNICODE)
1240 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001241 state->lower = sre_lower_unicode;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001242#endif
1243 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001244 state->lower = sre_lower;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001245
Guido van Rossumb700df92000-03-31 14:59:30 +00001246 return string;
1247}
1248
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001249LOCAL(void)
1250state_fini(SRE_STATE* state)
1251{
1252 stack_free(state);
1253}
1254
1255LOCAL(PyObject*)
1256state_getslice(SRE_STATE* state, int index, PyObject* string)
1257{
1258 index = (index - 1) * 2;
1259
1260 if (string == Py_None || !state->mark[index] || !state->mark[index+1]) {
1261 Py_INCREF(Py_None);
1262 return Py_None;
1263 }
1264
1265 return PySequence_GetSlice(
1266 string,
1267 ((char*)state->mark[index] - (char*)state->beginning) /
1268 state->charsize,
1269 ((char*)state->mark[index+1] - (char*)state->beginning) /
1270 state->charsize
1271 );
1272}
1273
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001274static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001275pattern_new_match(PatternObject* pattern, SRE_STATE* state,
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001276 PyObject* string, int status)
1277{
1278 /* create match object (from state object) */
1279
1280 MatchObject* match;
1281 int i, j;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001282 char* base;
1283 int n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001284
1285 if (status > 0) {
1286
1287 /* create match object (with room for extra group marks) */
Fredrik Lundh6f013982000-07-03 18:44:21 +00001288 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
1289 2*(pattern->groups+1));
1290 if (!match)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001291 return NULL;
1292
1293 Py_INCREF(pattern);
1294 match->pattern = pattern;
1295
1296 Py_INCREF(string);
1297 match->string = string;
1298
1299 match->groups = pattern->groups+1;
1300
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001301 base = (char*) state->beginning;
1302 n = state->charsize;
1303
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001304 /* group zero */
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001305 match->mark[0] = ((char*) state->start - base) / n;
1306 match->mark[1] = ((char*) state->ptr - base) / n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001307
1308 /* fill in the rest of the groups */
1309 for (i = j = 0; i < pattern->groups; i++, j+=2)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001310 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
1311 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
1312 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001313 } else
1314 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
1315
Fredrik Lundh6f013982000-07-03 18:44:21 +00001316 match->lastindex = state->lastindex;
1317
1318 match->pos = ((char*) state->start - base) / n;
1319 match->endpos = ((char*) state->end - base) / n;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001320
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001321 return (PyObject*) match;
1322
1323 } else if (status < 0) {
1324
1325 /* internal error */
1326 PyErr_SetString(
1327 PyExc_RuntimeError, "internal error in regular expression engine"
1328 );
1329 return NULL;
1330
1331 }
1332
1333 Py_INCREF(Py_None);
1334 return Py_None;
1335}
1336
1337static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001338pattern_scanner(PatternObject* pattern, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001339{
1340 /* create search state object */
1341
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001342 ScannerObject* self;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001343 PyObject* string;
1344
1345 /* create match object (with room for extra group marks) */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001346 self = PyObject_NEW(ScannerObject, &Scanner_Type);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001347 if (!self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001348 return NULL;
1349
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001350 string = state_init(&self->state, pattern, args);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001351 if (!string) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00001352 PyObject_Del(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001353 return NULL;
1354 }
1355
1356 Py_INCREF(pattern);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001357 self->pattern = (PyObject*) pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001358
1359 Py_INCREF(string);
1360 self->string = string;
1361
1362 return (PyObject*) self;
1363}
1364
Guido van Rossumb700df92000-03-31 14:59:30 +00001365static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001366pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001367{
Guido van Rossumb700df92000-03-31 14:59:30 +00001368 Py_XDECREF(self->pattern);
1369 Py_XDECREF(self->groupindex);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001370 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001371}
1372
1373static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001374pattern_match(PatternObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001375{
1376 SRE_STATE state;
1377 PyObject* string;
1378 int status;
1379
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001380 string = state_init(&state, self, args);
Guido van Rossumb700df92000-03-31 14:59:30 +00001381 if (!string)
1382 return NULL;
1383
1384 state.ptr = state.start;
1385
1386 if (state.charsize == 1) {
1387 status = sre_match(&state, PatternObject_GetCode(self));
1388 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001389#if defined(HAVE_UNICODE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001390 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001391#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001392 }
1393
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001394 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001395
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001396 return pattern_new_match(self, &state, string, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001397}
1398
1399static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001400pattern_search(PatternObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001401{
1402 SRE_STATE state;
1403 PyObject* string;
1404 int status;
1405
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001406 string = state_init(&state, self, args);
Guido van Rossumb700df92000-03-31 14:59:30 +00001407 if (!string)
1408 return NULL;
1409
1410 if (state.charsize == 1) {
1411 status = sre_search(&state, PatternObject_GetCode(self));
1412 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001413#if defined(HAVE_UNICODE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001414 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001415#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001416 }
1417
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001418 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001419
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001420 return pattern_new_match(self, &state, string, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001421}
1422
1423static PyObject*
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001424call(char* function, PyObject* args)
1425{
1426 PyObject* name;
1427 PyObject* module;
1428 PyObject* func;
1429 PyObject* result;
1430
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001431 name = PyString_FromString(MODULE);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001432 if (!name)
1433 return NULL;
1434 module = PyImport_Import(name);
1435 Py_DECREF(name);
1436 if (!module)
1437 return NULL;
1438 func = PyObject_GetAttrString(module, function);
1439 Py_DECREF(module);
1440 if (!func)
1441 return NULL;
1442 result = PyObject_CallObject(func, args);
1443 Py_DECREF(func);
1444 Py_DECREF(args);
1445 return result;
1446}
1447
1448static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001449pattern_sub(PatternObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001450{
1451 PyObject* template;
1452 PyObject* string;
1453 PyObject* count;
1454 if (!PyArg_ParseTuple(args, "OOO", &template, &string, &count))
1455 return NULL;
1456
1457 /* delegate to Python code */
1458 return call("_sub", Py_BuildValue("OOOO", self, template, string, count));
1459}
1460
1461static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001462pattern_subn(PatternObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001463{
1464 PyObject* template;
1465 PyObject* string;
1466 PyObject* count;
1467 if (!PyArg_ParseTuple(args, "OOO", &template, &string, &count))
1468 return NULL;
1469
1470 /* delegate to Python code */
1471 return call("_subn", Py_BuildValue("OOOO", self, template, string, count));
1472}
1473
1474static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001475pattern_split(PatternObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001476{
1477 PyObject* string;
1478 PyObject* maxsplit;
1479 if (!PyArg_ParseTuple(args, "OO", &string, &maxsplit))
1480 return NULL;
1481
1482 /* delegate to Python code */
1483 return call("_split", Py_BuildValue("OOO", self, string, maxsplit));
1484}
1485
1486static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001487pattern_findall(PatternObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001488{
Guido van Rossumb700df92000-03-31 14:59:30 +00001489 SRE_STATE state;
1490 PyObject* string;
1491 PyObject* list;
1492 int status;
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001493 int i;
Guido van Rossumb700df92000-03-31 14:59:30 +00001494
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001495 string = state_init(&state, self, args);
Guido van Rossumb700df92000-03-31 14:59:30 +00001496 if (!string)
1497 return NULL;
1498
1499 list = PyList_New(0);
1500
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001501 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001502
1503 PyObject* item;
1504
1505 state.ptr = state.start;
1506
1507 if (state.charsize == 1) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001508 status = sre_search(&state, PatternObject_GetCode(self));
Guido van Rossumb700df92000-03-31 14:59:30 +00001509 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001510#if defined(HAVE_UNICODE)
1511 status = sre_usearch(&state, PatternObject_GetCode(self));
1512#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001513 }
1514
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001515 if (status > 0) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001516
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001517 /* don't bother to build a match object */
1518 switch (self->groups) {
1519 case 0:
1520 item = PySequence_GetSlice(
1521 string,
1522 ((char*) state.start - (char*) state.beginning) /
1523 state.charsize,
1524 ((char*) state.ptr - (char*) state.beginning) /
1525 state.charsize);
1526 if (!item)
1527 goto error;
1528 break;
1529 case 1:
1530 item = state_getslice(&state, 1, string);
1531 if (!item)
1532 goto error;
1533 break;
1534 default:
1535 item = PyTuple_New(self->groups);
1536 if (!item)
1537 goto error;
1538 for (i = 0; i < self->groups; i++) {
1539 PyObject* o = state_getslice(&state, i+1, string);
1540 if (!o) {
1541 Py_DECREF(item);
1542 goto error;
1543 }
1544 PyTuple_SET_ITEM(item, i, o);
1545 }
1546 break;
1547 }
1548
1549 if (PyList_Append(list, item) < 0) {
1550 Py_DECREF(item);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001551 goto error;
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001552 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001553
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001554 if (state.ptr == state.start)
1555 state.start = (void*) ((char*) state.ptr + state.charsize);
1556 else
1557 state.start = state.ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001558
1559 } else {
1560
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001561 if (status == 0)
1562 break;
1563
Guido van Rossumb700df92000-03-31 14:59:30 +00001564 /* internal error */
1565 PyErr_SetString(
1566 PyExc_RuntimeError,
1567 "internal error in regular expression engine"
1568 );
1569 goto error;
1570
1571 }
1572 }
1573
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001574 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001575 return list;
1576
1577error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001578 Py_DECREF(list);
1579 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001580 return NULL;
1581
1582}
1583
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001584static PyMethodDef pattern_methods[] = {
1585 {"match", (PyCFunction) pattern_match, 1},
1586 {"search", (PyCFunction) pattern_search, 1},
1587 {"sub", (PyCFunction) pattern_sub, 1},
1588 {"subn", (PyCFunction) pattern_subn, 1},
1589 {"split", (PyCFunction) pattern_split, 1},
1590 {"findall", (PyCFunction) pattern_findall, 1},
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001591 /* experimental */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001592 {"scanner", (PyCFunction) pattern_scanner, 1},
Guido van Rossumb700df92000-03-31 14:59:30 +00001593 {NULL, NULL}
1594};
1595
1596static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001597pattern_getattr(PatternObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00001598{
1599 PyObject* res;
1600
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001601 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00001602
1603 if (res)
1604 return res;
1605
1606 PyErr_Clear();
1607
1608 /* attributes */
1609 if (!strcmp(name, "pattern")) {
1610 Py_INCREF(self->pattern);
1611 return self->pattern;
1612 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001613
1614 if (!strcmp(name, "flags"))
1615 return Py_BuildValue("i", self->flags);
1616
Fredrik Lundh01016fe2000-06-30 00:27:46 +00001617 if (!strcmp(name, "groups"))
1618 return Py_BuildValue("i", self->groups);
1619
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001620 if (!strcmp(name, "groupindex") && self->groupindex) {
1621 Py_INCREF(self->groupindex);
1622 return self->groupindex;
1623 }
1624
Guido van Rossumb700df92000-03-31 14:59:30 +00001625 PyErr_SetString(PyExc_AttributeError, name);
1626 return NULL;
1627}
1628
1629statichere PyTypeObject Pattern_Type = {
1630 PyObject_HEAD_INIT(NULL)
Fredrik Lundh6f013982000-07-03 18:44:21 +00001631 0, "SRE_Pattern",
1632 sizeof(PatternObject), sizeof(SRE_CODE),
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001633 (destructor)pattern_dealloc, /*tp_dealloc*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001634 0, /*tp_print*/
Fredrik Lundh6f013982000-07-03 18:44:21 +00001635 (getattrfunc)pattern_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001636};
1637
1638/* -------------------------------------------------------------------- */
1639/* match methods */
1640
1641static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001642match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001643{
1644 Py_XDECREF(self->string);
1645 Py_DECREF(self->pattern);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001646 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001647}
1648
1649static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001650match_getslice_by_index(MatchObject* self, int index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00001651{
1652 if (index < 0 || index >= self->groups) {
1653 /* raise IndexError if we were given a bad group number */
1654 PyErr_SetString(
1655 PyExc_IndexError,
1656 "no such group"
1657 );
1658 return NULL;
1659 }
1660
Fredrik Lundh6f013982000-07-03 18:44:21 +00001661 index *= 2;
1662
1663 if (self->string == Py_None || self->mark[index] < 0) {
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001664 /* return default value if the string or group is undefined */
1665 Py_INCREF(def);
1666 return def;
Guido van Rossumb700df92000-03-31 14:59:30 +00001667 }
1668
1669 return PySequence_GetSlice(
Fredrik Lundh6f013982000-07-03 18:44:21 +00001670 self->string, self->mark[index], self->mark[index+1]
Guido van Rossumb700df92000-03-31 14:59:30 +00001671 );
1672}
1673
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001674static int
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001675match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00001676{
Fredrik Lundh6f013982000-07-03 18:44:21 +00001677 int i;
Guido van Rossumb700df92000-03-31 14:59:30 +00001678
1679 if (PyInt_Check(index))
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001680 return (int) PyInt_AS_LONG(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00001681
Fredrik Lundh6f013982000-07-03 18:44:21 +00001682 i = -1;
1683
1684 if (self->pattern->groupindex) {
1685 index = PyObject_GetItem(self->pattern->groupindex, index);
1686 if (index) {
1687 if (PyInt_Check(index))
1688 i = (int) PyInt_AS_LONG(index);
1689 Py_DECREF(index);
1690 } else
1691 PyErr_Clear();
1692 }
1693
1694 return i;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001695}
1696
1697static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001698match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001699{
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001700 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00001701}
1702
1703static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001704match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001705{
1706 PyObject* result;
1707 int i, size;
1708
1709 size = PyTuple_GET_SIZE(args);
1710
1711 switch (size) {
1712 case 0:
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001713 result = match_getslice(self, Py_False, Py_None);
Guido van Rossumb700df92000-03-31 14:59:30 +00001714 break;
1715 case 1:
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001716 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
Guido van Rossumb700df92000-03-31 14:59:30 +00001717 break;
1718 default:
1719 /* fetch multiple items */
1720 result = PyTuple_New(size);
1721 if (!result)
1722 return NULL;
1723 for (i = 0; i < size; i++) {
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001724 PyObject* item = match_getslice(
1725 self, PyTuple_GET_ITEM(args, i), Py_None
1726 );
Guido van Rossumb700df92000-03-31 14:59:30 +00001727 if (!item) {
1728 Py_DECREF(result);
1729 return NULL;
1730 }
1731 PyTuple_SET_ITEM(result, i, item);
1732 }
1733 break;
1734 }
1735 return result;
1736}
1737
1738static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001739match_groups(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001740{
1741 PyObject* result;
1742 int index;
1743
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001744 PyObject* def = Py_None;
1745 if (!PyArg_ParseTuple(args, "|O", &def))
1746 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001747
Guido van Rossumb700df92000-03-31 14:59:30 +00001748 result = PyTuple_New(self->groups-1);
1749 if (!result)
1750 return NULL;
1751
1752 for (index = 1; index < self->groups; index++) {
1753 PyObject* item;
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001754 item = match_getslice_by_index(self, index, def);
Guido van Rossumb700df92000-03-31 14:59:30 +00001755 if (!item) {
1756 Py_DECREF(result);
1757 return NULL;
1758 }
1759 PyTuple_SET_ITEM(result, index-1, item);
1760 }
1761
1762 return result;
1763}
1764
1765static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001766match_groupdict(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001767{
1768 PyObject* result;
1769 PyObject* keys;
1770 int index;
1771
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001772 PyObject* def = Py_None;
1773 if (!PyArg_ParseTuple(args, "|O", &def))
1774 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001775
Guido van Rossumb700df92000-03-31 14:59:30 +00001776 result = PyDict_New();
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001777 if (!result || !self->pattern->groupindex)
Guido van Rossumb700df92000-03-31 14:59:30 +00001778 return result;
1779
1780 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001781 if (!keys) {
1782 Py_DECREF(result);
Guido van Rossumb700df92000-03-31 14:59:30 +00001783 return NULL;
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001784 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001785
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001786 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001787 PyObject* key;
1788 PyObject* item;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001789 key = PyList_GET_ITEM(keys, index);
Guido van Rossumb700df92000-03-31 14:59:30 +00001790 if (!key) {
1791 Py_DECREF(keys);
1792 Py_DECREF(result);
1793 return NULL;
1794 }
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001795 item = match_getslice(self, key, def);
Guido van Rossumb700df92000-03-31 14:59:30 +00001796 if (!item) {
1797 Py_DECREF(key);
1798 Py_DECREF(keys);
1799 Py_DECREF(result);
1800 return NULL;
1801 }
1802 /* FIXME: <fl> this can fail, right? */
1803 PyDict_SetItem(result, key, item);
1804 }
1805
1806 Py_DECREF(keys);
1807
1808 return result;
1809}
1810
1811static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001812match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001813{
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001814 int index;
1815
1816 PyObject* index_ = Py_False;
1817 if (!PyArg_ParseTuple(args, "|O", &index_))
Guido van Rossumb700df92000-03-31 14:59:30 +00001818 return NULL;
1819
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001820 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001821
Guido van Rossumb700df92000-03-31 14:59:30 +00001822 if (index < 0 || index >= self->groups) {
1823 PyErr_SetString(
1824 PyExc_IndexError,
1825 "no such group"
1826 );
1827 return NULL;
1828 }
1829
1830 if (self->mark[index*2] < 0) {
1831 Py_INCREF(Py_None);
1832 return Py_None;
1833 }
1834
1835 return Py_BuildValue("i", self->mark[index*2]);
1836}
1837
1838static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001839match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001840{
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001841 int index;
1842
1843 PyObject* index_ = Py_False;
1844 if (!PyArg_ParseTuple(args, "|O", &index_))
Guido van Rossumb700df92000-03-31 14:59:30 +00001845 return NULL;
1846
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001847 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001848
Guido van Rossumb700df92000-03-31 14:59:30 +00001849 if (index < 0 || index >= self->groups) {
1850 PyErr_SetString(
1851 PyExc_IndexError,
1852 "no such group"
1853 );
1854 return NULL;
1855 }
1856
1857 if (self->mark[index*2] < 0) {
1858 Py_INCREF(Py_None);
1859 return Py_None;
1860 }
1861
1862 return Py_BuildValue("i", self->mark[index*2+1]);
1863}
1864
1865static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001866match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001867{
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001868 int index;
1869
1870 PyObject* index_ = Py_False;
1871 if (!PyArg_ParseTuple(args, "|O", &index_))
Guido van Rossumb700df92000-03-31 14:59:30 +00001872 return NULL;
1873
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001874 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001875
Guido van Rossumb700df92000-03-31 14:59:30 +00001876 if (index < 0 || index >= self->groups) {
1877 PyErr_SetString(
1878 PyExc_IndexError,
1879 "no such group"
1880 );
1881 return NULL;
1882 }
1883
1884 if (self->mark[index*2] < 0) {
1885 Py_INCREF(Py_None);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001886 Py_INCREF(Py_None);
1887 return Py_BuildValue("OO", Py_None, Py_None);
Guido van Rossumb700df92000-03-31 14:59:30 +00001888 }
1889
1890 return Py_BuildValue("ii", self->mark[index*2], self->mark[index*2+1]);
1891}
1892
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001893static PyMethodDef match_methods[] = {
1894 {"group", (PyCFunction) match_group, 1},
1895 {"start", (PyCFunction) match_start, 1},
1896 {"end", (PyCFunction) match_end, 1},
1897 {"span", (PyCFunction) match_span, 1},
1898 {"groups", (PyCFunction) match_groups, 1},
1899 {"groupdict", (PyCFunction) match_groupdict, 1},
Guido van Rossumb700df92000-03-31 14:59:30 +00001900 {NULL, NULL}
1901};
1902
1903static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001904match_getattr(MatchObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00001905{
1906 PyObject* res;
1907
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001908 res = Py_FindMethod(match_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00001909 if (res)
1910 return res;
1911
1912 PyErr_Clear();
1913
Fredrik Lundhc2301732000-07-02 22:25:39 +00001914 if (!strcmp(name, "lastindex")) {
1915 /* experimental */
Fredrik Lundh6f013982000-07-03 18:44:21 +00001916 if (self->lastindex >= 0)
1917 return Py_BuildValue("i", self->lastindex);
Fredrik Lundhc2301732000-07-02 22:25:39 +00001918 Py_INCREF(Py_None);
1919 return Py_None;
1920 }
1921
1922 if (!strcmp(name, "lastgroup")) {
1923 /* experimental */
Fredrik Lundh6f013982000-07-03 18:44:21 +00001924 if (self->pattern->indexgroup && self->lastindex >= 0) {
Fredrik Lundhc2301732000-07-02 22:25:39 +00001925 PyObject* result = PySequence_GetItem(
Fredrik Lundh6f013982000-07-03 18:44:21 +00001926 self->pattern->indexgroup, self->lastindex
Fredrik Lundhc2301732000-07-02 22:25:39 +00001927 );
1928 if (result)
1929 return result;
1930 PyErr_Clear();
1931 }
1932 Py_INCREF(Py_None);
1933 return Py_None;
1934 }
1935
Guido van Rossumb700df92000-03-31 14:59:30 +00001936 if (!strcmp(name, "string")) {
1937 Py_INCREF(self->string);
1938 return self->string;
1939 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001940
Guido van Rossumb700df92000-03-31 14:59:30 +00001941 if (!strcmp(name, "re")) {
1942 Py_INCREF(self->pattern);
1943 return (PyObject*) self->pattern;
1944 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001945
Guido van Rossumb700df92000-03-31 14:59:30 +00001946 if (!strcmp(name, "pos"))
Fredrik Lundh6f013982000-07-03 18:44:21 +00001947 return Py_BuildValue("i", self->pos);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001948
Guido van Rossumb700df92000-03-31 14:59:30 +00001949 if (!strcmp(name, "endpos"))
Fredrik Lundh6f013982000-07-03 18:44:21 +00001950 return Py_BuildValue("i", self->endpos);
Guido van Rossumb700df92000-03-31 14:59:30 +00001951
1952 PyErr_SetString(PyExc_AttributeError, name);
1953 return NULL;
1954}
1955
1956/* FIXME: implement setattr("string", None) as a special case (to
1957 detach the associated string, if any */
1958
1959statichere PyTypeObject Match_Type = {
1960 PyObject_HEAD_INIT(NULL)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001961 0, "SRE_Match",
Fredrik Lundh6f013982000-07-03 18:44:21 +00001962 sizeof(MatchObject), sizeof(int),
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001963 (destructor)match_dealloc, /*tp_dealloc*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001964 0, /*tp_print*/
Fredrik Lundh6f013982000-07-03 18:44:21 +00001965 (getattrfunc)match_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001966};
1967
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001968/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001969/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001970
1971static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001972scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001973{
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001974 state_fini(&self->state);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001975 Py_DECREF(self->string);
1976 Py_DECREF(self->pattern);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001977 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001978}
1979
1980static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001981scanner_match(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001982{
1983 SRE_STATE* state = &self->state;
1984 PyObject* match;
1985 int status;
1986
1987 state->ptr = state->start;
1988
1989 if (state->charsize == 1) {
1990 status = sre_match(state, PatternObject_GetCode(self->pattern));
1991 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001992#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001993 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001994#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001995 }
1996
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001997 match = pattern_new_match((PatternObject*) self->pattern,
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001998 state, self->string, status);
1999
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002000 if (status == 0 || state->ptr == state->start)
2001 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002002 else
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002003 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002004
2005 return match;
2006}
2007
2008
2009static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002010scanner_search(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002011{
2012 SRE_STATE* state = &self->state;
2013 PyObject* match;
2014 int status;
2015
2016 state->ptr = state->start;
2017
2018 if (state->charsize == 1) {
2019 status = sre_search(state, PatternObject_GetCode(self->pattern));
2020 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002021#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002022 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002023#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002024 }
2025
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002026 match = pattern_new_match((PatternObject*) self->pattern,
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002027 state, self->string, status);
2028
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002029 if (status == 0 || state->ptr == state->start)
2030 state->start = (void*) ((char*) state->ptr + state->charsize);
2031 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002032 state->start = state->ptr;
2033
2034 return match;
2035}
2036
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002037static PyMethodDef scanner_methods[] = {
2038 {"match", (PyCFunction) scanner_match, 0},
2039 {"search", (PyCFunction) scanner_search, 0},
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002040 {NULL, NULL}
2041};
2042
2043static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002044scanner_getattr(ScannerObject* self, char* name)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002045{
2046 PyObject* res;
2047
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002048 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002049 if (res)
2050 return res;
2051
2052 PyErr_Clear();
2053
2054 /* attributes */
2055 if (!strcmp(name, "pattern")) {
2056 Py_INCREF(self->pattern);
2057 return self->pattern;
2058 }
2059
2060 PyErr_SetString(PyExc_AttributeError, name);
2061 return NULL;
2062}
2063
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002064statichere PyTypeObject Scanner_Type = {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002065 PyObject_HEAD_INIT(NULL)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002066 0, "SRE_Scanner",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002067 sizeof(ScannerObject), 0,
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002068 (destructor)scanner_dealloc, /*tp_dealloc*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002069 0, /*tp_print*/
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002070 (getattrfunc)scanner_getattr, /*tp_getattr*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002071};
2072
Guido van Rossumb700df92000-03-31 14:59:30 +00002073static PyMethodDef _functions[] = {
2074 {"compile", _compile, 1},
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002075 {"getcodesize", sre_codesize, 1},
Fredrik Lundhb389df32000-06-29 12:48:37 +00002076 {"getlower", sre_getlower, 1},
Guido van Rossumb700df92000-03-31 14:59:30 +00002077 {NULL, NULL}
2078};
2079
2080void
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002081#if defined(WIN32)
Guido van Rossumb700df92000-03-31 14:59:30 +00002082__declspec(dllexport)
2083#endif
2084init_sre()
2085{
2086 /* Patch object types */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002087 Pattern_Type.ob_type = Match_Type.ob_type =
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002088 Scanner_Type.ob_type = &PyType_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00002089
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002090 Py_InitModule("_" MODULE, _functions);
Guido van Rossumb700df92000-03-31 14:59:30 +00002091}
2092
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002093#endif /* !defined(SRE_RECURSIVE) */