blob: cac480d780258065e282ecd81695ba4eaac4c582 [file] [log] [blame]
Guido van Rossumb700df92000-03-31 14:59:30 +00001/* -*- Mode: C; tab-width: 4 -*-
2 *
3 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00004 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00005 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00006 *
7 * partial history:
Fredrik Lundh436c3d52000-06-29 08:58:44 +00008 * 99-10-24 fl created (based on existing template matcher code)
Guido van Rossumb700df92000-03-31 14:59:30 +00009 * 99-11-13 fl added categories, branching, and more (0.2)
10 * 99-11-16 fl some tweaks to compile on non-Windows platforms
11 * 99-12-18 fl non-literals, generic maximizing repeat (0.3)
Fredrik Lundh436c3d52000-06-29 08:58:44 +000012 * 00-02-28 fl tons of changes (not all to the better ;-) (0.4)
13 * 00-03-06 fl first alpha, sort of (0.5)
14 * 00-03-14 fl removed most compatibility stuff (0.6)
15 * 00-05-10 fl towards third alpha (0.8.2)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +000016 * 00-05-13 fl added experimental scanner stuff (0.8.3)
Fredrik Lundh436c3d52000-06-29 08:58:44 +000017 * 00-05-27 fl final bug hunt (0.8.4)
18 * 00-06-21 fl less bugs, more taste (0.8.5)
19 * 00-06-25 fl major changes to better deal with nested repeats (0.9)
20 * 00-06-28 fl fixed findall (0.9.1)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +000021 * 00-06-29 fl fixed split, added more scanner features (0.9.2)
Fredrik Lundhc13222c2000-07-01 23:49:14 +000022 * 00-06-30 fl added fast search optimization (0.9.3)
Fredrik Lundh0640e112000-06-30 13:55:15 +000023 * 00-06-30 fl added assert (lookahead) primitives, etc (0.9.4)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +000024 * 00-07-02 fl added charset optimizations, etc (0.9.5)
Fredrik Lundh6f013982000-07-03 18:44:21 +000025 * 00-07-03 fl store code in pattern object, lookbehind, etc
Guido van Rossumb700df92000-03-31 14:59:30 +000026 *
27 * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
28 *
Guido van Rossumb700df92000-03-31 14:59:30 +000029 * Portions of this engine have been developed in cooperation with
Fredrik Lundh22d25462000-07-01 17:50:59 +000030 * CNRI. Hewlett-Packard provided funding for 2.0 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000031 * other compatibility work.
32 */
33
34#ifndef SRE_RECURSIVE
35
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +000036char copyright[] = " SRE 0.9.5 Copyright (c) 1997-2000 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000037
38#include "Python.h"
39
40#include "sre.h"
41
Guido van Rossumb700df92000-03-31 14:59:30 +000042#if defined(HAVE_LIMITS_H)
43#include <limits.h>
44#else
45#define INT_MAX 2147483647
46#endif
47
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000048#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000049
Fredrik Lundh436c3d52000-06-29 08:58:44 +000050/* name of this module, minus the leading underscore */
51#define MODULE "sre"
52
Guido van Rossumb700df92000-03-31 14:59:30 +000053/* defining this one enables tracing */
54#undef DEBUG
55
Fredrik Lundh436c3d52000-06-29 08:58:44 +000056#if PY_VERSION_HEX >= 0x01060000
Fredrik Lundh22d25462000-07-01 17:50:59 +000057/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d52000-06-29 08:58:44 +000058#define HAVE_UNICODE
59#endif
60
Fredrik Lundh29c08be2000-06-29 23:33:12 +000061/* optional features */
62#define USE_FAST_SEARCH
63
Fredrik Lundh80946112000-06-29 18:03:25 +000064#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000065#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
66/* fastest possible local call under MSVC */
67#define LOCAL(type) static __inline type __fastcall
68#else
Fredrik Lundh29c08be2000-06-29 23:33:12 +000069#define LOCAL(type) static inline type
Guido van Rossumb700df92000-03-31 14:59:30 +000070#endif
71
72/* error codes */
73#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
74#define SRE_ERROR_MEMORY -9 /* out of memory */
75
Fredrik Lundh436c3d52000-06-29 08:58:44 +000076#if defined(DEBUG)
Guido van Rossumb700df92000-03-31 14:59:30 +000077#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +000078#else
79#define TRACE(v)
80#endif
81
Fredrik Lundh436c3d52000-06-29 08:58:44 +000082#define PTR(ptr) ((SRE_CHAR*) (ptr) - (SRE_CHAR*) state->beginning)
Guido van Rossumb700df92000-03-31 14:59:30 +000083
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000084/* -------------------------------------------------------------------- */
85/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +000086
Fredrik Lundh436c3d52000-06-29 08:58:44 +000087/* default character predicates (run sre_chars.py to regenerate tables) */
88
89#define SRE_DIGIT_MASK 1
90#define SRE_SPACE_MASK 2
91#define SRE_LINEBREAK_MASK 4
92#define SRE_ALNUM_MASK 8
93#define SRE_WORD_MASK 16
94
95static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
962, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
970, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
9825, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
9924, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1000, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
10124, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
102
Fredrik Lundhb389df32000-06-29 12:48:37 +0000103static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d52000-06-29 08:58:44 +000010410, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
10527, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
10644, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
10761, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
108108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
109122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
110106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
111120, 121, 122, 123, 124, 125, 126, 127 };
112
Fredrik Lundhb389df32000-06-29 12:48:37 +0000113static unsigned int sre_lower(unsigned int ch)
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000114{
Fredrik Lundhb389df32000-06-29 12:48:37 +0000115 return ((ch) < 128 ? sre_char_lower[ch] : ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000116}
117
118#define SRE_IS_DIGIT(ch)\
119 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
120#define SRE_IS_SPACE(ch)\
121 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
122#define SRE_IS_LINEBREAK(ch)\
123 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
124#define SRE_IS_ALNUM(ch)\
125 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
126#define SRE_IS_WORD(ch)\
127 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000128
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000129/* locale-specific character predicates */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000130
Fredrik Lundhb389df32000-06-29 12:48:37 +0000131static unsigned int sre_lower_locale(unsigned int ch)
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000132{
133 return ((ch) < 256 ? tolower((ch)) : ch);
134}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000135#define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
136#define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
137#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
138#define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
139#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
140
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000141/* unicode-specific character predicates */
142
143#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +0000144static unsigned int sre_lower_unicode(unsigned int ch)
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000145{
146 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
147}
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000148#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
149#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
150#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000151#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000152#define SRE_UNI_IS_WORD(ch) (SRE_IS_ALNUM((ch)) || (ch) == '_')
153#endif
154
Guido van Rossumb700df92000-03-31 14:59:30 +0000155LOCAL(int)
156sre_category(SRE_CODE category, unsigned int ch)
157{
158 switch (category) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000159
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000160 case SRE_CATEGORY_DIGIT:
Guido van Rossumb700df92000-03-31 14:59:30 +0000161 return SRE_IS_DIGIT(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000162 case SRE_CATEGORY_NOT_DIGIT:
Guido van Rossumb700df92000-03-31 14:59:30 +0000163 return !SRE_IS_DIGIT(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000164 case SRE_CATEGORY_SPACE:
Guido van Rossumb700df92000-03-31 14:59:30 +0000165 return SRE_IS_SPACE(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000166 case SRE_CATEGORY_NOT_SPACE:
Guido van Rossumb700df92000-03-31 14:59:30 +0000167 return !SRE_IS_SPACE(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000168 case SRE_CATEGORY_WORD:
Guido van Rossumb700df92000-03-31 14:59:30 +0000169 return SRE_IS_WORD(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000170 case SRE_CATEGORY_NOT_WORD:
Guido van Rossumb700df92000-03-31 14:59:30 +0000171 return !SRE_IS_WORD(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000172 case SRE_CATEGORY_LINEBREAK:
173 return SRE_IS_LINEBREAK(ch);
174 case SRE_CATEGORY_NOT_LINEBREAK:
175 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000176
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000177 case SRE_CATEGORY_LOC_WORD:
178 return SRE_LOC_IS_WORD(ch);
179 case SRE_CATEGORY_LOC_NOT_WORD:
180 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000181
182#if defined(HAVE_UNICODE)
183 case SRE_CATEGORY_UNI_DIGIT:
184 return SRE_UNI_IS_DIGIT(ch);
185 case SRE_CATEGORY_UNI_NOT_DIGIT:
186 return !SRE_UNI_IS_DIGIT(ch);
187 case SRE_CATEGORY_UNI_SPACE:
188 return SRE_UNI_IS_SPACE(ch);
189 case SRE_CATEGORY_UNI_NOT_SPACE:
190 return !SRE_UNI_IS_SPACE(ch);
191 case SRE_CATEGORY_UNI_WORD:
192 return SRE_UNI_IS_WORD(ch);
193 case SRE_CATEGORY_UNI_NOT_WORD:
194 return !SRE_UNI_IS_WORD(ch);
195 case SRE_CATEGORY_UNI_LINEBREAK:
196 return SRE_UNI_IS_LINEBREAK(ch);
197 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
198 return !SRE_UNI_IS_LINEBREAK(ch);
199#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000200 }
201 return 0;
202}
203
204/* helpers */
205
206LOCAL(int)
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000207stack_free(SRE_STATE* state)
Guido van Rossumb700df92000-03-31 14:59:30 +0000208{
209 if (state->stack) {
210 TRACE(("release stack\n"));
211 free(state->stack);
212 state->stack = NULL;
213 }
214 state->stacksize = 0;
215 return 0;
216}
217
218static int /* shouldn't be LOCAL */
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000219stack_extend(SRE_STATE* state, int lo, int hi)
Guido van Rossumb700df92000-03-31 14:59:30 +0000220{
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000221 SRE_STACK* stack;
Guido van Rossumb700df92000-03-31 14:59:30 +0000222 int stacksize;
223
224 /* grow the stack to a suitable size; we need at least lo entries,
225 at most hi entries. if for some reason hi is lower than lo, lo
226 wins */
227
228 stacksize = state->stacksize;
229
230 if (stacksize == 0) {
231 /* create new stack */
232 stacksize = 512;
233 if (stacksize < lo)
234 stacksize = lo;
235 else if (stacksize > hi)
236 stacksize = hi;
237 TRACE(("allocate stack %d\n", stacksize));
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000238 stack = malloc(sizeof(SRE_STACK) * stacksize);
Guido van Rossumb700df92000-03-31 14:59:30 +0000239 } else {
240 /* grow the stack (typically by a factor of two) */
241 while (stacksize < lo)
242 stacksize = 2 * stacksize;
243 /* FIXME: <fl> could trim size if it's larger than lo, and
244 much larger than hi */
245 TRACE(("grow stack to %d\n", stacksize));
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000246 stack = realloc(state->stack, sizeof(SRE_STACK) * stacksize);
Guido van Rossumb700df92000-03-31 14:59:30 +0000247 }
248
249 if (!stack) {
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000250 stack_free(state);
Guido van Rossumb700df92000-03-31 14:59:30 +0000251 return SRE_ERROR_MEMORY;
252 }
253
254 state->stack = stack;
255 state->stacksize = stacksize;
256
257 return 0;
258}
259
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000260/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000261
262#define SRE_CHAR unsigned char
263#define SRE_AT sre_at
264#define SRE_MEMBER sre_member
265#define SRE_MATCH sre_match
266#define SRE_SEARCH sre_search
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000267
268#if defined(HAVE_UNICODE)
269
Guido van Rossumb700df92000-03-31 14:59:30 +0000270#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000271#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000272#undef SRE_RECURSIVE
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000273
Guido van Rossumb700df92000-03-31 14:59:30 +0000274#undef SRE_SEARCH
275#undef SRE_MATCH
276#undef SRE_MEMBER
277#undef SRE_AT
278#undef SRE_CHAR
279
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000280/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000281
282#define SRE_CHAR Py_UNICODE
283#define SRE_AT sre_uat
284#define SRE_MEMBER sre_umember
285#define SRE_MATCH sre_umatch
286#define SRE_SEARCH sre_usearch
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000287#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000288
289#endif /* SRE_RECURSIVE */
290
291/* -------------------------------------------------------------------- */
292/* String matching engine */
293
294/* the following section is compiled twice, with different character
295 settings */
296
297LOCAL(int)
298SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
299{
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000300 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000301
302 int this, that;
303
304 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000305
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000306 case SRE_AT_BEGINNING:
Guido van Rossum29530882000-04-10 17:06:55 +0000307 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000308
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000309 case SRE_AT_BEGINNING_LINE:
310 return ((void*) ptr == state->beginning ||
311 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000312
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000313 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000314 return (((void*) (ptr+1) == state->end &&
315 SRE_IS_LINEBREAK((int) ptr[0])) ||
316 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000317
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000318 case SRE_AT_END_LINE:
319 return ((void*) ptr == state->end ||
320 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000321
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000322 case SRE_AT_BOUNDARY:
Guido van Rossumb700df92000-03-31 14:59:30 +0000323 if (state->beginning == state->end)
324 return 0;
325 that = ((void*) ptr > state->beginning) ?
326 SRE_IS_WORD((int) ptr[-1]) : 0;
327 this = ((void*) ptr < state->end) ?
328 SRE_IS_WORD((int) ptr[0]) : 0;
329 return this != that;
Fredrik Lundh80946112000-06-29 18:03:25 +0000330
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000331 case SRE_AT_NON_BOUNDARY:
Guido van Rossumb700df92000-03-31 14:59:30 +0000332 if (state->beginning == state->end)
333 return 0;
334 that = ((void*) ptr > state->beginning) ?
335 SRE_IS_WORD((int) ptr[-1]) : 0;
336 this = ((void*) ptr < state->end) ?
337 SRE_IS_WORD((int) ptr[0]) : 0;
338 return this == that;
339 }
340
341 return 0;
342}
343
344LOCAL(int)
Fredrik Lundh0640e112000-06-30 13:55:15 +0000345SRE_MEMBER(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000346{
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000347 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000348
349 int ok = 1;
350
351 for (;;) {
352 switch (*set++) {
353
354 case SRE_OP_NEGATE:
355 ok = !ok;
356 break;
357
358 case SRE_OP_FAILURE:
359 return !ok;
360
361 case SRE_OP_LITERAL:
Fredrik Lundhc13222c2000-07-01 23:49:14 +0000362 /* args: <literal> */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000363 if (ch == set[0])
Guido van Rossumb700df92000-03-31 14:59:30 +0000364 return ok;
365 set++;
366 break;
367
368 case SRE_OP_RANGE:
Fredrik Lundhc13222c2000-07-01 23:49:14 +0000369 /* args: <lower> <upper> */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000370 if (set[0] <= ch && ch <= set[1])
Guido van Rossumb700df92000-03-31 14:59:30 +0000371 return ok;
372 set += 2;
373 break;
374
Fredrik Lundh3562f112000-07-02 12:00:07 +0000375 case SRE_OP_CHARSET:
376 /* args: <bitmap> (16 bits per code word) */
377 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
378 return ok;
379 set += 16;
380 break;
381
Guido van Rossumb700df92000-03-31 14:59:30 +0000382 case SRE_OP_CATEGORY:
Fredrik Lundhc13222c2000-07-01 23:49:14 +0000383 /* args: <category> */
Guido van Rossumb700df92000-03-31 14:59:30 +0000384 if (sre_category(set[0], (int) ch))
385 return ok;
386 set += 1;
387 break;
388
389 default:
Fredrik Lundh80946112000-06-29 18:03:25 +0000390 /* internal error -- there's not much we can do about it
391 here, so let's just pretend it didn't match... */
Guido van Rossumb700df92000-03-31 14:59:30 +0000392 return 0;
393 }
394 }
395}
396
397LOCAL(int)
398SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
399{
400 /* check if string matches the given pattern. returns -1 for
401 error, 0 for failure, and 1 for success */
402
403 SRE_CHAR* end = state->end;
404 SRE_CHAR* ptr = state->ptr;
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000405 int stack;
Guido van Rossumb700df92000-03-31 14:59:30 +0000406 int stackbase;
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000407 int lastmark;
Guido van Rossumb700df92000-03-31 14:59:30 +0000408 int i, count;
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000409 SRE_STACK* sp;
Guido van Rossumb700df92000-03-31 14:59:30 +0000410
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000411 /* FIXME: this is a hack! */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000412 void* mark_copy[SRE_MARK_SIZE];
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000413 void* mark = NULL;
414
415 TRACE(("%8d: enter\n", PTR(ptr)));
416
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000417 if (pattern[0] == SRE_OP_INFO) {
418 /* optimization info block */
419 /* args: <1=skip> <2=flags> <3=min> ... */
420 if (pattern[3] && (end - ptr) < pattern[3]) {
421 TRACE(("reject (got %d chars, need %d)\n",
422 (end - ptr), pattern[3]));
423 return 0;
424 }
425 pattern += pattern[1] + 1;
426 }
427
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000428 stackbase = stack = state->stackbase;
429 lastmark = state->lastmark;
430
431 retry:
Guido van Rossumb700df92000-03-31 14:59:30 +0000432
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000433 for (;;) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000434
435 switch (*pattern++) {
436
437 case SRE_OP_FAILURE:
438 /* immediate failure */
439 TRACE(("%8d: failure\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000440 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000441
442 case SRE_OP_SUCCESS:
443 /* end of pattern */
444 TRACE(("%8d: success\n", PTR(ptr)));
445 state->ptr = ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000446 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000447
448 case SRE_OP_AT:
449 /* match at given position */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000450 /* args: <at> */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000451 TRACE(("%8d: position %d\n", PTR(ptr), *pattern));
Guido van Rossumb700df92000-03-31 14:59:30 +0000452 if (!SRE_AT(state, ptr, *pattern))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000453 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000454 pattern++;
455 break;
456
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000457 case SRE_OP_CATEGORY:
458 /* match at given category */
459 /* args: <category> */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000460 TRACE(("%8d: category %d [category %d]\n", PTR(ptr),
461 *ptr, *pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000462 if (ptr >= end || !sre_category(pattern[0], ptr[0]))
463 goto failure;
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000464 TRACE(("%8d: category ok\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000465 pattern++;
466 ptr++;
467 break;
468
Guido van Rossumb700df92000-03-31 14:59:30 +0000469 case SRE_OP_LITERAL:
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000470 /* match literal string */
Guido van Rossumb700df92000-03-31 14:59:30 +0000471 /* args: <code> */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000472 TRACE(("%8d: literal %c\n", PTR(ptr), pattern[0]));
473 if (ptr >= end || (SRE_CODE) ptr[0] != pattern[0])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000474 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000475 pattern++;
476 ptr++;
477 break;
478
479 case SRE_OP_NOT_LITERAL:
480 /* match anything that is not literal character */
481 /* args: <code> */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000482 TRACE(("%8d: literal not %c\n", PTR(ptr), pattern[0]));
483 if (ptr >= end || (SRE_CODE) ptr[0] == pattern[0])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000484 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000485 pattern++;
486 ptr++;
487 break;
488
489 case SRE_OP_ANY:
490 /* match anything */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000491 TRACE(("%8d: anything\n", PTR(ptr)));
Guido van Rossumb700df92000-03-31 14:59:30 +0000492 if (ptr >= end)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000493 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000494 ptr++;
495 break;
496
497 case SRE_OP_IN:
498 /* match set member (or non_member) */
499 /* args: <skip> <set> */
500 TRACE(("%8d: set %c\n", PTR(ptr), *ptr));
501 if (ptr >= end || !SRE_MEMBER(pattern + 1, *ptr))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000502 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000503 pattern += pattern[0];
504 ptr++;
505 break;
506
507 case SRE_OP_GROUP:
508 /* match backreference */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000509 TRACE(("%8d: group %d\n", PTR(ptr), pattern[0]));
Guido van Rossumb700df92000-03-31 14:59:30 +0000510 i = pattern[0];
511 {
Guido van Rossumb700df92000-03-31 14:59:30 +0000512 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
513 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
514 if (!p || !e || e < p)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000515 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000516 while (p < e) {
517 if (ptr >= end || *ptr != *p)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000518 goto failure;
519 p++; ptr++;
520 }
521 }
522 pattern++;
523 break;
524
525 case SRE_OP_GROUP_IGNORE:
526 /* match backreference */
527 TRACE(("%8d: group ignore %d\n", PTR(ptr), pattern[0]));
528 i = pattern[0];
529 {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000530 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
531 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000532 if (!p || !e || e < p)
533 goto failure;
534 while (p < e) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000535 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000536 state->lower(*ptr) != state->lower(*p))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000537 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000538 p++; ptr++;
539 }
540 }
541 pattern++;
542 break;
543
544 case SRE_OP_LITERAL_IGNORE:
Fredrik Lundh0640e112000-06-30 13:55:15 +0000545 TRACE(("%8d: literal lower(%c)\n", PTR(ptr), pattern[0]));
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000546 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000547 state->lower(*ptr) != state->lower(*pattern))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000548 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000549 pattern++;
550 ptr++;
551 break;
552
553 case SRE_OP_NOT_LITERAL_IGNORE:
Fredrik Lundh0640e112000-06-30 13:55:15 +0000554 TRACE(("%8d: literal not lower(%c)\n", PTR(ptr), pattern[0]));
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000555 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000556 state->lower(*ptr) == state->lower(*pattern))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000557 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000558 pattern++;
559 ptr++;
560 break;
561
562 case SRE_OP_IN_IGNORE:
563 TRACE(("%8d: set lower(%c)\n", PTR(ptr), *ptr));
564 if (ptr >= end
Fredrik Lundh0640e112000-06-30 13:55:15 +0000565 || !SRE_MEMBER(pattern+1, (SRE_CODE) state->lower(*ptr)))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000566 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000567 pattern += pattern[0];
568 ptr++;
569 break;
570
571 case SRE_OP_MARK:
572 /* set mark */
573 /* args: <mark> */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000574 TRACE(("%8d: set mark %d\n", PTR(ptr), pattern[0]));
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000575 if (state->lastmark < pattern[0]+1)
576 state->lastmark = pattern[0]+1;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000577 if (!mark) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000578 mark = mark_copy;
579 memcpy(mark, state->mark, state->lastmark*sizeof(void*));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000580 }
581 state->mark[pattern[0]] = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +0000582 pattern++;
583 break;
584
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000585 case SRE_OP_INDEX:
586 /* set index */
587 /* args: <index> */
588 TRACE(("%8d: set index %d\n", PTR(ptr), pattern[0]));
Fredrik Lundh6f013982000-07-03 18:44:21 +0000589 state->lastindex = pattern[0];
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000590 pattern++;
591 break;
592
Guido van Rossumb700df92000-03-31 14:59:30 +0000593 case SRE_OP_JUMP:
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000594 case SRE_OP_INFO:
Guido van Rossumb700df92000-03-31 14:59:30 +0000595 /* jump forward */
596 /* args: <skip> */
597 TRACE(("%8d: jump +%d\n", PTR(ptr), pattern[0]));
598 pattern += pattern[0];
599 break;
600
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000601 case SRE_OP_ASSERT:
602 /* assert subpattern */
Fredrik Lundh6f013982000-07-03 18:44:21 +0000603 /* args: <skip> <back> <pattern> */
604 TRACE(("%8d: assert subpattern %d\n", PTR(ptr), pattern[1]));
605 state->ptr = ptr - pattern[1];
606 if (state->ptr < state->beginning)
607 goto failure;
608 i = SRE_MATCH(state, pattern + 2);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000609 if (i < 0)
610 return i;
611 if (!i)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000612 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000613 pattern += pattern[0];
Guido van Rossumb700df92000-03-31 14:59:30 +0000614 break;
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000615
616 case SRE_OP_ASSERT_NOT:
617 /* assert not subpattern */
618 /* args: <skip> <pattern> */
Fredrik Lundh6f013982000-07-03 18:44:21 +0000619 TRACE(("%8d: assert not subpattern %d\n", PTR(ptr), pattern[1]));
620 state->ptr = ptr - pattern[1];
621 if (state->ptr < state->beginning)
622 goto failure;
623 i = SRE_MATCH(state, pattern + 2);
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000624 if (i < 0)
625 return i;
626 if (i)
627 goto failure;
628 pattern += pattern[0];
629 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000630
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000631#if 0
Guido van Rossumb700df92000-03-31 14:59:30 +0000632 case SRE_OP_MAX_REPEAT_ONE:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000633 /* match repeated sequence (maximizing regexp) */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000634
635 /* this operator only works if the repeated item is
636 exactly one character wide, and we're not already
637 collecting backtracking points. for other cases,
638 use the MAX_REPEAT operator instead */
639
Guido van Rossumb700df92000-03-31 14:59:30 +0000640 /* args: <skip> <min> <max> <step> */
Guido van Rossumb700df92000-03-31 14:59:30 +0000641 TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr),
642 pattern[1], pattern[2]));
643
644 count = 0;
645
646 if (pattern[3] == SRE_OP_ANY) {
647 /* repeated wildcard. skip to the end of the target
648 string, and backtrack from there */
649 /* FIXME: must look for line endings */
650 if (ptr + pattern[1] > end)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000651 goto failure; /* cannot match */
Guido van Rossumb700df92000-03-31 14:59:30 +0000652 count = pattern[2];
653 if (count > end - ptr)
654 count = end - ptr;
655 ptr += count;
656
657 } else if (pattern[3] == SRE_OP_LITERAL) {
658 /* repeated literal */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000659 SRE_CODE chr = pattern[4];
Guido van Rossumb700df92000-03-31 14:59:30 +0000660 while (count < (int) pattern[2]) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000661 if (ptr >= end || (SRE_CODE) ptr[0] != chr)
Guido van Rossumb700df92000-03-31 14:59:30 +0000662 break;
663 ptr++;
664 count++;
665 }
666
667 } else if (pattern[3] == SRE_OP_LITERAL_IGNORE) {
668 /* repeated literal */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000669 SRE_CODE chr = pattern[4];
Guido van Rossumb700df92000-03-31 14:59:30 +0000670 while (count < (int) pattern[2]) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000671 if (ptr >= end || (SRE_CODE) state->lower(*ptr) != chr)
Guido van Rossumb700df92000-03-31 14:59:30 +0000672 break;
673 ptr++;
674 count++;
675 }
676
677 } else if (pattern[3] == SRE_OP_NOT_LITERAL) {
678 /* repeated non-literal */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000679 SRE_CODE chr = pattern[4];
Guido van Rossumb700df92000-03-31 14:59:30 +0000680 while (count < (int) pattern[2]) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000681 if (ptr >= end || (SRE_CODE) ptr[0] == chr)
Guido van Rossumb700df92000-03-31 14:59:30 +0000682 break;
683 ptr++;
684 count++;
685 }
686
687 } else if (pattern[3] == SRE_OP_NOT_LITERAL_IGNORE) {
688 /* repeated non-literal */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000689 SRE_CODE chr = pattern[4];
Guido van Rossumb700df92000-03-31 14:59:30 +0000690 while (count < (int) pattern[2]) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000691 if (ptr >= end || (SRE_CODE) state->lower(ptr[0]) == chr)
Guido van Rossumb700df92000-03-31 14:59:30 +0000692 break;
693 ptr++;
694 count++;
695 }
696
697 } else if (pattern[3] == SRE_OP_IN) {
698 /* repeated set */
699 while (count < (int) pattern[2]) {
700 if (ptr >= end || !SRE_MEMBER(pattern + 5, *ptr))
701 break;
702 ptr++;
703 count++;
704 }
705
706 } else {
707 /* repeated single character pattern */
708 state->ptr = ptr;
709 while (count < (int) pattern[2]) {
710 i = SRE_MATCH(state, pattern + 3);
711 if (i < 0)
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000712 return i;
713 if (!i)
Guido van Rossumb700df92000-03-31 14:59:30 +0000714 break;
715 count++;
716 }
717 state->ptr = ptr;
718 ptr += count;
719 }
720
721 /* when we arrive here, count contains the number of
722 matches, and ptr points to the tail of the target
723 string. check if the rest of the pattern matches, and
724 backtrack if not. */
725
Guido van Rossumb700df92000-03-31 14:59:30 +0000726 TRACE(("%8d: repeat %d found\n", PTR(ptr), count));
727
728 if (count < (int) pattern[1])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000729 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000730
731 if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
732 /* tail is empty. we're finished */
733 TRACE(("%8d: tail is empty\n", PTR(ptr)));
734 state->ptr = ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000735 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000736
737 } else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000738 /* tail starts with a literal. skip positions where
739 the rest of the pattern cannot possibly match */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000740 SRE_CODE chr = pattern[pattern[0]+1];
Guido van Rossumb700df92000-03-31 14:59:30 +0000741 TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr));
742 for (;;) {
743 TRACE(("%8d: scan for tail match\n", PTR(ptr)));
744 while (count >= (int) pattern[1] &&
745 (ptr >= end || *ptr != chr)) {
746 ptr--;
747 count--;
748 }
749 TRACE(("%8d: check tail\n", PTR(ptr)));
750 if (count < (int) pattern[1])
751 break;
752 state->ptr = ptr;
753 i = SRE_MATCH(state, pattern + pattern[0]);
754 if (i > 0) {
755 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000756 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000757 }
758 TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
759 ptr--;
760 count--;
761 }
762
763 } else {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000764 /* general case */
Guido van Rossumb700df92000-03-31 14:59:30 +0000765 TRACE(("%8d: tail is pattern\n", PTR(ptr)));
766 while (count >= (int) pattern[1]) {
767 state->ptr = ptr;
768 i = SRE_MATCH(state, pattern + pattern[0]);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000769 if (i < 0)
770 return i;
771 if (i) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000772 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000773 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000774 }
775 TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
776 ptr--;
777 count--;
778 }
779 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000780 goto failure;
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000781#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000782
783 case SRE_OP_MAX_REPEAT:
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000784 /* match repeated sequence (maximizing regexp) */
785 /* args: <skip> <1=min> <2=max> <3=save> <4=item> */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000786
787 TRACE(("%8d: max repeat (%d %d)\n", PTR(ptr),
Guido van Rossumb700df92000-03-31 14:59:30 +0000788 pattern[1], pattern[2]));
789
790 count = 0;
791 state->ptr = ptr;
792
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000793 /* match minimum number of items */
794 while (count < (int) pattern[1]) {
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000795 i = SRE_MATCH(state, pattern + 4);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000796 if (i < 0)
797 return i;
798 if (!i)
799 goto failure;
800 if (state->ptr == ptr) {
801 /* if the match was successful but empty, set the
802 count to max and terminate the scanning loop */
803 count = (int) pattern[2];
804 break;
805 }
806 count++;
807 ptr = state->ptr;
808 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000809
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000810 TRACE(("%8d: found %d leading items\n", PTR(ptr), count));
Guido van Rossumb700df92000-03-31 14:59:30 +0000811
812 if (count < (int) pattern[1])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000813 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000814
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000815 /* match maximum number of items, pushing alternate end
816 points to the stack */
Guido van Rossumb700df92000-03-31 14:59:30 +0000817
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000818 while (pattern[2] == 65535 || count < (int) pattern[2]) {
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000819 void *mark0, *mark1;
820 if (pattern[3] != 65535) {
821 mark0 = state->mark[pattern[3]];
822 mark1 = state->mark[pattern[3]+1];
823 }
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000824 state->stackbase = stack;
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000825 i = SRE_MATCH(state, pattern + 4);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000826 state->stackbase = stackbase; /* rewind */
827 if (i < 0)
828 return i;
829 if (!i)
830 break;
831 if (state->ptr == ptr) {
832 count = (int) pattern[2];
833 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000834 }
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000835 /* this position was valid; add it to the retry
836 stack */
837 if (stack >= state->stacksize) {
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000838 i = stack_extend(state, stack + 1,
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000839 stackbase + pattern[2]);
840 if (i < 0)
841 return i; /* out of memory */
842 }
843 TRACE(("%8d: stack[%d] = %d\n", PTR(ptr), stack, PTR(ptr)));
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000844 sp = state->stack + stack;
845 sp->ptr = ptr;
846 sp->pattern = pattern + pattern[0];
847 sp->mark = pattern[3];
848 if (pattern[3] != 65535) {
849 sp->mark0 = mark0;
850 sp->mark1 = mark1;
851 }
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000852 stack++;
853 /* move forward */
854 ptr = state->ptr;
855 count++;
Guido van Rossumb700df92000-03-31 14:59:30 +0000856 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000857
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000858 /* when we get here, count is the number of successful
859 matches, and ptr points to the tail. */
Guido van Rossumb700df92000-03-31 14:59:30 +0000860
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000861 TRACE(("%8d: skip +%d\n", PTR(ptr), pattern[0]));
862
863 pattern += pattern[0];
864 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000865
866 case SRE_OP_MIN_REPEAT:
867 /* match repeated sequence (minimizing regexp) */
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000868 /* args: <skip> <1=min> <2=max> <3=save> <4=item> */
869
Guido van Rossumb700df92000-03-31 14:59:30 +0000870 TRACE(("%8d: min repeat %d %d\n", PTR(ptr),
871 pattern[1], pattern[2]));
872 count = 0;
873 state->ptr = ptr;
874 /* match minimum number of items */
875 while (count < (int) pattern[1]) {
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000876 i = SRE_MATCH(state, pattern + 4);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000877 if (i < 0)
878 return i;
879 if (!i)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000880 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000881 count++;
882 }
883 /* move forward until the tail matches. */
884 while (count <= (int) pattern[2]) {
885 ptr = state->ptr;
886 i = SRE_MATCH(state, pattern + pattern[0]);
887 if (i > 0) {
888 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000889 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000890 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000891 state->ptr = ptr; /* backtrack */
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000892 i = SRE_MATCH(state, pattern + 4);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000893 if (i < 0)
894 return i;
895 if (!i)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000896 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000897 count++;
898 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000899 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000900
Guido van Rossumb700df92000-03-31 14:59:30 +0000901 case SRE_OP_BRANCH:
902 /* match one of several subpatterns */
903 /* format: <branch> <size> <head> ... <null> <tail> */
904 TRACE(("%8d: branch\n", PTR(ptr)));
905 while (*pattern) {
906 if (pattern[1] != SRE_OP_LITERAL ||
Fredrik Lundh0640e112000-06-30 13:55:15 +0000907 (ptr < end && (SRE_CODE) ptr[0] == pattern[2])) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000908 TRACE(("%8d: branch check\n", PTR(ptr)));
909 state->ptr = ptr;
910 i = SRE_MATCH(state, pattern + 1);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000911 if (i < 0)
912 return i;
913 if (i) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000914 TRACE(("%8d: branch succeeded\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000915 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000916 }
917 }
918 pattern += *pattern;
919 }
920 TRACE(("%8d: branch failed\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000921 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000922
923 case SRE_OP_REPEAT:
924 /* TEMPLATE: match repeated sequence (no backtracking) */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000925 /* args: <skip> <min> <max> */
Guido van Rossumb700df92000-03-31 14:59:30 +0000926 TRACE(("%8d: repeat %d %d\n", PTR(ptr), pattern[1], pattern[2]));
927 count = 0;
928 state->ptr = ptr;
929 while (count < (int) pattern[2]) {
930 i = SRE_MATCH(state, pattern + 3);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000931 if (i < 0)
932 return i;
933 if (!i)
Guido van Rossumb700df92000-03-31 14:59:30 +0000934 break;
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000935 if (state->ptr == ptr) {
936 count = (int) pattern[2];
937 break;
938 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000939 count++;
940 }
941 if (count <= (int) pattern[1])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000942 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000943 TRACE(("%8d: repeat %d matches\n", PTR(ptr), count));
944 pattern += pattern[0];
945 ptr = state->ptr;
946 break;
947
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000948 default:
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000949 TRACE(("%8d: unknown opcode %d\n", PTR(ptr), pattern[-1]));
Guido van Rossumb700df92000-03-31 14:59:30 +0000950 return SRE_ERROR_ILLEGAL;
951 }
952 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000953
954 failure:
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000955 TRACE(("%8d: leave (failure)\n", PTR(ptr)));
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000956 if (stack-- > stackbase) {
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000957 sp = state->stack + stack;
958 ptr = sp->ptr;
959 pattern = sp->pattern;
960 if (sp->mark != 65535) {
961 state->mark[sp->mark] = sp->mark0;
962 state->mark[sp->mark+1] = sp->mark1;
963 }
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000964 TRACE(("%8d: retry (%d)\n", PTR(ptr), stack));
965 goto retry;
966 }
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000967 state->lastmark = lastmark;
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000968 state->stackbase = stackbase;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000969 if (mark)
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000970 memcpy(state->mark, mark, state->lastmark*sizeof(void*));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000971 return 0;
972
973 success:
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000974 TRACE(("%8d: leave (success)\n", PTR(ptr)));
975 state->stackbase = stackbase;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000976 return 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000977}
978
979LOCAL(int)
980SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
981{
982 SRE_CHAR* ptr = state->start;
983 SRE_CHAR* end = state->end;
984 int status = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +0000985 int prefix_len;
986 SRE_CODE* prefix = NULL;
987 SRE_CODE* charset = NULL;
988 SRE_CODE* overlap = NULL;
989 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000990
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000991 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000992 /* optimization info block */
Fredrik Lundh3562f112000-07-02 12:00:07 +0000993 /* args: <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
994
995 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000996
997 if (pattern[3] > 0) {
998 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +0000999 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001000 end -= pattern[3]-1;
1001 if (end <= ptr)
1002 end = ptr+1;
1003 }
1004
Fredrik Lundh3562f112000-07-02 12:00:07 +00001005 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001006 /* pattern starts with a known prefix */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001007 prefix_len = pattern[5];
1008 prefix = pattern + 6;
1009 overlap = prefix + prefix_len - 1;
1010 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001011 /* pattern starts with a character from a known set */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001012 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001013
1014 pattern += 1 + pattern[1];
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001015 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001016
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001017#if defined(USE_FAST_SEARCH)
Fredrik Lundh3562f112000-07-02 12:00:07 +00001018 if (prefix && overlap && prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001019 /* pattern starts with a known prefix. use the overlap
1020 table to skip forward as fast as we possibly can */
1021 int i = 0;
1022 end = state->end;
1023 while (ptr < end) {
1024 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001025 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001026 if (!i)
1027 break;
1028 else
1029 i = overlap[i];
1030 } else {
1031 if (++i == prefix_len) {
1032 /* found a potential match */
1033 TRACE(("%8d: === SEARCH === hit\n", PTR(ptr)));
1034 state->start = ptr - prefix_len + 1;
1035 state->ptr = ptr + 1;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001036 if (flags & SRE_INFO_LITERAL)
1037 return 1; /* we got all of it */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001038 status = SRE_MATCH(state, pattern + 2*prefix_len);
1039 if (status != 0)
1040 return status;
1041 /* close but no cigar -- try again */
1042 i = overlap[i];
1043 }
1044 break;
1045 }
1046
1047 }
1048 ptr++;
1049 }
1050 return 0;
1051 }
1052#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001053
Fredrik Lundh3562f112000-07-02 12:00:07 +00001054 if (pattern[0] == SRE_OP_LITERAL) {
1055 /* pattern starts with a literal character. this is used
1056 for short prefixes, and if fast search is disabled */
Fredrik Lundh0640e112000-06-30 13:55:15 +00001057 SRE_CODE chr = pattern[1];
Guido van Rossumb700df92000-03-31 14:59:30 +00001058 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001059 while (ptr < end && (SRE_CODE) ptr[0] != chr)
Guido van Rossumb700df92000-03-31 14:59:30 +00001060 ptr++;
1061 if (ptr == end)
1062 return 0;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001063 TRACE(("%8d: === SEARCH === literal\n", PTR(ptr)));
Guido van Rossumb700df92000-03-31 14:59:30 +00001064 state->start = ptr;
1065 state->ptr = ++ptr;
1066 status = SRE_MATCH(state, pattern + 2);
1067 if (status != 0)
1068 break;
1069 }
Fredrik Lundh3562f112000-07-02 12:00:07 +00001070 } else if (charset) {
1071 /* pattern starts with a character from a known set */
1072 for (;;) {
1073 while (ptr < end && !SRE_MEMBER(charset, ptr[0]))
1074 ptr++;
1075 if (ptr == end)
1076 return 0;
1077 TRACE(("%8d: === SEARCH === charset\n", PTR(ptr)));
1078 state->start = ptr;
1079 state->ptr = ptr;
1080 status = SRE_MATCH(state, pattern);
1081 if (status != 0)
1082 break;
1083 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001084 } else
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001085 /* general case */
Guido van Rossumb700df92000-03-31 14:59:30 +00001086 while (ptr <= end) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001087 TRACE(("%8d: === SEARCH ===\n", PTR(ptr)));
Guido van Rossumb700df92000-03-31 14:59:30 +00001088 state->start = state->ptr = ptr++;
1089 status = SRE_MATCH(state, pattern);
1090 if (status != 0)
1091 break;
1092 }
1093
1094 return status;
1095}
Fredrik Lundh3562f112000-07-02 12:00:07 +00001096
Guido van Rossumb700df92000-03-31 14:59:30 +00001097
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001098#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001099
1100/* -------------------------------------------------------------------- */
1101/* factories and destructors */
1102
1103/* see sre.h for object declarations */
1104
1105staticforward PyTypeObject Pattern_Type;
1106staticforward PyTypeObject Match_Type;
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001107staticforward PyTypeObject Scanner_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00001108
1109static PyObject *
1110_compile(PyObject* self_, PyObject* args)
1111{
1112 /* "compile" pattern descriptor to pattern object */
1113
1114 PatternObject* self;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001115 int i, n;
Guido van Rossumb700df92000-03-31 14:59:30 +00001116
1117 PyObject* pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001118 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001119 PyObject* code;
1120 int groups = 0;
1121 PyObject* groupindex = NULL;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001122 PyObject* indexgroup = NULL;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001123 if (!PyArg_ParseTuple(args, "OiO|iOO", &pattern, &flags, &code,
Fredrik Lundhc2301732000-07-02 22:25:39 +00001124 &groups, &groupindex, &indexgroup))
Guido van Rossumb700df92000-03-31 14:59:30 +00001125 return NULL;
1126
Fredrik Lundh6f013982000-07-03 18:44:21 +00001127 code = PySequence_Fast(code, "code argument must be a sequence");
1128 if (!code)
Guido van Rossumb700df92000-03-31 14:59:30 +00001129 return NULL;
1130
Fredrik Lundh6f013982000-07-03 18:44:21 +00001131 n = PySequence_Length(code);
1132
1133 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, 100*n);
1134 if (!self) {
1135 Py_DECREF(code);
1136 return NULL;
1137 }
1138
1139 for (i = 0; i < n; i++) {
1140 PyObject *o = PySequence_Fast_GET_ITEM(code, i);
1141 self->code[i] = (SRE_CODE) PyInt_AsLong(o);
1142 }
1143
1144 Py_DECREF(code);
1145
1146 if (PyErr_Occurred())
1147 return NULL;
1148
Guido van Rossumb700df92000-03-31 14:59:30 +00001149 Py_INCREF(pattern);
1150 self->pattern = pattern;
1151
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001152 self->flags = flags;
1153
Guido van Rossumb700df92000-03-31 14:59:30 +00001154 self->groups = groups;
1155
1156 Py_XINCREF(groupindex);
1157 self->groupindex = groupindex;
1158
Fredrik Lundhc2301732000-07-02 22:25:39 +00001159 Py_XINCREF(indexgroup);
1160 self->indexgroup = indexgroup;
1161
Guido van Rossumb700df92000-03-31 14:59:30 +00001162 return (PyObject*) self;
1163}
1164
1165static PyObject *
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001166sre_codesize(PyObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001167{
1168 return Py_BuildValue("i", sizeof(SRE_CODE));
1169}
1170
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001171static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001172sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001173{
1174 int character, flags;
1175 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1176 return NULL;
1177 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001178 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001179#if defined(HAVE_UNICODE)
1180 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001181 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001182#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001183 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001184}
1185
Guido van Rossumb700df92000-03-31 14:59:30 +00001186LOCAL(PyObject*)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001187state_init(SRE_STATE* state, PatternObject* pattern, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001188{
1189 /* prepare state object */
1190
1191 PyBufferProcs *buffer;
1192 int i, count;
1193 void* ptr;
1194
1195 PyObject* string;
1196 int start = 0;
1197 int end = INT_MAX;
1198 if (!PyArg_ParseTuple(args, "O|ii", &string, &start, &end))
1199 return NULL;
1200
1201 /* get pointer to string buffer */
1202 buffer = string->ob_type->tp_as_buffer;
1203 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1204 buffer->bf_getsegcount(string, NULL) != 1) {
1205 PyErr_SetString(PyExc_TypeError, "expected read-only buffer");
1206 return NULL;
1207 }
1208
1209 /* determine buffer size */
1210 count = buffer->bf_getreadbuffer(string, 0, &ptr);
1211 if (count < 0) {
1212 /* sanity check */
1213 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1214 return NULL;
1215 }
1216
1217 /* determine character size */
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001218#if defined(HAVE_UNICODE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001219 state->charsize = (PyUnicode_Check(string) ? sizeof(Py_UNICODE) : 1);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001220#else
1221 state->charsize = 1;
1222#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001223
1224 count /= state->charsize;
1225
1226 /* adjust boundaries */
1227 if (start < 0)
1228 start = 0;
1229 else if (start > count)
1230 start = count;
1231
1232 if (end < 0)
1233 end = 0;
1234 else if (end > count)
1235 end = count;
1236
1237 state->beginning = ptr;
1238
1239 state->start = (void*) ((char*) ptr + start * state->charsize);
1240 state->end = (void*) ((char*) ptr + end * state->charsize);
1241
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001242 state->lastmark = 0;
1243
Guido van Rossumb700df92000-03-31 14:59:30 +00001244 /* FIXME: dynamic! */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001245 for (i = 0; i < SRE_MARK_SIZE; i++)
Guido van Rossumb700df92000-03-31 14:59:30 +00001246 state->mark[i] = NULL;
1247
Fredrik Lundh6f013982000-07-03 18:44:21 +00001248 state->lastindex = -1;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001249
Guido van Rossumb700df92000-03-31 14:59:30 +00001250 state->stack = NULL;
1251 state->stackbase = 0;
1252 state->stacksize = 0;
1253
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001254 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001255 state->lower = sre_lower_locale;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001256#if defined(HAVE_UNICODE)
1257 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001258 state->lower = sre_lower_unicode;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001259#endif
1260 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001261 state->lower = sre_lower;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001262
Guido van Rossumb700df92000-03-31 14:59:30 +00001263 return string;
1264}
1265
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001266LOCAL(void)
1267state_fini(SRE_STATE* state)
1268{
1269 stack_free(state);
1270}
1271
1272LOCAL(PyObject*)
1273state_getslice(SRE_STATE* state, int index, PyObject* string)
1274{
1275 index = (index - 1) * 2;
1276
1277 if (string == Py_None || !state->mark[index] || !state->mark[index+1]) {
1278 Py_INCREF(Py_None);
1279 return Py_None;
1280 }
1281
1282 return PySequence_GetSlice(
1283 string,
1284 ((char*)state->mark[index] - (char*)state->beginning) /
1285 state->charsize,
1286 ((char*)state->mark[index+1] - (char*)state->beginning) /
1287 state->charsize
1288 );
1289}
1290
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001291static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001292pattern_new_match(PatternObject* pattern, SRE_STATE* state,
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001293 PyObject* string, int status)
1294{
1295 /* create match object (from state object) */
1296
1297 MatchObject* match;
1298 int i, j;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001299 char* base;
1300 int n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001301
1302 if (status > 0) {
1303
1304 /* create match object (with room for extra group marks) */
Fredrik Lundh6f013982000-07-03 18:44:21 +00001305 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
1306 2*(pattern->groups+1));
1307 if (!match)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001308 return NULL;
1309
1310 Py_INCREF(pattern);
1311 match->pattern = pattern;
1312
1313 Py_INCREF(string);
1314 match->string = string;
1315
1316 match->groups = pattern->groups+1;
1317
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001318 base = (char*) state->beginning;
1319 n = state->charsize;
1320
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001321 /* group zero */
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001322 match->mark[0] = ((char*) state->start - base) / n;
1323 match->mark[1] = ((char*) state->ptr - base) / n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001324
1325 /* fill in the rest of the groups */
1326 for (i = j = 0; i < pattern->groups; i++, j+=2)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001327 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
1328 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
1329 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001330 } else
1331 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
1332
Fredrik Lundh6f013982000-07-03 18:44:21 +00001333 match->lastindex = state->lastindex;
1334
1335 match->pos = ((char*) state->start - base) / n;
1336 match->endpos = ((char*) state->end - base) / n;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001337
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001338 return (PyObject*) match;
1339
1340 } else if (status < 0) {
1341
1342 /* internal error */
1343 PyErr_SetString(
1344 PyExc_RuntimeError, "internal error in regular expression engine"
1345 );
1346 return NULL;
1347
1348 }
1349
1350 Py_INCREF(Py_None);
1351 return Py_None;
1352}
1353
1354static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001355pattern_scanner(PatternObject* pattern, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001356{
1357 /* create search state object */
1358
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001359 ScannerObject* self;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001360 PyObject* string;
1361
1362 /* create match object (with room for extra group marks) */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001363 self = PyObject_NEW(ScannerObject, &Scanner_Type);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001364 if (!self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001365 return NULL;
1366
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001367 string = state_init(&self->state, pattern, args);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001368 if (!string) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00001369 PyObject_Del(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001370 return NULL;
1371 }
1372
1373 Py_INCREF(pattern);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001374 self->pattern = (PyObject*) pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001375
1376 Py_INCREF(string);
1377 self->string = string;
1378
1379 return (PyObject*) self;
1380}
1381
Guido van Rossumb700df92000-03-31 14:59:30 +00001382static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001383pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001384{
Guido van Rossumb700df92000-03-31 14:59:30 +00001385 Py_XDECREF(self->pattern);
1386 Py_XDECREF(self->groupindex);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001387 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001388}
1389
1390static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001391pattern_match(PatternObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001392{
1393 SRE_STATE state;
1394 PyObject* string;
1395 int status;
1396
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001397 string = state_init(&state, self, args);
Guido van Rossumb700df92000-03-31 14:59:30 +00001398 if (!string)
1399 return NULL;
1400
1401 state.ptr = state.start;
1402
1403 if (state.charsize == 1) {
1404 status = sre_match(&state, PatternObject_GetCode(self));
1405 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001406#if defined(HAVE_UNICODE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001407 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001408#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001409 }
1410
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001411 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001412
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001413 return pattern_new_match(self, &state, string, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001414}
1415
1416static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001417pattern_search(PatternObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001418{
1419 SRE_STATE state;
1420 PyObject* string;
1421 int status;
1422
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001423 string = state_init(&state, self, args);
Guido van Rossumb700df92000-03-31 14:59:30 +00001424 if (!string)
1425 return NULL;
1426
1427 if (state.charsize == 1) {
1428 status = sre_search(&state, PatternObject_GetCode(self));
1429 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001430#if defined(HAVE_UNICODE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001431 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001432#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001433 }
1434
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001435 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001436
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001437 return pattern_new_match(self, &state, string, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001438}
1439
1440static PyObject*
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001441call(char* function, PyObject* args)
1442{
1443 PyObject* name;
1444 PyObject* module;
1445 PyObject* func;
1446 PyObject* result;
1447
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001448 name = PyString_FromString(MODULE);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001449 if (!name)
1450 return NULL;
1451 module = PyImport_Import(name);
1452 Py_DECREF(name);
1453 if (!module)
1454 return NULL;
1455 func = PyObject_GetAttrString(module, function);
1456 Py_DECREF(module);
1457 if (!func)
1458 return NULL;
1459 result = PyObject_CallObject(func, args);
1460 Py_DECREF(func);
1461 Py_DECREF(args);
1462 return result;
1463}
1464
1465static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001466pattern_sub(PatternObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001467{
1468 PyObject* template;
1469 PyObject* string;
1470 PyObject* count;
1471 if (!PyArg_ParseTuple(args, "OOO", &template, &string, &count))
1472 return NULL;
1473
1474 /* delegate to Python code */
1475 return call("_sub", Py_BuildValue("OOOO", self, template, string, count));
1476}
1477
1478static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001479pattern_subn(PatternObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001480{
1481 PyObject* template;
1482 PyObject* string;
1483 PyObject* count;
1484 if (!PyArg_ParseTuple(args, "OOO", &template, &string, &count))
1485 return NULL;
1486
1487 /* delegate to Python code */
1488 return call("_subn", Py_BuildValue("OOOO", self, template, string, count));
1489}
1490
1491static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001492pattern_split(PatternObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001493{
1494 PyObject* string;
1495 PyObject* maxsplit;
1496 if (!PyArg_ParseTuple(args, "OO", &string, &maxsplit))
1497 return NULL;
1498
1499 /* delegate to Python code */
1500 return call("_split", Py_BuildValue("OOO", self, string, maxsplit));
1501}
1502
1503static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001504pattern_findall(PatternObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001505{
Guido van Rossumb700df92000-03-31 14:59:30 +00001506 SRE_STATE state;
1507 PyObject* string;
1508 PyObject* list;
1509 int status;
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001510 int i;
Guido van Rossumb700df92000-03-31 14:59:30 +00001511
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001512 string = state_init(&state, self, args);
Guido van Rossumb700df92000-03-31 14:59:30 +00001513 if (!string)
1514 return NULL;
1515
1516 list = PyList_New(0);
1517
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001518 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001519
1520 PyObject* item;
1521
1522 state.ptr = state.start;
1523
1524 if (state.charsize == 1) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001525 status = sre_search(&state, PatternObject_GetCode(self));
Guido van Rossumb700df92000-03-31 14:59:30 +00001526 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001527#if defined(HAVE_UNICODE)
1528 status = sre_usearch(&state, PatternObject_GetCode(self));
1529#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001530 }
1531
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001532 if (status > 0) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001533
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001534 /* don't bother to build a match object */
1535 switch (self->groups) {
1536 case 0:
1537 item = PySequence_GetSlice(
1538 string,
1539 ((char*) state.start - (char*) state.beginning) /
1540 state.charsize,
1541 ((char*) state.ptr - (char*) state.beginning) /
1542 state.charsize);
1543 if (!item)
1544 goto error;
1545 break;
1546 case 1:
1547 item = state_getslice(&state, 1, string);
1548 if (!item)
1549 goto error;
1550 break;
1551 default:
1552 item = PyTuple_New(self->groups);
1553 if (!item)
1554 goto error;
1555 for (i = 0; i < self->groups; i++) {
1556 PyObject* o = state_getslice(&state, i+1, string);
1557 if (!o) {
1558 Py_DECREF(item);
1559 goto error;
1560 }
1561 PyTuple_SET_ITEM(item, i, o);
1562 }
1563 break;
1564 }
1565
1566 if (PyList_Append(list, item) < 0) {
1567 Py_DECREF(item);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001568 goto error;
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001569 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001570
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001571 if (state.ptr == state.start)
1572 state.start = (void*) ((char*) state.ptr + state.charsize);
1573 else
1574 state.start = state.ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001575
1576 } else {
1577
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001578 if (status == 0)
1579 break;
1580
Guido van Rossumb700df92000-03-31 14:59:30 +00001581 /* internal error */
1582 PyErr_SetString(
1583 PyExc_RuntimeError,
1584 "internal error in regular expression engine"
1585 );
1586 goto error;
1587
1588 }
1589 }
1590
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001591 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001592 return list;
1593
1594error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001595 Py_DECREF(list);
1596 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001597 return NULL;
1598
1599}
1600
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001601static PyMethodDef pattern_methods[] = {
1602 {"match", (PyCFunction) pattern_match, 1},
1603 {"search", (PyCFunction) pattern_search, 1},
1604 {"sub", (PyCFunction) pattern_sub, 1},
1605 {"subn", (PyCFunction) pattern_subn, 1},
1606 {"split", (PyCFunction) pattern_split, 1},
1607 {"findall", (PyCFunction) pattern_findall, 1},
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001608 /* experimental */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001609 {"scanner", (PyCFunction) pattern_scanner, 1},
Guido van Rossumb700df92000-03-31 14:59:30 +00001610 {NULL, NULL}
1611};
1612
1613static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001614pattern_getattr(PatternObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00001615{
1616 PyObject* res;
1617
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001618 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00001619
1620 if (res)
1621 return res;
1622
1623 PyErr_Clear();
1624
1625 /* attributes */
1626 if (!strcmp(name, "pattern")) {
1627 Py_INCREF(self->pattern);
1628 return self->pattern;
1629 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001630
1631 if (!strcmp(name, "flags"))
1632 return Py_BuildValue("i", self->flags);
1633
Fredrik Lundh01016fe2000-06-30 00:27:46 +00001634 if (!strcmp(name, "groups"))
1635 return Py_BuildValue("i", self->groups);
1636
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001637 if (!strcmp(name, "groupindex") && self->groupindex) {
1638 Py_INCREF(self->groupindex);
1639 return self->groupindex;
1640 }
1641
Guido van Rossumb700df92000-03-31 14:59:30 +00001642 PyErr_SetString(PyExc_AttributeError, name);
1643 return NULL;
1644}
1645
1646statichere PyTypeObject Pattern_Type = {
1647 PyObject_HEAD_INIT(NULL)
Fredrik Lundh6f013982000-07-03 18:44:21 +00001648 0, "SRE_Pattern",
1649 sizeof(PatternObject), sizeof(SRE_CODE),
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001650 (destructor)pattern_dealloc, /*tp_dealloc*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001651 0, /*tp_print*/
Fredrik Lundh6f013982000-07-03 18:44:21 +00001652 (getattrfunc)pattern_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001653};
1654
1655/* -------------------------------------------------------------------- */
1656/* match methods */
1657
1658static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001659match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001660{
1661 Py_XDECREF(self->string);
1662 Py_DECREF(self->pattern);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001663 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001664}
1665
1666static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001667match_getslice_by_index(MatchObject* self, int index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00001668{
1669 if (index < 0 || index >= self->groups) {
1670 /* raise IndexError if we were given a bad group number */
1671 PyErr_SetString(
1672 PyExc_IndexError,
1673 "no such group"
1674 );
1675 return NULL;
1676 }
1677
Fredrik Lundh6f013982000-07-03 18:44:21 +00001678 index *= 2;
1679
1680 if (self->string == Py_None || self->mark[index] < 0) {
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001681 /* return default value if the string or group is undefined */
1682 Py_INCREF(def);
1683 return def;
Guido van Rossumb700df92000-03-31 14:59:30 +00001684 }
1685
1686 return PySequence_GetSlice(
Fredrik Lundh6f013982000-07-03 18:44:21 +00001687 self->string, self->mark[index], self->mark[index+1]
Guido van Rossumb700df92000-03-31 14:59:30 +00001688 );
1689}
1690
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001691static int
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001692match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00001693{
Fredrik Lundh6f013982000-07-03 18:44:21 +00001694 int i;
Guido van Rossumb700df92000-03-31 14:59:30 +00001695
1696 if (PyInt_Check(index))
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001697 return (int) PyInt_AS_LONG(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00001698
Fredrik Lundh6f013982000-07-03 18:44:21 +00001699 i = -1;
1700
1701 if (self->pattern->groupindex) {
1702 index = PyObject_GetItem(self->pattern->groupindex, index);
1703 if (index) {
1704 if (PyInt_Check(index))
1705 i = (int) PyInt_AS_LONG(index);
1706 Py_DECREF(index);
1707 } else
1708 PyErr_Clear();
1709 }
1710
1711 return i;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001712}
1713
1714static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001715match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001716{
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001717 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00001718}
1719
1720static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001721match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001722{
1723 PyObject* result;
1724 int i, size;
1725
1726 size = PyTuple_GET_SIZE(args);
1727
1728 switch (size) {
1729 case 0:
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001730 result = match_getslice(self, Py_False, Py_None);
Guido van Rossumb700df92000-03-31 14:59:30 +00001731 break;
1732 case 1:
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001733 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
Guido van Rossumb700df92000-03-31 14:59:30 +00001734 break;
1735 default:
1736 /* fetch multiple items */
1737 result = PyTuple_New(size);
1738 if (!result)
1739 return NULL;
1740 for (i = 0; i < size; i++) {
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001741 PyObject* item = match_getslice(
1742 self, PyTuple_GET_ITEM(args, i), Py_None
1743 );
Guido van Rossumb700df92000-03-31 14:59:30 +00001744 if (!item) {
1745 Py_DECREF(result);
1746 return NULL;
1747 }
1748 PyTuple_SET_ITEM(result, i, item);
1749 }
1750 break;
1751 }
1752 return result;
1753}
1754
1755static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001756match_groups(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001757{
1758 PyObject* result;
1759 int index;
1760
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001761 PyObject* def = Py_None;
1762 if (!PyArg_ParseTuple(args, "|O", &def))
1763 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001764
Guido van Rossumb700df92000-03-31 14:59:30 +00001765 result = PyTuple_New(self->groups-1);
1766 if (!result)
1767 return NULL;
1768
1769 for (index = 1; index < self->groups; index++) {
1770 PyObject* item;
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001771 item = match_getslice_by_index(self, index, def);
Guido van Rossumb700df92000-03-31 14:59:30 +00001772 if (!item) {
1773 Py_DECREF(result);
1774 return NULL;
1775 }
1776 PyTuple_SET_ITEM(result, index-1, item);
1777 }
1778
1779 return result;
1780}
1781
1782static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001783match_groupdict(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001784{
1785 PyObject* result;
1786 PyObject* keys;
1787 int index;
1788
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001789 PyObject* def = Py_None;
1790 if (!PyArg_ParseTuple(args, "|O", &def))
1791 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001792
Guido van Rossumb700df92000-03-31 14:59:30 +00001793 result = PyDict_New();
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001794 if (!result || !self->pattern->groupindex)
Guido van Rossumb700df92000-03-31 14:59:30 +00001795 return result;
1796
1797 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001798 if (!keys) {
1799 Py_DECREF(result);
Guido van Rossumb700df92000-03-31 14:59:30 +00001800 return NULL;
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001801 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001802
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001803 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001804 PyObject* key;
1805 PyObject* item;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001806 key = PyList_GET_ITEM(keys, index);
Guido van Rossumb700df92000-03-31 14:59:30 +00001807 if (!key) {
1808 Py_DECREF(keys);
1809 Py_DECREF(result);
1810 return NULL;
1811 }
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001812 item = match_getslice(self, key, def);
Guido van Rossumb700df92000-03-31 14:59:30 +00001813 if (!item) {
1814 Py_DECREF(key);
1815 Py_DECREF(keys);
1816 Py_DECREF(result);
1817 return NULL;
1818 }
1819 /* FIXME: <fl> this can fail, right? */
1820 PyDict_SetItem(result, key, item);
1821 }
1822
1823 Py_DECREF(keys);
1824
1825 return result;
1826}
1827
1828static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001829match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001830{
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001831 int index;
1832
1833 PyObject* index_ = Py_False;
1834 if (!PyArg_ParseTuple(args, "|O", &index_))
Guido van Rossumb700df92000-03-31 14:59:30 +00001835 return NULL;
1836
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001837 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001838
Guido van Rossumb700df92000-03-31 14:59:30 +00001839 if (index < 0 || index >= self->groups) {
1840 PyErr_SetString(
1841 PyExc_IndexError,
1842 "no such group"
1843 );
1844 return NULL;
1845 }
1846
1847 if (self->mark[index*2] < 0) {
1848 Py_INCREF(Py_None);
1849 return Py_None;
1850 }
1851
1852 return Py_BuildValue("i", self->mark[index*2]);
1853}
1854
1855static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001856match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001857{
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001858 int index;
1859
1860 PyObject* index_ = Py_False;
1861 if (!PyArg_ParseTuple(args, "|O", &index_))
Guido van Rossumb700df92000-03-31 14:59:30 +00001862 return NULL;
1863
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001864 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001865
Guido van Rossumb700df92000-03-31 14:59:30 +00001866 if (index < 0 || index >= self->groups) {
1867 PyErr_SetString(
1868 PyExc_IndexError,
1869 "no such group"
1870 );
1871 return NULL;
1872 }
1873
1874 if (self->mark[index*2] < 0) {
1875 Py_INCREF(Py_None);
1876 return Py_None;
1877 }
1878
1879 return Py_BuildValue("i", self->mark[index*2+1]);
1880}
1881
1882static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001883match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001884{
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001885 int index;
1886
1887 PyObject* index_ = Py_False;
1888 if (!PyArg_ParseTuple(args, "|O", &index_))
Guido van Rossumb700df92000-03-31 14:59:30 +00001889 return NULL;
1890
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001891 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001892
Guido van Rossumb700df92000-03-31 14:59:30 +00001893 if (index < 0 || index >= self->groups) {
1894 PyErr_SetString(
1895 PyExc_IndexError,
1896 "no such group"
1897 );
1898 return NULL;
1899 }
1900
1901 if (self->mark[index*2] < 0) {
1902 Py_INCREF(Py_None);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001903 Py_INCREF(Py_None);
1904 return Py_BuildValue("OO", Py_None, Py_None);
Guido van Rossumb700df92000-03-31 14:59:30 +00001905 }
1906
1907 return Py_BuildValue("ii", self->mark[index*2], self->mark[index*2+1]);
1908}
1909
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001910static PyMethodDef match_methods[] = {
1911 {"group", (PyCFunction) match_group, 1},
1912 {"start", (PyCFunction) match_start, 1},
1913 {"end", (PyCFunction) match_end, 1},
1914 {"span", (PyCFunction) match_span, 1},
1915 {"groups", (PyCFunction) match_groups, 1},
1916 {"groupdict", (PyCFunction) match_groupdict, 1},
Guido van Rossumb700df92000-03-31 14:59:30 +00001917 {NULL, NULL}
1918};
1919
1920static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001921match_getattr(MatchObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00001922{
1923 PyObject* res;
1924
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001925 res = Py_FindMethod(match_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00001926 if (res)
1927 return res;
1928
1929 PyErr_Clear();
1930
Fredrik Lundhc2301732000-07-02 22:25:39 +00001931 if (!strcmp(name, "lastindex")) {
1932 /* experimental */
Fredrik Lundh6f013982000-07-03 18:44:21 +00001933 if (self->lastindex >= 0)
1934 return Py_BuildValue("i", self->lastindex);
Fredrik Lundhc2301732000-07-02 22:25:39 +00001935 Py_INCREF(Py_None);
1936 return Py_None;
1937 }
1938
1939 if (!strcmp(name, "lastgroup")) {
1940 /* experimental */
Fredrik Lundh6f013982000-07-03 18:44:21 +00001941 if (self->pattern->indexgroup && self->lastindex >= 0) {
Fredrik Lundhc2301732000-07-02 22:25:39 +00001942 PyObject* result = PySequence_GetItem(
Fredrik Lundh6f013982000-07-03 18:44:21 +00001943 self->pattern->indexgroup, self->lastindex
Fredrik Lundhc2301732000-07-02 22:25:39 +00001944 );
1945 if (result)
1946 return result;
1947 PyErr_Clear();
1948 }
1949 Py_INCREF(Py_None);
1950 return Py_None;
1951 }
1952
Guido van Rossumb700df92000-03-31 14:59:30 +00001953 if (!strcmp(name, "string")) {
1954 Py_INCREF(self->string);
1955 return self->string;
1956 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001957
Guido van Rossumb700df92000-03-31 14:59:30 +00001958 if (!strcmp(name, "re")) {
1959 Py_INCREF(self->pattern);
1960 return (PyObject*) self->pattern;
1961 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001962
Guido van Rossumb700df92000-03-31 14:59:30 +00001963 if (!strcmp(name, "pos"))
Fredrik Lundh6f013982000-07-03 18:44:21 +00001964 return Py_BuildValue("i", self->pos);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001965
Guido van Rossumb700df92000-03-31 14:59:30 +00001966 if (!strcmp(name, "endpos"))
Fredrik Lundh6f013982000-07-03 18:44:21 +00001967 return Py_BuildValue("i", self->endpos);
Guido van Rossumb700df92000-03-31 14:59:30 +00001968
1969 PyErr_SetString(PyExc_AttributeError, name);
1970 return NULL;
1971}
1972
1973/* FIXME: implement setattr("string", None) as a special case (to
1974 detach the associated string, if any */
1975
1976statichere PyTypeObject Match_Type = {
1977 PyObject_HEAD_INIT(NULL)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001978 0, "SRE_Match",
Fredrik Lundh6f013982000-07-03 18:44:21 +00001979 sizeof(MatchObject), sizeof(int),
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001980 (destructor)match_dealloc, /*tp_dealloc*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001981 0, /*tp_print*/
Fredrik Lundh6f013982000-07-03 18:44:21 +00001982 (getattrfunc)match_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001983};
1984
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001985/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001986/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001987
1988static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001989scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001990{
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001991 state_fini(&self->state);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001992 Py_DECREF(self->string);
1993 Py_DECREF(self->pattern);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001994 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001995}
1996
1997static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001998scanner_match(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001999{
2000 SRE_STATE* state = &self->state;
2001 PyObject* match;
2002 int status;
2003
2004 state->ptr = state->start;
2005
2006 if (state->charsize == 1) {
2007 status = sre_match(state, PatternObject_GetCode(self->pattern));
2008 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002009#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002010 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002011#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002012 }
2013
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002014 match = pattern_new_match((PatternObject*) self->pattern,
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002015 state, self->string, status);
2016
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002017 if (status == 0 || state->ptr == state->start)
2018 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002019 else
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002020 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002021
2022 return match;
2023}
2024
2025
2026static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002027scanner_search(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002028{
2029 SRE_STATE* state = &self->state;
2030 PyObject* match;
2031 int status;
2032
2033 state->ptr = state->start;
2034
2035 if (state->charsize == 1) {
2036 status = sre_search(state, PatternObject_GetCode(self->pattern));
2037 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002038#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002039 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002040#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002041 }
2042
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002043 match = pattern_new_match((PatternObject*) self->pattern,
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002044 state, self->string, status);
2045
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002046 if (status == 0 || state->ptr == state->start)
2047 state->start = (void*) ((char*) state->ptr + state->charsize);
2048 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002049 state->start = state->ptr;
2050
2051 return match;
2052}
2053
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002054static PyMethodDef scanner_methods[] = {
2055 {"match", (PyCFunction) scanner_match, 0},
2056 {"search", (PyCFunction) scanner_search, 0},
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002057 {NULL, NULL}
2058};
2059
2060static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002061scanner_getattr(ScannerObject* self, char* name)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002062{
2063 PyObject* res;
2064
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002065 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002066 if (res)
2067 return res;
2068
2069 PyErr_Clear();
2070
2071 /* attributes */
2072 if (!strcmp(name, "pattern")) {
2073 Py_INCREF(self->pattern);
2074 return self->pattern;
2075 }
2076
2077 PyErr_SetString(PyExc_AttributeError, name);
2078 return NULL;
2079}
2080
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002081statichere PyTypeObject Scanner_Type = {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002082 PyObject_HEAD_INIT(NULL)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002083 0, "SRE_Scanner",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002084 sizeof(ScannerObject), 0,
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002085 (destructor)scanner_dealloc, /*tp_dealloc*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002086 0, /*tp_print*/
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002087 (getattrfunc)scanner_getattr, /*tp_getattr*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002088};
2089
Guido van Rossumb700df92000-03-31 14:59:30 +00002090static PyMethodDef _functions[] = {
2091 {"compile", _compile, 1},
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002092 {"getcodesize", sre_codesize, 1},
Fredrik Lundhb389df32000-06-29 12:48:37 +00002093 {"getlower", sre_getlower, 1},
Guido van Rossumb700df92000-03-31 14:59:30 +00002094 {NULL, NULL}
2095};
2096
2097void
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002098#if defined(WIN32)
Guido van Rossumb700df92000-03-31 14:59:30 +00002099__declspec(dllexport)
2100#endif
2101init_sre()
2102{
2103 /* Patch object types */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002104 Pattern_Type.ob_type = Match_Type.ob_type =
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002105 Scanner_Type.ob_type = &PyType_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00002106
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002107 Py_InitModule("_" MODULE, _functions);
Guido van Rossumb700df92000-03-31 14:59:30 +00002108}
2109
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002110#endif /* !defined(SRE_RECURSIVE) */