blob: 7206b9570e2c02c5acf1b2a8bdf80cebf44c7691 [file] [log] [blame]
Guido van Rossumb700df92000-03-31 14:59:30 +00001/* -*- Mode: C; tab-width: 4 -*-
2 *
3 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00004 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00005 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00006 *
7 * partial history:
Fredrik Lundh436c3d52000-06-29 08:58:44 +00008 * 99-10-24 fl created (based on existing template matcher code)
Guido van Rossumb700df92000-03-31 14:59:30 +00009 * 99-11-13 fl added categories, branching, and more (0.2)
10 * 99-11-16 fl some tweaks to compile on non-Windows platforms
11 * 99-12-18 fl non-literals, generic maximizing repeat (0.3)
Fredrik Lundh436c3d52000-06-29 08:58:44 +000012 * 00-02-28 fl tons of changes (not all to the better ;-) (0.4)
13 * 00-03-06 fl first alpha, sort of (0.5)
14 * 00-03-14 fl removed most compatibility stuff (0.6)
15 * 00-05-10 fl towards third alpha (0.8.2)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +000016 * 00-05-13 fl added experimental scanner stuff (0.8.3)
Fredrik Lundh436c3d52000-06-29 08:58:44 +000017 * 00-05-27 fl final bug hunt (0.8.4)
18 * 00-06-21 fl less bugs, more taste (0.8.5)
19 * 00-06-25 fl major changes to better deal with nested repeats (0.9)
20 * 00-06-28 fl fixed findall (0.9.1)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +000021 * 00-06-29 fl fixed split, added more scanner features (0.9.2)
Fredrik Lundhc13222c2000-07-01 23:49:14 +000022 * 00-06-30 fl added fast search optimization (0.9.3)
Fredrik Lundh0640e112000-06-30 13:55:15 +000023 * 00-06-30 fl added assert (lookahead) primitives, etc (0.9.4)
Guido van Rossumb700df92000-03-31 14:59:30 +000024 *
25 * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
26 *
Guido van Rossumb700df92000-03-31 14:59:30 +000027 * Portions of this engine have been developed in cooperation with
Fredrik Lundh22d25462000-07-01 17:50:59 +000028 * CNRI. Hewlett-Packard provided funding for 2.0 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000029 * other compatibility work.
30 */
31
32#ifndef SRE_RECURSIVE
33
Fredrik Lundh43b3b492000-06-30 10:41:31 +000034char copyright[] = " SRE 0.9.4 Copyright (c) 1997-2000 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000035
36#include "Python.h"
37
38#include "sre.h"
39
Guido van Rossumb700df92000-03-31 14:59:30 +000040#if defined(HAVE_LIMITS_H)
41#include <limits.h>
42#else
43#define INT_MAX 2147483647
44#endif
45
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000046#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000047
Fredrik Lundh436c3d52000-06-29 08:58:44 +000048/* name of this module, minus the leading underscore */
49#define MODULE "sre"
50
Guido van Rossumb700df92000-03-31 14:59:30 +000051/* defining this one enables tracing */
52#undef DEBUG
53
Fredrik Lundh436c3d52000-06-29 08:58:44 +000054#if PY_VERSION_HEX >= 0x01060000
Fredrik Lundh22d25462000-07-01 17:50:59 +000055/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d52000-06-29 08:58:44 +000056#define HAVE_UNICODE
57#endif
58
Fredrik Lundh29c08be2000-06-29 23:33:12 +000059/* optional features */
60#define USE_FAST_SEARCH
61
Fredrik Lundh80946112000-06-29 18:03:25 +000062#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000063#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
64/* fastest possible local call under MSVC */
65#define LOCAL(type) static __inline type __fastcall
66#else
Fredrik Lundh29c08be2000-06-29 23:33:12 +000067#define LOCAL(type) static inline type
Guido van Rossumb700df92000-03-31 14:59:30 +000068#endif
69
70/* error codes */
71#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
72#define SRE_ERROR_MEMORY -9 /* out of memory */
73
Fredrik Lundh436c3d52000-06-29 08:58:44 +000074#if defined(DEBUG)
Guido van Rossumb700df92000-03-31 14:59:30 +000075#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +000076#else
77#define TRACE(v)
78#endif
79
Fredrik Lundh436c3d52000-06-29 08:58:44 +000080#define PTR(ptr) ((SRE_CHAR*) (ptr) - (SRE_CHAR*) state->beginning)
Guido van Rossumb700df92000-03-31 14:59:30 +000081
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000082/* -------------------------------------------------------------------- */
83/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +000084
Fredrik Lundh436c3d52000-06-29 08:58:44 +000085/* default character predicates (run sre_chars.py to regenerate tables) */
86
87#define SRE_DIGIT_MASK 1
88#define SRE_SPACE_MASK 2
89#define SRE_LINEBREAK_MASK 4
90#define SRE_ALNUM_MASK 8
91#define SRE_WORD_MASK 16
92
93static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
942, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
950, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
9625, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
9724, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
980, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
9924, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
100
Fredrik Lundhb389df32000-06-29 12:48:37 +0000101static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d52000-06-29 08:58:44 +000010210, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
10327, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
10444, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
10561, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
106108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
107122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
108106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
109120, 121, 122, 123, 124, 125, 126, 127 };
110
Fredrik Lundhb389df32000-06-29 12:48:37 +0000111static unsigned int sre_lower(unsigned int ch)
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000112{
Fredrik Lundhb389df32000-06-29 12:48:37 +0000113 return ((ch) < 128 ? sre_char_lower[ch] : ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000114}
115
116#define SRE_IS_DIGIT(ch)\
117 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
118#define SRE_IS_SPACE(ch)\
119 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
120#define SRE_IS_LINEBREAK(ch)\
121 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
122#define SRE_IS_ALNUM(ch)\
123 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
124#define SRE_IS_WORD(ch)\
125 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000126
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000127/* locale-specific character predicates */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000128
Fredrik Lundhb389df32000-06-29 12:48:37 +0000129static unsigned int sre_lower_locale(unsigned int ch)
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000130{
131 return ((ch) < 256 ? tolower((ch)) : ch);
132}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000133#define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
134#define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
135#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
136#define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
137#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
138
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000139/* unicode-specific character predicates */
140
141#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +0000142static unsigned int sre_lower_unicode(unsigned int ch)
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000143{
144 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
145}
Fredrik Lundh22d25462000-07-01 17:50:59 +0000146
147#if !defined(Py_UNICODE_ISALNUM)
148/* FIXME: workaround. should be fixed in unicodectype.c */
149#define Py_UNICODE_ISALNUM(ch)\
150 (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISUPPER(ch) ||\
151 Py_UNICODE_ISTITLE(ch) || Py_UNICODE_ISDIGIT(ch))
152#endif
153
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000154#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
155#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
156#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000157#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000158#define SRE_UNI_IS_WORD(ch) (SRE_IS_ALNUM((ch)) || (ch) == '_')
159#endif
160
Guido van Rossumb700df92000-03-31 14:59:30 +0000161LOCAL(int)
162sre_category(SRE_CODE category, unsigned int ch)
163{
164 switch (category) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000165
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000166 case SRE_CATEGORY_DIGIT:
Guido van Rossumb700df92000-03-31 14:59:30 +0000167 return SRE_IS_DIGIT(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000168 case SRE_CATEGORY_NOT_DIGIT:
Guido van Rossumb700df92000-03-31 14:59:30 +0000169 return !SRE_IS_DIGIT(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000170 case SRE_CATEGORY_SPACE:
Guido van Rossumb700df92000-03-31 14:59:30 +0000171 return SRE_IS_SPACE(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000172 case SRE_CATEGORY_NOT_SPACE:
Guido van Rossumb700df92000-03-31 14:59:30 +0000173 return !SRE_IS_SPACE(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000174 case SRE_CATEGORY_WORD:
Guido van Rossumb700df92000-03-31 14:59:30 +0000175 return SRE_IS_WORD(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000176 case SRE_CATEGORY_NOT_WORD:
Guido van Rossumb700df92000-03-31 14:59:30 +0000177 return !SRE_IS_WORD(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000178 case SRE_CATEGORY_LINEBREAK:
179 return SRE_IS_LINEBREAK(ch);
180 case SRE_CATEGORY_NOT_LINEBREAK:
181 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000182
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000183 case SRE_CATEGORY_LOC_WORD:
184 return SRE_LOC_IS_WORD(ch);
185 case SRE_CATEGORY_LOC_NOT_WORD:
186 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000187
188#if defined(HAVE_UNICODE)
189 case SRE_CATEGORY_UNI_DIGIT:
190 return SRE_UNI_IS_DIGIT(ch);
191 case SRE_CATEGORY_UNI_NOT_DIGIT:
192 return !SRE_UNI_IS_DIGIT(ch);
193 case SRE_CATEGORY_UNI_SPACE:
194 return SRE_UNI_IS_SPACE(ch);
195 case SRE_CATEGORY_UNI_NOT_SPACE:
196 return !SRE_UNI_IS_SPACE(ch);
197 case SRE_CATEGORY_UNI_WORD:
198 return SRE_UNI_IS_WORD(ch);
199 case SRE_CATEGORY_UNI_NOT_WORD:
200 return !SRE_UNI_IS_WORD(ch);
201 case SRE_CATEGORY_UNI_LINEBREAK:
202 return SRE_UNI_IS_LINEBREAK(ch);
203 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
204 return !SRE_UNI_IS_LINEBREAK(ch);
205#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000206 }
207 return 0;
208}
209
210/* helpers */
211
212LOCAL(int)
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000213stack_free(SRE_STATE* state)
Guido van Rossumb700df92000-03-31 14:59:30 +0000214{
215 if (state->stack) {
216 TRACE(("release stack\n"));
217 free(state->stack);
218 state->stack = NULL;
219 }
220 state->stacksize = 0;
221 return 0;
222}
223
224static int /* shouldn't be LOCAL */
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000225stack_extend(SRE_STATE* state, int lo, int hi)
Guido van Rossumb700df92000-03-31 14:59:30 +0000226{
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000227 SRE_STACK* stack;
Guido van Rossumb700df92000-03-31 14:59:30 +0000228 int stacksize;
229
230 /* grow the stack to a suitable size; we need at least lo entries,
231 at most hi entries. if for some reason hi is lower than lo, lo
232 wins */
233
234 stacksize = state->stacksize;
235
236 if (stacksize == 0) {
237 /* create new stack */
238 stacksize = 512;
239 if (stacksize < lo)
240 stacksize = lo;
241 else if (stacksize > hi)
242 stacksize = hi;
243 TRACE(("allocate stack %d\n", stacksize));
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000244 stack = malloc(sizeof(SRE_STACK) * stacksize);
Guido van Rossumb700df92000-03-31 14:59:30 +0000245 } else {
246 /* grow the stack (typically by a factor of two) */
247 while (stacksize < lo)
248 stacksize = 2 * stacksize;
249 /* FIXME: <fl> could trim size if it's larger than lo, and
250 much larger than hi */
251 TRACE(("grow stack to %d\n", stacksize));
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000252 stack = realloc(state->stack, sizeof(SRE_STACK) * stacksize);
Guido van Rossumb700df92000-03-31 14:59:30 +0000253 }
254
255 if (!stack) {
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000256 stack_free(state);
Guido van Rossumb700df92000-03-31 14:59:30 +0000257 return SRE_ERROR_MEMORY;
258 }
259
260 state->stack = stack;
261 state->stacksize = stacksize;
262
263 return 0;
264}
265
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000266/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000267
268#define SRE_CHAR unsigned char
269#define SRE_AT sre_at
270#define SRE_MEMBER sre_member
271#define SRE_MATCH sre_match
272#define SRE_SEARCH sre_search
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000273
274#if defined(HAVE_UNICODE)
275
Guido van Rossumb700df92000-03-31 14:59:30 +0000276#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000277#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000278#undef SRE_RECURSIVE
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000279
Guido van Rossumb700df92000-03-31 14:59:30 +0000280#undef SRE_SEARCH
281#undef SRE_MATCH
282#undef SRE_MEMBER
283#undef SRE_AT
284#undef SRE_CHAR
285
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000286/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000287
288#define SRE_CHAR Py_UNICODE
289#define SRE_AT sre_uat
290#define SRE_MEMBER sre_umember
291#define SRE_MATCH sre_umatch
292#define SRE_SEARCH sre_usearch
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000293#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000294
295#endif /* SRE_RECURSIVE */
296
297/* -------------------------------------------------------------------- */
298/* String matching engine */
299
300/* the following section is compiled twice, with different character
301 settings */
302
303LOCAL(int)
304SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
305{
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000306 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000307
308 int this, that;
309
310 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000311
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000312 case SRE_AT_BEGINNING:
Guido van Rossum29530882000-04-10 17:06:55 +0000313 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000314
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000315 case SRE_AT_BEGINNING_LINE:
316 return ((void*) ptr == state->beginning ||
317 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000318
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000319 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000320 return (((void*) (ptr+1) == state->end &&
321 SRE_IS_LINEBREAK((int) ptr[0])) ||
322 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000323
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000324 case SRE_AT_END_LINE:
325 return ((void*) ptr == state->end ||
326 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000327
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000328 case SRE_AT_BOUNDARY:
Guido van Rossumb700df92000-03-31 14:59:30 +0000329 if (state->beginning == state->end)
330 return 0;
331 that = ((void*) ptr > state->beginning) ?
332 SRE_IS_WORD((int) ptr[-1]) : 0;
333 this = ((void*) ptr < state->end) ?
334 SRE_IS_WORD((int) ptr[0]) : 0;
335 return this != that;
Fredrik Lundh80946112000-06-29 18:03:25 +0000336
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000337 case SRE_AT_NON_BOUNDARY:
Guido van Rossumb700df92000-03-31 14:59:30 +0000338 if (state->beginning == state->end)
339 return 0;
340 that = ((void*) ptr > state->beginning) ?
341 SRE_IS_WORD((int) ptr[-1]) : 0;
342 this = ((void*) ptr < state->end) ?
343 SRE_IS_WORD((int) ptr[0]) : 0;
344 return this == that;
345 }
346
347 return 0;
348}
349
350LOCAL(int)
Fredrik Lundh0640e112000-06-30 13:55:15 +0000351SRE_MEMBER(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000352{
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000353 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000354
355 int ok = 1;
356
357 for (;;) {
358 switch (*set++) {
359
360 case SRE_OP_NEGATE:
361 ok = !ok;
362 break;
363
364 case SRE_OP_FAILURE:
365 return !ok;
366
367 case SRE_OP_LITERAL:
Fredrik Lundhc13222c2000-07-01 23:49:14 +0000368 /* args: <literal> */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000369 if (ch == set[0])
Guido van Rossumb700df92000-03-31 14:59:30 +0000370 return ok;
371 set++;
372 break;
373
374 case SRE_OP_RANGE:
Fredrik Lundhc13222c2000-07-01 23:49:14 +0000375 /* args: <lower> <upper> */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000376 if (set[0] <= ch && ch <= set[1])
Guido van Rossumb700df92000-03-31 14:59:30 +0000377 return ok;
378 set += 2;
379 break;
380
381 case SRE_OP_CATEGORY:
Fredrik Lundhc13222c2000-07-01 23:49:14 +0000382 /* args: <category> */
Guido van Rossumb700df92000-03-31 14:59:30 +0000383 if (sre_category(set[0], (int) ch))
384 return ok;
385 set += 1;
386 break;
387
388 default:
Fredrik Lundh80946112000-06-29 18:03:25 +0000389 /* internal error -- there's not much we can do about it
390 here, so let's just pretend it didn't match... */
Guido van Rossumb700df92000-03-31 14:59:30 +0000391 return 0;
392 }
393 }
394}
395
396LOCAL(int)
397SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
398{
399 /* check if string matches the given pattern. returns -1 for
400 error, 0 for failure, and 1 for success */
401
402 SRE_CHAR* end = state->end;
403 SRE_CHAR* ptr = state->ptr;
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000404 int stack;
Guido van Rossumb700df92000-03-31 14:59:30 +0000405 int stackbase;
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000406 int lastmark;
Guido van Rossumb700df92000-03-31 14:59:30 +0000407 int i, count;
408
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000409 /* FIXME: this is a hack! */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000410 void* mark_copy[SRE_MARK_SIZE];
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000411 void* mark = NULL;
412
413 TRACE(("%8d: enter\n", PTR(ptr)));
414
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000415 if (pattern[0] == SRE_OP_INFO) {
416 /* optimization info block */
417 /* args: <1=skip> <2=flags> <3=min> ... */
418 if (pattern[3] && (end - ptr) < pattern[3]) {
419 TRACE(("reject (got %d chars, need %d)\n",
420 (end - ptr), pattern[3]));
421 return 0;
422 }
423 pattern += pattern[1] + 1;
424 }
425
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000426 stackbase = stack = state->stackbase;
427 lastmark = state->lastmark;
428
429 retry:
Guido van Rossumb700df92000-03-31 14:59:30 +0000430
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000431 for (;;) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000432
433 switch (*pattern++) {
434
435 case SRE_OP_FAILURE:
436 /* immediate failure */
437 TRACE(("%8d: failure\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000438 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000439
440 case SRE_OP_SUCCESS:
441 /* end of pattern */
442 TRACE(("%8d: success\n", PTR(ptr)));
443 state->ptr = ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000444 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000445
446 case SRE_OP_AT:
447 /* match at given position */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000448 /* args: <at> */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000449 TRACE(("%8d: position %d\n", PTR(ptr), *pattern));
Guido van Rossumb700df92000-03-31 14:59:30 +0000450 if (!SRE_AT(state, ptr, *pattern))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000451 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000452 pattern++;
453 break;
454
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000455 case SRE_OP_CATEGORY:
456 /* match at given category */
457 /* args: <category> */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000458 TRACE(("%8d: category %d [category %d]\n", PTR(ptr),
459 *ptr, *pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000460 if (ptr >= end || !sre_category(pattern[0], ptr[0]))
461 goto failure;
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000462 TRACE(("%8d: category ok\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000463 pattern++;
464 ptr++;
465 break;
466
Guido van Rossumb700df92000-03-31 14:59:30 +0000467 case SRE_OP_LITERAL:
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000468 /* match literal string */
Guido van Rossumb700df92000-03-31 14:59:30 +0000469 /* args: <code> */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000470 TRACE(("%8d: literal %c\n", PTR(ptr), pattern[0]));
471 if (ptr >= end || (SRE_CODE) ptr[0] != pattern[0])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000472 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000473 pattern++;
474 ptr++;
475 break;
476
477 case SRE_OP_NOT_LITERAL:
478 /* match anything that is not literal character */
479 /* args: <code> */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000480 TRACE(("%8d: literal not %c\n", PTR(ptr), pattern[0]));
481 if (ptr >= end || (SRE_CODE) ptr[0] == pattern[0])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000482 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000483 pattern++;
484 ptr++;
485 break;
486
487 case SRE_OP_ANY:
488 /* match anything */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000489 TRACE(("%8d: anything\n", PTR(ptr)));
Guido van Rossumb700df92000-03-31 14:59:30 +0000490 if (ptr >= end)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000491 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000492 ptr++;
493 break;
494
495 case SRE_OP_IN:
496 /* match set member (or non_member) */
497 /* args: <skip> <set> */
498 TRACE(("%8d: set %c\n", PTR(ptr), *ptr));
499 if (ptr >= end || !SRE_MEMBER(pattern + 1, *ptr))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000500 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000501 pattern += pattern[0];
502 ptr++;
503 break;
504
505 case SRE_OP_GROUP:
506 /* match backreference */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000507 TRACE(("%8d: group %d\n", PTR(ptr), pattern[0]));
Guido van Rossumb700df92000-03-31 14:59:30 +0000508 i = pattern[0];
509 {
Guido van Rossumb700df92000-03-31 14:59:30 +0000510 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
511 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
512 if (!p || !e || e < p)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000513 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000514 while (p < e) {
515 if (ptr >= end || *ptr != *p)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000516 goto failure;
517 p++; ptr++;
518 }
519 }
520 pattern++;
521 break;
522
523 case SRE_OP_GROUP_IGNORE:
524 /* match backreference */
525 TRACE(("%8d: group ignore %d\n", PTR(ptr), pattern[0]));
526 i = pattern[0];
527 {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000528 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
529 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000530 if (!p || !e || e < p)
531 goto failure;
532 while (p < e) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000533 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000534 state->lower(*ptr) != state->lower(*p))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000535 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000536 p++; ptr++;
537 }
538 }
539 pattern++;
540 break;
541
542 case SRE_OP_LITERAL_IGNORE:
Fredrik Lundh0640e112000-06-30 13:55:15 +0000543 TRACE(("%8d: literal lower(%c)\n", PTR(ptr), pattern[0]));
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000544 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000545 state->lower(*ptr) != state->lower(*pattern))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000546 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000547 pattern++;
548 ptr++;
549 break;
550
551 case SRE_OP_NOT_LITERAL_IGNORE:
Fredrik Lundh0640e112000-06-30 13:55:15 +0000552 TRACE(("%8d: literal not lower(%c)\n", PTR(ptr), pattern[0]));
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000553 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000554 state->lower(*ptr) == state->lower(*pattern))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000555 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000556 pattern++;
557 ptr++;
558 break;
559
560 case SRE_OP_IN_IGNORE:
561 TRACE(("%8d: set lower(%c)\n", PTR(ptr), *ptr));
562 if (ptr >= end
Fredrik Lundh0640e112000-06-30 13:55:15 +0000563 || !SRE_MEMBER(pattern+1, (SRE_CODE) state->lower(*ptr)))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000564 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000565 pattern += pattern[0];
566 ptr++;
567 break;
568
569 case SRE_OP_MARK:
570 /* set mark */
571 /* args: <mark> */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000572 TRACE(("%8d: set mark %d\n", PTR(ptr), pattern[0]));
573 if (state->lastmark < pattern[0])
574 state->lastmark = pattern[0];
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000575 if (!mark) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000576 mark = mark_copy;
577 memcpy(mark, state->mark, state->lastmark*sizeof(void*));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000578 }
579 state->mark[pattern[0]] = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +0000580 pattern++;
581 break;
582
583 case SRE_OP_JUMP:
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000584 case SRE_OP_INFO:
Guido van Rossumb700df92000-03-31 14:59:30 +0000585 /* jump forward */
586 /* args: <skip> */
587 TRACE(("%8d: jump +%d\n", PTR(ptr), pattern[0]));
588 pattern += pattern[0];
589 break;
590
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000591 case SRE_OP_ASSERT:
592 /* assert subpattern */
Guido van Rossumb700df92000-03-31 14:59:30 +0000593 /* args: <skip> <pattern> */
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000594 TRACE(("%8d: assert subpattern\n", PTR(ptr)));
Guido van Rossumb700df92000-03-31 14:59:30 +0000595 state->ptr = ptr;
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000596 i = SRE_MATCH(state, pattern + 1);
597 if (i < 0)
598 return i;
599 if (!i)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000600 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000601 pattern += pattern[0];
Guido van Rossumb700df92000-03-31 14:59:30 +0000602 break;
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000603
604 case SRE_OP_ASSERT_NOT:
605 /* assert not subpattern */
606 /* args: <skip> <pattern> */
607 TRACE(("%8d: assert not subpattern\n", PTR(ptr)));
608 state->ptr = ptr;
609 i = SRE_MATCH(state, pattern + 1);
610 if (i < 0)
611 return i;
612 if (i)
613 goto failure;
614 pattern += pattern[0];
615 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000616
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000617#if 0
Guido van Rossumb700df92000-03-31 14:59:30 +0000618 case SRE_OP_MAX_REPEAT_ONE:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000619 /* match repeated sequence (maximizing regexp) */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000620
621 /* this operator only works if the repeated item is
622 exactly one character wide, and we're not already
623 collecting backtracking points. for other cases,
624 use the MAX_REPEAT operator instead */
625
Guido van Rossumb700df92000-03-31 14:59:30 +0000626 /* args: <skip> <min> <max> <step> */
Guido van Rossumb700df92000-03-31 14:59:30 +0000627 TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr),
628 pattern[1], pattern[2]));
629
630 count = 0;
631
632 if (pattern[3] == SRE_OP_ANY) {
633 /* repeated wildcard. skip to the end of the target
634 string, and backtrack from there */
635 /* FIXME: must look for line endings */
636 if (ptr + pattern[1] > end)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000637 goto failure; /* cannot match */
Guido van Rossumb700df92000-03-31 14:59:30 +0000638 count = pattern[2];
639 if (count > end - ptr)
640 count = end - ptr;
641 ptr += count;
642
643 } else if (pattern[3] == SRE_OP_LITERAL) {
644 /* repeated literal */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000645 SRE_CODE chr = pattern[4];
Guido van Rossumb700df92000-03-31 14:59:30 +0000646 while (count < (int) pattern[2]) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000647 if (ptr >= end || (SRE_CODE) ptr[0] != chr)
Guido van Rossumb700df92000-03-31 14:59:30 +0000648 break;
649 ptr++;
650 count++;
651 }
652
653 } else if (pattern[3] == SRE_OP_LITERAL_IGNORE) {
654 /* repeated literal */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000655 SRE_CODE chr = pattern[4];
Guido van Rossumb700df92000-03-31 14:59:30 +0000656 while (count < (int) pattern[2]) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000657 if (ptr >= end || (SRE_CODE) state->lower(*ptr) != chr)
Guido van Rossumb700df92000-03-31 14:59:30 +0000658 break;
659 ptr++;
660 count++;
661 }
662
663 } else if (pattern[3] == SRE_OP_NOT_LITERAL) {
664 /* repeated non-literal */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000665 SRE_CODE chr = pattern[4];
Guido van Rossumb700df92000-03-31 14:59:30 +0000666 while (count < (int) pattern[2]) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000667 if (ptr >= end || (SRE_CODE) ptr[0] == chr)
Guido van Rossumb700df92000-03-31 14:59:30 +0000668 break;
669 ptr++;
670 count++;
671 }
672
673 } else if (pattern[3] == SRE_OP_NOT_LITERAL_IGNORE) {
674 /* repeated non-literal */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000675 SRE_CODE chr = pattern[4];
Guido van Rossumb700df92000-03-31 14:59:30 +0000676 while (count < (int) pattern[2]) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000677 if (ptr >= end || (SRE_CODE) state->lower(ptr[0]) == chr)
Guido van Rossumb700df92000-03-31 14:59:30 +0000678 break;
679 ptr++;
680 count++;
681 }
682
683 } else if (pattern[3] == SRE_OP_IN) {
684 /* repeated set */
685 while (count < (int) pattern[2]) {
686 if (ptr >= end || !SRE_MEMBER(pattern + 5, *ptr))
687 break;
688 ptr++;
689 count++;
690 }
691
692 } else {
693 /* repeated single character pattern */
694 state->ptr = ptr;
695 while (count < (int) pattern[2]) {
696 i = SRE_MATCH(state, pattern + 3);
697 if (i < 0)
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000698 return i;
699 if (!i)
Guido van Rossumb700df92000-03-31 14:59:30 +0000700 break;
701 count++;
702 }
703 state->ptr = ptr;
704 ptr += count;
705 }
706
707 /* when we arrive here, count contains the number of
708 matches, and ptr points to the tail of the target
709 string. check if the rest of the pattern matches, and
710 backtrack if not. */
711
Guido van Rossumb700df92000-03-31 14:59:30 +0000712 TRACE(("%8d: repeat %d found\n", PTR(ptr), count));
713
714 if (count < (int) pattern[1])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000715 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000716
717 if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
718 /* tail is empty. we're finished */
719 TRACE(("%8d: tail is empty\n", PTR(ptr)));
720 state->ptr = ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000721 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000722
723 } else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000724 /* tail starts with a literal. skip positions where
725 the rest of the pattern cannot possibly match */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000726 SRE_CODE chr = pattern[pattern[0]+1];
Guido van Rossumb700df92000-03-31 14:59:30 +0000727 TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr));
728 for (;;) {
729 TRACE(("%8d: scan for tail match\n", PTR(ptr)));
730 while (count >= (int) pattern[1] &&
731 (ptr >= end || *ptr != chr)) {
732 ptr--;
733 count--;
734 }
735 TRACE(("%8d: check tail\n", PTR(ptr)));
736 if (count < (int) pattern[1])
737 break;
738 state->ptr = ptr;
739 i = SRE_MATCH(state, pattern + pattern[0]);
740 if (i > 0) {
741 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000742 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000743 }
744 TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
745 ptr--;
746 count--;
747 }
748
749 } else {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000750 /* general case */
Guido van Rossumb700df92000-03-31 14:59:30 +0000751 TRACE(("%8d: tail is pattern\n", PTR(ptr)));
752 while (count >= (int) pattern[1]) {
753 state->ptr = ptr;
754 i = SRE_MATCH(state, pattern + pattern[0]);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000755 if (i < 0)
756 return i;
757 if (i) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000758 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000759 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000760 }
761 TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
762 ptr--;
763 count--;
764 }
765 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000766 goto failure;
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000767#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000768
769 case SRE_OP_MAX_REPEAT:
770 /* match repeated sequence (maximizing regexp). repeated
771 group should end with a MAX_UNTIL code */
772
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000773 /* args: <skip> <min> <max> <item> */
774
775 TRACE(("%8d: max repeat (%d %d)\n", PTR(ptr),
Guido van Rossumb700df92000-03-31 14:59:30 +0000776 pattern[1], pattern[2]));
777
778 count = 0;
779 state->ptr = ptr;
780
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000781 /* match minimum number of items */
782 while (count < (int) pattern[1]) {
783 i = SRE_MATCH(state, pattern + 3);
784 if (i < 0)
785 return i;
786 if (!i)
787 goto failure;
788 if (state->ptr == ptr) {
789 /* if the match was successful but empty, set the
790 count to max and terminate the scanning loop */
791 count = (int) pattern[2];
792 break;
793 }
794 count++;
795 ptr = state->ptr;
796 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000797
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000798 TRACE(("%8d: found %d leading items\n", PTR(ptr), count));
Guido van Rossumb700df92000-03-31 14:59:30 +0000799
800 if (count < (int) pattern[1])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000801 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000802
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000803 /* match maximum number of items, pushing alternate end
804 points to the stack */
Guido van Rossumb700df92000-03-31 14:59:30 +0000805
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000806 while (pattern[2] == 32767 || count < (int) pattern[2]) {
807 state->stackbase = stack;
808 i = SRE_MATCH(state, pattern + 3);
809 state->stackbase = stackbase; /* rewind */
810 if (i < 0)
811 return i;
812 if (!i)
813 break;
814 if (state->ptr == ptr) {
815 count = (int) pattern[2];
816 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000817 }
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000818 /* this position was valid; add it to the retry
819 stack */
820 if (stack >= state->stacksize) {
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000821 i = stack_extend(state, stack + 1,
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000822 stackbase + pattern[2]);
823 if (i < 0)
824 return i; /* out of memory */
825 }
826 TRACE(("%8d: stack[%d] = %d\n", PTR(ptr), stack, PTR(ptr)));
827 state->stack[stack].ptr = ptr;
828 state->stack[stack].pattern = pattern + pattern[0];
829 stack++;
830 /* move forward */
831 ptr = state->ptr;
832 count++;
Guido van Rossumb700df92000-03-31 14:59:30 +0000833 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000834
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000835 /* when we get here, count is the number of successful
836 matches, and ptr points to the tail. */
Guido van Rossumb700df92000-03-31 14:59:30 +0000837
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000838 TRACE(("%8d: skip +%d\n", PTR(ptr), pattern[0]));
839
840 pattern += pattern[0];
841 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000842
843 case SRE_OP_MIN_REPEAT:
844 /* match repeated sequence (minimizing regexp) */
845 TRACE(("%8d: min repeat %d %d\n", PTR(ptr),
846 pattern[1], pattern[2]));
847 count = 0;
848 state->ptr = ptr;
849 /* match minimum number of items */
850 while (count < (int) pattern[1]) {
851 i = SRE_MATCH(state, pattern + 3);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000852 if (i < 0)
853 return i;
854 if (!i)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000855 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000856 count++;
857 }
858 /* move forward until the tail matches. */
859 while (count <= (int) pattern[2]) {
860 ptr = state->ptr;
861 i = SRE_MATCH(state, pattern + pattern[0]);
862 if (i > 0) {
863 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000864 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000865 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000866 state->ptr = ptr; /* backtrack */
867 i = SRE_MATCH(state, pattern + 3);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000868 if (i < 0)
869 return i;
870 if (!i)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000871 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000872 count++;
873 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000874 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000875
Guido van Rossumb700df92000-03-31 14:59:30 +0000876 case SRE_OP_BRANCH:
877 /* match one of several subpatterns */
878 /* format: <branch> <size> <head> ... <null> <tail> */
879 TRACE(("%8d: branch\n", PTR(ptr)));
880 while (*pattern) {
881 if (pattern[1] != SRE_OP_LITERAL ||
Fredrik Lundh0640e112000-06-30 13:55:15 +0000882 (ptr < end && (SRE_CODE) ptr[0] == pattern[2])) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000883 TRACE(("%8d: branch check\n", PTR(ptr)));
884 state->ptr = ptr;
885 i = SRE_MATCH(state, pattern + 1);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000886 if (i < 0)
887 return i;
888 if (i) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000889 TRACE(("%8d: branch succeeded\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000890 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000891 }
892 }
893 pattern += *pattern;
894 }
895 TRACE(("%8d: branch failed\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000896 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000897
898 case SRE_OP_REPEAT:
899 /* TEMPLATE: match repeated sequence (no backtracking) */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000900 /* args: <skip> <min> <max> */
Guido van Rossumb700df92000-03-31 14:59:30 +0000901 TRACE(("%8d: repeat %d %d\n", PTR(ptr), pattern[1], pattern[2]));
902 count = 0;
903 state->ptr = ptr;
904 while (count < (int) pattern[2]) {
905 i = SRE_MATCH(state, pattern + 3);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000906 if (i < 0)
907 return i;
908 if (!i)
Guido van Rossumb700df92000-03-31 14:59:30 +0000909 break;
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000910 if (state->ptr == ptr) {
911 count = (int) pattern[2];
912 break;
913 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000914 count++;
915 }
916 if (count <= (int) pattern[1])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000917 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000918 TRACE(("%8d: repeat %d matches\n", PTR(ptr), count));
919 pattern += pattern[0];
920 ptr = state->ptr;
921 break;
922
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000923 default:
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000924 TRACE(("%8d: unknown opcode %d\n", PTR(ptr), pattern[-1]));
Guido van Rossumb700df92000-03-31 14:59:30 +0000925 return SRE_ERROR_ILLEGAL;
926 }
927 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000928
929 failure:
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000930 if (stack-- > stackbase) {
931 ptr = state->stack[stack].ptr;
932 pattern = state->stack[stack].pattern;
933 TRACE(("%8d: retry (%d)\n", PTR(ptr), stack));
934 goto retry;
935 }
936 TRACE(("%8d: leave (failure)\n", PTR(ptr)));
937 state->stackbase = stackbase;
938 state->lastmark = lastmark;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000939 if (mark)
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000940 memcpy(state->mark, mark, state->lastmark*sizeof(void*));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000941 return 0;
942
943 success:
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000944 TRACE(("%8d: leave (success)\n", PTR(ptr)));
945 state->stackbase = stackbase;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000946 return 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000947}
948
949LOCAL(int)
950SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
951{
952 SRE_CHAR* ptr = state->start;
953 SRE_CHAR* end = state->end;
954 int status = 0;
Fredrik Lundh80946112000-06-29 18:03:25 +0000955 int prefix_len = 0;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000956 SRE_CODE* prefix;
957 SRE_CODE* overlap;
958 int literal = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000959
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000960 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000961 /* optimization info block */
962 /* args: <1=skip> <2=flags> <3=min> <4=max> <5=prefix> <6=data...> */
963
964 if (pattern[3] > 0) {
965 /* adjust end point (but make sure we leave at least one
966 character in there) */
967 end -= pattern[3]-1;
968 if (end <= ptr)
969 end = ptr+1;
970 }
971
972 literal = pattern[2];
973
974 prefix = pattern + 6;
975 prefix_len = pattern[5];
976
977 overlap = prefix + prefix_len - 1;
978
979 pattern += 1 + pattern[1];
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000980 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000981
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000982#if defined(USE_FAST_SEARCH)
983 if (prefix_len > 1) {
984 /* pattern starts with a known prefix. use the overlap
985 table to skip forward as fast as we possibly can */
986 int i = 0;
987 end = state->end;
988 while (ptr < end) {
989 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000990 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000991 if (!i)
992 break;
993 else
994 i = overlap[i];
995 } else {
996 if (++i == prefix_len) {
997 /* found a potential match */
998 TRACE(("%8d: === SEARCH === hit\n", PTR(ptr)));
999 state->start = ptr - prefix_len + 1;
1000 state->ptr = ptr + 1;
1001 if (literal)
1002 return 1; /* all of it */
1003 status = SRE_MATCH(state, pattern + 2*prefix_len);
1004 if (status != 0)
1005 return status;
1006 /* close but no cigar -- try again */
1007 i = overlap[i];
1008 }
1009 break;
1010 }
1011
1012 }
1013 ptr++;
1014 }
1015 return 0;
1016 }
1017#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001018
Guido van Rossumb700df92000-03-31 14:59:30 +00001019 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001020 /* pattern starts with a literal character. this is used for
1021 short prefixes, and if fast search is disabled*/
Fredrik Lundh0640e112000-06-30 13:55:15 +00001022 SRE_CODE chr = pattern[1];
Guido van Rossumb700df92000-03-31 14:59:30 +00001023 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001024 while (ptr < end && (SRE_CODE) ptr[0] != chr)
Guido van Rossumb700df92000-03-31 14:59:30 +00001025 ptr++;
1026 if (ptr == end)
1027 return 0;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001028 TRACE(("%8d: === SEARCH === literal\n", PTR(ptr)));
Guido van Rossumb700df92000-03-31 14:59:30 +00001029 state->start = ptr;
1030 state->ptr = ++ptr;
1031 status = SRE_MATCH(state, pattern + 2);
1032 if (status != 0)
1033 break;
1034 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001035 } else
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001036 /* general case */
Guido van Rossumb700df92000-03-31 14:59:30 +00001037 while (ptr <= end) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001038 TRACE(("%8d: === SEARCH ===\n", PTR(ptr)));
Guido van Rossumb700df92000-03-31 14:59:30 +00001039 state->start = state->ptr = ptr++;
1040 status = SRE_MATCH(state, pattern);
1041 if (status != 0)
1042 break;
1043 }
1044
1045 return status;
1046}
1047
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001048#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001049
1050/* -------------------------------------------------------------------- */
1051/* factories and destructors */
1052
1053/* see sre.h for object declarations */
1054
1055staticforward PyTypeObject Pattern_Type;
1056staticforward PyTypeObject Match_Type;
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001057staticforward PyTypeObject Scanner_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00001058
1059static PyObject *
1060_compile(PyObject* self_, PyObject* args)
1061{
1062 /* "compile" pattern descriptor to pattern object */
1063
1064 PatternObject* self;
1065
1066 PyObject* pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001067 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001068 PyObject* code;
1069 int groups = 0;
1070 PyObject* groupindex = NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001071 if (!PyArg_ParseTuple(args, "OiO!|iO", &pattern, &flags,
1072 &PyString_Type, &code,
1073 &groups, &groupindex))
Guido van Rossumb700df92000-03-31 14:59:30 +00001074 return NULL;
1075
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001076 self = PyObject_NEW(PatternObject, &Pattern_Type);
Guido van Rossumb700df92000-03-31 14:59:30 +00001077 if (self == NULL)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001078
Guido van Rossumb700df92000-03-31 14:59:30 +00001079 return NULL;
1080
1081 Py_INCREF(pattern);
1082 self->pattern = pattern;
1083
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001084 self->flags = flags;
1085
Guido van Rossumb700df92000-03-31 14:59:30 +00001086 Py_INCREF(code);
1087 self->code = code;
1088
1089 self->groups = groups;
1090
1091 Py_XINCREF(groupindex);
1092 self->groupindex = groupindex;
1093
1094 return (PyObject*) self;
1095}
1096
1097static PyObject *
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001098sre_codesize(PyObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001099{
1100 return Py_BuildValue("i", sizeof(SRE_CODE));
1101}
1102
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001103static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001104sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001105{
1106 int character, flags;
1107 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1108 return NULL;
1109 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001110 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001111#if defined(HAVE_UNICODE)
1112 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001113 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001114#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001115 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001116}
1117
Guido van Rossumb700df92000-03-31 14:59:30 +00001118LOCAL(PyObject*)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001119state_init(SRE_STATE* state, PatternObject* pattern, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001120{
1121 /* prepare state object */
1122
1123 PyBufferProcs *buffer;
1124 int i, count;
1125 void* ptr;
1126
1127 PyObject* string;
1128 int start = 0;
1129 int end = INT_MAX;
1130 if (!PyArg_ParseTuple(args, "O|ii", &string, &start, &end))
1131 return NULL;
1132
1133 /* get pointer to string buffer */
1134 buffer = string->ob_type->tp_as_buffer;
1135 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1136 buffer->bf_getsegcount(string, NULL) != 1) {
1137 PyErr_SetString(PyExc_TypeError, "expected read-only buffer");
1138 return NULL;
1139 }
1140
1141 /* determine buffer size */
1142 count = buffer->bf_getreadbuffer(string, 0, &ptr);
1143 if (count < 0) {
1144 /* sanity check */
1145 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1146 return NULL;
1147 }
1148
1149 /* determine character size */
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001150#if defined(HAVE_UNICODE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001151 state->charsize = (PyUnicode_Check(string) ? sizeof(Py_UNICODE) : 1);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001152#else
1153 state->charsize = 1;
1154#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001155
1156 count /= state->charsize;
1157
1158 /* adjust boundaries */
1159 if (start < 0)
1160 start = 0;
1161 else if (start > count)
1162 start = count;
1163
1164 if (end < 0)
1165 end = 0;
1166 else if (end > count)
1167 end = count;
1168
1169 state->beginning = ptr;
1170
1171 state->start = (void*) ((char*) ptr + start * state->charsize);
1172 state->end = (void*) ((char*) ptr + end * state->charsize);
1173
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001174 state->lastmark = 0;
1175
Guido van Rossumb700df92000-03-31 14:59:30 +00001176 /* FIXME: dynamic! */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001177 for (i = 0; i < SRE_MARK_SIZE; i++)
Guido van Rossumb700df92000-03-31 14:59:30 +00001178 state->mark[i] = NULL;
1179
1180 state->stack = NULL;
1181 state->stackbase = 0;
1182 state->stacksize = 0;
1183
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001184 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001185 state->lower = sre_lower_locale;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001186#if defined(HAVE_UNICODE)
1187 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001188 state->lower = sre_lower_unicode;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001189#endif
1190 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001191 state->lower = sre_lower;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001192
Guido van Rossumb700df92000-03-31 14:59:30 +00001193 return string;
1194}
1195
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001196LOCAL(void)
1197state_fini(SRE_STATE* state)
1198{
1199 stack_free(state);
1200}
1201
1202LOCAL(PyObject*)
1203state_getslice(SRE_STATE* state, int index, PyObject* string)
1204{
1205 index = (index - 1) * 2;
1206
1207 if (string == Py_None || !state->mark[index] || !state->mark[index+1]) {
1208 Py_INCREF(Py_None);
1209 return Py_None;
1210 }
1211
1212 return PySequence_GetSlice(
1213 string,
1214 ((char*)state->mark[index] - (char*)state->beginning) /
1215 state->charsize,
1216 ((char*)state->mark[index+1] - (char*)state->beginning) /
1217 state->charsize
1218 );
1219}
1220
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001221static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001222pattern_new_match(PatternObject* pattern, SRE_STATE* state,
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001223 PyObject* string, int status)
1224{
1225 /* create match object (from state object) */
1226
1227 MatchObject* match;
1228 int i, j;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001229 char* base;
1230 int n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001231
1232 if (status > 0) {
1233
1234 /* create match object (with room for extra group marks) */
1235 match = PyObject_NEW_VAR(MatchObject, &Match_Type, 2*pattern->groups);
1236 if (match == NULL)
1237 return NULL;
1238
1239 Py_INCREF(pattern);
1240 match->pattern = pattern;
1241
1242 Py_INCREF(string);
1243 match->string = string;
1244
1245 match->groups = pattern->groups+1;
1246
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001247 base = (char*) state->beginning;
1248 n = state->charsize;
1249
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001250 /* group zero */
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001251 match->mark[0] = ((char*) state->start - base) / n;
1252 match->mark[1] = ((char*) state->ptr - base) / n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001253
1254 /* fill in the rest of the groups */
1255 for (i = j = 0; i < pattern->groups; i++, j+=2)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001256 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
1257 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
1258 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001259 } else
1260 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
1261
1262 return (PyObject*) match;
1263
1264 } else if (status < 0) {
1265
1266 /* internal error */
1267 PyErr_SetString(
1268 PyExc_RuntimeError, "internal error in regular expression engine"
1269 );
1270 return NULL;
1271
1272 }
1273
1274 Py_INCREF(Py_None);
1275 return Py_None;
1276}
1277
1278static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001279pattern_scanner(PatternObject* pattern, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001280{
1281 /* create search state object */
1282
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001283 ScannerObject* self;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001284 PyObject* string;
1285
1286 /* create match object (with room for extra group marks) */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001287 self = PyObject_NEW(ScannerObject, &Scanner_Type);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001288 if (self == NULL)
1289 return NULL;
1290
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001291 string = state_init(&self->state, pattern, args);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001292 if (!string) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001293 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001294 return NULL;
1295 }
1296
1297 Py_INCREF(pattern);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001298 self->pattern = (PyObject*) pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001299
1300 Py_INCREF(string);
1301 self->string = string;
1302
1303 return (PyObject*) self;
1304}
1305
Guido van Rossumb700df92000-03-31 14:59:30 +00001306static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001307pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001308{
1309 Py_XDECREF(self->code);
1310 Py_XDECREF(self->pattern);
1311 Py_XDECREF(self->groupindex);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001312 PyMem_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001313}
1314
1315static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001316pattern_match(PatternObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001317{
1318 SRE_STATE state;
1319 PyObject* string;
1320 int status;
1321
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001322 string = state_init(&state, self, args);
Guido van Rossumb700df92000-03-31 14:59:30 +00001323 if (!string)
1324 return NULL;
1325
1326 state.ptr = state.start;
1327
1328 if (state.charsize == 1) {
1329 status = sre_match(&state, PatternObject_GetCode(self));
1330 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001331#if defined(HAVE_UNICODE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001332 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001333#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001334 }
1335
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001336 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001337
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001338 return pattern_new_match(self, &state, string, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001339}
1340
1341static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001342pattern_search(PatternObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001343{
1344 SRE_STATE state;
1345 PyObject* string;
1346 int status;
1347
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001348 string = state_init(&state, self, args);
Guido van Rossumb700df92000-03-31 14:59:30 +00001349 if (!string)
1350 return NULL;
1351
1352 if (state.charsize == 1) {
1353 status = sre_search(&state, PatternObject_GetCode(self));
1354 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001355#if defined(HAVE_UNICODE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001356 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001357#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001358 }
1359
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001360 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001361
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001362 return pattern_new_match(self, &state, string, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001363}
1364
1365static PyObject*
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001366call(char* function, PyObject* args)
1367{
1368 PyObject* name;
1369 PyObject* module;
1370 PyObject* func;
1371 PyObject* result;
1372
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001373 name = PyString_FromString(MODULE);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001374 if (!name)
1375 return NULL;
1376 module = PyImport_Import(name);
1377 Py_DECREF(name);
1378 if (!module)
1379 return NULL;
1380 func = PyObject_GetAttrString(module, function);
1381 Py_DECREF(module);
1382 if (!func)
1383 return NULL;
1384 result = PyObject_CallObject(func, args);
1385 Py_DECREF(func);
1386 Py_DECREF(args);
1387 return result;
1388}
1389
1390static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001391pattern_sub(PatternObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001392{
1393 PyObject* template;
1394 PyObject* string;
1395 PyObject* count;
1396 if (!PyArg_ParseTuple(args, "OOO", &template, &string, &count))
1397 return NULL;
1398
1399 /* delegate to Python code */
1400 return call("_sub", Py_BuildValue("OOOO", self, template, string, count));
1401}
1402
1403static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001404pattern_subn(PatternObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001405{
1406 PyObject* template;
1407 PyObject* string;
1408 PyObject* count;
1409 if (!PyArg_ParseTuple(args, "OOO", &template, &string, &count))
1410 return NULL;
1411
1412 /* delegate to Python code */
1413 return call("_subn", Py_BuildValue("OOOO", self, template, string, count));
1414}
1415
1416static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001417pattern_split(PatternObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001418{
1419 PyObject* string;
1420 PyObject* maxsplit;
1421 if (!PyArg_ParseTuple(args, "OO", &string, &maxsplit))
1422 return NULL;
1423
1424 /* delegate to Python code */
1425 return call("_split", Py_BuildValue("OOO", self, string, maxsplit));
1426}
1427
1428static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001429pattern_findall(PatternObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001430{
Guido van Rossumb700df92000-03-31 14:59:30 +00001431 SRE_STATE state;
1432 PyObject* string;
1433 PyObject* list;
1434 int status;
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001435 int i;
Guido van Rossumb700df92000-03-31 14:59:30 +00001436
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001437 string = state_init(&state, self, args);
Guido van Rossumb700df92000-03-31 14:59:30 +00001438 if (!string)
1439 return NULL;
1440
1441 list = PyList_New(0);
1442
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001443 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001444
1445 PyObject* item;
1446
1447 state.ptr = state.start;
1448
1449 if (state.charsize == 1) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001450 status = sre_search(&state, PatternObject_GetCode(self));
Guido van Rossumb700df92000-03-31 14:59:30 +00001451 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001452#if defined(HAVE_UNICODE)
1453 status = sre_usearch(&state, PatternObject_GetCode(self));
1454#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001455 }
1456
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001457 if (status > 0) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001458
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001459 /* don't bother to build a match object */
1460 switch (self->groups) {
1461 case 0:
1462 item = PySequence_GetSlice(
1463 string,
1464 ((char*) state.start - (char*) state.beginning) /
1465 state.charsize,
1466 ((char*) state.ptr - (char*) state.beginning) /
1467 state.charsize);
1468 if (!item)
1469 goto error;
1470 break;
1471 case 1:
1472 item = state_getslice(&state, 1, string);
1473 if (!item)
1474 goto error;
1475 break;
1476 default:
1477 item = PyTuple_New(self->groups);
1478 if (!item)
1479 goto error;
1480 for (i = 0; i < self->groups; i++) {
1481 PyObject* o = state_getslice(&state, i+1, string);
1482 if (!o) {
1483 Py_DECREF(item);
1484 goto error;
1485 }
1486 PyTuple_SET_ITEM(item, i, o);
1487 }
1488 break;
1489 }
1490
1491 if (PyList_Append(list, item) < 0) {
1492 Py_DECREF(item);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001493 goto error;
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001494 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001495
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001496 if (state.ptr == state.start)
1497 state.start = (void*) ((char*) state.ptr + state.charsize);
1498 else
1499 state.start = state.ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001500
1501 } else {
1502
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001503 if (status == 0)
1504 break;
1505
Guido van Rossumb700df92000-03-31 14:59:30 +00001506 /* internal error */
1507 PyErr_SetString(
1508 PyExc_RuntimeError,
1509 "internal error in regular expression engine"
1510 );
1511 goto error;
1512
1513 }
1514 }
1515
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001516 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001517 return list;
1518
1519error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001520 Py_DECREF(list);
1521 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001522 return NULL;
1523
1524}
1525
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001526static PyMethodDef pattern_methods[] = {
1527 {"match", (PyCFunction) pattern_match, 1},
1528 {"search", (PyCFunction) pattern_search, 1},
1529 {"sub", (PyCFunction) pattern_sub, 1},
1530 {"subn", (PyCFunction) pattern_subn, 1},
1531 {"split", (PyCFunction) pattern_split, 1},
1532 {"findall", (PyCFunction) pattern_findall, 1},
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001533 /* experimental */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001534 {"scanner", (PyCFunction) pattern_scanner, 1},
Guido van Rossumb700df92000-03-31 14:59:30 +00001535 {NULL, NULL}
1536};
1537
1538static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001539pattern_getattr(PatternObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00001540{
1541 PyObject* res;
1542
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001543 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00001544
1545 if (res)
1546 return res;
1547
1548 PyErr_Clear();
1549
1550 /* attributes */
1551 if (!strcmp(name, "pattern")) {
1552 Py_INCREF(self->pattern);
1553 return self->pattern;
1554 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001555
1556 if (!strcmp(name, "flags"))
1557 return Py_BuildValue("i", self->flags);
1558
Fredrik Lundh01016fe2000-06-30 00:27:46 +00001559 if (!strcmp(name, "groups"))
1560 return Py_BuildValue("i", self->groups);
1561
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001562 if (!strcmp(name, "groupindex") && self->groupindex) {
1563 Py_INCREF(self->groupindex);
1564 return self->groupindex;
1565 }
1566
Guido van Rossumb700df92000-03-31 14:59:30 +00001567 PyErr_SetString(PyExc_AttributeError, name);
1568 return NULL;
1569}
1570
1571statichere PyTypeObject Pattern_Type = {
1572 PyObject_HEAD_INIT(NULL)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001573 0, "SRE_Pattern", sizeof(PatternObject), 0,
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001574 (destructor)pattern_dealloc, /*tp_dealloc*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001575 0, /*tp_print*/
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001576 (getattrfunc)pattern_getattr, /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001577};
1578
1579/* -------------------------------------------------------------------- */
1580/* match methods */
1581
1582static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001583match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001584{
1585 Py_XDECREF(self->string);
1586 Py_DECREF(self->pattern);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001587 PyMem_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001588}
1589
1590static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001591match_getslice_by_index(MatchObject* self, int index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00001592{
1593 if (index < 0 || index >= self->groups) {
1594 /* raise IndexError if we were given a bad group number */
1595 PyErr_SetString(
1596 PyExc_IndexError,
1597 "no such group"
1598 );
1599 return NULL;
1600 }
1601
1602 if (self->string == Py_None || self->mark[index+index] < 0) {
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001603 /* return default value if the string or group is undefined */
1604 Py_INCREF(def);
1605 return def;
Guido van Rossumb700df92000-03-31 14:59:30 +00001606 }
1607
1608 return PySequence_GetSlice(
1609 self->string, self->mark[index+index], self->mark[index+index+1]
1610 );
1611}
1612
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001613static int
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001614match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00001615{
1616 if (!PyInt_Check(index) && self->pattern->groupindex != NULL) {
1617 /* FIXME: resource leak? */
1618 index = PyObject_GetItem(self->pattern->groupindex, index);
1619 if (!index)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001620 return -1;
Guido van Rossumb700df92000-03-31 14:59:30 +00001621 }
1622
1623 if (PyInt_Check(index))
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001624 return (int) PyInt_AS_LONG(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00001625
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001626 return -1;
1627}
1628
1629static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001630match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001631{
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001632 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00001633}
1634
1635static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001636match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001637{
1638 PyObject* result;
1639 int i, size;
1640
1641 size = PyTuple_GET_SIZE(args);
1642
1643 switch (size) {
1644 case 0:
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001645 result = match_getslice(self, Py_False, Py_None);
Guido van Rossumb700df92000-03-31 14:59:30 +00001646 break;
1647 case 1:
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001648 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
Guido van Rossumb700df92000-03-31 14:59:30 +00001649 break;
1650 default:
1651 /* fetch multiple items */
1652 result = PyTuple_New(size);
1653 if (!result)
1654 return NULL;
1655 for (i = 0; i < size; i++) {
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001656 PyObject* item = match_getslice(
1657 self, PyTuple_GET_ITEM(args, i), Py_None
1658 );
Guido van Rossumb700df92000-03-31 14:59:30 +00001659 if (!item) {
1660 Py_DECREF(result);
1661 return NULL;
1662 }
1663 PyTuple_SET_ITEM(result, i, item);
1664 }
1665 break;
1666 }
1667 return result;
1668}
1669
1670static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001671match_groups(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001672{
1673 PyObject* result;
1674 int index;
1675
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001676 PyObject* def = Py_None;
1677 if (!PyArg_ParseTuple(args, "|O", &def))
1678 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001679
Guido van Rossumb700df92000-03-31 14:59:30 +00001680 result = PyTuple_New(self->groups-1);
1681 if (!result)
1682 return NULL;
1683
1684 for (index = 1; index < self->groups; index++) {
1685 PyObject* item;
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001686 item = match_getslice_by_index(self, index, def);
Guido van Rossumb700df92000-03-31 14:59:30 +00001687 if (!item) {
1688 Py_DECREF(result);
1689 return NULL;
1690 }
1691 PyTuple_SET_ITEM(result, index-1, item);
1692 }
1693
1694 return result;
1695}
1696
1697static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001698match_groupdict(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001699{
1700 PyObject* result;
1701 PyObject* keys;
1702 int index;
1703
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001704 PyObject* def = Py_None;
1705 if (!PyArg_ParseTuple(args, "|O", &def))
1706 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001707
Guido van Rossumb700df92000-03-31 14:59:30 +00001708 result = PyDict_New();
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001709 if (!result || !self->pattern->groupindex)
Guido van Rossumb700df92000-03-31 14:59:30 +00001710 return result;
1711
1712 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001713 if (!keys) {
1714 Py_DECREF(result);
Guido van Rossumb700df92000-03-31 14:59:30 +00001715 return NULL;
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001716 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001717
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001718 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001719 PyObject* key;
1720 PyObject* item;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001721 key = PyList_GET_ITEM(keys, index);
Guido van Rossumb700df92000-03-31 14:59:30 +00001722 if (!key) {
1723 Py_DECREF(keys);
1724 Py_DECREF(result);
1725 return NULL;
1726 }
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001727 item = match_getslice(self, key, def);
Guido van Rossumb700df92000-03-31 14:59:30 +00001728 if (!item) {
1729 Py_DECREF(key);
1730 Py_DECREF(keys);
1731 Py_DECREF(result);
1732 return NULL;
1733 }
1734 /* FIXME: <fl> this can fail, right? */
1735 PyDict_SetItem(result, key, item);
1736 }
1737
1738 Py_DECREF(keys);
1739
1740 return result;
1741}
1742
1743static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001744match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001745{
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001746 int index;
1747
1748 PyObject* index_ = Py_False;
1749 if (!PyArg_ParseTuple(args, "|O", &index_))
Guido van Rossumb700df92000-03-31 14:59:30 +00001750 return NULL;
1751
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001752 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001753
Guido van Rossumb700df92000-03-31 14:59:30 +00001754 if (index < 0 || index >= self->groups) {
1755 PyErr_SetString(
1756 PyExc_IndexError,
1757 "no such group"
1758 );
1759 return NULL;
1760 }
1761
1762 if (self->mark[index*2] < 0) {
1763 Py_INCREF(Py_None);
1764 return Py_None;
1765 }
1766
1767 return Py_BuildValue("i", self->mark[index*2]);
1768}
1769
1770static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001771match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001772{
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001773 int index;
1774
1775 PyObject* index_ = Py_False;
1776 if (!PyArg_ParseTuple(args, "|O", &index_))
Guido van Rossumb700df92000-03-31 14:59:30 +00001777 return NULL;
1778
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001779 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001780
Guido van Rossumb700df92000-03-31 14:59:30 +00001781 if (index < 0 || index >= self->groups) {
1782 PyErr_SetString(
1783 PyExc_IndexError,
1784 "no such group"
1785 );
1786 return NULL;
1787 }
1788
1789 if (self->mark[index*2] < 0) {
1790 Py_INCREF(Py_None);
1791 return Py_None;
1792 }
1793
1794 return Py_BuildValue("i", self->mark[index*2+1]);
1795}
1796
1797static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001798match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001799{
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001800 int index;
1801
1802 PyObject* index_ = Py_False;
1803 if (!PyArg_ParseTuple(args, "|O", &index_))
Guido van Rossumb700df92000-03-31 14:59:30 +00001804 return NULL;
1805
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001806 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001807
Guido van Rossumb700df92000-03-31 14:59:30 +00001808 if (index < 0 || index >= self->groups) {
1809 PyErr_SetString(
1810 PyExc_IndexError,
1811 "no such group"
1812 );
1813 return NULL;
1814 }
1815
1816 if (self->mark[index*2] < 0) {
1817 Py_INCREF(Py_None);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001818 Py_INCREF(Py_None);
1819 return Py_BuildValue("OO", Py_None, Py_None);
Guido van Rossumb700df92000-03-31 14:59:30 +00001820 }
1821
1822 return Py_BuildValue("ii", self->mark[index*2], self->mark[index*2+1]);
1823}
1824
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001825static PyMethodDef match_methods[] = {
1826 {"group", (PyCFunction) match_group, 1},
1827 {"start", (PyCFunction) match_start, 1},
1828 {"end", (PyCFunction) match_end, 1},
1829 {"span", (PyCFunction) match_span, 1},
1830 {"groups", (PyCFunction) match_groups, 1},
1831 {"groupdict", (PyCFunction) match_groupdict, 1},
Guido van Rossumb700df92000-03-31 14:59:30 +00001832 {NULL, NULL}
1833};
1834
1835static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001836match_getattr(MatchObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00001837{
1838 PyObject* res;
1839
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001840 res = Py_FindMethod(match_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00001841 if (res)
1842 return res;
1843
1844 PyErr_Clear();
1845
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001846 /* attributes */
Guido van Rossumb700df92000-03-31 14:59:30 +00001847 if (!strcmp(name, "string")) {
1848 Py_INCREF(self->string);
1849 return self->string;
1850 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001851
Guido van Rossumb700df92000-03-31 14:59:30 +00001852 if (!strcmp(name, "re")) {
1853 Py_INCREF(self->pattern);
1854 return (PyObject*) self->pattern;
1855 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001856
Guido van Rossumb700df92000-03-31 14:59:30 +00001857 if (!strcmp(name, "pos"))
1858 return Py_BuildValue("i", 0); /* FIXME */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001859
Guido van Rossumb700df92000-03-31 14:59:30 +00001860 if (!strcmp(name, "endpos"))
1861 return Py_BuildValue("i", 0); /* FIXME */
1862
1863 PyErr_SetString(PyExc_AttributeError, name);
1864 return NULL;
1865}
1866
1867/* FIXME: implement setattr("string", None) as a special case (to
1868 detach the associated string, if any */
1869
1870statichere PyTypeObject Match_Type = {
1871 PyObject_HEAD_INIT(NULL)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001872 0, "SRE_Match",
Guido van Rossumb700df92000-03-31 14:59:30 +00001873 sizeof(MatchObject), /* size of basic object */
1874 sizeof(int), /* space for group item */
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001875 (destructor)match_dealloc, /*tp_dealloc*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001876 0, /*tp_print*/
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001877 (getattrfunc)match_getattr, /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001878};
1879
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001880/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001881/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001882
1883static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001884scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001885{
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001886 state_fini(&self->state);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001887 Py_DECREF(self->string);
1888 Py_DECREF(self->pattern);
1889 PyMem_DEL(self);
1890}
1891
1892static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001893scanner_match(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001894{
1895 SRE_STATE* state = &self->state;
1896 PyObject* match;
1897 int status;
1898
1899 state->ptr = state->start;
1900
1901 if (state->charsize == 1) {
1902 status = sre_match(state, PatternObject_GetCode(self->pattern));
1903 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001904#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001905 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001906#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001907 }
1908
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001909 match = pattern_new_match((PatternObject*) self->pattern,
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001910 state, self->string, status);
1911
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001912 if (status == 0 || state->ptr == state->start)
1913 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001914 else
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001915 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001916
1917 return match;
1918}
1919
1920
1921static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001922scanner_search(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001923{
1924 SRE_STATE* state = &self->state;
1925 PyObject* match;
1926 int status;
1927
1928 state->ptr = state->start;
1929
1930 if (state->charsize == 1) {
1931 status = sre_search(state, PatternObject_GetCode(self->pattern));
1932 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001933#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001934 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001935#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001936 }
1937
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001938 match = pattern_new_match((PatternObject*) self->pattern,
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001939 state, self->string, status);
1940
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001941 if (status == 0 || state->ptr == state->start)
1942 state->start = (void*) ((char*) state->ptr + state->charsize);
1943 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001944 state->start = state->ptr;
1945
1946 return match;
1947}
1948
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001949static PyMethodDef scanner_methods[] = {
1950 {"match", (PyCFunction) scanner_match, 0},
1951 {"search", (PyCFunction) scanner_search, 0},
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001952 {NULL, NULL}
1953};
1954
1955static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001956scanner_getattr(ScannerObject* self, char* name)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001957{
1958 PyObject* res;
1959
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001960 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001961 if (res)
1962 return res;
1963
1964 PyErr_Clear();
1965
1966 /* attributes */
1967 if (!strcmp(name, "pattern")) {
1968 Py_INCREF(self->pattern);
1969 return self->pattern;
1970 }
1971
1972 PyErr_SetString(PyExc_AttributeError, name);
1973 return NULL;
1974}
1975
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001976statichere PyTypeObject Scanner_Type = {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001977 PyObject_HEAD_INIT(NULL)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001978 0, "SRE_Scanner",
1979 sizeof(ScannerObject), /* size of basic object */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001980 0,
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001981 (destructor)scanner_dealloc, /*tp_dealloc*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001982 0, /*tp_print*/
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001983 (getattrfunc)scanner_getattr, /*tp_getattr*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001984};
1985
Guido van Rossumb700df92000-03-31 14:59:30 +00001986static PyMethodDef _functions[] = {
1987 {"compile", _compile, 1},
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001988 {"getcodesize", sre_codesize, 1},
Fredrik Lundhb389df32000-06-29 12:48:37 +00001989 {"getlower", sre_getlower, 1},
Guido van Rossumb700df92000-03-31 14:59:30 +00001990 {NULL, NULL}
1991};
1992
1993void
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001994#if defined(WIN32)
Guido van Rossumb700df92000-03-31 14:59:30 +00001995__declspec(dllexport)
1996#endif
1997init_sre()
1998{
1999 /* Patch object types */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002000 Pattern_Type.ob_type = Match_Type.ob_type =
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002001 Scanner_Type.ob_type = &PyType_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00002002
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002003 Py_InitModule("_" MODULE, _functions);
Guido van Rossumb700df92000-03-31 14:59:30 +00002004}
2005
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002006#endif /* !defined(SRE_RECURSIVE) */