blob: b87282c1191f69b73d048facfe0a815a472aeb86 [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00007 * 99-10-24 fl created (based on existing template matcher code)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00008 * 00-03-06 fl first alpha, sort of (0.5)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00009 * 00-06-30 fl added fast search optimization (0.9.3)
10 * 00-06-30 fl added assert (lookahead) primitives, etc (0.9.4)
11 * 00-07-02 fl added charset optimizations, etc (0.9.5)
12 * 00-07-03 fl store code in pattern object, lookbehind, etc
13 * 00-07-08 fl added regs attribute
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000014 * 00-07-21 fl reset lastindex in scanner methods (0.9.6)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000015 * 00-08-01 fl fixes for 1.6b1 (0.9.8)
Fredrik Lundh96ab4652000-08-03 16:29:50 +000016 * 00-08-03 fl added recursion limit
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000017 * 00-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh58100642000-08-09 09:14:35 +000018 * 00-08-08 fl changed findall to return empty strings instead of None
Guido van Rossumb700df92000-03-31 14:59:30 +000019 *
20 * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
21 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000022 * This version of the SRE library can be redistributed under CNRI's
23 * Python 1.6 license. For any other use, please contact Secret Labs
24 * AB (info@pythonware.com).
25 *
Guido van Rossumb700df92000-03-31 14:59:30 +000026 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000027 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000028 * other compatibility work.
29 */
30
31#ifndef SRE_RECURSIVE
32
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000033char copyright[] = " SRE 0.9.8 Copyright (c) 1997-2000 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000034
35#include "Python.h"
36
37#include "sre.h"
38
Guido van Rossumb700df92000-03-31 14:59:30 +000039#if defined(HAVE_LIMITS_H)
40#include <limits.h>
41#else
42#define INT_MAX 2147483647
43#endif
44
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000045#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000046
Fredrik Lundh436c3d52000-06-29 08:58:44 +000047/* name of this module, minus the leading underscore */
48#define MODULE "sre"
49
Guido van Rossumb700df92000-03-31 14:59:30 +000050/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000051#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000052
Fredrik Lundh436c3d52000-06-29 08:58:44 +000053#if PY_VERSION_HEX >= 0x01060000
Fredrik Lundh22d25462000-07-01 17:50:59 +000054/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d52000-06-29 08:58:44 +000055#define HAVE_UNICODE
56#endif
57
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000058/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000059/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000060
Fredrik Lundh96ab4652000-08-03 16:29:50 +000061/* prevent run-away recursion (bad patterns on long strings) */
Fredrik Lundh18c2aa22000-08-07 17:33:38 +000062#if !defined(USE_STACKCHECK)
Fredrik Lundh96ab4652000-08-03 16:29:50 +000063#define USE_RECURSION_LIMIT 10000
Fredrik Lundh18c2aa22000-08-07 17:33:38 +000064#endif
Fredrik Lundh96ab4652000-08-03 16:29:50 +000065
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000066/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000067#define USE_FAST_SEARCH
68
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000069/* enables aggressive inlining (always on for Visual C) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000070#undef USE_INLINE
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000071
72/* -------------------------------------------------------------------- */
73
Fredrik Lundh80946112000-06-29 18:03:25 +000074#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000075#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000076#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000077/* fastest possible local call under MSVC */
78#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000079#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000080#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000081#else
82#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000083#endif
84
85/* error codes */
86#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000087#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000088#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000089#define SRE_ERROR_MEMORY -9 /* out of memory */
90
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000091#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +000092#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +000093#else
94#define TRACE(v)
95#endif
96
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000097/* -------------------------------------------------------------------- */
98/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +000099
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000100/* default character predicates (run sre_chars.py to regenerate tables) */
101
102#define SRE_DIGIT_MASK 1
103#define SRE_SPACE_MASK 2
104#define SRE_LINEBREAK_MASK 4
105#define SRE_ALNUM_MASK 8
106#define SRE_WORD_MASK 16
107
108static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1092, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
11125, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
11224, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1130, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
11424, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
115
Fredrik Lundhb389df32000-06-29 12:48:37 +0000116static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d52000-06-29 08:58:44 +000011710, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
11827, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
11944, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
12061, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
121108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
122122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
123106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
124120, 121, 122, 123, 124, 125, 126, 127 };
125
Fredrik Lundhb389df32000-06-29 12:48:37 +0000126static unsigned int sre_lower(unsigned int ch)
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000127{
Fredrik Lundhb389df32000-06-29 12:48:37 +0000128 return ((ch) < 128 ? sre_char_lower[ch] : ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000129}
130
131#define SRE_IS_DIGIT(ch)\
132 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
133#define SRE_IS_SPACE(ch)\
134 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
135#define SRE_IS_LINEBREAK(ch)\
136 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
137#define SRE_IS_ALNUM(ch)\
138 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
139#define SRE_IS_WORD(ch)\
140 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000141
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000142/* locale-specific character predicates */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000143
Fredrik Lundhb389df32000-06-29 12:48:37 +0000144static unsigned int sre_lower_locale(unsigned int ch)
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000145{
146 return ((ch) < 256 ? tolower((ch)) : ch);
147}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000148#define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
149#define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
150#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
151#define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
152#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
153
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000154/* unicode-specific character predicates */
155
156#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +0000157static unsigned int sre_lower_unicode(unsigned int ch)
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000158{
159 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
160}
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000161#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
162#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
163#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000164#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000165#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000166#endif
167
Guido van Rossumb700df92000-03-31 14:59:30 +0000168LOCAL(int)
169sre_category(SRE_CODE category, unsigned int ch)
170{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000171 switch (category) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000172
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000173 case SRE_CATEGORY_DIGIT:
174 return SRE_IS_DIGIT(ch);
175 case SRE_CATEGORY_NOT_DIGIT:
176 return !SRE_IS_DIGIT(ch);
177 case SRE_CATEGORY_SPACE:
178 return SRE_IS_SPACE(ch);
179 case SRE_CATEGORY_NOT_SPACE:
180 return !SRE_IS_SPACE(ch);
181 case SRE_CATEGORY_WORD:
182 return SRE_IS_WORD(ch);
183 case SRE_CATEGORY_NOT_WORD:
184 return !SRE_IS_WORD(ch);
185 case SRE_CATEGORY_LINEBREAK:
186 return SRE_IS_LINEBREAK(ch);
187 case SRE_CATEGORY_NOT_LINEBREAK:
188 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000189
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000190 case SRE_CATEGORY_LOC_WORD:
191 return SRE_LOC_IS_WORD(ch);
192 case SRE_CATEGORY_LOC_NOT_WORD:
193 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000194
195#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000196 case SRE_CATEGORY_UNI_DIGIT:
197 return SRE_UNI_IS_DIGIT(ch);
198 case SRE_CATEGORY_UNI_NOT_DIGIT:
199 return !SRE_UNI_IS_DIGIT(ch);
200 case SRE_CATEGORY_UNI_SPACE:
201 return SRE_UNI_IS_SPACE(ch);
202 case SRE_CATEGORY_UNI_NOT_SPACE:
203 return !SRE_UNI_IS_SPACE(ch);
204 case SRE_CATEGORY_UNI_WORD:
205 return SRE_UNI_IS_WORD(ch);
206 case SRE_CATEGORY_UNI_NOT_WORD:
207 return !SRE_UNI_IS_WORD(ch);
208 case SRE_CATEGORY_UNI_LINEBREAK:
209 return SRE_UNI_IS_LINEBREAK(ch);
210 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
211 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000212#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000213 }
214 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000215}
216
217/* helpers */
218
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000219static void
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000220mark_fini(SRE_STATE* state)
221{
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000222 if (state->mark_stack) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000223 free(state->mark_stack);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000224 state->mark_stack = NULL;
225 }
226 state->mark_stack_size = state->mark_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000227}
228
229static int
230mark_save(SRE_STATE* state, int lo, int hi)
231{
232 void* stack;
233 int size;
234 int minsize, newsize;
235
236 if (hi <= lo)
237 return 0;
238
239 size = (hi - lo) + 1;
240
241 newsize = state->mark_stack_size;
242 minsize = state->mark_stack_base + size;
243
244 if (newsize < minsize) {
245 /* create new stack */
246 if (!newsize) {
247 newsize = 512;
248 if (newsize < minsize)
249 newsize = minsize;
250 TRACE(("allocate stack %d\n", newsize));
251 stack = malloc(sizeof(void*) * newsize);
252 } else {
253 /* grow the stack */
254 while (newsize < minsize)
255 newsize += newsize;
256 TRACE(("grow stack to %d\n", newsize));
257 stack = realloc(state->mark_stack, sizeof(void*) * newsize);
258 }
259 if (!stack) {
260 mark_fini(state);
261 return SRE_ERROR_MEMORY;
262 }
263 state->mark_stack = stack;
264 state->mark_stack_size = newsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000265 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000266
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000267 TRACE(("copy %d:%d to %d (%d)\n", lo, hi, state->mark_stack_base, size));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000268
269 memcpy(state->mark_stack + state->mark_stack_base, state->mark + lo,
270 size * sizeof(void*));
271
272 state->mark_stack_base += size;
273
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000274 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000275}
276
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000277static int
278mark_restore(SRE_STATE* state, int lo, int hi)
Guido van Rossumb700df92000-03-31 14:59:30 +0000279{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000280 int size;
Guido van Rossumb700df92000-03-31 14:59:30 +0000281
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000282 if (hi <= lo)
283 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000284
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000285 size = (hi - lo) + 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000286
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000287 state->mark_stack_base -= size;
Guido van Rossumb700df92000-03-31 14:59:30 +0000288
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000289 TRACE(("copy %d:%d from %d\n", lo, hi, state->mark_stack_base));
Guido van Rossumb700df92000-03-31 14:59:30 +0000290
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000291 memcpy(state->mark + lo, state->mark_stack + state->mark_stack_base,
292 size * sizeof(void*));
Guido van Rossumb700df92000-03-31 14:59:30 +0000293
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000294 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000295}
296
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000297/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000298
299#define SRE_CHAR unsigned char
300#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000301#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000302#define SRE_CHARSET sre_charset
303#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000304#define SRE_MATCH sre_match
305#define SRE_SEARCH sre_search
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000306
307#if defined(HAVE_UNICODE)
308
Guido van Rossumb700df92000-03-31 14:59:30 +0000309#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000310#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000311#undef SRE_RECURSIVE
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000312
Guido van Rossumb700df92000-03-31 14:59:30 +0000313#undef SRE_SEARCH
314#undef SRE_MATCH
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000315#undef SRE_INFO
316#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000317#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000318#undef SRE_AT
319#undef SRE_CHAR
320
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000321/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000322
323#define SRE_CHAR Py_UNICODE
324#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000325#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000326#define SRE_CHARSET sre_ucharset
327#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000328#define SRE_MATCH sre_umatch
329#define SRE_SEARCH sre_usearch
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000330#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000331
332#endif /* SRE_RECURSIVE */
333
334/* -------------------------------------------------------------------- */
335/* String matching engine */
336
337/* the following section is compiled twice, with different character
338 settings */
339
340LOCAL(int)
341SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
342{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000343 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000344
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000345 int this, that;
Guido van Rossumb700df92000-03-31 14:59:30 +0000346
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000347 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000348
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000349 case SRE_AT_BEGINNING:
350 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000351
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000352 case SRE_AT_BEGINNING_LINE:
353 return ((void*) ptr == state->beginning ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000354 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000355
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000356 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000357 return (((void*) (ptr+1) == state->end &&
358 SRE_IS_LINEBREAK((int) ptr[0])) ||
359 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000360
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000361 case SRE_AT_END_LINE:
362 return ((void*) ptr == state->end ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000363 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000364
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000365 case SRE_AT_BOUNDARY:
366 if (state->beginning == state->end)
367 return 0;
368 that = ((void*) ptr > state->beginning) ?
369 SRE_IS_WORD((int) ptr[-1]) : 0;
370 this = ((void*) ptr < state->end) ?
371 SRE_IS_WORD((int) ptr[0]) : 0;
372 return this != that;
Fredrik Lundh80946112000-06-29 18:03:25 +0000373
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000374 case SRE_AT_NON_BOUNDARY:
375 if (state->beginning == state->end)
376 return 0;
377 that = ((void*) ptr > state->beginning) ?
378 SRE_IS_WORD((int) ptr[-1]) : 0;
379 this = ((void*) ptr < state->end) ?
380 SRE_IS_WORD((int) ptr[0]) : 0;
381 return this == that;
382 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000383
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000384 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000385}
386
387LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000388SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000389{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000390 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000391
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000392 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000393
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000394 for (;;) {
395 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000396
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000397 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000398 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000399 if (ch == set[0])
400 return ok;
401 set++;
402 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000403
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000404 case SRE_OP_RANGE:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000405 /* <RANGE> <lower> <upper> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000406 if (set[0] <= ch && ch <= set[1])
407 return ok;
408 set += 2;
409 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000410
Fredrik Lundh3562f112000-07-02 12:00:07 +0000411 case SRE_OP_CHARSET:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000412 /* <CHARSET> <bitmap> (16 bits per code word) */
Fredrik Lundh3562f112000-07-02 12:00:07 +0000413 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
414 return ok;
415 set += 16;
416 break;
417
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000418 case SRE_OP_CATEGORY:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000419 /* <CATEGORY> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000420 if (sre_category(set[0], (int) ch))
421 return ok;
422 set += 1;
423 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000424
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000425 case SRE_OP_NEGATE:
426 ok = !ok;
427 break;
428
429 case SRE_OP_FAILURE:
430 return !ok;
431
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000432 default:
433 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000434 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000435 return 0;
436 }
437 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000438}
439
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000440LOCAL(int) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level);
441
442LOCAL(int)
443SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, int maxcount, int level)
444{
445 SRE_CODE chr;
446 SRE_CHAR* ptr = state->ptr;
447 SRE_CHAR* end = state->end;
448 int i;
449
450 /* adjust end */
451 if (maxcount < end - ptr && maxcount != 65535)
452 end = ptr + maxcount;
453
454 switch (pattern[0]) {
455
456 case SRE_OP_ANY:
457 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000458 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000459 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
460 ptr++;
461 break;
462
463 case SRE_OP_ANY_ALL:
464 /* repeated dot wildcare. skip to the end of the target
465 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000466 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000467 ptr = end;
468 break;
469
470 case SRE_OP_LITERAL:
471 /* repeated literal */
472 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000473 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000474 while (ptr < end && (SRE_CODE) *ptr == chr)
475 ptr++;
476 break;
477
478 case SRE_OP_LITERAL_IGNORE:
479 /* repeated literal */
480 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000481 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000482 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
483 ptr++;
484 break;
485
486 case SRE_OP_NOT_LITERAL:
487 /* repeated non-literal */
488 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000489 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000490 while (ptr < end && (SRE_CODE) *ptr != chr)
491 ptr++;
492 break;
493
494 case SRE_OP_NOT_LITERAL_IGNORE:
495 /* repeated non-literal */
496 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000497 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000498 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
499 ptr++;
500 break;
501
502 case SRE_OP_IN:
503 /* repeated set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000504 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
505 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000506 ptr++;
507 break;
508
509 default:
510 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000511 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000512 while ((SRE_CHAR*) state->ptr < end) {
513 i = SRE_MATCH(state, pattern, level);
514 if (i < 0)
515 return i;
516 if (!i)
517 break;
518 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000519 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
520 (SRE_CHAR*) state->ptr - ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000521 return (SRE_CHAR*) state->ptr - ptr;
522 }
523
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000524 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000525 return ptr - (SRE_CHAR*) state->ptr;
526}
527
Guido van Rossumb700df92000-03-31 14:59:30 +0000528LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000529SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
530{
531 /* check if an SRE_OP_INFO block matches at the current position.
532 returns the number of SRE_CODE objects to skip if successful, 0
533 if no match */
534
535 SRE_CHAR* end = state->end;
536 SRE_CHAR* ptr = state->ptr;
537 int i;
538
539 /* check minimal length */
540 if (pattern[3] && (end - ptr) < pattern[3])
541 return 0;
542
543 /* check known prefix */
544 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
545 /* <length> <skip> <prefix data> <overlap data> */
546 for (i = 0; i < pattern[5]; i++)
547 if ((SRE_CODE) ptr[i] != pattern[7 + i])
548 return 0;
549 return pattern[0] + 2 * pattern[6];
550 }
551 return pattern[0];
552}
553
554LOCAL(int)
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000555SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
Guido van Rossumb700df92000-03-31 14:59:30 +0000556{
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000557 /* check if string matches the given pattern. returns <0 for
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000558 error, 0 for failure, and 1 for success */
Guido van Rossumb700df92000-03-31 14:59:30 +0000559
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000560 SRE_CHAR* end = state->end;
561 SRE_CHAR* ptr = state->ptr;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000562 int i, count;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000563 SRE_REPEAT* rp;
564 int lastmark;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000565 SRE_CODE chr;
Guido van Rossumb700df92000-03-31 14:59:30 +0000566
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000567 SRE_REPEAT rep; /* FIXME: <fl> allocate in STATE instead */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000568
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000569 TRACE(("|%p|%p|ENTER %d\n", pattern, ptr, level));
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000570
Fredrik Lundh18c2aa22000-08-07 17:33:38 +0000571#if defined(USE_STACKCHECK)
Fredrik Lundh58100642000-08-09 09:14:35 +0000572 if (level % 10 == 0 && PyOS_CheckStack())
Fredrik Lundh18c2aa22000-08-07 17:33:38 +0000573 return SRE_ERROR_RECURSION_LIMIT;
574#endif
575
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000576#if defined(USE_RECURSION_LIMIT)
577 if (level > USE_RECURSION_LIMIT)
578 return SRE_ERROR_RECURSION_LIMIT;
579#endif
580
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000581 if (pattern[0] == SRE_OP_INFO) {
582 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000583 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000584 if (pattern[3] && (end - ptr) < pattern[3]) {
585 TRACE(("reject (got %d chars, need %d)\n",
586 (end - ptr), pattern[3]));
587 return 0;
588 }
589 pattern += pattern[1] + 1;
590 }
591
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000592 for (;;) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000593
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000594 switch (*pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000595
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000596 case SRE_OP_FAILURE:
597 /* immediate failure */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000598 TRACE(("|%p|%p|FAILURE\n", pattern, ptr));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000599 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000600
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000601 case SRE_OP_SUCCESS:
602 /* end of pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000603 TRACE(("|%p|%p|SUCCESS\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000604 state->ptr = ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000605 return 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000606
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000607 case SRE_OP_AT:
608 /* match at given position */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000609 /* <AT> <code> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000610 TRACE(("|%p|%p|AT %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000611 if (!SRE_AT(state, ptr, *pattern))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000612 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000613 pattern++;
614 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000615
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000616 case SRE_OP_CATEGORY:
617 /* match at given category */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000618 /* <CATEGORY> <code> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000619 TRACE(("|%p|%p|CATEGORY %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000620 if (ptr >= end || !sre_category(pattern[0], ptr[0]))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000621 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000622 pattern++;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000623 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000624 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000625
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000626 case SRE_OP_LITERAL:
627 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000628 /* <LITERAL> <code> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000629 TRACE(("|%p|%p|LITERAL %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000630 if (ptr >= end || (SRE_CODE) ptr[0] != pattern[0])
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000631 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000632 pattern++;
633 ptr++;
634 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000635
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000636 case SRE_OP_NOT_LITERAL:
637 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000638 /* <NOT_LITERAL> <code> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000639 TRACE(("|%p|%p|NOT_LITERAL %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000640 if (ptr >= end || (SRE_CODE) ptr[0] == pattern[0])
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000641 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000642 pattern++;
643 ptr++;
644 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000645
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000646 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000647 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000648 /* <ANY> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000649 TRACE(("|%p|%p|ANY\n", pattern, ptr));
Fredrik Lundhe1869832000-08-01 22:47:49 +0000650 if (ptr >= end || SRE_IS_LINEBREAK(ptr[0]))
651 return 0;
652 ptr++;
653 break;
654
655 case SRE_OP_ANY_ALL:
656 /* match anything */
657 /* <ANY_ALL> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000658 TRACE(("|%p|%p|ANY_ALL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000659 if (ptr >= end)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000660 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000661 ptr++;
662 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000663
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000664 case SRE_OP_IN:
665 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000666 /* <IN> <skip> <set> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000667 TRACE(("|%p|%p|IN\n", pattern, ptr));
668 if (ptr >= end || !SRE_CHARSET(pattern + 1, *ptr))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000669 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000670 pattern += pattern[0];
671 ptr++;
672 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000673
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000674 case SRE_OP_GROUPREF:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000675 /* match backreference */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000676 TRACE(("|%p|%p|GROUPREF %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000677 i = pattern[0];
678 {
679 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
680 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
681 if (!p || !e || e < p)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000682 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000683 while (p < e) {
684 if (ptr >= end || *ptr != *p)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000685 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000686 p++; ptr++;
687 }
688 }
689 pattern++;
690 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000691
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000692 case SRE_OP_GROUPREF_IGNORE:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000693 /* match backreference */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000694 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000695 i = pattern[0];
696 {
697 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
698 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
699 if (!p || !e || e < p)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000700 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000701 while (p < e) {
702 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000703 state->lower(*ptr) != state->lower(*p))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000704 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000705 p++; ptr++;
706 }
707 }
708 pattern++;
709 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000710
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000711 case SRE_OP_LITERAL_IGNORE:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000712 TRACE(("|%p|%p|LITERAL_IGNORE %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000713 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000714 state->lower(*ptr) != state->lower(*pattern))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000715 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000716 pattern++;
717 ptr++;
718 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000719
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000720 case SRE_OP_NOT_LITERAL_IGNORE:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000721 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000722 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000723 state->lower(*ptr) == state->lower(*pattern))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000724 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000725 pattern++;
726 ptr++;
727 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000728
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000729 case SRE_OP_IN_IGNORE:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000730 TRACE(("|%p|%p|IN_IGNORE\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000731 if (ptr >= end
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000732 || !SRE_CHARSET(pattern + 1, (SRE_CODE) state->lower(*ptr)))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000733 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000734 pattern += pattern[0];
735 ptr++;
736 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000737
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000738 case SRE_OP_MARK:
739 /* set mark */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000740 /* <MARK> <gid> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000741 TRACE(("|%p|%p|MARK %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000742 i = pattern[0];
743 if (i & 1)
744 state->lastindex = i/2 + 1;
745 if (i > state->lastmark)
746 state->lastmark = i;
747 state->mark[i] = ptr;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000748 pattern++;
749 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000750
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000751 case SRE_OP_JUMP:
752 case SRE_OP_INFO:
753 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000754 /* <JUMP> <offset> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000755 TRACE(("|%p|%p|JUMP %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000756 pattern += pattern[0];
757 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000758
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000759 case SRE_OP_ASSERT:
760 /* assert subpattern */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000761 /* <ASSERT> <skip> <back> <pattern> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000762 TRACE(("|%p|%p|ASSERT %d\n", pattern, ptr, pattern[1]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000763 state->ptr = ptr - pattern[1];
Fredrik Lundh6f013982000-07-03 18:44:21 +0000764 if (state->ptr < state->beginning)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000765 return 0;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000766 i = SRE_MATCH(state, pattern + 2, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000767 if (i <= 0)
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000768 return i;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000769 if (pattern[1] > 0 && state->ptr != ptr)
770 return SRE_ERROR_STATE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000771 pattern += pattern[0];
772 break;
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000773
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000774 case SRE_OP_ASSERT_NOT:
775 /* assert not subpattern */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000776 /* <ASSERT_NOT> <skip> <back> <pattern> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000777 TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern, ptr, pattern[1]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000778 state->ptr = ptr - pattern[1];
Fredrik Lundh6f013982000-07-03 18:44:21 +0000779 if (state->ptr < state->beginning)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000780 return 0;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000781 i = SRE_MATCH(state, pattern + 2, level + 1);
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000782 if (i < 0)
783 return i;
784 if (i)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000785 return 0;
786 if (pattern[1] > 0 && state->ptr != ptr)
787 return SRE_ERROR_STATE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000788 pattern += pattern[0];
789 break;
790
791 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000792 /* alternation */
793 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000794 TRACE(("|%p|%p|BRANCH\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000795 lastmark = state->lastmark;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000796 for (; pattern[0]; pattern += pattern[0]) {
797 if (pattern[1] == SRE_OP_LITERAL &&
798 (ptr >= end || (SRE_CODE) *ptr != pattern[2]))
799 continue;
800 if (pattern[1] == SRE_OP_IN &&
801 (ptr >= end || !SRE_CHARSET(pattern + 3, (SRE_CODE) *ptr)))
802 continue;
803 state->ptr = ptr;
804 i = SRE_MATCH(state, pattern + 1, level + 1);
805 if (i)
806 return i;
807 if (state->lastmark > lastmark) {
808 memset(
809 state->mark + lastmark + 1, 0,
810 (state->lastmark - lastmark) * sizeof(void*)
811 );
812 state->lastmark = lastmark;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000813 }
814 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000815 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000816
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000817 case SRE_OP_REPEAT_ONE:
818 /* match repeated sequence (maximizing regexp) */
819
820 /* this operator only works if the repeated item is
821 exactly one character wide, and we're not already
822 collecting backtracking points. for other cases,
823 use the MAX_REPEAT operator instead */
824
825 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
826
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000827 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", pattern, ptr,
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000828 pattern[1], pattern[2]));
829
Fredrik Lundhe1869832000-08-01 22:47:49 +0000830 if (ptr + pattern[1] > end)
831 return 0; /* cannot match */
832
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000833 state->ptr = ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000834
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000835 count = SRE_COUNT(state, pattern + 3, pattern[2], level + 1);
836 if (count < 0)
837 return count;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000838
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000839 ptr += count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000840
841 /* when we arrive here, count contains the number of
842 matches, and ptr points to the tail of the target
843 string. check if the rest of the pattern matches,
844 and backtrack if not. */
845
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000846 if (count < (int) pattern[1])
847 return 0;
848
849 if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
850 /* tail is empty. we're finished */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000851 state->ptr = ptr;
852 return 1;
853
854 } else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
855 /* tail starts with a literal. skip positions where
856 the rest of the pattern cannot possibly match */
Fredrik Lundhe1869832000-08-01 22:47:49 +0000857 chr = pattern[pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000858 for (;;) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000859 while (count >= (int) pattern[1] &&
860 (ptr >= end || *ptr != chr)) {
861 ptr--;
862 count--;
863 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000864 if (count < (int) pattern[1])
865 break;
866 state->ptr = ptr;
867 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000868 if (i)
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000869 return 1;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000870 ptr--;
871 count--;
872 }
873
874 } else {
875 /* general case */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000876 lastmark = state->lastmark;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000877 while (count >= (int) pattern[1]) {
878 state->ptr = ptr;
879 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000880 if (i)
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000881 return 1;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000882 ptr--;
883 count--;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000884 if (state->lastmark > lastmark) {
885 memset(
886 state->mark + lastmark + 1, 0,
887 (state->lastmark - lastmark) * sizeof(void*)
888 );
889 state->lastmark = lastmark;
890 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000891 }
892 }
893 return 0;
894
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000895 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000896 /* create repeat context. all the hard work is done
897 by the UNTIL operator */
898 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000899 TRACE(("|%p|%p|REPEAT %d %d\n", pattern, ptr,
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000900 pattern[1], pattern[2]));
901
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000902 rep.count = -1;
903 rep.pattern = pattern;
904
905 /* install new repeat context */
906 rep.prev = state->repeat;
907 state->repeat = &rep;
908
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000909 state->ptr = ptr;
910 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000911
912 state->repeat = rep.prev;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000913
914 return i;
915
916 case SRE_OP_MAX_UNTIL:
917 /* maximizing repeat */
918 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
919
920 /* FIXME: we probably need to deal with zero-width
921 matches in here... */
922
923 rp = state->repeat;
924 if (!rp)
925 return SRE_ERROR_STATE;
926
927 state->ptr = ptr;
928
929 count = rp->count + 1;
930
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000931 TRACE(("|%p|%p|MAX_UNTIL %d\n", pattern, ptr, count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000932
933 if (count < rp->pattern[1]) {
934 /* not enough matches */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000935 rp->count = count;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000936 /* RECURSIVE */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000937 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000938 if (i)
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000939 return i;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000940 rp->count = count - 1;
941 state->ptr = ptr;
942 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000943 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000944
945 if (count < rp->pattern[2] || rp->pattern[2] == 65535) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000946 /* we may have enough matches, but if we can
947 match another item, do so */
948 rp->count = count;
949 lastmark = state->lastmark;
950 mark_save(state, 0, lastmark);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000951 /* RECURSIVE */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000952 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000953 if (i)
954 return i;
955 mark_restore(state, 0, lastmark);
956 rp->count = count - 1;
957 state->ptr = ptr;
958 }
959
960 /* cannot match more repeated items here. make sure the
961 tail matches */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000962 state->repeat = rp->prev;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000963 i = SRE_MATCH(state, pattern, level + 1);
964 if (i)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000965 return i;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000966 state->repeat = rp;
967 return 0;
968
969 case SRE_OP_MIN_UNTIL:
970 /* minimizing repeat */
971 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
972
973 rp = state->repeat;
974 if (!rp)
975 return SRE_ERROR_STATE;
976
977 count = rp->count + 1;
978
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000979 TRACE(("|%p|%p|MIN_UNTIL %d\n", pattern, ptr, count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000980
981 state->ptr = ptr;
982
983 if (count < rp->pattern[1]) {
984 /* not enough matches */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000985 rp->count = count;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000986 /* RECURSIVE */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000987 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000988 if (i)
989 return i;
990 rp->count = count-1;
991 state->ptr = ptr;
992 return 0;
993 }
994
995 /* see if the tail matches */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000996 state->repeat = rp->prev;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000997 i = SRE_MATCH(state, pattern, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000998 if (i) {
999 /* free(rp); */
1000 return i;
1001 }
1002 state->repeat = rp;
1003
1004 if (count >= rp->pattern[2] && rp->pattern[2] != 65535)
1005 return 0;
1006
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001007 rp->count = count;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001008 /* RECURSIVE */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001009 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001010 if (i)
1011 return i;
1012 rp->count = count - 1;
1013 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001014
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001015 default:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001016 TRACE(("|%p|%p|UNKNOWN %d\n", pattern, ptr, pattern[-1]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001017 return SRE_ERROR_ILLEGAL;
1018 }
1019 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001020
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001021 /* shouldn't end up here */
1022 return SRE_ERROR_ILLEGAL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001023}
1024
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001025LOCAL(int)
Guido van Rossumb700df92000-03-31 14:59:30 +00001026SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1027{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001028 SRE_CHAR* ptr = state->start;
1029 SRE_CHAR* end = state->end;
1030 int status = 0;
Fredrik Lundh28552902000-07-05 21:14:16 +00001031 int prefix_len = 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001032 int prefix_skip;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001033 SRE_CODE* prefix = NULL;
1034 SRE_CODE* charset = NULL;
1035 SRE_CODE* overlap = NULL;
1036 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001037
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001038 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001039 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001040 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001041
1042 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001043
1044 if (pattern[3] > 0) {
1045 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001046 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001047 end -= pattern[3]-1;
1048 if (end <= ptr)
1049 end = ptr+1;
1050 }
1051
Fredrik Lundh3562f112000-07-02 12:00:07 +00001052 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001053 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001054 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001055 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001056 prefix_skip = pattern[6];
1057 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001058 overlap = prefix + prefix_len - 1;
1059 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001060 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001061 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001062 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001063
1064 pattern += 1 + pattern[1];
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001065 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001066
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001067 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1068 TRACE(("charset = %p\n", charset));
1069
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001070#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001071 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001072 /* pattern starts with a known prefix. use the overlap
1073 table to skip forward as fast as we possibly can */
1074 int i = 0;
1075 end = state->end;
1076 while (ptr < end) {
1077 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001078 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001079 if (!i)
1080 break;
1081 else
1082 i = overlap[i];
1083 } else {
1084 if (++i == prefix_len) {
1085 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001086 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1087 state->start = ptr + 1 - prefix_len;
1088 state->ptr = ptr + 1 - prefix_len + prefix_skip;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001089 if (flags & SRE_INFO_LITERAL)
1090 return 1; /* we got all of it */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001091 status = SRE_MATCH(state, pattern + 2*prefix_skip, 1);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001092 if (status != 0)
1093 return status;
1094 /* close but no cigar -- try again */
1095 i = overlap[i];
1096 }
1097 break;
1098 }
1099
1100 }
1101 ptr++;
1102 }
1103 return 0;
1104 }
1105#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001106
Fredrik Lundh3562f112000-07-02 12:00:07 +00001107 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001108 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001109 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001110 SRE_CODE chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001111 end = state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001112 for (;;) {
1113 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1114 ptr++;
1115 if (ptr == end)
1116 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001117 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001118 state->start = ptr;
1119 state->ptr = ++ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001120 status = SRE_MATCH(state, pattern + 2, 1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001121 if (status != 0)
1122 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001123 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001124 } else if (charset) {
1125 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001126 end = state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001127 for (;;) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001128 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001129 ptr++;
1130 if (ptr == end)
1131 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001132 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001133 state->start = ptr;
1134 state->ptr = ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001135 status = SRE_MATCH(state, pattern, 1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001136 if (status != 0)
1137 break;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001138 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001139 }
1140 } else
1141 /* general case */
1142 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001143 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001144 state->start = state->ptr = ptr++;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001145 status = SRE_MATCH(state, pattern, 1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001146 if (status != 0)
1147 break;
1148 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001149
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001150 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001151}
Fredrik Lundh3562f112000-07-02 12:00:07 +00001152
Guido van Rossumb700df92000-03-31 14:59:30 +00001153
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001154#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001155
1156/* -------------------------------------------------------------------- */
1157/* factories and destructors */
1158
1159/* see sre.h for object declarations */
1160
1161staticforward PyTypeObject Pattern_Type;
1162staticforward PyTypeObject Match_Type;
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001163staticforward PyTypeObject Scanner_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00001164
1165static PyObject *
1166_compile(PyObject* self_, PyObject* args)
1167{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001168 /* "compile" pattern descriptor to pattern object */
Guido van Rossumb700df92000-03-31 14:59:30 +00001169
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001170 PatternObject* self;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001171 int i, n;
Guido van Rossumb700df92000-03-31 14:59:30 +00001172
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001173 PyObject* pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001174 int flags = 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001175 PyObject* code;
1176 int groups = 0;
1177 PyObject* groupindex = NULL;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001178 PyObject* indexgroup = NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001179 if (!PyArg_ParseTuple(args, "OiO|iOO", &pattern, &flags, &code,
Fredrik Lundhc2301732000-07-02 22:25:39 +00001180 &groups, &groupindex, &indexgroup))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001181 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001182
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001183 code = PySequence_Fast(code, "code argument must be a sequence");
1184 if (!code)
1185 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001186
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001187#if PY_VERSION_HEX >= 0x01060000
Jeremy Hylton03657cf2000-07-12 13:05:33 +00001188 n = PySequence_Size(code);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001189#else
1190 n = PySequence_Length(code);
1191#endif
Fredrik Lundh6f013982000-07-03 18:44:21 +00001192
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001193 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, 100*n);
1194 if (!self) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00001195 Py_DECREF(code);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001196 return NULL;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001197 }
1198
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001199 for (i = 0; i < n; i++) {
1200 PyObject *o = PySequence_Fast_GET_ITEM(code, i);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001201 self->code[i] = (SRE_CODE) PyInt_AsLong(o);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001202 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00001203
1204 Py_DECREF(code);
1205
1206 if (PyErr_Occurred())
1207 return NULL;
1208
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001209 Py_INCREF(pattern);
1210 self->pattern = pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00001211
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001212 self->flags = flags;
1213
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001214 self->groups = groups;
Guido van Rossumb700df92000-03-31 14:59:30 +00001215
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001216 Py_XINCREF(groupindex);
1217 self->groupindex = groupindex;
Guido van Rossumb700df92000-03-31 14:59:30 +00001218
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001219 Py_XINCREF(indexgroup);
1220 self->indexgroup = indexgroup;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001221
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001222 return (PyObject*) self;
Guido van Rossumb700df92000-03-31 14:59:30 +00001223}
1224
1225static PyObject *
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001226sre_codesize(PyObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001227{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001228 return Py_BuildValue("i", sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001229}
1230
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001231static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001232sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001233{
1234 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001235 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001236 return NULL;
1237 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001238 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001239#if defined(HAVE_UNICODE)
1240 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001241 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001242#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001243 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001244}
1245
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001246LOCAL(void)
1247state_reset(SRE_STATE* state)
1248{
1249 int i;
1250
1251 state->lastmark = 0;
1252
1253 /* FIXME: dynamic! */
1254 for (i = 0; i < SRE_MARK_SIZE; i++)
1255 state->mark[i] = NULL;
1256
1257 state->lastindex = -1;
1258
1259 state->repeat = NULL;
1260
1261 mark_fini(state);
1262}
1263
Guido van Rossumb700df92000-03-31 14:59:30 +00001264LOCAL(PyObject*)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001265state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1266 int start, int end)
Guido van Rossumb700df92000-03-31 14:59:30 +00001267{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001268 /* prepare state object */
Guido van Rossumb700df92000-03-31 14:59:30 +00001269
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001270 PyBufferProcs *buffer;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001271 int size, bytes;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001272 void* ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001273
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001274 memset(state, 0, sizeof(SRE_STATE));
1275
1276 state->lastindex = -1;
1277
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001278 /* get pointer to string buffer */
1279 buffer = string->ob_type->tp_as_buffer;
1280 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1281 buffer->bf_getsegcount(string, NULL) != 1) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001282 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001283 return NULL;
1284 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001285
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001286 /* determine buffer size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001287 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1288 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001289 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1290 return NULL;
1291 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001292
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001293 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001294
1295#if PY_VERSION_HEX >= 0x01060000
1296 size = PyObject_Size(string);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001297#else
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001298 size = PyObject_Length(string);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001299#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001300
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001301 if (PyString_Check(string) || bytes == size)
1302 state->charsize = 1;
1303#if defined(HAVE_UNICODE)
1304 else if (bytes == (int) (size * sizeof(Py_UNICODE)))
1305 state->charsize = sizeof(Py_UNICODE);
1306#endif
1307 else {
1308 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1309 return NULL;
1310 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001311
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001312 /* adjust boundaries */
1313 if (start < 0)
1314 start = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001315 else if (start > size)
1316 start = size;
Guido van Rossumb700df92000-03-31 14:59:30 +00001317
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001318 if (end < 0)
1319 end = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001320 else if (end > size)
1321 end = size;
Guido van Rossumb700df92000-03-31 14:59:30 +00001322
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001323 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001324
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001325 state->start = (void*) ((char*) ptr + start * state->charsize);
1326 state->end = (void*) ((char*) ptr + end * state->charsize);
1327
1328 Py_INCREF(string);
1329 state->string = string;
1330 state->pos = start;
1331 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001332
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001333 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001334 state->lower = sre_lower_locale;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001335#if defined(HAVE_UNICODE)
1336 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001337 state->lower = sre_lower_unicode;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001338#endif
1339 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001340 state->lower = sre_lower;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001341
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001342 return string;
Guido van Rossumb700df92000-03-31 14:59:30 +00001343}
1344
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001345LOCAL(void)
1346state_fini(SRE_STATE* state)
1347{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001348 Py_XDECREF(state->string);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001349 mark_fini(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001350}
1351
1352LOCAL(PyObject*)
1353state_getslice(SRE_STATE* state, int index, PyObject* string)
1354{
Fredrik Lundh58100642000-08-09 09:14:35 +00001355 int i, j;
1356
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001357 index = (index - 1) * 2;
1358
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001359 if (string == Py_None || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh58100642000-08-09 09:14:35 +00001360 i = j = 0;
1361 } else {
1362 i = ((char*)state->mark[index] - (char*)state->beginning) /
1363 state->charsize;
1364 j = ((char*)state->mark[index+1] - (char*)state->beginning) /
1365 state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001366 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001367
Fredrik Lundh58100642000-08-09 09:14:35 +00001368 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001369}
1370
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001371static void
1372pattern_error(int status)
1373{
1374 switch (status) {
1375 case SRE_ERROR_RECURSION_LIMIT:
1376 PyErr_SetString(
1377 PyExc_RuntimeError,
1378 "maximum recursion limit exceeded"
1379 );
1380 break;
1381 case SRE_ERROR_MEMORY:
1382 PyErr_NoMemory();
1383 break;
1384 default:
1385 /* other error codes indicate compiler/engine bugs */
1386 PyErr_SetString(
1387 PyExc_RuntimeError,
1388 "internal error in regular expression engine"
1389 );
1390 }
1391}
1392
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001393static PyObject*
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001394pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001395{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001396 /* create match object (from state object) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001397
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001398 MatchObject* match;
1399 int i, j;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001400 char* base;
1401 int n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001402
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001403 if (status > 0) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001404
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001405 /* create match object (with room for extra group marks) */
1406 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
Fredrik Lundh6f013982000-07-03 18:44:21 +00001407 2*(pattern->groups+1));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001408 if (!match)
1409 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001410
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001411 Py_INCREF(pattern);
1412 match->pattern = pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001413
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001414 Py_INCREF(state->string);
1415 match->string = state->string;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001416
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001417 match->regs = NULL;
1418 match->groups = pattern->groups+1;
1419
1420 /* fill in group slices */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001421
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001422 base = (char*) state->beginning;
1423 n = state->charsize;
1424
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001425 match->mark[0] = ((char*) state->start - base) / n;
1426 match->mark[1] = ((char*) state->ptr - base) / n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001427
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001428 for (i = j = 0; i < pattern->groups; i++, j+=2)
1429 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
1430 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
1431 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
1432 } else
1433 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
1434
1435 match->pos = state->pos;
1436 match->endpos = state->endpos;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001437
Fredrik Lundh6f013982000-07-03 18:44:21 +00001438 match->lastindex = state->lastindex;
1439
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001440 return (PyObject*) match;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001441
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001442 } else if (status == 0) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001443
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001444 /* no match */
1445 Py_INCREF(Py_None);
1446 return Py_None;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001447
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001448 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001449
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001450 /* internal error */
1451 pattern_error(status);
1452 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001453}
1454
1455static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001456pattern_scanner(PatternObject* pattern, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001457{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001458 /* create search state object */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001459
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001460 ScannerObject* self;
1461
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001462 PyObject* string;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001463 int start = 0;
1464 int end = INT_MAX;
1465 if (!PyArg_ParseTuple(args, "O|ii:scanner", &string, &start, &end))
1466 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001467
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001468 /* create scanner object */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001469 self = PyObject_NEW(ScannerObject, &Scanner_Type);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001470 if (!self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001471 return NULL;
1472
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001473 string = state_init(&self->state, pattern, string, start, end);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001474 if (!string) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00001475 PyObject_Del(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001476 return NULL;
1477 }
1478
1479 Py_INCREF(pattern);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001480 self->pattern = (PyObject*) pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001481
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001482 return (PyObject*) self;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001483}
1484
Guido van Rossumb700df92000-03-31 14:59:30 +00001485static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001486pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001487{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001488 Py_XDECREF(self->pattern);
1489 Py_XDECREF(self->groupindex);
1490 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001491}
1492
1493static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001494pattern_match(PatternObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001495{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001496 SRE_STATE state;
1497 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001498
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001499 PyObject* string;
1500 int start = 0;
1501 int end = INT_MAX;
1502 if (!PyArg_ParseTuple(args, "O|ii:match", &string, &start, &end))
1503 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001504
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001505 string = state_init(&state, self, string, start, end);
1506 if (!string)
1507 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001508
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001509 state.ptr = state.start;
1510
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001511 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1512
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001513 if (state.charsize == 1) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001514 status = sre_match(&state, PatternObject_GetCode(self), 1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001515 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001516#if defined(HAVE_UNICODE)
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001517 status = sre_umatch(&state, PatternObject_GetCode(self), 1);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001518#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001519 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001520
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001521 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1522
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001523 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001524
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001525 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001526}
1527
1528static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001529pattern_search(PatternObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001530{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001531 SRE_STATE state;
1532 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001533
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001534 PyObject* string;
1535 int start = 0;
1536 int end = INT_MAX;
1537 if (!PyArg_ParseTuple(args, "O|ii:search", &string, &start, &end))
1538 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001539
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001540 string = state_init(&state, self, string, start, end);
1541 if (!string)
1542 return NULL;
1543
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001544 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1545
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001546 if (state.charsize == 1) {
1547 status = sre_search(&state, PatternObject_GetCode(self));
1548 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001549#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001550 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001551#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001552 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001553
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001554 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1555
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001556 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001557
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001558 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001559}
1560
1561static PyObject*
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001562call(char* function, PyObject* args)
1563{
1564 PyObject* name;
1565 PyObject* module;
1566 PyObject* func;
1567 PyObject* result;
1568
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001569 name = PyString_FromString(MODULE);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001570 if (!name)
1571 return NULL;
1572 module = PyImport_Import(name);
1573 Py_DECREF(name);
1574 if (!module)
1575 return NULL;
1576 func = PyObject_GetAttrString(module, function);
1577 Py_DECREF(module);
1578 if (!func)
1579 return NULL;
1580 result = PyObject_CallObject(func, args);
1581 Py_DECREF(func);
1582 Py_DECREF(args);
1583 return result;
1584}
1585
1586static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001587pattern_sub(PatternObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001588{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001589 PyObject* template;
1590 PyObject* string;
Fredrik Lundh28552902000-07-05 21:14:16 +00001591 PyObject* count = Py_False; /* zero */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001592 if (!PyArg_ParseTuple(args, "OO|O:sub", &template, &string, &count))
1593 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001594
1595 /* delegate to Python code */
1596 return call("_sub", Py_BuildValue("OOOO", self, template, string, count));
1597}
1598
1599static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001600pattern_subn(PatternObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001601{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001602 PyObject* template;
1603 PyObject* string;
Fredrik Lundh28552902000-07-05 21:14:16 +00001604 PyObject* count = Py_False; /* zero */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001605 if (!PyArg_ParseTuple(args, "OO|O:subn", &template, &string, &count))
1606 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001607
1608 /* delegate to Python code */
1609 return call("_subn", Py_BuildValue("OOOO", self, template, string, count));
1610}
1611
1612static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001613pattern_split(PatternObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001614{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001615 PyObject* string;
Fredrik Lundh28552902000-07-05 21:14:16 +00001616 PyObject* maxsplit = Py_False; /* zero */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001617 if (!PyArg_ParseTuple(args, "O|O:split", &string, &maxsplit))
1618 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001619
1620 /* delegate to Python code */
1621 return call("_split", Py_BuildValue("OOO", self, string, maxsplit));
1622}
1623
1624static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001625pattern_findall(PatternObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001626{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001627 SRE_STATE state;
1628 PyObject* list;
1629 int status;
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001630 int i;
Guido van Rossumb700df92000-03-31 14:59:30 +00001631
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001632 PyObject* string;
1633 int start = 0;
1634 int end = INT_MAX;
1635 if (!PyArg_ParseTuple(args, "O|ii:findall", &string, &start, &end))
1636 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001637
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001638 string = state_init(&state, self, string, start, end);
1639 if (!string)
1640 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001641
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001642 list = PyList_New(0);
Guido van Rossumb700df92000-03-31 14:59:30 +00001643
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001644 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001645
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001646 PyObject* item;
1647
1648 state.ptr = state.start;
1649
1650 if (state.charsize == 1) {
1651 status = sre_search(&state, PatternObject_GetCode(self));
1652 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001653#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001654 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001655#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001656 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001657
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001658 if (status > 0) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001659
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001660 /* don't bother to build a match object */
1661 switch (self->groups) {
1662 case 0:
1663 item = PySequence_GetSlice(
1664 string,
1665 ((char*) state.start - (char*) state.beginning) /
1666 state.charsize,
1667 ((char*) state.ptr - (char*) state.beginning) /
1668 state.charsize);
1669 if (!item)
1670 goto error;
1671 break;
1672 case 1:
1673 item = state_getslice(&state, 1, string);
1674 if (!item)
1675 goto error;
1676 break;
1677 default:
1678 item = PyTuple_New(self->groups);
1679 if (!item)
1680 goto error;
1681 for (i = 0; i < self->groups; i++) {
1682 PyObject* o = state_getslice(&state, i+1, string);
1683 if (!o) {
1684 Py_DECREF(item);
1685 goto error;
1686 }
1687 PyTuple_SET_ITEM(item, i, o);
1688 }
1689 break;
1690 }
1691
1692 if (PyList_Append(list, item) < 0) {
1693 Py_DECREF(item);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001694 goto error;
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001695 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001696
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001697 if (state.ptr == state.start)
1698 state.start = (void*) ((char*) state.ptr + state.charsize);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001699 else
1700 state.start = state.ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001701
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001702 } else {
Guido van Rossumb700df92000-03-31 14:59:30 +00001703
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001704 if (status == 0)
1705 break;
1706
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001707 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001708 goto error;
Guido van Rossumb700df92000-03-31 14:59:30 +00001709
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001710 }
1711 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001712
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001713 state_fini(&state);
1714 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00001715
1716error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001717 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001718 state_fini(&state);
1719 return NULL;
1720
Guido van Rossumb700df92000-03-31 14:59:30 +00001721}
1722
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001723static PyMethodDef pattern_methods[] = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001724 {"match", (PyCFunction) pattern_match, 1},
1725 {"search", (PyCFunction) pattern_search, 1},
1726 {"sub", (PyCFunction) pattern_sub, 1},
1727 {"subn", (PyCFunction) pattern_subn, 1},
1728 {"split", (PyCFunction) pattern_split, 1},
1729 {"findall", (PyCFunction) pattern_findall, 1},
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001730 /* experimental */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001731 {"scanner", (PyCFunction) pattern_scanner, 1},
1732 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00001733};
1734
1735static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001736pattern_getattr(PatternObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00001737{
1738 PyObject* res;
1739
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001740 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00001741
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001742 if (res)
1743 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00001744
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001745 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00001746
1747 /* attributes */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001748 if (!strcmp(name, "pattern")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001749 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001750 return self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00001751 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001752
1753 if (!strcmp(name, "flags"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001754 return Py_BuildValue("i", self->flags);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001755
Fredrik Lundh01016fe2000-06-30 00:27:46 +00001756 if (!strcmp(name, "groups"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001757 return Py_BuildValue("i", self->groups);
Fredrik Lundh01016fe2000-06-30 00:27:46 +00001758
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001759 if (!strcmp(name, "groupindex") && self->groupindex) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001760 Py_INCREF(self->groupindex);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001761 return self->groupindex;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001762 }
1763
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001764 PyErr_SetString(PyExc_AttributeError, name);
1765 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001766}
1767
1768statichere PyTypeObject Pattern_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001769 PyObject_HEAD_INIT(NULL)
1770 0, "SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00001771 sizeof(PatternObject), sizeof(SRE_CODE),
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001772 (destructor)pattern_dealloc, /*tp_dealloc*/
1773 0, /*tp_print*/
1774 (getattrfunc)pattern_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001775};
1776
1777/* -------------------------------------------------------------------- */
1778/* match methods */
1779
1780static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001781match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001782{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001783 Py_XDECREF(self->regs);
1784 Py_XDECREF(self->string);
1785 Py_DECREF(self->pattern);
1786 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001787}
1788
1789static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001790match_getslice_by_index(MatchObject* self, int index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00001791{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001792 if (index < 0 || index >= self->groups) {
1793 /* raise IndexError if we were given a bad group number */
1794 PyErr_SetString(
1795 PyExc_IndexError,
1796 "no such group"
1797 );
1798 return NULL;
1799 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001800
Fredrik Lundh6f013982000-07-03 18:44:21 +00001801 index *= 2;
1802
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001803 if (self->string == Py_None || self->mark[index] < 0) {
1804 /* return default value if the string or group is undefined */
1805 Py_INCREF(def);
1806 return def;
1807 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001808
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001809 return PySequence_GetSlice(
1810 self->string, self->mark[index], self->mark[index+1]
1811 );
Guido van Rossumb700df92000-03-31 14:59:30 +00001812}
1813
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001814static int
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001815match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00001816{
Fredrik Lundh6f013982000-07-03 18:44:21 +00001817 int i;
Guido van Rossumb700df92000-03-31 14:59:30 +00001818
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001819 if (PyInt_Check(index))
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001820 return (int) PyInt_AS_LONG(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00001821
Fredrik Lundh6f013982000-07-03 18:44:21 +00001822 i = -1;
1823
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001824 if (self->pattern->groupindex) {
1825 index = PyObject_GetItem(self->pattern->groupindex, index);
1826 if (index) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00001827 if (PyInt_Check(index))
1828 i = (int) PyInt_AS_LONG(index);
1829 Py_DECREF(index);
1830 } else
1831 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001832 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00001833
1834 return i;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001835}
1836
1837static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001838match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001839{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001840 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00001841}
1842
1843static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001844match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001845{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001846 PyObject* result;
1847 int i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00001848
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001849 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00001850
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001851 switch (size) {
1852 case 0:
1853 result = match_getslice(self, Py_False, Py_None);
1854 break;
1855 case 1:
1856 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
1857 break;
1858 default:
1859 /* fetch multiple items */
1860 result = PyTuple_New(size);
1861 if (!result)
1862 return NULL;
1863 for (i = 0; i < size; i++) {
1864 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001865 self, PyTuple_GET_ITEM(args, i), Py_None
1866 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001867 if (!item) {
1868 Py_DECREF(result);
1869 return NULL;
1870 }
1871 PyTuple_SET_ITEM(result, i, item);
1872 }
1873 break;
1874 }
1875 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00001876}
1877
1878static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001879match_groups(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001880{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001881 PyObject* result;
1882 int index;
Guido van Rossumb700df92000-03-31 14:59:30 +00001883
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001884 PyObject* def = Py_None;
1885 if (!PyArg_ParseTuple(args, "|O:groups", &def))
1886 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001887
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001888 result = PyTuple_New(self->groups-1);
1889 if (!result)
1890 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001891
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001892 for (index = 1; index < self->groups; index++) {
1893 PyObject* item;
1894 item = match_getslice_by_index(self, index, def);
1895 if (!item) {
1896 Py_DECREF(result);
1897 return NULL;
1898 }
1899 PyTuple_SET_ITEM(result, index-1, item);
1900 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001901
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001902 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00001903}
1904
1905static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001906match_groupdict(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001907{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001908 PyObject* result;
1909 PyObject* keys;
1910 int index;
Guido van Rossumb700df92000-03-31 14:59:30 +00001911
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001912 PyObject* def = Py_None;
1913 if (!PyArg_ParseTuple(args, "|O:groupdict", &def))
1914 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001915
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001916 result = PyDict_New();
1917 if (!result || !self->pattern->groupindex)
1918 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00001919
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001920 keys = PyMapping_Keys(self->pattern->groupindex);
1921 if (!keys) {
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001922 Py_DECREF(result);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001923 return NULL;
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001924 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001925
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001926 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
1927 PyObject* key;
1928 PyObject* item;
1929 key = PyList_GET_ITEM(keys, index);
1930 if (!key) {
1931 Py_DECREF(keys);
1932 Py_DECREF(result);
1933 return NULL;
1934 }
1935 item = match_getslice(self, key, def);
1936 if (!item) {
1937 Py_DECREF(key);
1938 Py_DECREF(keys);
1939 Py_DECREF(result);
1940 return NULL;
1941 }
1942 /* FIXME: <fl> this can fail, right? */
1943 PyDict_SetItem(result, key, item);
1944 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001945
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001946 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00001947
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001948 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00001949}
1950
1951static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001952match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001953{
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001954 int index;
1955
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001956 PyObject* index_ = Py_False; /* zero */
1957 if (!PyArg_ParseTuple(args, "|O:start", &index_))
1958 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001959
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001960 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001961
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001962 if (index < 0 || index >= self->groups) {
1963 PyErr_SetString(
1964 PyExc_IndexError,
1965 "no such group"
1966 );
1967 return NULL;
1968 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001969
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001970 if (self->mark[index*2] < 0) {
1971 Py_INCREF(Py_None);
1972 return Py_None;
1973 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001974
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001975 return Py_BuildValue("i", self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00001976}
1977
1978static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001979match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001980{
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001981 int index;
1982
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001983 PyObject* index_ = Py_False; /* zero */
1984 if (!PyArg_ParseTuple(args, "|O:end", &index_))
1985 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001986
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001987 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001988
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001989 if (index < 0 || index >= self->groups) {
1990 PyErr_SetString(
1991 PyExc_IndexError,
1992 "no such group"
1993 );
1994 return NULL;
1995 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001996
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001997 if (self->mark[index*2] < 0) {
1998 Py_INCREF(Py_None);
1999 return Py_None;
2000 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002001
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002002 return Py_BuildValue("i", self->mark[index*2+1]);
2003}
2004
2005LOCAL(PyObject*)
2006_pair(int i1, int i2)
2007{
2008 PyObject* pair;
2009 PyObject* item;
2010
2011 pair = PyTuple_New(2);
2012 if (!pair)
2013 return NULL;
2014
2015 item = PyInt_FromLong(i1);
2016 if (!item)
2017 goto error;
2018 PyTuple_SET_ITEM(pair, 0, item);
2019
2020 item = PyInt_FromLong(i2);
2021 if (!item)
2022 goto error;
2023 PyTuple_SET_ITEM(pair, 1, item);
2024
2025 return pair;
2026
2027 error:
2028 Py_DECREF(pair);
2029 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002030}
2031
2032static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002033match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002034{
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002035 int index;
2036
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002037 PyObject* index_ = Py_False; /* zero */
2038 if (!PyArg_ParseTuple(args, "|O:span", &index_))
2039 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002040
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002041 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002042
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002043 if (index < 0 || index >= self->groups) {
2044 PyErr_SetString(
2045 PyExc_IndexError,
2046 "no such group"
2047 );
2048 return NULL;
2049 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002050
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002051 if (self->mark[index*2] < 0) {
2052 Py_INCREF(Py_None);
2053 Py_INCREF(Py_None);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002054 return Py_BuildValue("OO", Py_None, Py_None);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002055 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002056
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002057 return _pair(self->mark[index*2], self->mark[index*2+1]);
2058}
2059
2060static PyObject*
2061match_regs(MatchObject* self)
2062{
2063 PyObject* regs;
2064 PyObject* item;
2065 int index;
2066
2067 regs = PyTuple_New(self->groups);
2068 if (!regs)
2069 return NULL;
2070
2071 for (index = 0; index < self->groups; index++) {
2072 item = _pair(self->mark[index*2], self->mark[index*2+1]);
2073 if (!item) {
2074 Py_DECREF(regs);
2075 return NULL;
2076 }
2077 PyTuple_SET_ITEM(regs, index, item);
2078 }
2079
2080 Py_INCREF(regs);
2081 self->regs = regs;
2082
2083 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00002084}
2085
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002086static PyMethodDef match_methods[] = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002087 {"group", (PyCFunction) match_group, 1},
2088 {"start", (PyCFunction) match_start, 1},
2089 {"end", (PyCFunction) match_end, 1},
2090 {"span", (PyCFunction) match_span, 1},
2091 {"groups", (PyCFunction) match_groups, 1},
2092 {"groupdict", (PyCFunction) match_groupdict, 1},
2093 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002094};
2095
2096static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002097match_getattr(MatchObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00002098{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002099 PyObject* res;
Guido van Rossumb700df92000-03-31 14:59:30 +00002100
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002101 res = Py_FindMethod(match_methods, (PyObject*) self, name);
2102 if (res)
2103 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00002104
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002105 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00002106
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002107 if (!strcmp(name, "lastindex")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00002108 if (self->lastindex >= 0)
2109 return Py_BuildValue("i", self->lastindex);
Fredrik Lundhc2301732000-07-02 22:25:39 +00002110 Py_INCREF(Py_None);
2111 return Py_None;
2112 }
2113
2114 if (!strcmp(name, "lastgroup")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00002115 if (self->pattern->indexgroup && self->lastindex >= 0) {
Fredrik Lundhc2301732000-07-02 22:25:39 +00002116 PyObject* result = PySequence_GetItem(
Fredrik Lundh6f013982000-07-03 18:44:21 +00002117 self->pattern->indexgroup, self->lastindex
Fredrik Lundhc2301732000-07-02 22:25:39 +00002118 );
2119 if (result)
2120 return result;
2121 PyErr_Clear();
2122 }
2123 Py_INCREF(Py_None);
2124 return Py_None;
2125 }
2126
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002127 if (!strcmp(name, "string")) {
2128 if (self->string) {
2129 Py_INCREF(self->string);
2130 return self->string;
2131 } else {
2132 Py_INCREF(Py_None);
2133 return Py_None;
2134 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002135 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002136
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002137 if (!strcmp(name, "regs")) {
2138 if (self->regs) {
2139 Py_INCREF(self->regs);
2140 return self->regs;
2141 } else
2142 return match_regs(self);
2143 }
2144
2145 if (!strcmp(name, "re")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002146 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002147 return (PyObject*) self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00002148 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002149
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002150 if (!strcmp(name, "pos"))
2151 return Py_BuildValue("i", self->pos);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002152
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002153 if (!strcmp(name, "endpos"))
2154 return Py_BuildValue("i", self->endpos);
Guido van Rossumb700df92000-03-31 14:59:30 +00002155
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002156 PyErr_SetString(PyExc_AttributeError, name);
2157 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002158}
2159
2160/* FIXME: implement setattr("string", None) as a special case (to
2161 detach the associated string, if any */
2162
2163statichere PyTypeObject Match_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002164 PyObject_HEAD_INIT(NULL)
2165 0, "SRE_Match",
2166 sizeof(MatchObject), sizeof(int),
2167 (destructor)match_dealloc, /*tp_dealloc*/
2168 0, /*tp_print*/
2169 (getattrfunc)match_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002170};
2171
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002172/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002173/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002174
2175static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002176scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002177{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002178 state_fini(&self->state);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002179 Py_DECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002180 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002181}
2182
2183static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002184scanner_match(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002185{
2186 SRE_STATE* state = &self->state;
2187 PyObject* match;
2188 int status;
2189
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00002190 state_reset(state);
2191
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002192 state->ptr = state->start;
2193
2194 if (state->charsize == 1) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00002195 status = sre_match(state, PatternObject_GetCode(self->pattern), 1);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002196 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002197#if defined(HAVE_UNICODE)
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00002198 status = sre_umatch(state, PatternObject_GetCode(self->pattern), 1);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002199#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002200 }
2201
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002202 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002203 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002204
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002205 if (status == 0 || state->ptr == state->start)
2206 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002207 else
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002208 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002209
2210 return match;
2211}
2212
2213
2214static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002215scanner_search(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002216{
2217 SRE_STATE* state = &self->state;
2218 PyObject* match;
2219 int status;
2220
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00002221 state_reset(state);
2222
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002223 state->ptr = state->start;
2224
2225 if (state->charsize == 1) {
2226 status = sre_search(state, PatternObject_GetCode(self->pattern));
2227 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002228#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002229 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002230#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002231 }
2232
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002233 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002234 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002235
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002236 if (status == 0 || state->ptr == state->start)
2237 state->start = (void*) ((char*) state->ptr + state->charsize);
2238 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002239 state->start = state->ptr;
2240
2241 return match;
2242}
2243
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002244static PyMethodDef scanner_methods[] = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002245 {"match", (PyCFunction) scanner_match, 0},
2246 {"search", (PyCFunction) scanner_search, 0},
2247 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002248};
2249
2250static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002251scanner_getattr(ScannerObject* self, char* name)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002252{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002253 PyObject* res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002254
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002255 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
2256 if (res)
2257 return res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002258
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002259 PyErr_Clear();
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002260
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002261 /* attributes */
2262 if (!strcmp(name, "pattern")) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002263 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002264 return self->pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002265 }
2266
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002267 PyErr_SetString(PyExc_AttributeError, name);
2268 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002269}
2270
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002271statichere PyTypeObject Scanner_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002272 PyObject_HEAD_INIT(NULL)
2273 0, "SRE_Scanner",
2274 sizeof(ScannerObject), 0,
2275 (destructor)scanner_dealloc, /*tp_dealloc*/
2276 0, /*tp_print*/
2277 (getattrfunc)scanner_getattr, /*tp_getattr*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002278};
2279
Guido van Rossumb700df92000-03-31 14:59:30 +00002280static PyMethodDef _functions[] = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002281 {"compile", _compile, 1},
2282 {"getcodesize", sre_codesize, 1},
2283 {"getlower", sre_getlower, 1},
2284 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002285};
2286
2287void
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002288#if defined(WIN32)
Guido van Rossumb700df92000-03-31 14:59:30 +00002289__declspec(dllexport)
2290#endif
Thomas Woutersf3f33dc2000-07-21 06:00:07 +00002291init_sre(void)
Guido van Rossumb700df92000-03-31 14:59:30 +00002292{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002293 /* Patch object types */
2294 Pattern_Type.ob_type = Match_Type.ob_type =
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002295 Scanner_Type.ob_type = &PyType_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00002296
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002297 Py_InitModule("_" MODULE, _functions);
Guido van Rossumb700df92000-03-31 14:59:30 +00002298}
2299
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002300#endif /* !defined(SRE_RECURSIVE) */