blob: 2ab1703636cdde2f3180ad403cd3ec9842476fc1 [file] [log] [blame]
Guido van Rossumb700df92000-03-31 14:59:30 +00001/* -*- Mode: C; tab-width: 4 -*-
2 *
3 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00004 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00005 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00006 *
7 * partial history:
Fredrik Lundh436c3d52000-06-29 08:58:44 +00008 * 99-10-24 fl created (based on existing template matcher code)
Guido van Rossumb700df92000-03-31 14:59:30 +00009 * 99-11-13 fl added categories, branching, and more (0.2)
10 * 99-11-16 fl some tweaks to compile on non-Windows platforms
11 * 99-12-18 fl non-literals, generic maximizing repeat (0.3)
Fredrik Lundh436c3d52000-06-29 08:58:44 +000012 * 00-02-28 fl tons of changes (not all to the better ;-) (0.4)
13 * 00-03-06 fl first alpha, sort of (0.5)
14 * 00-03-14 fl removed most compatibility stuff (0.6)
15 * 00-05-10 fl towards third alpha (0.8.2)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +000016 * 00-05-13 fl added experimental scanner stuff (0.8.3)
Fredrik Lundh436c3d52000-06-29 08:58:44 +000017 * 00-05-27 fl final bug hunt (0.8.4)
18 * 00-06-21 fl less bugs, more taste (0.8.5)
19 * 00-06-25 fl major changes to better deal with nested repeats (0.9)
20 * 00-06-28 fl fixed findall (0.9.1)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +000021 * 00-06-29 fl fixed split, added more scanner features (0.9.2)
Fredrik Lundhc13222c2000-07-01 23:49:14 +000022 * 00-06-30 fl added fast search optimization (0.9.3)
Fredrik Lundh0640e112000-06-30 13:55:15 +000023 * 00-06-30 fl added assert (lookahead) primitives, etc (0.9.4)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +000024 * 00-07-02 fl added charset optimizations, etc (0.9.5)
Fredrik Lundh6f013982000-07-03 18:44:21 +000025 * 00-07-03 fl store code in pattern object, lookbehind, etc
Guido van Rossumb700df92000-03-31 14:59:30 +000026 *
27 * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
28 *
Guido van Rossumb700df92000-03-31 14:59:30 +000029 * Portions of this engine have been developed in cooperation with
Fredrik Lundh22d25462000-07-01 17:50:59 +000030 * CNRI. Hewlett-Packard provided funding for 2.0 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000031 * other compatibility work.
32 */
33
34#ifndef SRE_RECURSIVE
35
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +000036char copyright[] = " SRE 0.9.5 Copyright (c) 1997-2000 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000037
38#include "Python.h"
39
40#include "sre.h"
41
Guido van Rossumb700df92000-03-31 14:59:30 +000042#if defined(HAVE_LIMITS_H)
43#include <limits.h>
44#else
45#define INT_MAX 2147483647
46#endif
47
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000048#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000049
Fredrik Lundh436c3d52000-06-29 08:58:44 +000050/* name of this module, minus the leading underscore */
51#define MODULE "sre"
52
Guido van Rossumb700df92000-03-31 14:59:30 +000053/* defining this one enables tracing */
54#undef DEBUG
55
Fredrik Lundh436c3d52000-06-29 08:58:44 +000056#if PY_VERSION_HEX >= 0x01060000
Fredrik Lundh22d25462000-07-01 17:50:59 +000057/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d52000-06-29 08:58:44 +000058#define HAVE_UNICODE
59#endif
60
Fredrik Lundh29c08be2000-06-29 23:33:12 +000061/* optional features */
62#define USE_FAST_SEARCH
63
Fredrik Lundh80946112000-06-29 18:03:25 +000064#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000065#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000066#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000067/* fastest possible local call under MSVC */
68#define LOCAL(type) static __inline type __fastcall
69#else
Fredrik Lundh29c08be2000-06-29 23:33:12 +000070#define LOCAL(type) static inline type
Guido van Rossumb700df92000-03-31 14:59:30 +000071#endif
72
73/* error codes */
74#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
75#define SRE_ERROR_MEMORY -9 /* out of memory */
76
Fredrik Lundh436c3d52000-06-29 08:58:44 +000077#if defined(DEBUG)
Guido van Rossumb700df92000-03-31 14:59:30 +000078#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +000079#else
80#define TRACE(v)
81#endif
82
Fredrik Lundh436c3d52000-06-29 08:58:44 +000083#define PTR(ptr) ((SRE_CHAR*) (ptr) - (SRE_CHAR*) state->beginning)
Guido van Rossumb700df92000-03-31 14:59:30 +000084
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000085/* -------------------------------------------------------------------- */
86/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +000087
Fredrik Lundh436c3d52000-06-29 08:58:44 +000088/* default character predicates (run sre_chars.py to regenerate tables) */
89
90#define SRE_DIGIT_MASK 1
91#define SRE_SPACE_MASK 2
92#define SRE_LINEBREAK_MASK 4
93#define SRE_ALNUM_MASK 8
94#define SRE_WORD_MASK 16
95
96static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
972, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
980, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
9925, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
10024, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1010, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
10224, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
103
Fredrik Lundhb389df32000-06-29 12:48:37 +0000104static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d52000-06-29 08:58:44 +000010510, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
10627, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
10744, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
10861, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
109108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
110122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
111106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
112120, 121, 122, 123, 124, 125, 126, 127 };
113
Fredrik Lundhb389df32000-06-29 12:48:37 +0000114static unsigned int sre_lower(unsigned int ch)
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000115{
Fredrik Lundhb389df32000-06-29 12:48:37 +0000116 return ((ch) < 128 ? sre_char_lower[ch] : ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000117}
118
119#define SRE_IS_DIGIT(ch)\
120 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
121#define SRE_IS_SPACE(ch)\
122 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
123#define SRE_IS_LINEBREAK(ch)\
124 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
125#define SRE_IS_ALNUM(ch)\
126 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
127#define SRE_IS_WORD(ch)\
128 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000129
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000130/* locale-specific character predicates */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000131
Fredrik Lundhb389df32000-06-29 12:48:37 +0000132static unsigned int sre_lower_locale(unsigned int ch)
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000133{
134 return ((ch) < 256 ? tolower((ch)) : ch);
135}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000136#define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
137#define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
138#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
139#define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
140#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
141
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000142/* unicode-specific character predicates */
143
144#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +0000145static unsigned int sre_lower_unicode(unsigned int ch)
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000146{
147 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
148}
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000149#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
150#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
151#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000152#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000153#define SRE_UNI_IS_WORD(ch) (SRE_IS_ALNUM((ch)) || (ch) == '_')
154#endif
155
Guido van Rossumb700df92000-03-31 14:59:30 +0000156LOCAL(int)
157sre_category(SRE_CODE category, unsigned int ch)
158{
159 switch (category) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000160
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000161 case SRE_CATEGORY_DIGIT:
Guido van Rossumb700df92000-03-31 14:59:30 +0000162 return SRE_IS_DIGIT(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000163 case SRE_CATEGORY_NOT_DIGIT:
Guido van Rossumb700df92000-03-31 14:59:30 +0000164 return !SRE_IS_DIGIT(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000165 case SRE_CATEGORY_SPACE:
Guido van Rossumb700df92000-03-31 14:59:30 +0000166 return SRE_IS_SPACE(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000167 case SRE_CATEGORY_NOT_SPACE:
Guido van Rossumb700df92000-03-31 14:59:30 +0000168 return !SRE_IS_SPACE(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000169 case SRE_CATEGORY_WORD:
Guido van Rossumb700df92000-03-31 14:59:30 +0000170 return SRE_IS_WORD(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000171 case SRE_CATEGORY_NOT_WORD:
Guido van Rossumb700df92000-03-31 14:59:30 +0000172 return !SRE_IS_WORD(ch);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000173 case SRE_CATEGORY_LINEBREAK:
174 return SRE_IS_LINEBREAK(ch);
175 case SRE_CATEGORY_NOT_LINEBREAK:
176 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000177
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000178 case SRE_CATEGORY_LOC_WORD:
179 return SRE_LOC_IS_WORD(ch);
180 case SRE_CATEGORY_LOC_NOT_WORD:
181 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000182
183#if defined(HAVE_UNICODE)
184 case SRE_CATEGORY_UNI_DIGIT:
185 return SRE_UNI_IS_DIGIT(ch);
186 case SRE_CATEGORY_UNI_NOT_DIGIT:
187 return !SRE_UNI_IS_DIGIT(ch);
188 case SRE_CATEGORY_UNI_SPACE:
189 return SRE_UNI_IS_SPACE(ch);
190 case SRE_CATEGORY_UNI_NOT_SPACE:
191 return !SRE_UNI_IS_SPACE(ch);
192 case SRE_CATEGORY_UNI_WORD:
193 return SRE_UNI_IS_WORD(ch);
194 case SRE_CATEGORY_UNI_NOT_WORD:
195 return !SRE_UNI_IS_WORD(ch);
196 case SRE_CATEGORY_UNI_LINEBREAK:
197 return SRE_UNI_IS_LINEBREAK(ch);
198 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
199 return !SRE_UNI_IS_LINEBREAK(ch);
200#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000201 }
202 return 0;
203}
204
205/* helpers */
206
207LOCAL(int)
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000208stack_free(SRE_STATE* state)
Guido van Rossumb700df92000-03-31 14:59:30 +0000209{
210 if (state->stack) {
211 TRACE(("release stack\n"));
212 free(state->stack);
213 state->stack = NULL;
214 }
215 state->stacksize = 0;
216 return 0;
217}
218
219static int /* shouldn't be LOCAL */
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000220stack_extend(SRE_STATE* state, int lo, int hi)
Guido van Rossumb700df92000-03-31 14:59:30 +0000221{
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000222 SRE_STACK* stack;
Guido van Rossumb700df92000-03-31 14:59:30 +0000223 int stacksize;
224
225 /* grow the stack to a suitable size; we need at least lo entries,
226 at most hi entries. if for some reason hi is lower than lo, lo
227 wins */
228
229 stacksize = state->stacksize;
230
231 if (stacksize == 0) {
232 /* create new stack */
233 stacksize = 512;
234 if (stacksize < lo)
235 stacksize = lo;
236 else if (stacksize > hi)
237 stacksize = hi;
238 TRACE(("allocate stack %d\n", stacksize));
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000239 stack = malloc(sizeof(SRE_STACK) * stacksize);
Guido van Rossumb700df92000-03-31 14:59:30 +0000240 } else {
241 /* grow the stack (typically by a factor of two) */
242 while (stacksize < lo)
243 stacksize = 2 * stacksize;
Fredrik Lundh28552902000-07-05 21:14:16 +0000244 /* FIXME: <fl> could trim size if it's much larger than hi,
245 as long it's larger than lo */
Guido van Rossumb700df92000-03-31 14:59:30 +0000246 TRACE(("grow stack to %d\n", stacksize));
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000247 stack = realloc(state->stack, sizeof(SRE_STACK) * stacksize);
Guido van Rossumb700df92000-03-31 14:59:30 +0000248 }
249
250 if (!stack) {
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000251 stack_free(state);
Guido van Rossumb700df92000-03-31 14:59:30 +0000252 return SRE_ERROR_MEMORY;
253 }
254
255 state->stack = stack;
256 state->stacksize = stacksize;
257
258 return 0;
259}
260
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000261/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000262
263#define SRE_CHAR unsigned char
264#define SRE_AT sre_at
265#define SRE_MEMBER sre_member
266#define SRE_MATCH sre_match
267#define SRE_SEARCH sre_search
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000268
269#if defined(HAVE_UNICODE)
270
Guido van Rossumb700df92000-03-31 14:59:30 +0000271#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000272#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000273#undef SRE_RECURSIVE
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000274
Guido van Rossumb700df92000-03-31 14:59:30 +0000275#undef SRE_SEARCH
276#undef SRE_MATCH
277#undef SRE_MEMBER
278#undef SRE_AT
279#undef SRE_CHAR
280
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000281/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000282
283#define SRE_CHAR Py_UNICODE
284#define SRE_AT sre_uat
285#define SRE_MEMBER sre_umember
286#define SRE_MATCH sre_umatch
287#define SRE_SEARCH sre_usearch
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000288#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000289
290#endif /* SRE_RECURSIVE */
291
292/* -------------------------------------------------------------------- */
293/* String matching engine */
294
295/* the following section is compiled twice, with different character
296 settings */
297
298LOCAL(int)
299SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
300{
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000301 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000302
303 int this, that;
304
305 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000306
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000307 case SRE_AT_BEGINNING:
Guido van Rossum29530882000-04-10 17:06:55 +0000308 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000309
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000310 case SRE_AT_BEGINNING_LINE:
311 return ((void*) ptr == state->beginning ||
312 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000313
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000314 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000315 return (((void*) (ptr+1) == state->end &&
316 SRE_IS_LINEBREAK((int) ptr[0])) ||
317 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000318
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000319 case SRE_AT_END_LINE:
320 return ((void*) ptr == state->end ||
321 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000322
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000323 case SRE_AT_BOUNDARY:
Guido van Rossumb700df92000-03-31 14:59:30 +0000324 if (state->beginning == state->end)
325 return 0;
326 that = ((void*) ptr > state->beginning) ?
327 SRE_IS_WORD((int) ptr[-1]) : 0;
328 this = ((void*) ptr < state->end) ?
329 SRE_IS_WORD((int) ptr[0]) : 0;
330 return this != that;
Fredrik Lundh80946112000-06-29 18:03:25 +0000331
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000332 case SRE_AT_NON_BOUNDARY:
Guido van Rossumb700df92000-03-31 14:59:30 +0000333 if (state->beginning == state->end)
334 return 0;
335 that = ((void*) ptr > state->beginning) ?
336 SRE_IS_WORD((int) ptr[-1]) : 0;
337 this = ((void*) ptr < state->end) ?
338 SRE_IS_WORD((int) ptr[0]) : 0;
339 return this == that;
340 }
341
342 return 0;
343}
344
345LOCAL(int)
Fredrik Lundh0640e112000-06-30 13:55:15 +0000346SRE_MEMBER(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000347{
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000348 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000349
350 int ok = 1;
351
352 for (;;) {
353 switch (*set++) {
354
355 case SRE_OP_NEGATE:
356 ok = !ok;
357 break;
358
359 case SRE_OP_FAILURE:
360 return !ok;
361
362 case SRE_OP_LITERAL:
Fredrik Lundhc13222c2000-07-01 23:49:14 +0000363 /* args: <literal> */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000364 if (ch == set[0])
Guido van Rossumb700df92000-03-31 14:59:30 +0000365 return ok;
366 set++;
367 break;
368
369 case SRE_OP_RANGE:
Fredrik Lundhc13222c2000-07-01 23:49:14 +0000370 /* args: <lower> <upper> */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000371 if (set[0] <= ch && ch <= set[1])
Guido van Rossumb700df92000-03-31 14:59:30 +0000372 return ok;
373 set += 2;
374 break;
375
Fredrik Lundh3562f112000-07-02 12:00:07 +0000376 case SRE_OP_CHARSET:
377 /* args: <bitmap> (16 bits per code word) */
378 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
379 return ok;
380 set += 16;
381 break;
382
Guido van Rossumb700df92000-03-31 14:59:30 +0000383 case SRE_OP_CATEGORY:
Fredrik Lundhc13222c2000-07-01 23:49:14 +0000384 /* args: <category> */
Guido van Rossumb700df92000-03-31 14:59:30 +0000385 if (sre_category(set[0], (int) ch))
386 return ok;
387 set += 1;
388 break;
389
390 default:
Fredrik Lundh80946112000-06-29 18:03:25 +0000391 /* internal error -- there's not much we can do about it
392 here, so let's just pretend it didn't match... */
Guido van Rossumb700df92000-03-31 14:59:30 +0000393 return 0;
394 }
395 }
396}
397
398LOCAL(int)
399SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
400{
401 /* check if string matches the given pattern. returns -1 for
402 error, 0 for failure, and 1 for success */
403
404 SRE_CHAR* end = state->end;
405 SRE_CHAR* ptr = state->ptr;
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000406 int stack;
Guido van Rossumb700df92000-03-31 14:59:30 +0000407 int stackbase;
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000408 int lastmark;
Guido van Rossumb700df92000-03-31 14:59:30 +0000409 int i, count;
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000410 SRE_STACK* sp;
Guido van Rossumb700df92000-03-31 14:59:30 +0000411
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000412 /* FIXME: this is a hack! */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000413 void* mark_copy[SRE_MARK_SIZE];
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000414 void* mark = NULL;
415
416 TRACE(("%8d: enter\n", PTR(ptr)));
417
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000418 if (pattern[0] == SRE_OP_INFO) {
419 /* optimization info block */
420 /* args: <1=skip> <2=flags> <3=min> ... */
421 if (pattern[3] && (end - ptr) < pattern[3]) {
422 TRACE(("reject (got %d chars, need %d)\n",
423 (end - ptr), pattern[3]));
424 return 0;
425 }
426 pattern += pattern[1] + 1;
427 }
428
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000429 stackbase = stack = state->stackbase;
430 lastmark = state->lastmark;
431
432 retry:
Guido van Rossumb700df92000-03-31 14:59:30 +0000433
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000434 for (;;) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000435
436 switch (*pattern++) {
437
438 case SRE_OP_FAILURE:
439 /* immediate failure */
440 TRACE(("%8d: failure\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000441 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000442
443 case SRE_OP_SUCCESS:
444 /* end of pattern */
445 TRACE(("%8d: success\n", PTR(ptr)));
446 state->ptr = ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000447 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000448
449 case SRE_OP_AT:
450 /* match at given position */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000451 /* args: <at> */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000452 TRACE(("%8d: position %d\n", PTR(ptr), *pattern));
Guido van Rossumb700df92000-03-31 14:59:30 +0000453 if (!SRE_AT(state, ptr, *pattern))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000454 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000455 pattern++;
456 break;
457
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000458 case SRE_OP_CATEGORY:
459 /* match at given category */
460 /* args: <category> */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000461 TRACE(("%8d: category %d [category %d]\n", PTR(ptr),
462 *ptr, *pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000463 if (ptr >= end || !sre_category(pattern[0], ptr[0]))
464 goto failure;
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000465 TRACE(("%8d: category ok\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000466 pattern++;
467 ptr++;
468 break;
469
Guido van Rossumb700df92000-03-31 14:59:30 +0000470 case SRE_OP_LITERAL:
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000471 /* match literal string */
Guido van Rossumb700df92000-03-31 14:59:30 +0000472 /* args: <code> */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000473 TRACE(("%8d: literal %c\n", PTR(ptr), pattern[0]));
474 if (ptr >= end || (SRE_CODE) ptr[0] != pattern[0])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000475 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000476 pattern++;
477 ptr++;
478 break;
479
480 case SRE_OP_NOT_LITERAL:
481 /* match anything that is not literal character */
482 /* args: <code> */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000483 TRACE(("%8d: literal not %c\n", PTR(ptr), pattern[0]));
484 if (ptr >= end || (SRE_CODE) ptr[0] == pattern[0])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000485 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000486 pattern++;
487 ptr++;
488 break;
489
490 case SRE_OP_ANY:
491 /* match anything */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000492 TRACE(("%8d: anything\n", PTR(ptr)));
Guido van Rossumb700df92000-03-31 14:59:30 +0000493 if (ptr >= end)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000494 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000495 ptr++;
496 break;
497
498 case SRE_OP_IN:
499 /* match set member (or non_member) */
500 /* args: <skip> <set> */
501 TRACE(("%8d: set %c\n", PTR(ptr), *ptr));
502 if (ptr >= end || !SRE_MEMBER(pattern + 1, *ptr))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000503 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000504 pattern += pattern[0];
505 ptr++;
506 break;
507
508 case SRE_OP_GROUP:
509 /* match backreference */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000510 TRACE(("%8d: group %d\n", PTR(ptr), pattern[0]));
Guido van Rossumb700df92000-03-31 14:59:30 +0000511 i = pattern[0];
512 {
Guido van Rossumb700df92000-03-31 14:59:30 +0000513 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
514 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
515 if (!p || !e || e < p)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000516 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000517 while (p < e) {
518 if (ptr >= end || *ptr != *p)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000519 goto failure;
520 p++; ptr++;
521 }
522 }
523 pattern++;
524 break;
525
526 case SRE_OP_GROUP_IGNORE:
527 /* match backreference */
528 TRACE(("%8d: group ignore %d\n", PTR(ptr), pattern[0]));
529 i = pattern[0];
530 {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000531 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
532 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000533 if (!p || !e || e < p)
534 goto failure;
535 while (p < e) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000536 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000537 state->lower(*ptr) != state->lower(*p))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000538 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000539 p++; ptr++;
540 }
541 }
542 pattern++;
543 break;
544
545 case SRE_OP_LITERAL_IGNORE:
Fredrik Lundh0640e112000-06-30 13:55:15 +0000546 TRACE(("%8d: literal lower(%c)\n", PTR(ptr), pattern[0]));
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000547 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000548 state->lower(*ptr) != state->lower(*pattern))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000549 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000550 pattern++;
551 ptr++;
552 break;
553
554 case SRE_OP_NOT_LITERAL_IGNORE:
Fredrik Lundh0640e112000-06-30 13:55:15 +0000555 TRACE(("%8d: literal not lower(%c)\n", PTR(ptr), pattern[0]));
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000556 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000557 state->lower(*ptr) == state->lower(*pattern))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000558 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000559 pattern++;
560 ptr++;
561 break;
562
563 case SRE_OP_IN_IGNORE:
564 TRACE(("%8d: set lower(%c)\n", PTR(ptr), *ptr));
565 if (ptr >= end
Fredrik Lundh0640e112000-06-30 13:55:15 +0000566 || !SRE_MEMBER(pattern+1, (SRE_CODE) state->lower(*ptr)))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000567 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000568 pattern += pattern[0];
569 ptr++;
570 break;
571
572 case SRE_OP_MARK:
573 /* set mark */
574 /* args: <mark> */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000575 TRACE(("%8d: set mark %d\n", PTR(ptr), pattern[0]));
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000576 if (state->lastmark < pattern[0]+1)
577 state->lastmark = pattern[0]+1;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000578 if (!mark) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000579 mark = mark_copy;
580 memcpy(mark, state->mark, state->lastmark*sizeof(void*));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000581 }
582 state->mark[pattern[0]] = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +0000583 pattern++;
584 break;
585
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000586 case SRE_OP_INDEX:
587 /* set index */
588 /* args: <index> */
589 TRACE(("%8d: set index %d\n", PTR(ptr), pattern[0]));
Fredrik Lundh6f013982000-07-03 18:44:21 +0000590 state->lastindex = pattern[0];
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000591 pattern++;
592 break;
593
Guido van Rossumb700df92000-03-31 14:59:30 +0000594 case SRE_OP_JUMP:
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000595 case SRE_OP_INFO:
Guido van Rossumb700df92000-03-31 14:59:30 +0000596 /* jump forward */
597 /* args: <skip> */
598 TRACE(("%8d: jump +%d\n", PTR(ptr), pattern[0]));
599 pattern += pattern[0];
600 break;
601
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000602 case SRE_OP_ASSERT:
603 /* assert subpattern */
Fredrik Lundh6f013982000-07-03 18:44:21 +0000604 /* args: <skip> <back> <pattern> */
605 TRACE(("%8d: assert subpattern %d\n", PTR(ptr), pattern[1]));
606 state->ptr = ptr - pattern[1];
607 if (state->ptr < state->beginning)
608 goto failure;
609 i = SRE_MATCH(state, pattern + 2);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000610 if (i < 0)
611 return i;
612 if (!i)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000613 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000614 pattern += pattern[0];
Guido van Rossumb700df92000-03-31 14:59:30 +0000615 break;
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000616
617 case SRE_OP_ASSERT_NOT:
618 /* assert not subpattern */
619 /* args: <skip> <pattern> */
Fredrik Lundh6f013982000-07-03 18:44:21 +0000620 TRACE(("%8d: assert not subpattern %d\n", PTR(ptr), pattern[1]));
621 state->ptr = ptr - pattern[1];
622 if (state->ptr < state->beginning)
623 goto failure;
624 i = SRE_MATCH(state, pattern + 2);
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000625 if (i < 0)
626 return i;
627 if (i)
628 goto failure;
629 pattern += pattern[0];
630 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000631
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000632#if 0
Guido van Rossumb700df92000-03-31 14:59:30 +0000633 case SRE_OP_MAX_REPEAT_ONE:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000634 /* match repeated sequence (maximizing regexp) */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000635
636 /* this operator only works if the repeated item is
637 exactly one character wide, and we're not already
638 collecting backtracking points. for other cases,
639 use the MAX_REPEAT operator instead */
640
Guido van Rossumb700df92000-03-31 14:59:30 +0000641 /* args: <skip> <min> <max> <step> */
Guido van Rossumb700df92000-03-31 14:59:30 +0000642 TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr),
643 pattern[1], pattern[2]));
644
645 count = 0;
646
647 if (pattern[3] == SRE_OP_ANY) {
648 /* repeated wildcard. skip to the end of the target
649 string, and backtrack from there */
650 /* FIXME: must look for line endings */
651 if (ptr + pattern[1] > end)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000652 goto failure; /* cannot match */
Guido van Rossumb700df92000-03-31 14:59:30 +0000653 count = pattern[2];
654 if (count > end - ptr)
655 count = end - ptr;
656 ptr += count;
657
658 } else if (pattern[3] == SRE_OP_LITERAL) {
659 /* repeated literal */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000660 SRE_CODE chr = pattern[4];
Guido van Rossumb700df92000-03-31 14:59:30 +0000661 while (count < (int) pattern[2]) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000662 if (ptr >= end || (SRE_CODE) ptr[0] != chr)
Guido van Rossumb700df92000-03-31 14:59:30 +0000663 break;
664 ptr++;
665 count++;
666 }
667
668 } else if (pattern[3] == SRE_OP_LITERAL_IGNORE) {
669 /* repeated literal */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000670 SRE_CODE chr = pattern[4];
Guido van Rossumb700df92000-03-31 14:59:30 +0000671 while (count < (int) pattern[2]) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000672 if (ptr >= end || (SRE_CODE) state->lower(*ptr) != chr)
Guido van Rossumb700df92000-03-31 14:59:30 +0000673 break;
674 ptr++;
675 count++;
676 }
677
678 } else if (pattern[3] == SRE_OP_NOT_LITERAL) {
679 /* repeated non-literal */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000680 SRE_CODE chr = pattern[4];
Guido van Rossumb700df92000-03-31 14:59:30 +0000681 while (count < (int) pattern[2]) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000682 if (ptr >= end || (SRE_CODE) ptr[0] == chr)
Guido van Rossumb700df92000-03-31 14:59:30 +0000683 break;
684 ptr++;
685 count++;
686 }
687
688 } else if (pattern[3] == SRE_OP_NOT_LITERAL_IGNORE) {
689 /* repeated non-literal */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000690 SRE_CODE chr = pattern[4];
Guido van Rossumb700df92000-03-31 14:59:30 +0000691 while (count < (int) pattern[2]) {
Fredrik Lundh0640e112000-06-30 13:55:15 +0000692 if (ptr >= end || (SRE_CODE) state->lower(ptr[0]) == chr)
Guido van Rossumb700df92000-03-31 14:59:30 +0000693 break;
694 ptr++;
695 count++;
696 }
697
698 } else if (pattern[3] == SRE_OP_IN) {
699 /* repeated set */
700 while (count < (int) pattern[2]) {
701 if (ptr >= end || !SRE_MEMBER(pattern + 5, *ptr))
702 break;
703 ptr++;
704 count++;
705 }
706
707 } else {
708 /* repeated single character pattern */
709 state->ptr = ptr;
710 while (count < (int) pattern[2]) {
711 i = SRE_MATCH(state, pattern + 3);
712 if (i < 0)
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000713 return i;
714 if (!i)
Guido van Rossumb700df92000-03-31 14:59:30 +0000715 break;
716 count++;
717 }
718 state->ptr = ptr;
719 ptr += count;
720 }
721
722 /* when we arrive here, count contains the number of
723 matches, and ptr points to the tail of the target
724 string. check if the rest of the pattern matches, and
725 backtrack if not. */
726
Guido van Rossumb700df92000-03-31 14:59:30 +0000727 TRACE(("%8d: repeat %d found\n", PTR(ptr), count));
728
729 if (count < (int) pattern[1])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000730 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000731
732 if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
733 /* tail is empty. we're finished */
734 TRACE(("%8d: tail is empty\n", PTR(ptr)));
735 state->ptr = ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000736 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000737
738 } else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000739 /* tail starts with a literal. skip positions where
740 the rest of the pattern cannot possibly match */
Fredrik Lundh0640e112000-06-30 13:55:15 +0000741 SRE_CODE chr = pattern[pattern[0]+1];
Guido van Rossumb700df92000-03-31 14:59:30 +0000742 TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr));
743 for (;;) {
744 TRACE(("%8d: scan for tail match\n", PTR(ptr)));
745 while (count >= (int) pattern[1] &&
746 (ptr >= end || *ptr != chr)) {
747 ptr--;
748 count--;
749 }
750 TRACE(("%8d: check tail\n", PTR(ptr)));
751 if (count < (int) pattern[1])
752 break;
753 state->ptr = ptr;
754 i = SRE_MATCH(state, pattern + pattern[0]);
755 if (i > 0) {
756 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000757 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000758 }
759 TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
760 ptr--;
761 count--;
762 }
763
764 } else {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000765 /* general case */
Guido van Rossumb700df92000-03-31 14:59:30 +0000766 TRACE(("%8d: tail is pattern\n", PTR(ptr)));
767 while (count >= (int) pattern[1]) {
768 state->ptr = ptr;
769 i = SRE_MATCH(state, pattern + pattern[0]);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000770 if (i < 0)
771 return i;
772 if (i) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000773 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000774 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000775 }
776 TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
777 ptr--;
778 count--;
779 }
780 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000781 goto failure;
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000782#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000783
784 case SRE_OP_MAX_REPEAT:
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000785 /* match repeated sequence (maximizing regexp) */
786 /* args: <skip> <1=min> <2=max> <3=save> <4=item> */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000787
788 TRACE(("%8d: max repeat (%d %d)\n", PTR(ptr),
Guido van Rossumb700df92000-03-31 14:59:30 +0000789 pattern[1], pattern[2]));
790
791 count = 0;
792 state->ptr = ptr;
793
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000794 /* match minimum number of items */
795 while (count < (int) pattern[1]) {
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000796 i = SRE_MATCH(state, pattern + 4);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000797 if (i < 0)
798 return i;
799 if (!i)
800 goto failure;
801 if (state->ptr == ptr) {
802 /* if the match was successful but empty, set the
803 count to max and terminate the scanning loop */
804 count = (int) pattern[2];
805 break;
806 }
807 count++;
808 ptr = state->ptr;
809 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000810
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000811 TRACE(("%8d: found %d leading items\n", PTR(ptr), count));
Guido van Rossumb700df92000-03-31 14:59:30 +0000812
813 if (count < (int) pattern[1])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000814 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000815
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000816 /* match maximum number of items, pushing alternate end
817 points to the stack */
Guido van Rossumb700df92000-03-31 14:59:30 +0000818
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000819 while (pattern[2] == 65535 || count < (int) pattern[2]) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000820 /* this position was valid; add it to the retry
821 stack */
822 if (stack >= state->stacksize) {
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000823 i = stack_extend(state, stack + 1,
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000824 stackbase + pattern[2]);
825 if (i < 0)
826 return i; /* out of memory */
827 }
Fredrik Lundh28552902000-07-05 21:14:16 +0000828 TRACE(("%8d: stack[%d]\n", PTR(ptr), stack));
829 TRACE((" ptr %d mark %d %d %d\n",
830 PTR(ptr), pattern[3], PTR(mark0), PTR(mark1)));
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000831 sp = state->stack + stack;
832 sp->ptr = ptr;
833 sp->pattern = pattern + pattern[0];
834 sp->mark = pattern[3];
835 if (pattern[3] != 65535) {
Fredrik Lundh28552902000-07-05 21:14:16 +0000836 sp->mark0 = state->mark[pattern[3]];
837 sp->mark1 = state->mark[pattern[3]+1];
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000838 }
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000839 stack++;
Fredrik Lundh28552902000-07-05 21:14:16 +0000840 state->stackbase = stack;
841 i = SRE_MATCH(state, pattern + 4);
842 state->stackbase = stackbase;
843 if (i < 0)
844 return i;
845 if (!i)
846 break;
847 if (state->ptr == ptr) {
848 count = (int) pattern[2];
849 break;
850 }
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000851 /* move forward */
852 ptr = state->ptr;
853 count++;
Guido van Rossumb700df92000-03-31 14:59:30 +0000854 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000855
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000856 /* when we get here, count is the number of successful
857 matches, and ptr points to the tail. */
Guido van Rossumb700df92000-03-31 14:59:30 +0000858
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000859 TRACE(("%8d: skip +%d\n", PTR(ptr), pattern[0]));
860
861 pattern += pattern[0];
862 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000863
864 case SRE_OP_MIN_REPEAT:
865 /* match repeated sequence (minimizing regexp) */
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000866 /* args: <skip> <1=min> <2=max> <3=save> <4=item> */
867
Guido van Rossumb700df92000-03-31 14:59:30 +0000868 TRACE(("%8d: min repeat %d %d\n", PTR(ptr),
869 pattern[1], pattern[2]));
870 count = 0;
871 state->ptr = ptr;
872 /* match minimum number of items */
873 while (count < (int) pattern[1]) {
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000874 i = SRE_MATCH(state, pattern + 4);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000875 if (i < 0)
876 return i;
877 if (!i)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000878 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000879 count++;
880 }
881 /* move forward until the tail matches. */
882 while (count <= (int) pattern[2]) {
883 ptr = state->ptr;
884 i = SRE_MATCH(state, pattern + pattern[0]);
885 if (i > 0) {
886 TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000887 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000888 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000889 state->ptr = ptr; /* backtrack */
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000890 i = SRE_MATCH(state, pattern + 4);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000891 if (i < 0)
892 return i;
893 if (!i)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000894 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000895 count++;
896 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000897 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000898
Guido van Rossumb700df92000-03-31 14:59:30 +0000899 case SRE_OP_BRANCH:
900 /* match one of several subpatterns */
901 /* format: <branch> <size> <head> ... <null> <tail> */
902 TRACE(("%8d: branch\n", PTR(ptr)));
903 while (*pattern) {
904 if (pattern[1] != SRE_OP_LITERAL ||
Fredrik Lundh0640e112000-06-30 13:55:15 +0000905 (ptr < end && (SRE_CODE) ptr[0] == pattern[2])) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000906 TRACE(("%8d: branch check\n", PTR(ptr)));
907 state->ptr = ptr;
908 i = SRE_MATCH(state, pattern + 1);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000909 if (i < 0)
910 return i;
911 if (i) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000912 TRACE(("%8d: branch succeeded\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000913 goto success;
Guido van Rossumb700df92000-03-31 14:59:30 +0000914 }
915 }
916 pattern += *pattern;
917 }
918 TRACE(("%8d: branch failed\n", PTR(ptr)));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000919 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000920
921 case SRE_OP_REPEAT:
922 /* TEMPLATE: match repeated sequence (no backtracking) */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000923 /* args: <skip> <min> <max> */
Guido van Rossumb700df92000-03-31 14:59:30 +0000924 TRACE(("%8d: repeat %d %d\n", PTR(ptr), pattern[1], pattern[2]));
925 count = 0;
926 state->ptr = ptr;
927 while (count < (int) pattern[2]) {
928 i = SRE_MATCH(state, pattern + 3);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000929 if (i < 0)
930 return i;
931 if (!i)
Guido van Rossumb700df92000-03-31 14:59:30 +0000932 break;
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000933 if (state->ptr == ptr) {
934 count = (int) pattern[2];
935 break;
936 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000937 count++;
938 }
939 if (count <= (int) pattern[1])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000940 goto failure;
Guido van Rossumb700df92000-03-31 14:59:30 +0000941 TRACE(("%8d: repeat %d matches\n", PTR(ptr), count));
942 pattern += pattern[0];
943 ptr = state->ptr;
944 break;
945
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000946 default:
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000947 TRACE(("%8d: unknown opcode %d\n", PTR(ptr), pattern[-1]));
Guido van Rossumb700df92000-03-31 14:59:30 +0000948 return SRE_ERROR_ILLEGAL;
949 }
950 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000951
952 failure:
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000953 TRACE(("%8d: leave (failure)\n", PTR(ptr)));
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000954 if (stack-- > stackbase) {
Fredrik Lundh28552902000-07-05 21:14:16 +0000955 TRACE(("%8d: pop stack[%d]\n", stack));
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000956 sp = state->stack + stack;
957 ptr = sp->ptr;
958 pattern = sp->pattern;
959 if (sp->mark != 65535) {
960 state->mark[sp->mark] = sp->mark0;
961 state->mark[sp->mark+1] = sp->mark1;
962 }
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000963 TRACE(("%8d: retry (%d)\n", PTR(ptr), stack));
964 goto retry;
965 }
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000966 state->lastmark = lastmark;
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000967 state->stackbase = stackbase;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000968 if (mark)
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000969 memcpy(state->mark, mark, state->lastmark*sizeof(void*));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000970 return 0;
971
972 success:
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000973 TRACE(("%8d: leave (success)\n", PTR(ptr)));
974 state->stackbase = stackbase;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000975 return 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000976}
977
978LOCAL(int)
979SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
980{
981 SRE_CHAR* ptr = state->start;
982 SRE_CHAR* end = state->end;
983 int status = 0;
Fredrik Lundh28552902000-07-05 21:14:16 +0000984 int prefix_len = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +0000985 SRE_CODE* prefix = NULL;
986 SRE_CODE* charset = NULL;
987 SRE_CODE* overlap = NULL;
988 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000989
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000990 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000991 /* optimization info block */
Fredrik Lundh3562f112000-07-02 12:00:07 +0000992 /* args: <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
993
994 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000995
996 if (pattern[3] > 0) {
997 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +0000998 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000999 end -= pattern[3]-1;
1000 if (end <= ptr)
1001 end = ptr+1;
1002 }
1003
Fredrik Lundh3562f112000-07-02 12:00:07 +00001004 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001005 /* pattern starts with a known prefix */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001006 prefix_len = pattern[5];
1007 prefix = pattern + 6;
1008 overlap = prefix + prefix_len - 1;
1009 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001010 /* pattern starts with a character from a known set */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001011 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001012
1013 pattern += 1 + pattern[1];
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001014 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001015
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001016#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001017 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001018 /* pattern starts with a known prefix. use the overlap
1019 table to skip forward as fast as we possibly can */
1020 int i = 0;
1021 end = state->end;
1022 while (ptr < end) {
1023 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001024 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001025 if (!i)
1026 break;
1027 else
1028 i = overlap[i];
1029 } else {
1030 if (++i == prefix_len) {
1031 /* found a potential match */
1032 TRACE(("%8d: === SEARCH === hit\n", PTR(ptr)));
1033 state->start = ptr - prefix_len + 1;
1034 state->ptr = ptr + 1;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001035 if (flags & SRE_INFO_LITERAL)
1036 return 1; /* we got all of it */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001037 status = SRE_MATCH(state, pattern + 2*prefix_len);
1038 if (status != 0)
1039 return status;
1040 /* close but no cigar -- try again */
1041 i = overlap[i];
1042 }
1043 break;
1044 }
1045
1046 }
1047 ptr++;
1048 }
1049 return 0;
1050 }
1051#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001052
Fredrik Lundh3562f112000-07-02 12:00:07 +00001053 if (pattern[0] == SRE_OP_LITERAL) {
1054 /* pattern starts with a literal character. this is used
1055 for short prefixes, and if fast search is disabled */
Fredrik Lundh0640e112000-06-30 13:55:15 +00001056 SRE_CODE chr = pattern[1];
Guido van Rossumb700df92000-03-31 14:59:30 +00001057 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001058 while (ptr < end && (SRE_CODE) ptr[0] != chr)
Guido van Rossumb700df92000-03-31 14:59:30 +00001059 ptr++;
1060 if (ptr == end)
1061 return 0;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001062 TRACE(("%8d: === SEARCH === literal\n", PTR(ptr)));
Guido van Rossumb700df92000-03-31 14:59:30 +00001063 state->start = ptr;
1064 state->ptr = ++ptr;
1065 status = SRE_MATCH(state, pattern + 2);
1066 if (status != 0)
1067 break;
1068 }
Fredrik Lundh3562f112000-07-02 12:00:07 +00001069 } else if (charset) {
1070 /* pattern starts with a character from a known set */
1071 for (;;) {
1072 while (ptr < end && !SRE_MEMBER(charset, ptr[0]))
1073 ptr++;
1074 if (ptr == end)
1075 return 0;
1076 TRACE(("%8d: === SEARCH === charset\n", PTR(ptr)));
1077 state->start = ptr;
1078 state->ptr = ptr;
1079 status = SRE_MATCH(state, pattern);
1080 if (status != 0)
1081 break;
1082 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001083 } else
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001084 /* general case */
Guido van Rossumb700df92000-03-31 14:59:30 +00001085 while (ptr <= end) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001086 TRACE(("%8d: === SEARCH ===\n", PTR(ptr)));
Guido van Rossumb700df92000-03-31 14:59:30 +00001087 state->start = state->ptr = ptr++;
1088 status = SRE_MATCH(state, pattern);
1089 if (status != 0)
1090 break;
1091 }
1092
1093 return status;
1094}
Fredrik Lundh3562f112000-07-02 12:00:07 +00001095
Guido van Rossumb700df92000-03-31 14:59:30 +00001096
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001097#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001098
1099/* -------------------------------------------------------------------- */
1100/* factories and destructors */
1101
1102/* see sre.h for object declarations */
1103
1104staticforward PyTypeObject Pattern_Type;
1105staticforward PyTypeObject Match_Type;
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001106staticforward PyTypeObject Scanner_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00001107
1108static PyObject *
1109_compile(PyObject* self_, PyObject* args)
1110{
1111 /* "compile" pattern descriptor to pattern object */
1112
1113 PatternObject* self;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001114 int i, n;
Guido van Rossumb700df92000-03-31 14:59:30 +00001115
1116 PyObject* pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001117 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001118 PyObject* code;
1119 int groups = 0;
1120 PyObject* groupindex = NULL;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001121 PyObject* indexgroup = NULL;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001122 if (!PyArg_ParseTuple(args, "OiO|iOO", &pattern, &flags, &code,
Fredrik Lundhc2301732000-07-02 22:25:39 +00001123 &groups, &groupindex, &indexgroup))
Guido van Rossumb700df92000-03-31 14:59:30 +00001124 return NULL;
1125
Fredrik Lundh6f013982000-07-03 18:44:21 +00001126 code = PySequence_Fast(code, "code argument must be a sequence");
1127 if (!code)
Guido van Rossumb700df92000-03-31 14:59:30 +00001128 return NULL;
1129
Fredrik Lundh6f013982000-07-03 18:44:21 +00001130 n = PySequence_Length(code);
1131
1132 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, 100*n);
1133 if (!self) {
1134 Py_DECREF(code);
1135 return NULL;
1136 }
1137
1138 for (i = 0; i < n; i++) {
1139 PyObject *o = PySequence_Fast_GET_ITEM(code, i);
1140 self->code[i] = (SRE_CODE) PyInt_AsLong(o);
1141 }
1142
1143 Py_DECREF(code);
1144
1145 if (PyErr_Occurred())
1146 return NULL;
1147
Guido van Rossumb700df92000-03-31 14:59:30 +00001148 Py_INCREF(pattern);
1149 self->pattern = pattern;
1150
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001151 self->flags = flags;
1152
Guido van Rossumb700df92000-03-31 14:59:30 +00001153 self->groups = groups;
1154
1155 Py_XINCREF(groupindex);
1156 self->groupindex = groupindex;
1157
Fredrik Lundhc2301732000-07-02 22:25:39 +00001158 Py_XINCREF(indexgroup);
1159 self->indexgroup = indexgroup;
1160
Guido van Rossumb700df92000-03-31 14:59:30 +00001161 return (PyObject*) self;
1162}
1163
1164static PyObject *
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001165sre_codesize(PyObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001166{
1167 return Py_BuildValue("i", sizeof(SRE_CODE));
1168}
1169
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001170static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001171sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001172{
1173 int character, flags;
1174 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1175 return NULL;
1176 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001177 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001178#if defined(HAVE_UNICODE)
1179 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001180 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001181#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001182 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001183}
1184
Guido van Rossumb700df92000-03-31 14:59:30 +00001185LOCAL(PyObject*)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001186state_init(SRE_STATE* state, PatternObject* pattern, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001187{
1188 /* prepare state object */
1189
1190 PyBufferProcs *buffer;
1191 int i, count;
1192 void* ptr;
1193
1194 PyObject* string;
1195 int start = 0;
1196 int end = INT_MAX;
1197 if (!PyArg_ParseTuple(args, "O|ii", &string, &start, &end))
1198 return NULL;
1199
1200 /* get pointer to string buffer */
1201 buffer = string->ob_type->tp_as_buffer;
1202 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1203 buffer->bf_getsegcount(string, NULL) != 1) {
1204 PyErr_SetString(PyExc_TypeError, "expected read-only buffer");
1205 return NULL;
1206 }
1207
1208 /* determine buffer size */
1209 count = buffer->bf_getreadbuffer(string, 0, &ptr);
1210 if (count < 0) {
1211 /* sanity check */
1212 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1213 return NULL;
1214 }
1215
1216 /* determine character size */
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001217#if defined(HAVE_UNICODE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001218 state->charsize = (PyUnicode_Check(string) ? sizeof(Py_UNICODE) : 1);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001219#else
1220 state->charsize = 1;
1221#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001222
1223 count /= state->charsize;
1224
1225 /* adjust boundaries */
1226 if (start < 0)
1227 start = 0;
1228 else if (start > count)
1229 start = count;
1230
1231 if (end < 0)
1232 end = 0;
1233 else if (end > count)
1234 end = count;
1235
1236 state->beginning = ptr;
1237
1238 state->start = (void*) ((char*) ptr + start * state->charsize);
1239 state->end = (void*) ((char*) ptr + end * state->charsize);
1240
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001241 state->lastmark = 0;
1242
Guido van Rossumb700df92000-03-31 14:59:30 +00001243 /* FIXME: dynamic! */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001244 for (i = 0; i < SRE_MARK_SIZE; i++)
Guido van Rossumb700df92000-03-31 14:59:30 +00001245 state->mark[i] = NULL;
1246
Fredrik Lundh6f013982000-07-03 18:44:21 +00001247 state->lastindex = -1;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001248
Guido van Rossumb700df92000-03-31 14:59:30 +00001249 state->stack = NULL;
1250 state->stackbase = 0;
1251 state->stacksize = 0;
1252
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001253 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001254 state->lower = sre_lower_locale;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001255#if defined(HAVE_UNICODE)
1256 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001257 state->lower = sre_lower_unicode;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001258#endif
1259 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001260 state->lower = sre_lower;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001261
Guido van Rossumb700df92000-03-31 14:59:30 +00001262 return string;
1263}
1264
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001265LOCAL(void)
1266state_fini(SRE_STATE* state)
1267{
1268 stack_free(state);
1269}
1270
1271LOCAL(PyObject*)
1272state_getslice(SRE_STATE* state, int index, PyObject* string)
1273{
1274 index = (index - 1) * 2;
1275
1276 if (string == Py_None || !state->mark[index] || !state->mark[index+1]) {
1277 Py_INCREF(Py_None);
1278 return Py_None;
1279 }
1280
1281 return PySequence_GetSlice(
1282 string,
1283 ((char*)state->mark[index] - (char*)state->beginning) /
1284 state->charsize,
1285 ((char*)state->mark[index+1] - (char*)state->beginning) /
1286 state->charsize
1287 );
1288}
1289
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001290static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001291pattern_new_match(PatternObject* pattern, SRE_STATE* state,
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001292 PyObject* string, int status)
1293{
1294 /* create match object (from state object) */
1295
1296 MatchObject* match;
1297 int i, j;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001298 char* base;
1299 int n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001300
1301 if (status > 0) {
1302
1303 /* create match object (with room for extra group marks) */
Fredrik Lundh6f013982000-07-03 18:44:21 +00001304 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
1305 2*(pattern->groups+1));
1306 if (!match)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001307 return NULL;
1308
1309 Py_INCREF(pattern);
1310 match->pattern = pattern;
1311
1312 Py_INCREF(string);
1313 match->string = string;
1314
1315 match->groups = pattern->groups+1;
1316
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001317 base = (char*) state->beginning;
1318 n = state->charsize;
1319
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001320 /* group zero */
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001321 match->mark[0] = ((char*) state->start - base) / n;
1322 match->mark[1] = ((char*) state->ptr - base) / n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001323
1324 /* fill in the rest of the groups */
1325 for (i = j = 0; i < pattern->groups; i++, j+=2)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001326 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
1327 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
1328 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001329 } else
1330 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
1331
Fredrik Lundh6f013982000-07-03 18:44:21 +00001332 match->lastindex = state->lastindex;
1333
1334 match->pos = ((char*) state->start - base) / n;
1335 match->endpos = ((char*) state->end - base) / n;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001336
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001337 return (PyObject*) match;
1338
1339 } else if (status < 0) {
1340
1341 /* internal error */
1342 PyErr_SetString(
1343 PyExc_RuntimeError, "internal error in regular expression engine"
1344 );
1345 return NULL;
1346
1347 }
1348
1349 Py_INCREF(Py_None);
1350 return Py_None;
1351}
1352
1353static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001354pattern_scanner(PatternObject* pattern, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001355{
1356 /* create search state object */
1357
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001358 ScannerObject* self;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001359 PyObject* string;
1360
1361 /* create match object (with room for extra group marks) */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001362 self = PyObject_NEW(ScannerObject, &Scanner_Type);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001363 if (!self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001364 return NULL;
1365
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001366 string = state_init(&self->state, pattern, args);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001367 if (!string) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00001368 PyObject_Del(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001369 return NULL;
1370 }
1371
1372 Py_INCREF(pattern);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001373 self->pattern = (PyObject*) pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001374
1375 Py_INCREF(string);
1376 self->string = string;
1377
1378 return (PyObject*) self;
1379}
1380
Guido van Rossumb700df92000-03-31 14:59:30 +00001381static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001382pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001383{
Guido van Rossumb700df92000-03-31 14:59:30 +00001384 Py_XDECREF(self->pattern);
1385 Py_XDECREF(self->groupindex);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001386 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001387}
1388
1389static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001390pattern_match(PatternObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001391{
1392 SRE_STATE state;
1393 PyObject* string;
1394 int status;
1395
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001396 string = state_init(&state, self, args);
Guido van Rossumb700df92000-03-31 14:59:30 +00001397 if (!string)
1398 return NULL;
1399
1400 state.ptr = state.start;
1401
1402 if (state.charsize == 1) {
1403 status = sre_match(&state, PatternObject_GetCode(self));
1404 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001405#if defined(HAVE_UNICODE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001406 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001407#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001408 }
1409
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001410 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001411
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001412 return pattern_new_match(self, &state, string, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001413}
1414
1415static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001416pattern_search(PatternObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001417{
1418 SRE_STATE state;
1419 PyObject* string;
1420 int status;
1421
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001422 string = state_init(&state, self, args);
Guido van Rossumb700df92000-03-31 14:59:30 +00001423 if (!string)
1424 return NULL;
1425
1426 if (state.charsize == 1) {
1427 status = sre_search(&state, PatternObject_GetCode(self));
1428 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001429#if defined(HAVE_UNICODE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001430 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001431#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001432 }
1433
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001434 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001435
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001436 return pattern_new_match(self, &state, string, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001437}
1438
1439static PyObject*
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001440call(char* function, PyObject* args)
1441{
1442 PyObject* name;
1443 PyObject* module;
1444 PyObject* func;
1445 PyObject* result;
1446
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001447 name = PyString_FromString(MODULE);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001448 if (!name)
1449 return NULL;
1450 module = PyImport_Import(name);
1451 Py_DECREF(name);
1452 if (!module)
1453 return NULL;
1454 func = PyObject_GetAttrString(module, function);
1455 Py_DECREF(module);
1456 if (!func)
1457 return NULL;
1458 result = PyObject_CallObject(func, args);
1459 Py_DECREF(func);
1460 Py_DECREF(args);
1461 return result;
1462}
1463
1464static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001465pattern_sub(PatternObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001466{
1467 PyObject* template;
1468 PyObject* string;
Fredrik Lundh28552902000-07-05 21:14:16 +00001469 PyObject* count = Py_False; /* zero */
1470 if (!PyArg_ParseTuple(args, "OO|O", &template, &string, &count))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001471 return NULL;
1472
1473 /* delegate to Python code */
1474 return call("_sub", Py_BuildValue("OOOO", self, template, string, count));
1475}
1476
1477static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001478pattern_subn(PatternObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001479{
1480 PyObject* template;
1481 PyObject* string;
Fredrik Lundh28552902000-07-05 21:14:16 +00001482 PyObject* count = Py_False; /* zero */
1483 if (!PyArg_ParseTuple(args, "OO|O", &template, &string, &count))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001484 return NULL;
1485
1486 /* delegate to Python code */
1487 return call("_subn", Py_BuildValue("OOOO", self, template, string, count));
1488}
1489
1490static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001491pattern_split(PatternObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001492{
1493 PyObject* string;
Fredrik Lundh28552902000-07-05 21:14:16 +00001494 PyObject* maxsplit = Py_False; /* zero */
1495 if (!PyArg_ParseTuple(args, "O|O", &string, &maxsplit))
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001496 return NULL;
1497
1498 /* delegate to Python code */
1499 return call("_split", Py_BuildValue("OOO", self, string, maxsplit));
1500}
1501
1502static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001503pattern_findall(PatternObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001504{
Guido van Rossumb700df92000-03-31 14:59:30 +00001505 SRE_STATE state;
1506 PyObject* string;
1507 PyObject* list;
1508 int status;
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001509 int i;
Guido van Rossumb700df92000-03-31 14:59:30 +00001510
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001511 string = state_init(&state, self, args);
Guido van Rossumb700df92000-03-31 14:59:30 +00001512 if (!string)
1513 return NULL;
1514
1515 list = PyList_New(0);
1516
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001517 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001518
1519 PyObject* item;
1520
1521 state.ptr = state.start;
1522
1523 if (state.charsize == 1) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001524 status = sre_search(&state, PatternObject_GetCode(self));
Guido van Rossumb700df92000-03-31 14:59:30 +00001525 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001526#if defined(HAVE_UNICODE)
1527 status = sre_usearch(&state, PatternObject_GetCode(self));
1528#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001529 }
1530
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001531 if (status > 0) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001532
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001533 /* don't bother to build a match object */
1534 switch (self->groups) {
1535 case 0:
1536 item = PySequence_GetSlice(
1537 string,
1538 ((char*) state.start - (char*) state.beginning) /
1539 state.charsize,
1540 ((char*) state.ptr - (char*) state.beginning) /
1541 state.charsize);
1542 if (!item)
1543 goto error;
1544 break;
1545 case 1:
1546 item = state_getslice(&state, 1, string);
1547 if (!item)
1548 goto error;
1549 break;
1550 default:
1551 item = PyTuple_New(self->groups);
1552 if (!item)
1553 goto error;
1554 for (i = 0; i < self->groups; i++) {
1555 PyObject* o = state_getslice(&state, i+1, string);
1556 if (!o) {
1557 Py_DECREF(item);
1558 goto error;
1559 }
1560 PyTuple_SET_ITEM(item, i, o);
1561 }
1562 break;
1563 }
1564
1565 if (PyList_Append(list, item) < 0) {
1566 Py_DECREF(item);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001567 goto error;
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001568 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001569
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001570 if (state.ptr == state.start)
1571 state.start = (void*) ((char*) state.ptr + state.charsize);
1572 else
1573 state.start = state.ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001574
1575 } else {
1576
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001577 if (status == 0)
1578 break;
1579
Guido van Rossumb700df92000-03-31 14:59:30 +00001580 /* internal error */
1581 PyErr_SetString(
1582 PyExc_RuntimeError,
1583 "internal error in regular expression engine"
1584 );
1585 goto error;
1586
1587 }
1588 }
1589
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001590 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001591 return list;
1592
1593error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001594 Py_DECREF(list);
1595 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001596 return NULL;
1597
1598}
1599
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001600static PyMethodDef pattern_methods[] = {
1601 {"match", (PyCFunction) pattern_match, 1},
1602 {"search", (PyCFunction) pattern_search, 1},
1603 {"sub", (PyCFunction) pattern_sub, 1},
1604 {"subn", (PyCFunction) pattern_subn, 1},
1605 {"split", (PyCFunction) pattern_split, 1},
1606 {"findall", (PyCFunction) pattern_findall, 1},
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001607 /* experimental */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001608 {"scanner", (PyCFunction) pattern_scanner, 1},
Guido van Rossumb700df92000-03-31 14:59:30 +00001609 {NULL, NULL}
1610};
1611
1612static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001613pattern_getattr(PatternObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00001614{
1615 PyObject* res;
1616
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001617 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00001618
1619 if (res)
1620 return res;
1621
1622 PyErr_Clear();
1623
1624 /* attributes */
1625 if (!strcmp(name, "pattern")) {
1626 Py_INCREF(self->pattern);
1627 return self->pattern;
1628 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001629
1630 if (!strcmp(name, "flags"))
1631 return Py_BuildValue("i", self->flags);
1632
Fredrik Lundh01016fe2000-06-30 00:27:46 +00001633 if (!strcmp(name, "groups"))
1634 return Py_BuildValue("i", self->groups);
1635
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001636 if (!strcmp(name, "groupindex") && self->groupindex) {
1637 Py_INCREF(self->groupindex);
1638 return self->groupindex;
1639 }
1640
Guido van Rossumb700df92000-03-31 14:59:30 +00001641 PyErr_SetString(PyExc_AttributeError, name);
1642 return NULL;
1643}
1644
1645statichere PyTypeObject Pattern_Type = {
1646 PyObject_HEAD_INIT(NULL)
Fredrik Lundh6f013982000-07-03 18:44:21 +00001647 0, "SRE_Pattern",
1648 sizeof(PatternObject), sizeof(SRE_CODE),
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001649 (destructor)pattern_dealloc, /*tp_dealloc*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001650 0, /*tp_print*/
Fredrik Lundh6f013982000-07-03 18:44:21 +00001651 (getattrfunc)pattern_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001652};
1653
1654/* -------------------------------------------------------------------- */
1655/* match methods */
1656
1657static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001658match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001659{
1660 Py_XDECREF(self->string);
1661 Py_DECREF(self->pattern);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001662 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001663}
1664
1665static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001666match_getslice_by_index(MatchObject* self, int index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00001667{
1668 if (index < 0 || index >= self->groups) {
1669 /* raise IndexError if we were given a bad group number */
1670 PyErr_SetString(
1671 PyExc_IndexError,
1672 "no such group"
1673 );
1674 return NULL;
1675 }
1676
Fredrik Lundh6f013982000-07-03 18:44:21 +00001677 index *= 2;
1678
1679 if (self->string == Py_None || self->mark[index] < 0) {
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001680 /* return default value if the string or group is undefined */
1681 Py_INCREF(def);
1682 return def;
Guido van Rossumb700df92000-03-31 14:59:30 +00001683 }
1684
1685 return PySequence_GetSlice(
Fredrik Lundh6f013982000-07-03 18:44:21 +00001686 self->string, self->mark[index], self->mark[index+1]
Guido van Rossumb700df92000-03-31 14:59:30 +00001687 );
1688}
1689
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001690static int
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001691match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00001692{
Fredrik Lundh6f013982000-07-03 18:44:21 +00001693 int i;
Guido van Rossumb700df92000-03-31 14:59:30 +00001694
1695 if (PyInt_Check(index))
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001696 return (int) PyInt_AS_LONG(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00001697
Fredrik Lundh6f013982000-07-03 18:44:21 +00001698 i = -1;
1699
1700 if (self->pattern->groupindex) {
1701 index = PyObject_GetItem(self->pattern->groupindex, index);
1702 if (index) {
1703 if (PyInt_Check(index))
1704 i = (int) PyInt_AS_LONG(index);
1705 Py_DECREF(index);
1706 } else
1707 PyErr_Clear();
1708 }
1709
1710 return i;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001711}
1712
1713static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001714match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001715{
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001716 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00001717}
1718
1719static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001720match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001721{
1722 PyObject* result;
1723 int i, size;
1724
1725 size = PyTuple_GET_SIZE(args);
1726
1727 switch (size) {
1728 case 0:
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001729 result = match_getslice(self, Py_False, Py_None);
Guido van Rossumb700df92000-03-31 14:59:30 +00001730 break;
1731 case 1:
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001732 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
Guido van Rossumb700df92000-03-31 14:59:30 +00001733 break;
1734 default:
1735 /* fetch multiple items */
1736 result = PyTuple_New(size);
1737 if (!result)
1738 return NULL;
1739 for (i = 0; i < size; i++) {
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001740 PyObject* item = match_getslice(
1741 self, PyTuple_GET_ITEM(args, i), Py_None
1742 );
Guido van Rossumb700df92000-03-31 14:59:30 +00001743 if (!item) {
1744 Py_DECREF(result);
1745 return NULL;
1746 }
1747 PyTuple_SET_ITEM(result, i, item);
1748 }
1749 break;
1750 }
1751 return result;
1752}
1753
1754static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001755match_groups(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001756{
1757 PyObject* result;
1758 int index;
1759
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001760 PyObject* def = Py_None;
1761 if (!PyArg_ParseTuple(args, "|O", &def))
1762 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001763
Guido van Rossumb700df92000-03-31 14:59:30 +00001764 result = PyTuple_New(self->groups-1);
1765 if (!result)
1766 return NULL;
1767
1768 for (index = 1; index < self->groups; index++) {
1769 PyObject* item;
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001770 item = match_getslice_by_index(self, index, def);
Guido van Rossumb700df92000-03-31 14:59:30 +00001771 if (!item) {
1772 Py_DECREF(result);
1773 return NULL;
1774 }
1775 PyTuple_SET_ITEM(result, index-1, item);
1776 }
1777
1778 return result;
1779}
1780
1781static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001782match_groupdict(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001783{
1784 PyObject* result;
1785 PyObject* keys;
1786 int index;
1787
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001788 PyObject* def = Py_None;
1789 if (!PyArg_ParseTuple(args, "|O", &def))
1790 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001791
Guido van Rossumb700df92000-03-31 14:59:30 +00001792 result = PyDict_New();
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001793 if (!result || !self->pattern->groupindex)
Guido van Rossumb700df92000-03-31 14:59:30 +00001794 return result;
1795
1796 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001797 if (!keys) {
1798 Py_DECREF(result);
Guido van Rossumb700df92000-03-31 14:59:30 +00001799 return NULL;
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001800 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001801
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001802 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001803 PyObject* key;
1804 PyObject* item;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001805 key = PyList_GET_ITEM(keys, index);
Guido van Rossumb700df92000-03-31 14:59:30 +00001806 if (!key) {
1807 Py_DECREF(keys);
1808 Py_DECREF(result);
1809 return NULL;
1810 }
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001811 item = match_getslice(self, key, def);
Guido van Rossumb700df92000-03-31 14:59:30 +00001812 if (!item) {
1813 Py_DECREF(key);
1814 Py_DECREF(keys);
1815 Py_DECREF(result);
1816 return NULL;
1817 }
1818 /* FIXME: <fl> this can fail, right? */
1819 PyDict_SetItem(result, key, item);
1820 }
1821
1822 Py_DECREF(keys);
1823
1824 return result;
1825}
1826
1827static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001828match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001829{
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001830 int index;
1831
Fredrik Lundh28552902000-07-05 21:14:16 +00001832 PyObject* index_ = Py_False; /* zero */
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001833 if (!PyArg_ParseTuple(args, "|O", &index_))
Guido van Rossumb700df92000-03-31 14:59:30 +00001834 return NULL;
1835
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001836 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001837
Guido van Rossumb700df92000-03-31 14:59:30 +00001838 if (index < 0 || index >= self->groups) {
1839 PyErr_SetString(
1840 PyExc_IndexError,
1841 "no such group"
1842 );
1843 return NULL;
1844 }
1845
1846 if (self->mark[index*2] < 0) {
1847 Py_INCREF(Py_None);
1848 return Py_None;
1849 }
1850
1851 return Py_BuildValue("i", self->mark[index*2]);
1852}
1853
1854static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001855match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001856{
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001857 int index;
1858
Fredrik Lundh28552902000-07-05 21:14:16 +00001859 PyObject* index_ = Py_False; /* zero */
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001860 if (!PyArg_ParseTuple(args, "|O", &index_))
Guido van Rossumb700df92000-03-31 14:59:30 +00001861 return NULL;
1862
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001863 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001864
Guido van Rossumb700df92000-03-31 14:59:30 +00001865 if (index < 0 || index >= self->groups) {
1866 PyErr_SetString(
1867 PyExc_IndexError,
1868 "no such group"
1869 );
1870 return NULL;
1871 }
1872
1873 if (self->mark[index*2] < 0) {
1874 Py_INCREF(Py_None);
1875 return Py_None;
1876 }
1877
1878 return Py_BuildValue("i", self->mark[index*2+1]);
1879}
1880
1881static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001882match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001883{
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001884 int index;
1885
Fredrik Lundh28552902000-07-05 21:14:16 +00001886 PyObject* index_ = Py_False; /* zero */
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001887 if (!PyArg_ParseTuple(args, "|O", &index_))
Guido van Rossumb700df92000-03-31 14:59:30 +00001888 return NULL;
1889
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001890 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001891
Guido van Rossumb700df92000-03-31 14:59:30 +00001892 if (index < 0 || index >= self->groups) {
1893 PyErr_SetString(
1894 PyExc_IndexError,
1895 "no such group"
1896 );
1897 return NULL;
1898 }
1899
1900 if (self->mark[index*2] < 0) {
1901 Py_INCREF(Py_None);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001902 Py_INCREF(Py_None);
1903 return Py_BuildValue("OO", Py_None, Py_None);
Guido van Rossumb700df92000-03-31 14:59:30 +00001904 }
1905
1906 return Py_BuildValue("ii", self->mark[index*2], self->mark[index*2+1]);
1907}
1908
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001909static PyMethodDef match_methods[] = {
1910 {"group", (PyCFunction) match_group, 1},
1911 {"start", (PyCFunction) match_start, 1},
1912 {"end", (PyCFunction) match_end, 1},
1913 {"span", (PyCFunction) match_span, 1},
1914 {"groups", (PyCFunction) match_groups, 1},
1915 {"groupdict", (PyCFunction) match_groupdict, 1},
Guido van Rossumb700df92000-03-31 14:59:30 +00001916 {NULL, NULL}
1917};
1918
1919static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001920match_getattr(MatchObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00001921{
1922 PyObject* res;
1923
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001924 res = Py_FindMethod(match_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00001925 if (res)
1926 return res;
1927
1928 PyErr_Clear();
1929
Fredrik Lundhc2301732000-07-02 22:25:39 +00001930 if (!strcmp(name, "lastindex")) {
1931 /* experimental */
Fredrik Lundh6f013982000-07-03 18:44:21 +00001932 if (self->lastindex >= 0)
1933 return Py_BuildValue("i", self->lastindex);
Fredrik Lundhc2301732000-07-02 22:25:39 +00001934 Py_INCREF(Py_None);
1935 return Py_None;
1936 }
1937
1938 if (!strcmp(name, "lastgroup")) {
1939 /* experimental */
Fredrik Lundh6f013982000-07-03 18:44:21 +00001940 if (self->pattern->indexgroup && self->lastindex >= 0) {
Fredrik Lundhc2301732000-07-02 22:25:39 +00001941 PyObject* result = PySequence_GetItem(
Fredrik Lundh6f013982000-07-03 18:44:21 +00001942 self->pattern->indexgroup, self->lastindex
Fredrik Lundhc2301732000-07-02 22:25:39 +00001943 );
1944 if (result)
1945 return result;
1946 PyErr_Clear();
1947 }
1948 Py_INCREF(Py_None);
1949 return Py_None;
1950 }
1951
Guido van Rossumb700df92000-03-31 14:59:30 +00001952 if (!strcmp(name, "string")) {
1953 Py_INCREF(self->string);
1954 return self->string;
1955 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001956
Guido van Rossumb700df92000-03-31 14:59:30 +00001957 if (!strcmp(name, "re")) {
1958 Py_INCREF(self->pattern);
1959 return (PyObject*) self->pattern;
1960 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001961
Guido van Rossumb700df92000-03-31 14:59:30 +00001962 if (!strcmp(name, "pos"))
Fredrik Lundh6f013982000-07-03 18:44:21 +00001963 return Py_BuildValue("i", self->pos);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001964
Guido van Rossumb700df92000-03-31 14:59:30 +00001965 if (!strcmp(name, "endpos"))
Fredrik Lundh6f013982000-07-03 18:44:21 +00001966 return Py_BuildValue("i", self->endpos);
Guido van Rossumb700df92000-03-31 14:59:30 +00001967
1968 PyErr_SetString(PyExc_AttributeError, name);
1969 return NULL;
1970}
1971
1972/* FIXME: implement setattr("string", None) as a special case (to
1973 detach the associated string, if any */
1974
1975statichere PyTypeObject Match_Type = {
1976 PyObject_HEAD_INIT(NULL)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001977 0, "SRE_Match",
Fredrik Lundh6f013982000-07-03 18:44:21 +00001978 sizeof(MatchObject), sizeof(int),
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001979 (destructor)match_dealloc, /*tp_dealloc*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001980 0, /*tp_print*/
Fredrik Lundh6f013982000-07-03 18:44:21 +00001981 (getattrfunc)match_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00001982};
1983
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001984/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001985/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001986
1987static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001988scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001989{
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001990 state_fini(&self->state);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001991 Py_DECREF(self->string);
1992 Py_DECREF(self->pattern);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001993 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001994}
1995
1996static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001997scanner_match(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001998{
1999 SRE_STATE* state = &self->state;
2000 PyObject* match;
2001 int status;
2002
2003 state->ptr = state->start;
2004
2005 if (state->charsize == 1) {
2006 status = sre_match(state, PatternObject_GetCode(self->pattern));
2007 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002008#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002009 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002010#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002011 }
2012
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002013 match = pattern_new_match((PatternObject*) self->pattern,
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002014 state, self->string, status);
2015
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002016 if (status == 0 || state->ptr == state->start)
2017 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002018 else
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002019 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002020
2021 return match;
2022}
2023
2024
2025static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002026scanner_search(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002027{
2028 SRE_STATE* state = &self->state;
2029 PyObject* match;
2030 int status;
2031
2032 state->ptr = state->start;
2033
2034 if (state->charsize == 1) {
2035 status = sre_search(state, PatternObject_GetCode(self->pattern));
2036 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002037#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002038 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002039#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002040 }
2041
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002042 match = pattern_new_match((PatternObject*) self->pattern,
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002043 state, self->string, status);
2044
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002045 if (status == 0 || state->ptr == state->start)
2046 state->start = (void*) ((char*) state->ptr + state->charsize);
2047 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002048 state->start = state->ptr;
2049
2050 return match;
2051}
2052
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002053static PyMethodDef scanner_methods[] = {
2054 {"match", (PyCFunction) scanner_match, 0},
2055 {"search", (PyCFunction) scanner_search, 0},
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002056 {NULL, NULL}
2057};
2058
2059static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002060scanner_getattr(ScannerObject* self, char* name)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002061{
2062 PyObject* res;
2063
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002064 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002065 if (res)
2066 return res;
2067
2068 PyErr_Clear();
2069
2070 /* attributes */
2071 if (!strcmp(name, "pattern")) {
2072 Py_INCREF(self->pattern);
2073 return self->pattern;
2074 }
2075
2076 PyErr_SetString(PyExc_AttributeError, name);
2077 return NULL;
2078}
2079
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002080statichere PyTypeObject Scanner_Type = {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002081 PyObject_HEAD_INIT(NULL)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002082 0, "SRE_Scanner",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002083 sizeof(ScannerObject), 0,
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002084 (destructor)scanner_dealloc, /*tp_dealloc*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002085 0, /*tp_print*/
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002086 (getattrfunc)scanner_getattr, /*tp_getattr*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002087};
2088
Guido van Rossumb700df92000-03-31 14:59:30 +00002089static PyMethodDef _functions[] = {
2090 {"compile", _compile, 1},
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002091 {"getcodesize", sre_codesize, 1},
Fredrik Lundhb389df32000-06-29 12:48:37 +00002092 {"getlower", sre_getlower, 1},
Guido van Rossumb700df92000-03-31 14:59:30 +00002093 {NULL, NULL}
2094};
2095
2096void
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002097#if defined(WIN32)
Guido van Rossumb700df92000-03-31 14:59:30 +00002098__declspec(dllexport)
2099#endif
2100init_sre()
2101{
2102 /* Patch object types */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002103 Pattern_Type.ob_type = Match_Type.ob_type =
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002104 Scanner_Type.ob_type = &PyType_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00002105
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002106 Py_InitModule("_" MODULE, _functions);
Guido van Rossumb700df92000-03-31 14:59:30 +00002107}
2108
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002109#endif /* !defined(SRE_RECURSIVE) */