blob: 9943c3016e2c38335124d6dfa193d6304f4da9ea [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
9 * 2000-06-30 fl added fast search optimization
10 * 2000-06-30 fl added assert (lookahead) primitives, etc
11 * 2000-07-02 fl added charset optimizations, etc
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000012 * 2000-07-03 fl store code in pattern object, lookbehind, etc
13 * 2000-07-08 fl added regs attribute
Fredrik Lundhebc37b22000-10-28 19:30:41 +000014 * 2000-07-21 fl reset lastindex in scanner methods
15 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000016 * 2000-08-03 fl added recursion limit
17 * 2000-08-07 fl use PyOS_CheckStack() if available
18 * 2000-08-08 fl changed findall to return empty strings instead of None
19 * 2000-08-27 fl properly propagate memory errors
20 * 2000-09-02 fl return -1 instead of None for start/end/span
21 * 2000-09-20 fl added expand method
22 * 2000-09-21 fl don't use the buffer interface for unicode strings
Fredrik Lundh562586e2000-10-03 20:43:34 +000023 * 2000-10-03 fl fixed assert_not primitive; support keyword arguments
Fredrik Lundhebc37b22000-10-28 19:30:41 +000024 * 2000-10-24 fl really fixed assert_not; reset groups in findall
Fredrik Lundh770617b2001-01-14 15:06:11 +000025 * 2000-12-21 fl fixed memory leak in groupdict
26 * 2001-01-02 fl properly reset pointer after failed assertion in MIN_UNTIL
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000027 * 2001-01-15 fl avoid recursion for MIN_UNTIL; fixed uppercase literal bug
Fredrik Lundh6f5cba62001-01-16 07:05:29 +000028 * 2001-01-16 fl fixed memory leak in pattern destructor
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000029 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000030 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000031 * 2001-04-28 fl added __copy__ methods (work in progress)
32 * 2001-05-14 fl fixes for 1.5.2
Fredrik Lundhf71ae462001-07-02 17:04:48 +000033 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Guido van Rossumb700df92000-03-31 14:59:30 +000034 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000035 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000036 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000037 * This version of the SRE library can be redistributed under CNRI's
38 * Python 1.6 license. For any other use, please contact Secret Labs
39 * AB (info@pythonware.com).
40 *
Guido van Rossumb700df92000-03-31 14:59:30 +000041 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000042 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000043 * other compatibility work.
44 */
45
46#ifndef SRE_RECURSIVE
47
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000048static char copyright[] =
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000049 " SRE 2.1.1 Copyright (c) 1997-2001 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000050
51#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000052#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000053
54#include "sre.h"
55
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000056#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000057
Fredrik Lundh436c3d582000-06-29 08:58:44 +000058/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000059#if !defined(SRE_MODULE)
60#define SRE_MODULE "sre"
61#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000062
Guido van Rossumb700df92000-03-31 14:59:30 +000063/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000065
Martin v. Löwis339d0f72001-08-17 18:39:25 +000066#if PY_VERSION_HEX >= 0x01060000 && defined(Py_USING_UNICODE)
Fredrik Lundh22d25462000-07-01 17:50:59 +000067/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +000068#define HAVE_UNICODE
69#endif
70
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000071/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000072/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000073
Fredrik Lundh33accc12000-08-27 20:59:47 +000074/* prevent run-away recursion (bad patterns on long strings) */
75
Fredrik Lundh18c2aa22000-08-07 17:33:38 +000076#if !defined(USE_STACKCHECK)
Fredrik Lundh33accc12000-08-27 20:59:47 +000077#if defined(MS_WIN64) || defined(__LP64__) || defined(_LP64)
78/* require smaller recursion limit for a number of 64-bit platforms:
79 Win64 (MS_WIN64), Linux64 (__LP64__), Monterey (64-bit AIX) (_LP64) */
80/* FIXME: maybe the limit should be 40000 / sizeof(void*) ? */
81#define USE_RECURSION_LIMIT 7500
82#else
83#define USE_RECURSION_LIMIT 10000
84#endif
Fredrik Lundh18c2aa22000-08-07 17:33:38 +000085#endif
Fredrik Lundh96ab4652000-08-03 16:29:50 +000086
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000087/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000088#define USE_FAST_SEARCH
89
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000090/* enables aggressive inlining (always on for Visual C) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000091#undef USE_INLINE
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000092
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000093/* enables copy/deepcopy handling (work in progress) */
94#undef USE_BUILTIN_COPY
95
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000096#if PY_VERSION_HEX < 0x01060000
97#define PyObject_DEL(op) PyMem_DEL((op))
98#endif
99
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000100/* -------------------------------------------------------------------- */
101
Fredrik Lundh80946112000-06-29 18:03:25 +0000102#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +0000103#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +0000104#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +0000105/* fastest possible local call under MSVC */
106#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000107#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000108#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000109#else
110#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +0000111#endif
112
113/* error codes */
114#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000115#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000116#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +0000117#define SRE_ERROR_MEMORY -9 /* out of memory */
118
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000119#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +0000120#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +0000121#else
122#define TRACE(v)
123#endif
124
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000125/* -------------------------------------------------------------------- */
126/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000127
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000128/* default character predicates (run sre_chars.py to regenerate tables) */
129
130#define SRE_DIGIT_MASK 1
131#define SRE_SPACE_MASK 2
132#define SRE_LINEBREAK_MASK 4
133#define SRE_ALNUM_MASK 8
134#define SRE_WORD_MASK 16
135
136static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1372, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1380, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
13925, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
14024, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1410, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
14224, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
143
Fredrik Lundhb389df32000-06-29 12:48:37 +0000144static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000014510, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
14627, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
14744, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
14861, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
149108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
150122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
151106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
152120, 121, 122, 123, 124, 125, 126, 127 };
153
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000154#define SRE_IS_DIGIT(ch)\
155 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
156#define SRE_IS_SPACE(ch)\
157 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
158#define SRE_IS_LINEBREAK(ch)\
159 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
160#define SRE_IS_ALNUM(ch)\
161 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
162#define SRE_IS_WORD(ch)\
163 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000164
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000165static unsigned int sre_lower(unsigned int ch)
166{
167 return ((ch) < 128 ? sre_char_lower[ch] : ch);
168}
169
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000170/* locale-specific character predicates */
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000171
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000172#define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
173#define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
174#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
175#define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
176#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
177
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000178static unsigned int sre_lower_locale(unsigned int ch)
179{
180 return ((ch) < 256 ? tolower((ch)) : ch);
181}
182
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000183/* unicode-specific character predicates */
184
185#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000186
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000187#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
188#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
189#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000190#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000191#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000192
193static unsigned int sre_lower_unicode(unsigned int ch)
194{
195 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
196}
197
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000198#endif
199
Guido van Rossumb700df92000-03-31 14:59:30 +0000200LOCAL(int)
201sre_category(SRE_CODE category, unsigned int ch)
202{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000203 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000204
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000205 case SRE_CATEGORY_DIGIT:
206 return SRE_IS_DIGIT(ch);
207 case SRE_CATEGORY_NOT_DIGIT:
208 return !SRE_IS_DIGIT(ch);
209 case SRE_CATEGORY_SPACE:
210 return SRE_IS_SPACE(ch);
211 case SRE_CATEGORY_NOT_SPACE:
212 return !SRE_IS_SPACE(ch);
213 case SRE_CATEGORY_WORD:
214 return SRE_IS_WORD(ch);
215 case SRE_CATEGORY_NOT_WORD:
216 return !SRE_IS_WORD(ch);
217 case SRE_CATEGORY_LINEBREAK:
218 return SRE_IS_LINEBREAK(ch);
219 case SRE_CATEGORY_NOT_LINEBREAK:
220 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000221
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000222 case SRE_CATEGORY_LOC_WORD:
223 return SRE_LOC_IS_WORD(ch);
224 case SRE_CATEGORY_LOC_NOT_WORD:
225 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000226
227#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000228 case SRE_CATEGORY_UNI_DIGIT:
229 return SRE_UNI_IS_DIGIT(ch);
230 case SRE_CATEGORY_UNI_NOT_DIGIT:
231 return !SRE_UNI_IS_DIGIT(ch);
232 case SRE_CATEGORY_UNI_SPACE:
233 return SRE_UNI_IS_SPACE(ch);
234 case SRE_CATEGORY_UNI_NOT_SPACE:
235 return !SRE_UNI_IS_SPACE(ch);
236 case SRE_CATEGORY_UNI_WORD:
237 return SRE_UNI_IS_WORD(ch);
238 case SRE_CATEGORY_UNI_NOT_WORD:
239 return !SRE_UNI_IS_WORD(ch);
240 case SRE_CATEGORY_UNI_LINEBREAK:
241 return SRE_UNI_IS_LINEBREAK(ch);
242 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
243 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000244#else
245 case SRE_CATEGORY_UNI_DIGIT:
246 return SRE_IS_DIGIT(ch);
247 case SRE_CATEGORY_UNI_NOT_DIGIT:
248 return !SRE_IS_DIGIT(ch);
249 case SRE_CATEGORY_UNI_SPACE:
250 return SRE_IS_SPACE(ch);
251 case SRE_CATEGORY_UNI_NOT_SPACE:
252 return !SRE_IS_SPACE(ch);
253 case SRE_CATEGORY_UNI_WORD:
254 return SRE_LOC_IS_WORD(ch);
255 case SRE_CATEGORY_UNI_NOT_WORD:
256 return !SRE_LOC_IS_WORD(ch);
257 case SRE_CATEGORY_UNI_LINEBREAK:
258 return SRE_IS_LINEBREAK(ch);
259 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
260 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000261#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000262 }
263 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000264}
265
266/* helpers */
267
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000268static void
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000269mark_fini(SRE_STATE* state)
270{
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000271 if (state->mark_stack) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000272 free(state->mark_stack);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000273 state->mark_stack = NULL;
274 }
275 state->mark_stack_size = state->mark_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000276}
277
278static int
279mark_save(SRE_STATE* state, int lo, int hi)
280{
281 void* stack;
282 int size;
283 int minsize, newsize;
284
285 if (hi <= lo)
286 return 0;
287
288 size = (hi - lo) + 1;
289
290 newsize = state->mark_stack_size;
291 minsize = state->mark_stack_base + size;
292
293 if (newsize < minsize) {
294 /* create new stack */
295 if (!newsize) {
296 newsize = 512;
297 if (newsize < minsize)
298 newsize = minsize;
299 TRACE(("allocate stack %d\n", newsize));
300 stack = malloc(sizeof(void*) * newsize);
301 } else {
302 /* grow the stack */
303 while (newsize < minsize)
304 newsize += newsize;
305 TRACE(("grow stack to %d\n", newsize));
306 stack = realloc(state->mark_stack, sizeof(void*) * newsize);
307 }
308 if (!stack) {
309 mark_fini(state);
310 return SRE_ERROR_MEMORY;
311 }
312 state->mark_stack = stack;
313 state->mark_stack_size = newsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000314 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000315
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000316 TRACE(("copy %d:%d to %d (%d)\n", lo, hi, state->mark_stack_base, size));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000317
318 memcpy(state->mark_stack + state->mark_stack_base, state->mark + lo,
319 size * sizeof(void*));
320
321 state->mark_stack_base += size;
322
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000323 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000324}
325
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000326static int
327mark_restore(SRE_STATE* state, int lo, int hi)
Guido van Rossumb700df92000-03-31 14:59:30 +0000328{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000329 int size;
Guido van Rossumb700df92000-03-31 14:59:30 +0000330
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000331 if (hi <= lo)
332 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000333
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000334 size = (hi - lo) + 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000335
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000336 state->mark_stack_base -= size;
Guido van Rossumb700df92000-03-31 14:59:30 +0000337
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000338 TRACE(("copy %d:%d from %d\n", lo, hi, state->mark_stack_base));
Guido van Rossumb700df92000-03-31 14:59:30 +0000339
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000340 memcpy(state->mark + lo, state->mark_stack + state->mark_stack_base,
341 size * sizeof(void*));
Guido van Rossumb700df92000-03-31 14:59:30 +0000342
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000343 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000344}
345
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000346/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000347
348#define SRE_CHAR unsigned char
349#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000350#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000351#define SRE_CHARSET sre_charset
352#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000353#define SRE_MATCH sre_match
354#define SRE_SEARCH sre_search
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000355
356#if defined(HAVE_UNICODE)
357
Guido van Rossumb700df92000-03-31 14:59:30 +0000358#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000359#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000360#undef SRE_RECURSIVE
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000361
Guido van Rossumb700df92000-03-31 14:59:30 +0000362#undef SRE_SEARCH
363#undef SRE_MATCH
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000364#undef SRE_INFO
365#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000366#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000367#undef SRE_AT
368#undef SRE_CHAR
369
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000370/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000371
372#define SRE_CHAR Py_UNICODE
373#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000374#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000375#define SRE_CHARSET sre_ucharset
376#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000377#define SRE_MATCH sre_umatch
378#define SRE_SEARCH sre_usearch
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000379#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000380
381#endif /* SRE_RECURSIVE */
382
383/* -------------------------------------------------------------------- */
384/* String matching engine */
385
386/* the following section is compiled twice, with different character
387 settings */
388
389LOCAL(int)
390SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
391{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000392 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000393
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000394 int this, that;
Guido van Rossumb700df92000-03-31 14:59:30 +0000395
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000396 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000397
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000398 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000399 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000400 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000401
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000402 case SRE_AT_BEGINNING_LINE:
403 return ((void*) ptr == state->beginning ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000404 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000405
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000406 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000407 return (((void*) (ptr+1) == state->end &&
408 SRE_IS_LINEBREAK((int) ptr[0])) ||
409 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000410
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000411 case SRE_AT_END_LINE:
412 return ((void*) ptr == state->end ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000413 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000414
Fredrik Lundh770617b2001-01-14 15:06:11 +0000415 case SRE_AT_END_STRING:
416 return ((void*) ptr == state->end);
417
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000418 case SRE_AT_BOUNDARY:
419 if (state->beginning == state->end)
420 return 0;
421 that = ((void*) ptr > state->beginning) ?
422 SRE_IS_WORD((int) ptr[-1]) : 0;
423 this = ((void*) ptr < state->end) ?
424 SRE_IS_WORD((int) ptr[0]) : 0;
425 return this != that;
Fredrik Lundh80946112000-06-29 18:03:25 +0000426
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000427 case SRE_AT_NON_BOUNDARY:
428 if (state->beginning == state->end)
429 return 0;
430 that = ((void*) ptr > state->beginning) ?
431 SRE_IS_WORD((int) ptr[-1]) : 0;
432 this = ((void*) ptr < state->end) ?
433 SRE_IS_WORD((int) ptr[0]) : 0;
434 return this == that;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000435
436 case SRE_AT_LOC_BOUNDARY:
437 if (state->beginning == state->end)
438 return 0;
439 that = ((void*) ptr > state->beginning) ?
440 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
441 this = ((void*) ptr < state->end) ?
442 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
443 return this != that;
444
445 case SRE_AT_LOC_NON_BOUNDARY:
446 if (state->beginning == state->end)
447 return 0;
448 that = ((void*) ptr > state->beginning) ?
449 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
450 this = ((void*) ptr < state->end) ?
451 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
452 return this == that;
453
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000454#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000455 case SRE_AT_UNI_BOUNDARY:
456 if (state->beginning == state->end)
457 return 0;
458 that = ((void*) ptr > state->beginning) ?
459 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
460 this = ((void*) ptr < state->end) ?
461 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
462 return this != that;
463
464 case SRE_AT_UNI_NON_BOUNDARY:
465 if (state->beginning == state->end)
466 return 0;
467 that = ((void*) ptr > state->beginning) ?
468 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
469 this = ((void*) ptr < state->end) ?
470 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
471 return this == that;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000472#endif
473
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000474 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000475
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000476 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000477}
478
479LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000480SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000481{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000482 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000483
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000484 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000485
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000486 for (;;) {
487 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000488
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000489 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000490 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000491 if (ch == set[0])
492 return ok;
493 set++;
494 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000495
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000496 case SRE_OP_RANGE:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000497 /* <RANGE> <lower> <upper> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000498 if (set[0] <= ch && ch <= set[1])
499 return ok;
500 set += 2;
501 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000502
Fredrik Lundh3562f112000-07-02 12:00:07 +0000503 case SRE_OP_CHARSET:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000504 /* <CHARSET> <bitmap> (16 bits per code word) */
Fredrik Lundh3562f112000-07-02 12:00:07 +0000505 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
506 return ok;
507 set += 16;
508 break;
509
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000510 case SRE_OP_BIGCHARSET:
511 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
512 {
513 int count, block;
514 count = *(set++);
515 block = ((unsigned char*)set)[ch >> 8];
516 set += 128;
517 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
518 return ok;
519 set += count*16;
520 break;
521 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000522
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000523 case SRE_OP_CATEGORY:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000524 /* <CATEGORY> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000525 if (sre_category(set[0], (int) ch))
526 return ok;
527 set += 1;
528 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000529
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000530 case SRE_OP_NEGATE:
531 ok = !ok;
532 break;
533
534 case SRE_OP_FAILURE:
535 return !ok;
536
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000537 default:
538 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000539 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000540 return 0;
541 }
542 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000543}
544
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000545LOCAL(int) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level);
546
547LOCAL(int)
548SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, int maxcount, int level)
549{
550 SRE_CODE chr;
551 SRE_CHAR* ptr = state->ptr;
552 SRE_CHAR* end = state->end;
553 int i;
554
555 /* adjust end */
556 if (maxcount < end - ptr && maxcount != 65535)
557 end = ptr + maxcount;
558
559 switch (pattern[0]) {
560
561 case SRE_OP_ANY:
562 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000563 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000564 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
565 ptr++;
566 break;
567
568 case SRE_OP_ANY_ALL:
569 /* repeated dot wildcare. skip to the end of the target
570 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000571 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000572 ptr = end;
573 break;
574
575 case SRE_OP_LITERAL:
576 /* repeated literal */
577 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000578 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000579 while (ptr < end && (SRE_CODE) *ptr == chr)
580 ptr++;
581 break;
582
583 case SRE_OP_LITERAL_IGNORE:
584 /* repeated literal */
585 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000586 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000587 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
588 ptr++;
589 break;
590
591 case SRE_OP_NOT_LITERAL:
592 /* repeated non-literal */
593 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000594 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000595 while (ptr < end && (SRE_CODE) *ptr != chr)
596 ptr++;
597 break;
598
599 case SRE_OP_NOT_LITERAL_IGNORE:
600 /* repeated non-literal */
601 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000602 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000603 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
604 ptr++;
605 break;
606
607 case SRE_OP_IN:
608 /* repeated set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000609 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
610 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000611 ptr++;
612 break;
613
614 default:
615 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000616 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000617 while ((SRE_CHAR*) state->ptr < end) {
618 i = SRE_MATCH(state, pattern, level);
619 if (i < 0)
620 return i;
621 if (!i)
622 break;
623 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000624 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
625 (SRE_CHAR*) state->ptr - ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000626 return (SRE_CHAR*) state->ptr - ptr;
627 }
628
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000629 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000630 return ptr - (SRE_CHAR*) state->ptr;
631}
632
Fredrik Lundh33accc12000-08-27 20:59:47 +0000633#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000634LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000635SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
636{
637 /* check if an SRE_OP_INFO block matches at the current position.
638 returns the number of SRE_CODE objects to skip if successful, 0
639 if no match */
640
641 SRE_CHAR* end = state->end;
642 SRE_CHAR* ptr = state->ptr;
643 int i;
644
645 /* check minimal length */
646 if (pattern[3] && (end - ptr) < pattern[3])
647 return 0;
648
649 /* check known prefix */
650 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
651 /* <length> <skip> <prefix data> <overlap data> */
652 for (i = 0; i < pattern[5]; i++)
653 if ((SRE_CODE) ptr[i] != pattern[7 + i])
654 return 0;
655 return pattern[0] + 2 * pattern[6];
656 }
657 return pattern[0];
658}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000659#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000660
661LOCAL(int)
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000662SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
Guido van Rossumb700df92000-03-31 14:59:30 +0000663{
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000664 /* check if string matches the given pattern. returns <0 for
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000665 error, 0 for failure, and 1 for success */
Guido van Rossumb700df92000-03-31 14:59:30 +0000666
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000667 SRE_CHAR* end = state->end;
668 SRE_CHAR* ptr = state->ptr;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000669 int i, count;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000670 SRE_REPEAT* rp;
671 int lastmark;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000672 SRE_CODE chr;
Guido van Rossumb700df92000-03-31 14:59:30 +0000673
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000674 SRE_REPEAT rep; /* FIXME: <fl> allocate in STATE instead */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000675
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000676 TRACE(("|%p|%p|ENTER %d\n", pattern, ptr, level));
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000677
Fredrik Lundh18c2aa22000-08-07 17:33:38 +0000678#if defined(USE_STACKCHECK)
Fredrik Lundh58100642000-08-09 09:14:35 +0000679 if (level % 10 == 0 && PyOS_CheckStack())
Fredrik Lundh18c2aa22000-08-07 17:33:38 +0000680 return SRE_ERROR_RECURSION_LIMIT;
681#endif
682
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000683#if defined(USE_RECURSION_LIMIT)
684 if (level > USE_RECURSION_LIMIT)
685 return SRE_ERROR_RECURSION_LIMIT;
686#endif
687
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000688 if (pattern[0] == SRE_OP_INFO) {
689 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000690 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000691 if (pattern[3] && (end - ptr) < pattern[3]) {
692 TRACE(("reject (got %d chars, need %d)\n",
693 (end - ptr), pattern[3]));
694 return 0;
695 }
696 pattern += pattern[1] + 1;
697 }
698
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000699 for (;;) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000700
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000701 switch (*pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000702
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000703 case SRE_OP_FAILURE:
704 /* immediate failure */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000705 TRACE(("|%p|%p|FAILURE\n", pattern, ptr));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000706 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000707
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000708 case SRE_OP_SUCCESS:
709 /* end of pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000710 TRACE(("|%p|%p|SUCCESS\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000711 state->ptr = ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000712 return 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000713
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000714 case SRE_OP_AT:
715 /* match at given position */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000716 /* <AT> <code> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000717 TRACE(("|%p|%p|AT %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000718 if (!SRE_AT(state, ptr, *pattern))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000719 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000720 pattern++;
721 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000722
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000723 case SRE_OP_CATEGORY:
724 /* match at given category */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000725 /* <CATEGORY> <code> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000726 TRACE(("|%p|%p|CATEGORY %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000727 if (ptr >= end || !sre_category(pattern[0], ptr[0]))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000728 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000729 pattern++;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000730 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000731 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000732
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000733 case SRE_OP_LITERAL:
734 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000735 /* <LITERAL> <code> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000736 TRACE(("|%p|%p|LITERAL %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000737 if (ptr >= end || (SRE_CODE) ptr[0] != pattern[0])
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000738 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000739 pattern++;
740 ptr++;
741 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000742
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000743 case SRE_OP_NOT_LITERAL:
744 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000745 /* <NOT_LITERAL> <code> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000746 TRACE(("|%p|%p|NOT_LITERAL %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000747 if (ptr >= end || (SRE_CODE) ptr[0] == pattern[0])
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000748 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000749 pattern++;
750 ptr++;
751 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000752
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000753 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000754 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000755 /* <ANY> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000756 TRACE(("|%p|%p|ANY\n", pattern, ptr));
Fredrik Lundhe1869832000-08-01 22:47:49 +0000757 if (ptr >= end || SRE_IS_LINEBREAK(ptr[0]))
758 return 0;
759 ptr++;
760 break;
761
762 case SRE_OP_ANY_ALL:
763 /* match anything */
764 /* <ANY_ALL> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000765 TRACE(("|%p|%p|ANY_ALL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000766 if (ptr >= end)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000767 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000768 ptr++;
769 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000770
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000771 case SRE_OP_IN:
772 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000773 /* <IN> <skip> <set> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000774 TRACE(("|%p|%p|IN\n", pattern, ptr));
775 if (ptr >= end || !SRE_CHARSET(pattern + 1, *ptr))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000776 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000777 pattern += pattern[0];
778 ptr++;
779 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000780
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000781 case SRE_OP_GROUPREF:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000782 /* match backreference */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000783 TRACE(("|%p|%p|GROUPREF %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000784 i = pattern[0];
785 {
786 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
787 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
788 if (!p || !e || e < p)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000789 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000790 while (p < e) {
791 if (ptr >= end || *ptr != *p)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000792 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000793 p++; ptr++;
794 }
795 }
796 pattern++;
797 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000798
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000799 case SRE_OP_GROUPREF_IGNORE:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000800 /* match backreference */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000801 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000802 i = pattern[0];
803 {
804 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
805 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
806 if (!p || !e || e < p)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000807 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000808 while (p < e) {
809 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000810 state->lower(*ptr) != state->lower(*p))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000811 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000812 p++; ptr++;
813 }
814 }
815 pattern++;
816 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000817
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000818 case SRE_OP_LITERAL_IGNORE:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000819 TRACE(("|%p|%p|LITERAL_IGNORE %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000820 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000821 state->lower(*ptr) != state->lower(*pattern))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000822 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000823 pattern++;
824 ptr++;
825 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000826
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000827 case SRE_OP_NOT_LITERAL_IGNORE:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000828 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000829 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000830 state->lower(*ptr) == state->lower(*pattern))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000831 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000832 pattern++;
833 ptr++;
834 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000835
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000836 case SRE_OP_IN_IGNORE:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000837 TRACE(("|%p|%p|IN_IGNORE\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000838 if (ptr >= end
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000839 || !SRE_CHARSET(pattern + 1, (SRE_CODE) state->lower(*ptr)))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000840 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000841 pattern += pattern[0];
842 ptr++;
843 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000844
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000845 case SRE_OP_MARK:
846 /* set mark */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000847 /* <MARK> <gid> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000848 TRACE(("|%p|%p|MARK %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000849 i = pattern[0];
850 if (i & 1)
851 state->lastindex = i/2 + 1;
852 if (i > state->lastmark)
853 state->lastmark = i;
854 state->mark[i] = ptr;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000855 pattern++;
856 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000857
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000858 case SRE_OP_JUMP:
859 case SRE_OP_INFO:
860 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000861 /* <JUMP> <offset> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000862 TRACE(("|%p|%p|JUMP %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000863 pattern += pattern[0];
864 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000865
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000866 case SRE_OP_ASSERT:
867 /* assert subpattern */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000868 /* <ASSERT> <skip> <back> <pattern> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000869 TRACE(("|%p|%p|ASSERT %d\n", pattern, ptr, pattern[1]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000870 state->ptr = ptr - pattern[1];
Fredrik Lundh6f013982000-07-03 18:44:21 +0000871 if (state->ptr < state->beginning)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000872 return 0;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000873 i = SRE_MATCH(state, pattern + 2, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000874 if (i <= 0)
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000875 return i;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000876 pattern += pattern[0];
877 break;
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000878
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000879 case SRE_OP_ASSERT_NOT:
880 /* assert not subpattern */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000881 /* <ASSERT_NOT> <skip> <back> <pattern> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000882 TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern, ptr, pattern[1]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000883 state->ptr = ptr - pattern[1];
Fredrik Lundhebc37b22000-10-28 19:30:41 +0000884 if (state->ptr >= state->beginning) {
885 i = SRE_MATCH(state, pattern + 2, level + 1);
886 if (i < 0)
887 return i;
888 if (i)
889 return 0;
890 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000891 pattern += pattern[0];
892 break;
893
894 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000895 /* alternation */
896 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000897 TRACE(("|%p|%p|BRANCH\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000898 lastmark = state->lastmark;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000899 for (; pattern[0]; pattern += pattern[0]) {
900 if (pattern[1] == SRE_OP_LITERAL &&
901 (ptr >= end || (SRE_CODE) *ptr != pattern[2]))
902 continue;
903 if (pattern[1] == SRE_OP_IN &&
904 (ptr >= end || !SRE_CHARSET(pattern + 3, (SRE_CODE) *ptr)))
905 continue;
906 state->ptr = ptr;
907 i = SRE_MATCH(state, pattern + 1, level + 1);
908 if (i)
909 return i;
910 if (state->lastmark > lastmark) {
911 memset(
912 state->mark + lastmark + 1, 0,
913 (state->lastmark - lastmark) * sizeof(void*)
914 );
915 state->lastmark = lastmark;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000916 }
917 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000918 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000919
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000920 case SRE_OP_REPEAT_ONE:
921 /* match repeated sequence (maximizing regexp) */
922
923 /* this operator only works if the repeated item is
924 exactly one character wide, and we're not already
925 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +0000926 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000927
928 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
929
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000930 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", pattern, ptr,
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000931 pattern[1], pattern[2]));
932
Fredrik Lundhe1869832000-08-01 22:47:49 +0000933 if (ptr + pattern[1] > end)
934 return 0; /* cannot match */
935
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000936 state->ptr = ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000937
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000938 count = SRE_COUNT(state, pattern + 3, pattern[2], level + 1);
939 if (count < 0)
940 return count;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000941
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000942 ptr += count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000943
944 /* when we arrive here, count contains the number of
945 matches, and ptr points to the tail of the target
946 string. check if the rest of the pattern matches,
947 and backtrack if not. */
948
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000949 if (count < (int) pattern[1])
950 return 0;
951
952 if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
953 /* tail is empty. we're finished */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000954 state->ptr = ptr;
955 return 1;
956
957 } else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
958 /* tail starts with a literal. skip positions where
959 the rest of the pattern cannot possibly match */
Fredrik Lundhe1869832000-08-01 22:47:49 +0000960 chr = pattern[pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000961 for (;;) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000962 while (count >= (int) pattern[1] &&
963 (ptr >= end || *ptr != chr)) {
964 ptr--;
965 count--;
966 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000967 if (count < (int) pattern[1])
968 break;
969 state->ptr = ptr;
970 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000971 if (i)
Fredrik Lundh33accc12000-08-27 20:59:47 +0000972 return i;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000973 ptr--;
974 count--;
975 }
976
977 } else {
978 /* general case */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000979 lastmark = state->lastmark;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000980 while (count >= (int) pattern[1]) {
981 state->ptr = ptr;
982 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000983 if (i)
Fredrik Lundh33accc12000-08-27 20:59:47 +0000984 return i;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000985 ptr--;
986 count--;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000987 if (state->lastmark > lastmark) {
988 memset(
989 state->mark + lastmark + 1, 0,
990 (state->lastmark - lastmark) * sizeof(void*)
991 );
992 state->lastmark = lastmark;
993 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000994 }
995 }
996 return 0;
997
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000998 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000999 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001000 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001001 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001002 TRACE(("|%p|%p|REPEAT %d %d\n", pattern, ptr,
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001003 pattern[1], pattern[2]));
1004
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001005 rep.count = -1;
1006 rep.pattern = pattern;
1007
1008 /* install new repeat context */
1009 rep.prev = state->repeat;
1010 state->repeat = &rep;
1011
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001012 state->ptr = ptr;
1013 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001014
1015 state->repeat = rep.prev;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001016
1017 return i;
1018
1019 case SRE_OP_MAX_UNTIL:
1020 /* maximizing repeat */
1021 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1022
1023 /* FIXME: we probably need to deal with zero-width
1024 matches in here... */
1025
1026 rp = state->repeat;
1027 if (!rp)
1028 return SRE_ERROR_STATE;
1029
1030 state->ptr = ptr;
1031
1032 count = rp->count + 1;
1033
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001034 TRACE(("|%p|%p|MAX_UNTIL %d\n", pattern, ptr, count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001035
1036 if (count < rp->pattern[1]) {
1037 /* not enough matches */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001038 rp->count = count;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001039 /* RECURSIVE */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001040 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001041 if (i)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001042 return i;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001043 rp->count = count - 1;
1044 state->ptr = ptr;
1045 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001046 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001047
1048 if (count < rp->pattern[2] || rp->pattern[2] == 65535) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001049 /* we may have enough matches, but if we can
1050 match another item, do so */
1051 rp->count = count;
1052 lastmark = state->lastmark;
Fredrik Lundh33accc12000-08-27 20:59:47 +00001053 i = mark_save(state, 0, lastmark);
1054 if (i < 0)
1055 return i;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001056 /* RECURSIVE */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001057 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001058 if (i)
1059 return i;
Fredrik Lundh33accc12000-08-27 20:59:47 +00001060 i = mark_restore(state, 0, lastmark);
1061 if (i < 0)
1062 return i;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001063 rp->count = count - 1;
1064 state->ptr = ptr;
1065 }
1066
1067 /* cannot match more repeated items here. make sure the
1068 tail matches */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001069 state->repeat = rp->prev;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001070 i = SRE_MATCH(state, pattern, level + 1);
1071 if (i)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001072 return i;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001073 state->repeat = rp;
Fredrik Lundh770617b2001-01-14 15:06:11 +00001074 state->ptr = ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001075 return 0;
1076
1077 case SRE_OP_MIN_UNTIL:
1078 /* minimizing repeat */
1079 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1080
1081 rp = state->repeat;
1082 if (!rp)
1083 return SRE_ERROR_STATE;
1084
1085 count = rp->count + 1;
1086
Fredrik Lundh770617b2001-01-14 15:06:11 +00001087 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", pattern, ptr, count,
1088 rp->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001089
1090 state->ptr = ptr;
1091
1092 if (count < rp->pattern[1]) {
1093 /* not enough matches */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001094 rp->count = count;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001095 /* RECURSIVE */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001096 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001097 if (i)
1098 return i;
1099 rp->count = count-1;
1100 state->ptr = ptr;
1101 return 0;
1102 }
1103
1104 /* see if the tail matches */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001105 state->repeat = rp->prev;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +00001106 /* FIXME: the following fix doesn't always work (#133283) */
Fredrik Lundhdf781e62001-07-02 19:54:28 +00001107 if (rp->pattern[2] == 65535) {
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001108 /* unbounded repeat */
1109 for (;;) {
1110 i = SRE_MATCH(state, pattern, level + 1);
1111 if (i || ptr >= end)
1112 break;
1113 state->ptr = ++ptr;
1114 }
1115 } else
1116 i = SRE_MATCH(state, pattern, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001117 if (i) {
1118 /* free(rp); */
1119 return i;
1120 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001121
Fredrik Lundh770617b2001-01-14 15:06:11 +00001122 state->ptr = ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001123 state->repeat = rp;
1124
1125 if (count >= rp->pattern[2] && rp->pattern[2] != 65535)
1126 return 0;
1127
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001128 rp->count = count;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001129 /* RECURSIVE */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001130 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001131 if (i)
1132 return i;
1133 rp->count = count - 1;
Fredrik Lundh770617b2001-01-14 15:06:11 +00001134 state->ptr = ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001135 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001136
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001137 default:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001138 TRACE(("|%p|%p|UNKNOWN %d\n", pattern, ptr, pattern[-1]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001139 return SRE_ERROR_ILLEGAL;
1140 }
1141 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001142
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001143 /* shouldn't end up here */
1144 return SRE_ERROR_ILLEGAL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001145}
1146
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001147LOCAL(int)
Guido van Rossumb700df92000-03-31 14:59:30 +00001148SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1149{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001150 SRE_CHAR* ptr = state->start;
1151 SRE_CHAR* end = state->end;
1152 int status = 0;
Fredrik Lundh28552902000-07-05 21:14:16 +00001153 int prefix_len = 0;
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001154 int prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001155 SRE_CODE* prefix = NULL;
1156 SRE_CODE* charset = NULL;
1157 SRE_CODE* overlap = NULL;
1158 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001159
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001160 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001161 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001162 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001163
1164 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001165
1166 if (pattern[3] > 0) {
1167 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001168 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001169 end -= pattern[3]-1;
1170 if (end <= ptr)
1171 end = ptr+1;
1172 }
1173
Fredrik Lundh3562f112000-07-02 12:00:07 +00001174 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001175 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001176 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001177 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001178 prefix_skip = pattern[6];
1179 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001180 overlap = prefix + prefix_len - 1;
1181 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001182 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001183 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001184 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001185
1186 pattern += 1 + pattern[1];
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001187 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001188
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001189 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1190 TRACE(("charset = %p\n", charset));
1191
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001192#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001193 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001194 /* pattern starts with a known prefix. use the overlap
1195 table to skip forward as fast as we possibly can */
1196 int i = 0;
1197 end = state->end;
1198 while (ptr < end) {
1199 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001200 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001201 if (!i)
1202 break;
1203 else
1204 i = overlap[i];
1205 } else {
1206 if (++i == prefix_len) {
1207 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001208 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1209 state->start = ptr + 1 - prefix_len;
1210 state->ptr = ptr + 1 - prefix_len + prefix_skip;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001211 if (flags & SRE_INFO_LITERAL)
1212 return 1; /* we got all of it */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001213 status = SRE_MATCH(state, pattern + 2*prefix_skip, 1);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001214 if (status != 0)
1215 return status;
1216 /* close but no cigar -- try again */
1217 i = overlap[i];
1218 }
1219 break;
1220 }
1221
1222 }
1223 ptr++;
1224 }
1225 return 0;
1226 }
1227#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001228
Fredrik Lundh3562f112000-07-02 12:00:07 +00001229 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001230 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001231 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001232 SRE_CODE chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001233 end = state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001234 for (;;) {
1235 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1236 ptr++;
1237 if (ptr == end)
1238 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001239 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001240 state->start = ptr;
1241 state->ptr = ++ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001242 status = SRE_MATCH(state, pattern + 2, 1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001243 if (status != 0)
1244 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001245 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001246 } else if (charset) {
1247 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001248 end = state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001249 for (;;) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001250 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001251 ptr++;
1252 if (ptr == end)
1253 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001254 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001255 state->start = ptr;
1256 state->ptr = ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001257 status = SRE_MATCH(state, pattern, 1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001258 if (status != 0)
1259 break;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001260 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001261 }
1262 } else
1263 /* general case */
1264 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001265 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001266 state->start = state->ptr = ptr++;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001267 status = SRE_MATCH(state, pattern, 1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001268 if (status != 0)
1269 break;
1270 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001271
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001272 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001273}
Fredrik Lundh3562f112000-07-02 12:00:07 +00001274
Guido van Rossumb700df92000-03-31 14:59:30 +00001275
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001276#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001277
1278/* -------------------------------------------------------------------- */
1279/* factories and destructors */
1280
1281/* see sre.h for object declarations */
1282
1283staticforward PyTypeObject Pattern_Type;
1284staticforward PyTypeObject Match_Type;
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001285staticforward PyTypeObject Scanner_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00001286
1287static PyObject *
1288_compile(PyObject* self_, PyObject* args)
1289{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001290 /* "compile" pattern descriptor to pattern object */
Guido van Rossumb700df92000-03-31 14:59:30 +00001291
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001292 PatternObject* self;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001293 int i, n;
Guido van Rossumb700df92000-03-31 14:59:30 +00001294
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001295 PyObject* pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001296 int flags = 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001297 PyObject* code;
1298 int groups = 0;
1299 PyObject* groupindex = NULL;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001300 PyObject* indexgroup = NULL;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001301 if (!PyArg_ParseTuple(args, "OiO!|iOO", &pattern, &flags,
1302 &PyList_Type, &code, &groups,
1303 &groupindex, &indexgroup))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001304 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001305
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001306 n = PyList_GET_SIZE(code);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001307
Fredrik Lundhebc37b22000-10-28 19:30:41 +00001308 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001309 if (!self)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001310 return NULL;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001311
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001312 self->codesize = n;
1313
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001314 for (i = 0; i < n; i++) {
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001315 PyObject *o = PyList_GET_ITEM(code, i);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001316 self->code[i] = (SRE_CODE) PyInt_AsLong(o);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001317 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00001318
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001319 if (PyErr_Occurred()) {
1320 PyObject_DEL(self);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001321 return NULL;
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001322 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00001323
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001324 Py_INCREF(pattern);
1325 self->pattern = pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00001326
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001327 self->flags = flags;
1328
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001329 self->groups = groups;
Guido van Rossumb700df92000-03-31 14:59:30 +00001330
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001331 Py_XINCREF(groupindex);
1332 self->groupindex = groupindex;
Guido van Rossumb700df92000-03-31 14:59:30 +00001333
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001334 Py_XINCREF(indexgroup);
1335 self->indexgroup = indexgroup;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001336
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001337 return (PyObject*) self;
Guido van Rossumb700df92000-03-31 14:59:30 +00001338}
1339
1340static PyObject *
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001341sre_codesize(PyObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001342{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001343 return Py_BuildValue("i", sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001344}
1345
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001346static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001347sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001348{
1349 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001350 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001351 return NULL;
1352 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001353 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001354 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001355#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001356 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001357#else
1358 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001359#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001360 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001361}
1362
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001363LOCAL(void)
1364state_reset(SRE_STATE* state)
1365{
1366 int i;
1367
1368 state->lastmark = 0;
1369
1370 /* FIXME: dynamic! */
1371 for (i = 0; i < SRE_MARK_SIZE; i++)
1372 state->mark[i] = NULL;
1373
1374 state->lastindex = -1;
1375
1376 state->repeat = NULL;
1377
1378 mark_fini(state);
1379}
1380
Guido van Rossumb700df92000-03-31 14:59:30 +00001381LOCAL(PyObject*)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001382state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1383 int start, int end)
Guido van Rossumb700df92000-03-31 14:59:30 +00001384{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001385 /* prepare state object */
Guido van Rossumb700df92000-03-31 14:59:30 +00001386
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001387 PyBufferProcs *buffer;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001388 int size, bytes;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001389 void* ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001390
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001391 memset(state, 0, sizeof(SRE_STATE));
1392
1393 state->lastindex = -1;
1394
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001395#if defined(HAVE_UNICODE)
1396 if (PyUnicode_Check(string)) {
1397 /* unicode strings doesn't always support the buffer interface */
1398 ptr = (void*) PyUnicode_AS_DATA(string);
1399 bytes = PyUnicode_GET_DATA_SIZE(string);
1400 size = PyUnicode_GET_SIZE(string);
1401 state->charsize = sizeof(Py_UNICODE);
1402
1403 } else {
1404#endif
1405
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001406 /* get pointer to string buffer */
1407 buffer = string->ob_type->tp_as_buffer;
1408 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1409 buffer->bf_getsegcount(string, NULL) != 1) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001410 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001411 return NULL;
1412 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001413
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001414 /* determine buffer size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001415 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1416 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001417 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1418 return NULL;
1419 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001420
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001421 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001422#if PY_VERSION_HEX >= 0x01060000
1423 size = PyObject_Size(string);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001424#else
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001425 size = PyObject_Length(string);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001426#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001427
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001428 if (PyString_Check(string) || bytes == size)
1429 state->charsize = 1;
1430#if defined(HAVE_UNICODE)
1431 else if (bytes == (int) (size * sizeof(Py_UNICODE)))
1432 state->charsize = sizeof(Py_UNICODE);
1433#endif
1434 else {
1435 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1436 return NULL;
1437 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001438
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001439#if defined(HAVE_UNICODE)
1440 }
1441#endif
1442
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001443 /* adjust boundaries */
1444 if (start < 0)
1445 start = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001446 else if (start > size)
1447 start = size;
Guido van Rossumb700df92000-03-31 14:59:30 +00001448
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001449 if (end < 0)
1450 end = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001451 else if (end > size)
1452 end = size;
Guido van Rossumb700df92000-03-31 14:59:30 +00001453
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001454 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001455
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001456 state->start = (void*) ((char*) ptr + start * state->charsize);
1457 state->end = (void*) ((char*) ptr + end * state->charsize);
1458
1459 Py_INCREF(string);
1460 state->string = string;
1461 state->pos = start;
1462 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001463
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001464 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001465 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001466 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001467#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001468 state->lower = sre_lower_unicode;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001469#else
1470 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001471#endif
1472 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001473 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001474
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001475 return string;
Guido van Rossumb700df92000-03-31 14:59:30 +00001476}
1477
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001478LOCAL(void)
1479state_fini(SRE_STATE* state)
1480{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001481 Py_XDECREF(state->string);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001482 mark_fini(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001483}
1484
1485LOCAL(PyObject*)
1486state_getslice(SRE_STATE* state, int index, PyObject* string)
1487{
Fredrik Lundh58100642000-08-09 09:14:35 +00001488 int i, j;
1489
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001490 index = (index - 1) * 2;
1491
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001492 if (string == Py_None || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh58100642000-08-09 09:14:35 +00001493 i = j = 0;
1494 } else {
1495 i = ((char*)state->mark[index] - (char*)state->beginning) /
1496 state->charsize;
1497 j = ((char*)state->mark[index+1] - (char*)state->beginning) /
1498 state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001499 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001500
Fredrik Lundh58100642000-08-09 09:14:35 +00001501 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001502}
1503
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001504static void
1505pattern_error(int status)
1506{
1507 switch (status) {
1508 case SRE_ERROR_RECURSION_LIMIT:
1509 PyErr_SetString(
1510 PyExc_RuntimeError,
1511 "maximum recursion limit exceeded"
1512 );
1513 break;
1514 case SRE_ERROR_MEMORY:
1515 PyErr_NoMemory();
1516 break;
1517 default:
1518 /* other error codes indicate compiler/engine bugs */
1519 PyErr_SetString(
1520 PyExc_RuntimeError,
1521 "internal error in regular expression engine"
1522 );
1523 }
1524}
1525
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001526static PyObject*
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001527pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001528{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001529 /* create match object (from state object) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001530
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001531 MatchObject* match;
1532 int i, j;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001533 char* base;
1534 int n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001535
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001536 if (status > 0) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001537
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001538 /* create match object (with room for extra group marks) */
1539 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
Fredrik Lundh6f013982000-07-03 18:44:21 +00001540 2*(pattern->groups+1));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001541 if (!match)
1542 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001543
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001544 Py_INCREF(pattern);
1545 match->pattern = pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001546
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001547 Py_INCREF(state->string);
1548 match->string = state->string;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001549
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001550 match->regs = NULL;
1551 match->groups = pattern->groups+1;
1552
1553 /* fill in group slices */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001554
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001555 base = (char*) state->beginning;
1556 n = state->charsize;
1557
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001558 match->mark[0] = ((char*) state->start - base) / n;
1559 match->mark[1] = ((char*) state->ptr - base) / n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001560
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001561 for (i = j = 0; i < pattern->groups; i++, j+=2)
1562 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
1563 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
1564 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
1565 } else
1566 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
1567
1568 match->pos = state->pos;
1569 match->endpos = state->endpos;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001570
Fredrik Lundh6f013982000-07-03 18:44:21 +00001571 match->lastindex = state->lastindex;
1572
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001573 return (PyObject*) match;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001574
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001575 } else if (status == 0) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001576
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001577 /* no match */
1578 Py_INCREF(Py_None);
1579 return Py_None;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001580
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001581 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001582
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001583 /* internal error */
1584 pattern_error(status);
1585 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001586}
1587
1588static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001589pattern_scanner(PatternObject* pattern, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001590{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001591 /* create search state object */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001592
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001593 ScannerObject* self;
1594
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001595 PyObject* string;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001596 int start = 0;
1597 int end = INT_MAX;
1598 if (!PyArg_ParseTuple(args, "O|ii:scanner", &string, &start, &end))
1599 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001600
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001601 /* create scanner object */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001602 self = PyObject_NEW(ScannerObject, &Scanner_Type);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001603 if (!self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001604 return NULL;
1605
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001606 string = state_init(&self->state, pattern, string, start, end);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001607 if (!string) {
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001608 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001609 return NULL;
1610 }
1611
1612 Py_INCREF(pattern);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001613 self->pattern = (PyObject*) pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001614
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001615 return (PyObject*) self;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001616}
1617
Guido van Rossumb700df92000-03-31 14:59:30 +00001618static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001619pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001620{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001621 Py_XDECREF(self->pattern);
1622 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001623 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001624 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001625}
1626
1627static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001628pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001629{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001630 SRE_STATE state;
1631 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001632
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001633 PyObject* string;
1634 int start = 0;
1635 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00001636 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1637 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:match", kwlist,
1638 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001639 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001640
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001641 string = state_init(&state, self, string, start, end);
1642 if (!string)
1643 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001644
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001645 state.ptr = state.start;
1646
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001647 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1648
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001649 if (state.charsize == 1) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001650 status = sre_match(&state, PatternObject_GetCode(self), 1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001651 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001652#if defined(HAVE_UNICODE)
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001653 status = sre_umatch(&state, PatternObject_GetCode(self), 1);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001654#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001655 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001656
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001657 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1658
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001659 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001660
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001661 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001662}
1663
1664static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001665pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001666{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001667 SRE_STATE state;
1668 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001669
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001670 PyObject* string;
1671 int start = 0;
1672 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00001673 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1674 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:search", kwlist,
1675 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001676 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001677
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001678 string = state_init(&state, self, string, start, end);
1679 if (!string)
1680 return NULL;
1681
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001682 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1683
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001684 if (state.charsize == 1) {
1685 status = sre_search(&state, PatternObject_GetCode(self));
1686 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001687#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001688 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001689#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001690 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001691
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001692 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1693
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001694 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001695
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001696 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001697}
1698
1699static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001700call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001701{
1702 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001703 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001704 PyObject* func;
1705 PyObject* result;
1706
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001707 name = PyString_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001708 if (!name)
1709 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001710 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001711 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001712 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001713 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001714 func = PyObject_GetAttrString(mod, function);
1715 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001716 if (!func)
1717 return NULL;
1718 result = PyObject_CallObject(func, args);
1719 Py_DECREF(func);
1720 Py_DECREF(args);
1721 return result;
1722}
1723
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001724#ifdef USE_BUILTIN_COPY
1725static int
1726deepcopy(PyObject** object, PyObject* memo)
1727{
1728 PyObject* copy;
1729
1730 copy = call(
1731 "copy", "deepcopy",
1732 Py_BuildValue("OO", *object, memo)
1733 );
1734 if (!copy)
1735 return 0;
1736
1737 Py_DECREF(*object);
1738 *object = copy;
1739
1740 return 1; /* success */
1741}
1742#endif
1743
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001744static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001745pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001746{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001747 PyObject* template;
1748 PyObject* string;
Fredrik Lundh28552902000-07-05 21:14:16 +00001749 PyObject* count = Py_False; /* zero */
Fredrik Lundh562586e2000-10-03 20:43:34 +00001750 static char* kwlist[] = { "repl", "string", "count", NULL };
1751 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|O:sub", kwlist,
1752 &template, &string, &count))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001753 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001754
1755 /* delegate to Python code */
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001756 return call(
1757 SRE_MODULE, "_sub",
1758 Py_BuildValue("OOOO", self, template, string, count)
1759 );
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001760}
1761
1762static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001763pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001764{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001765 PyObject* template;
1766 PyObject* string;
Fredrik Lundh28552902000-07-05 21:14:16 +00001767 PyObject* count = Py_False; /* zero */
Fredrik Lundh562586e2000-10-03 20:43:34 +00001768 static char* kwlist[] = { "repl", "string", "count", NULL };
1769 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|O:subn", kwlist,
1770 &template, &string, &count))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001771 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001772
1773 /* delegate to Python code */
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001774 return call(
1775 SRE_MODULE, "_subn",
1776 Py_BuildValue("OOOO", self, template, string, count)
1777 );
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001778}
1779
1780static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001781pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001782{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001783 PyObject* string;
Fredrik Lundh28552902000-07-05 21:14:16 +00001784 PyObject* maxsplit = Py_False; /* zero */
Fredrik Lundh562586e2000-10-03 20:43:34 +00001785 static char* kwlist[] = { "source", "maxsplit", NULL };
1786 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|O:split", kwlist,
1787 &string, &maxsplit))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001788 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001789
1790 /* delegate to Python code */
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001791 return call(
1792 SRE_MODULE, "_split",
1793 Py_BuildValue("OOO", self, string, maxsplit)
1794 );
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001795}
1796
1797static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001798pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001799{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001800 SRE_STATE state;
1801 PyObject* list;
1802 int status;
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001803 int i;
Guido van Rossumb700df92000-03-31 14:59:30 +00001804
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001805 PyObject* string;
1806 int start = 0;
1807 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00001808 static char* kwlist[] = { "source", "pos", "endpos", NULL };
1809 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:findall", kwlist,
1810 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001811 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001812
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001813 string = state_init(&state, self, string, start, end);
1814 if (!string)
1815 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001816
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001817 list = PyList_New(0);
Guido van Rossumb700df92000-03-31 14:59:30 +00001818
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001819 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001820
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001821 PyObject* item;
1822
Fredrik Lundhebc37b22000-10-28 19:30:41 +00001823 state_reset(&state);
1824
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001825 state.ptr = state.start;
1826
1827 if (state.charsize == 1) {
1828 status = sre_search(&state, PatternObject_GetCode(self));
1829 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001830#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001831 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001832#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001833 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001834
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001835 if (status > 0) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001836
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001837 /* don't bother to build a match object */
1838 switch (self->groups) {
1839 case 0:
1840 item = PySequence_GetSlice(
1841 string,
1842 ((char*) state.start - (char*) state.beginning) /
1843 state.charsize,
1844 ((char*) state.ptr - (char*) state.beginning) /
1845 state.charsize);
1846 if (!item)
1847 goto error;
1848 break;
1849 case 1:
1850 item = state_getslice(&state, 1, string);
1851 if (!item)
1852 goto error;
1853 break;
1854 default:
1855 item = PyTuple_New(self->groups);
1856 if (!item)
1857 goto error;
1858 for (i = 0; i < self->groups; i++) {
1859 PyObject* o = state_getslice(&state, i+1, string);
1860 if (!o) {
1861 Py_DECREF(item);
1862 goto error;
1863 }
1864 PyTuple_SET_ITEM(item, i, o);
1865 }
1866 break;
1867 }
1868
Fredrik Lundhe67d8e52000-08-27 21:32:46 +00001869 status = PyList_Append(list, item);
1870 Py_DECREF(item);
1871
1872 if (status < 0)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001873 goto error;
Guido van Rossumb700df92000-03-31 14:59:30 +00001874
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001875 if (state.ptr == state.start)
1876 state.start = (void*) ((char*) state.ptr + state.charsize);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001877 else
1878 state.start = state.ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001879
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001880 } else {
Guido van Rossumb700df92000-03-31 14:59:30 +00001881
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001882 if (status == 0)
1883 break;
1884
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001885 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001886 goto error;
Guido van Rossumb700df92000-03-31 14:59:30 +00001887
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001888 }
1889 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001890
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001891 state_fini(&state);
1892 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00001893
1894error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001895 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001896 state_fini(&state);
1897 return NULL;
1898
Guido van Rossumb700df92000-03-31 14:59:30 +00001899}
1900
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001901static PyObject*
1902pattern_copy(PatternObject* self, PyObject* args)
1903{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001904#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001905 PatternObject* copy;
1906 int offset;
1907
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001908 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
1909 return NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001910
1911 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
1912 if (!copy)
1913 return NULL;
1914
1915 offset = offsetof(PatternObject, groups);
1916
1917 Py_XINCREF(self->groupindex);
1918 Py_XINCREF(self->indexgroup);
1919 Py_XINCREF(self->pattern);
1920
1921 memcpy((char*) copy + offset, (char*) self + offset,
1922 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
1923
1924 return (PyObject*) copy;
1925#else
1926 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
1927 return NULL;
1928#endif
1929}
1930
1931static PyObject*
1932pattern_deepcopy(PatternObject* self, PyObject* args)
1933{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001934#ifdef USE_BUILTIN_COPY
1935 PatternObject* copy;
1936
1937 PyObject* memo;
1938 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
1939 return NULL;
1940
1941 copy = (PatternObject*) pattern_copy(self, Py_None);
1942 if (!copy)
1943 return NULL;
1944
1945 if (!deepcopy(&copy->groupindex, memo) ||
1946 !deepcopy(&copy->indexgroup, memo) ||
1947 !deepcopy(&copy->pattern, memo)) {
1948 Py_DECREF(copy);
1949 return NULL;
1950 }
1951
1952#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001953 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
1954 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001955#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001956}
1957
Fredrik Lundh2d96f112001-07-08 13:26:57 +00001958static PyObject*
1959pattern_isliteral(PatternObject* self, PyObject* args)
1960{
1961 /* internal: return true if pattern consists of literal text only */
1962
1963 SRE_CODE* code;
1964 PyObject* isliteral;
1965
1966 if (!PyArg_ParseTuple(args, ":_isliteral"))
1967 return NULL;
1968
1969 code = PatternObject_GetCode(self);
1970
1971 if (code[0] == SRE_OP_INFO && code[2] & SRE_INFO_LITERAL)
1972 isliteral = Py_True;
1973 else
1974 isliteral = Py_False;
1975
1976 Py_INCREF(isliteral);
1977 return isliteral;
1978}
1979
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001980static PyMethodDef pattern_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00001981 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS},
1982 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS},
1983 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS},
1984 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS},
1985 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS},
1986 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS},
Fredrik Lundh562586e2000-10-03 20:43:34 +00001987 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001988 {"__copy__", (PyCFunction) pattern_copy, METH_VARARGS},
1989 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_VARARGS},
Fredrik Lundh2d96f112001-07-08 13:26:57 +00001990 {"_isliteral", (PyCFunction) pattern_isliteral, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001991 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00001992};
1993
1994static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001995pattern_getattr(PatternObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00001996{
1997 PyObject* res;
1998
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001999 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00002000
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002001 if (res)
2002 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00002003
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002004 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00002005
2006 /* attributes */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002007 if (!strcmp(name, "pattern")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002008 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002009 return self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00002010 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002011
2012 if (!strcmp(name, "flags"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002013 return Py_BuildValue("i", self->flags);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002014
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002015 if (!strcmp(name, "groups"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002016 return Py_BuildValue("i", self->groups);
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002017
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002018 if (!strcmp(name, "groupindex") && self->groupindex) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002019 Py_INCREF(self->groupindex);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002020 return self->groupindex;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002021 }
2022
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002023 PyErr_SetString(PyExc_AttributeError, name);
2024 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002025}
2026
2027statichere PyTypeObject Pattern_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002028 PyObject_HEAD_INIT(NULL)
2029 0, "SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002030 sizeof(PatternObject), sizeof(SRE_CODE),
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002031 (destructor)pattern_dealloc, /*tp_dealloc*/
2032 0, /*tp_print*/
2033 (getattrfunc)pattern_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002034};
2035
2036/* -------------------------------------------------------------------- */
2037/* match methods */
2038
2039static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002040match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00002041{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002042 Py_XDECREF(self->regs);
2043 Py_XDECREF(self->string);
2044 Py_DECREF(self->pattern);
2045 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00002046}
2047
2048static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002049match_getslice_by_index(MatchObject* self, int index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00002050{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002051 if (index < 0 || index >= self->groups) {
2052 /* raise IndexError if we were given a bad group number */
2053 PyErr_SetString(
2054 PyExc_IndexError,
2055 "no such group"
2056 );
2057 return NULL;
2058 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002059
Fredrik Lundh6f013982000-07-03 18:44:21 +00002060 index *= 2;
2061
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002062 if (self->string == Py_None || self->mark[index] < 0) {
2063 /* return default value if the string or group is undefined */
2064 Py_INCREF(def);
2065 return def;
2066 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002067
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002068 return PySequence_GetSlice(
2069 self->string, self->mark[index], self->mark[index+1]
2070 );
Guido van Rossumb700df92000-03-31 14:59:30 +00002071}
2072
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002073static int
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002074match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00002075{
Fredrik Lundh6f013982000-07-03 18:44:21 +00002076 int i;
Guido van Rossumb700df92000-03-31 14:59:30 +00002077
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002078 if (PyInt_Check(index))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002079 return (int) PyInt_AS_LONG(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00002080
Fredrik Lundh6f013982000-07-03 18:44:21 +00002081 i = -1;
2082
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002083 if (self->pattern->groupindex) {
2084 index = PyObject_GetItem(self->pattern->groupindex, index);
2085 if (index) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00002086 if (PyInt_Check(index))
2087 i = (int) PyInt_AS_LONG(index);
2088 Py_DECREF(index);
2089 } else
2090 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002091 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00002092
2093 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002094}
2095
2096static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002097match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002098{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002099 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00002100}
2101
2102static PyObject*
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002103match_expand(MatchObject* self, PyObject* args)
2104{
2105 PyObject* template;
2106 if (!PyArg_ParseTuple(args, "O:expand", &template))
2107 return NULL;
2108
2109 /* delegate to Python code */
2110 return call(
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002111 SRE_MODULE, "_expand",
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002112 Py_BuildValue("OOO", self->pattern, self, template)
2113 );
2114}
2115
2116static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002117match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002118{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002119 PyObject* result;
2120 int i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00002121
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002122 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00002123
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002124 switch (size) {
2125 case 0:
2126 result = match_getslice(self, Py_False, Py_None);
2127 break;
2128 case 1:
2129 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2130 break;
2131 default:
2132 /* fetch multiple items */
2133 result = PyTuple_New(size);
2134 if (!result)
2135 return NULL;
2136 for (i = 0; i < size; i++) {
2137 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002138 self, PyTuple_GET_ITEM(args, i), Py_None
2139 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002140 if (!item) {
2141 Py_DECREF(result);
2142 return NULL;
2143 }
2144 PyTuple_SET_ITEM(result, i, item);
2145 }
2146 break;
2147 }
2148 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002149}
2150
2151static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002152match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002153{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002154 PyObject* result;
2155 int index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002156
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002157 PyObject* def = Py_None;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002158 static char* kwlist[] = { "default", NULL };
2159 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002160 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002161
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002162 result = PyTuple_New(self->groups-1);
2163 if (!result)
2164 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002165
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002166 for (index = 1; index < self->groups; index++) {
2167 PyObject* item;
2168 item = match_getslice_by_index(self, index, def);
2169 if (!item) {
2170 Py_DECREF(result);
2171 return NULL;
2172 }
2173 PyTuple_SET_ITEM(result, index-1, item);
2174 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002175
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002176 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002177}
2178
2179static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002180match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002181{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002182 PyObject* result;
2183 PyObject* keys;
2184 int index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002185
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002186 PyObject* def = Py_None;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002187 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00002188 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002189 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002190
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002191 result = PyDict_New();
2192 if (!result || !self->pattern->groupindex)
2193 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002194
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002195 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002196 if (!keys)
2197 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00002198
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002199 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00002200 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002201 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002202 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002203 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002204 if (!key)
2205 goto failed;
2206 value = match_getslice(self, key, def);
2207 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002208 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002209 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002210 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00002211 status = PyDict_SetItem(result, key, value);
2212 Py_DECREF(value);
2213 if (status < 0)
2214 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002215 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002216
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002217 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00002218
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002219 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002220
2221failed:
2222 Py_DECREF(keys);
2223 Py_DECREF(result);
2224 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002225}
2226
2227static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002228match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002229{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002230 int index;
2231
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002232 PyObject* index_ = Py_False; /* zero */
2233 if (!PyArg_ParseTuple(args, "|O:start", &index_))
2234 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002235
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002236 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002237
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002238 if (index < 0 || index >= self->groups) {
2239 PyErr_SetString(
2240 PyExc_IndexError,
2241 "no such group"
2242 );
2243 return NULL;
2244 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002245
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002246 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002247 return Py_BuildValue("i", self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00002248}
2249
2250static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002251match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002252{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002253 int index;
2254
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002255 PyObject* index_ = Py_False; /* zero */
2256 if (!PyArg_ParseTuple(args, "|O:end", &index_))
2257 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002258
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002259 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002260
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002261 if (index < 0 || index >= self->groups) {
2262 PyErr_SetString(
2263 PyExc_IndexError,
2264 "no such group"
2265 );
2266 return NULL;
2267 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002268
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002269 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002270 return Py_BuildValue("i", self->mark[index*2+1]);
2271}
2272
2273LOCAL(PyObject*)
2274_pair(int i1, int i2)
2275{
2276 PyObject* pair;
2277 PyObject* item;
2278
2279 pair = PyTuple_New(2);
2280 if (!pair)
2281 return NULL;
2282
2283 item = PyInt_FromLong(i1);
2284 if (!item)
2285 goto error;
2286 PyTuple_SET_ITEM(pair, 0, item);
2287
2288 item = PyInt_FromLong(i2);
2289 if (!item)
2290 goto error;
2291 PyTuple_SET_ITEM(pair, 1, item);
2292
2293 return pair;
2294
2295 error:
2296 Py_DECREF(pair);
2297 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002298}
2299
2300static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002301match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002302{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002303 int index;
2304
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002305 PyObject* index_ = Py_False; /* zero */
2306 if (!PyArg_ParseTuple(args, "|O:span", &index_))
2307 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002308
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002309 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002310
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002311 if (index < 0 || index >= self->groups) {
2312 PyErr_SetString(
2313 PyExc_IndexError,
2314 "no such group"
2315 );
2316 return NULL;
2317 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002318
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002319 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002320 return _pair(self->mark[index*2], self->mark[index*2+1]);
2321}
2322
2323static PyObject*
2324match_regs(MatchObject* self)
2325{
2326 PyObject* regs;
2327 PyObject* item;
2328 int index;
2329
2330 regs = PyTuple_New(self->groups);
2331 if (!regs)
2332 return NULL;
2333
2334 for (index = 0; index < self->groups; index++) {
2335 item = _pair(self->mark[index*2], self->mark[index*2+1]);
2336 if (!item) {
2337 Py_DECREF(regs);
2338 return NULL;
2339 }
2340 PyTuple_SET_ITEM(regs, index, item);
2341 }
2342
2343 Py_INCREF(regs);
2344 self->regs = regs;
2345
2346 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00002347}
2348
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002349static PyObject*
2350match_copy(MatchObject* self, PyObject* args)
2351{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002352#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002353 MatchObject* copy;
2354 int slots, offset;
2355
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002356 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
2357 return NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002358
2359 slots = 2 * (self->pattern->groups+1);
2360
2361 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
2362 if (!copy)
2363 return NULL;
2364
2365 /* this value a constant, but any compiler should be able to
2366 figure that out all by itself */
2367 offset = offsetof(MatchObject, string);
2368
2369 Py_XINCREF(self->pattern);
2370 Py_XINCREF(self->string);
2371 Py_XINCREF(self->regs);
2372
2373 memcpy((char*) copy + offset, (char*) self + offset,
2374 sizeof(MatchObject) + slots * sizeof(int) - offset);
2375
2376 return (PyObject*) copy;
2377#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002378 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002379 return NULL;
2380#endif
2381}
2382
2383static PyObject*
2384match_deepcopy(MatchObject* self, PyObject* args)
2385{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002386#ifdef USE_BUILTIN_COPY
2387 MatchObject* copy;
2388
2389 PyObject* memo;
2390 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
2391 return NULL;
2392
2393 copy = (MatchObject*) match_copy(self, Py_None);
2394 if (!copy)
2395 return NULL;
2396
2397 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
2398 !deepcopy(&copy->string, memo) ||
2399 !deepcopy(&copy->regs, memo)) {
2400 Py_DECREF(copy);
2401 return NULL;
2402 }
2403
2404#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002405 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
2406 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002407#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002408}
2409
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002410static PyMethodDef match_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00002411 {"group", (PyCFunction) match_group, METH_VARARGS},
2412 {"start", (PyCFunction) match_start, METH_VARARGS},
2413 {"end", (PyCFunction) match_end, METH_VARARGS},
2414 {"span", (PyCFunction) match_span, METH_VARARGS},
2415 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
2416 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
2417 {"expand", (PyCFunction) match_expand, METH_VARARGS},
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002418 {"__copy__", (PyCFunction) match_copy, METH_VARARGS},
2419 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002420 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002421};
2422
2423static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002424match_getattr(MatchObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00002425{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002426 PyObject* res;
Guido van Rossumb700df92000-03-31 14:59:30 +00002427
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002428 res = Py_FindMethod(match_methods, (PyObject*) self, name);
2429 if (res)
2430 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00002431
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002432 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00002433
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002434 if (!strcmp(name, "lastindex")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00002435 if (self->lastindex >= 0)
2436 return Py_BuildValue("i", self->lastindex);
Fredrik Lundhc2301732000-07-02 22:25:39 +00002437 Py_INCREF(Py_None);
2438 return Py_None;
2439 }
2440
2441 if (!strcmp(name, "lastgroup")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00002442 if (self->pattern->indexgroup && self->lastindex >= 0) {
Fredrik Lundhc2301732000-07-02 22:25:39 +00002443 PyObject* result = PySequence_GetItem(
Fredrik Lundh6f013982000-07-03 18:44:21 +00002444 self->pattern->indexgroup, self->lastindex
Fredrik Lundhc2301732000-07-02 22:25:39 +00002445 );
2446 if (result)
2447 return result;
2448 PyErr_Clear();
2449 }
2450 Py_INCREF(Py_None);
2451 return Py_None;
2452 }
2453
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002454 if (!strcmp(name, "string")) {
2455 if (self->string) {
2456 Py_INCREF(self->string);
2457 return self->string;
2458 } else {
2459 Py_INCREF(Py_None);
2460 return Py_None;
2461 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002462 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002463
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002464 if (!strcmp(name, "regs")) {
2465 if (self->regs) {
2466 Py_INCREF(self->regs);
2467 return self->regs;
2468 } else
2469 return match_regs(self);
2470 }
2471
2472 if (!strcmp(name, "re")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002473 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002474 return (PyObject*) self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00002475 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002476
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002477 if (!strcmp(name, "pos"))
2478 return Py_BuildValue("i", self->pos);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002479
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002480 if (!strcmp(name, "endpos"))
2481 return Py_BuildValue("i", self->endpos);
Guido van Rossumb700df92000-03-31 14:59:30 +00002482
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002483 PyErr_SetString(PyExc_AttributeError, name);
2484 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002485}
2486
2487/* FIXME: implement setattr("string", None) as a special case (to
2488 detach the associated string, if any */
2489
2490statichere PyTypeObject Match_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002491 PyObject_HEAD_INIT(NULL)
2492 0, "SRE_Match",
2493 sizeof(MatchObject), sizeof(int),
2494 (destructor)match_dealloc, /*tp_dealloc*/
2495 0, /*tp_print*/
2496 (getattrfunc)match_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002497};
2498
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002499/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002500/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002501
2502static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002503scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002504{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002505 state_fini(&self->state);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002506 Py_DECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002507 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002508}
2509
2510static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002511scanner_match(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002512{
2513 SRE_STATE* state = &self->state;
2514 PyObject* match;
2515 int status;
2516
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00002517 state_reset(state);
2518
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002519 state->ptr = state->start;
2520
2521 if (state->charsize == 1) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00002522 status = sre_match(state, PatternObject_GetCode(self->pattern), 1);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002523 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002524#if defined(HAVE_UNICODE)
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00002525 status = sre_umatch(state, PatternObject_GetCode(self->pattern), 1);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002526#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002527 }
2528
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002529 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002530 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002531
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002532 if (status == 0 || state->ptr == state->start)
2533 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002534 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002535 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002536
2537 return match;
2538}
2539
2540
2541static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002542scanner_search(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002543{
2544 SRE_STATE* state = &self->state;
2545 PyObject* match;
2546 int status;
2547
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00002548 state_reset(state);
2549
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002550 state->ptr = state->start;
2551
2552 if (state->charsize == 1) {
2553 status = sre_search(state, PatternObject_GetCode(self->pattern));
2554 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002555#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002556 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002557#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002558 }
2559
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002560 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002561 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002562
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002563 if (status == 0 || state->ptr == state->start)
2564 state->start = (void*) ((char*) state->ptr + state->charsize);
2565 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002566 state->start = state->ptr;
2567
2568 return match;
2569}
2570
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002571static PyMethodDef scanner_methods[] = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002572 {"match", (PyCFunction) scanner_match, 0},
2573 {"search", (PyCFunction) scanner_search, 0},
2574 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002575};
2576
2577static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002578scanner_getattr(ScannerObject* self, char* name)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002579{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002580 PyObject* res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002581
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002582 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
2583 if (res)
2584 return res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002585
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002586 PyErr_Clear();
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002587
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002588 /* attributes */
2589 if (!strcmp(name, "pattern")) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002590 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002591 return self->pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002592 }
2593
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002594 PyErr_SetString(PyExc_AttributeError, name);
2595 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002596}
2597
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002598statichere PyTypeObject Scanner_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002599 PyObject_HEAD_INIT(NULL)
2600 0, "SRE_Scanner",
2601 sizeof(ScannerObject), 0,
2602 (destructor)scanner_dealloc, /*tp_dealloc*/
2603 0, /*tp_print*/
2604 (getattrfunc)scanner_getattr, /*tp_getattr*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002605};
2606
Guido van Rossumb700df92000-03-31 14:59:30 +00002607static PyMethodDef _functions[] = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002608 {"compile", _compile, 1},
2609 {"getcodesize", sre_codesize, 1},
2610 {"getlower", sre_getlower, 1},
2611 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002612};
2613
Tim Peters5687ffe2001-02-28 16:44:18 +00002614DL_EXPORT(void)
Thomas Woutersf3f33dc2000-07-21 06:00:07 +00002615init_sre(void)
Guido van Rossumb700df92000-03-31 14:59:30 +00002616{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00002617 PyObject* m;
2618 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00002619 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00002620
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002621 /* Patch object types */
2622 Pattern_Type.ob_type = Match_Type.ob_type =
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002623 Scanner_Type.ob_type = &PyType_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00002624
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00002625 m = Py_InitModule("_" SRE_MODULE, _functions);
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00002626 d = PyModule_GetDict(m);
2627
2628 PyDict_SetItemString(
Barry Warsaw214a0b132001-08-16 20:33:48 +00002629 d, "MAGIC", (x = (PyObject*) PyInt_FromLong(SRE_MAGIC))
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00002630 );
Barry Warsaw214a0b132001-08-16 20:33:48 +00002631 Py_XDECREF(x);
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00002632
2633 PyDict_SetItemString(
Barry Warsaw214a0b132001-08-16 20:33:48 +00002634 d, "copyright", (x = (PyObject*)PyString_FromString(copyright))
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00002635 );
Barry Warsaw214a0b132001-08-16 20:33:48 +00002636 Py_XDECREF(x);
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00002637
Guido van Rossumb700df92000-03-31 14:59:30 +00002638}
2639
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002640#endif /* !defined(SRE_RECURSIVE) */