blob: 537dc298f72771ed217d3c9c043c6f1d5fc7a0bc [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
9 * 2000-06-30 fl added fast search optimization
10 * 2000-06-30 fl added assert (lookahead) primitives, etc
11 * 2000-07-02 fl added charset optimizations, etc
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000012 * 2000-07-03 fl store code in pattern object, lookbehind, etc
13 * 2000-07-08 fl added regs attribute
Fredrik Lundhebc37b22000-10-28 19:30:41 +000014 * 2000-07-21 fl reset lastindex in scanner methods
15 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000016 * 2000-08-03 fl added recursion limit
17 * 2000-08-07 fl use PyOS_CheckStack() if available
18 * 2000-08-08 fl changed findall to return empty strings instead of None
19 * 2000-08-27 fl properly propagate memory errors
20 * 2000-09-02 fl return -1 instead of None for start/end/span
21 * 2000-09-20 fl added expand method
22 * 2000-09-21 fl don't use the buffer interface for unicode strings
Fredrik Lundh562586e2000-10-03 20:43:34 +000023 * 2000-10-03 fl fixed assert_not primitive; support keyword arguments
Fredrik Lundhebc37b22000-10-28 19:30:41 +000024 * 2000-10-24 fl really fixed assert_not; reset groups in findall
Fredrik Lundh770617b2001-01-14 15:06:11 +000025 * 2000-12-21 fl fixed memory leak in groupdict
26 * 2001-01-02 fl properly reset pointer after failed assertion in MIN_UNTIL
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000027 * 2001-01-15 fl avoid recursion for MIN_UNTIL; fixed uppercase literal bug
Fredrik Lundh6f5cba62001-01-16 07:05:29 +000028 * 2001-01-16 fl fixed memory leak in pattern destructor
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000029 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000030 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000031 * 2001-04-28 fl added __copy__ methods (work in progress)
32 * 2001-05-14 fl fixes for 1.5.2
Fredrik Lundhf71ae462001-07-02 17:04:48 +000033 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh59b68652001-09-18 20:55:24 +000034 * 2001-09-18 fl added _getliteral helper
Fredrik Lundh397a6542001-10-18 19:30:16 +000035 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Guido van Rossumb700df92000-03-31 14:59:30 +000036 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000037 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000038 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000039 * This version of the SRE library can be redistributed under CNRI's
40 * Python 1.6 license. For any other use, please contact Secret Labs
41 * AB (info@pythonware.com).
42 *
Guido van Rossumb700df92000-03-31 14:59:30 +000043 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000044 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000045 * other compatibility work.
46 */
47
48#ifndef SRE_RECURSIVE
49
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000050static char copyright[] =
Fredrik Lundh397a6542001-10-18 19:30:16 +000051 " SRE 2.2.0 Copyright (c) 1997-2001 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000052
53#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000054#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000055
56#include "sre.h"
57
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000058#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000059
Fredrik Lundh436c3d582000-06-29 08:58:44 +000060/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000061#if !defined(SRE_MODULE)
62#define SRE_MODULE "sre"
63#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000064
Guido van Rossumb700df92000-03-31 14:59:30 +000065/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000066#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000067
Martin v. Löwis339d0f72001-08-17 18:39:25 +000068#if PY_VERSION_HEX >= 0x01060000 && defined(Py_USING_UNICODE)
Fredrik Lundh22d25462000-07-01 17:50:59 +000069/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +000070#define HAVE_UNICODE
71#endif
72
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000073/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000074/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000075
Fredrik Lundh33accc12000-08-27 20:59:47 +000076/* prevent run-away recursion (bad patterns on long strings) */
77
Fredrik Lundh18c2aa22000-08-07 17:33:38 +000078#if !defined(USE_STACKCHECK)
Fredrik Lundh33accc12000-08-27 20:59:47 +000079#if defined(MS_WIN64) || defined(__LP64__) || defined(_LP64)
80/* require smaller recursion limit for a number of 64-bit platforms:
81 Win64 (MS_WIN64), Linux64 (__LP64__), Monterey (64-bit AIX) (_LP64) */
82/* FIXME: maybe the limit should be 40000 / sizeof(void*) ? */
83#define USE_RECURSION_LIMIT 7500
84#else
85#define USE_RECURSION_LIMIT 10000
86#endif
Fredrik Lundh18c2aa22000-08-07 17:33:38 +000087#endif
Fredrik Lundh96ab4652000-08-03 16:29:50 +000088
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000089/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000090#define USE_FAST_SEARCH
91
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000092/* enables aggressive inlining (always on for Visual C) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000093#undef USE_INLINE
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000094
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000095/* enables copy/deepcopy handling (work in progress) */
96#undef USE_BUILTIN_COPY
97
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000098#if PY_VERSION_HEX < 0x01060000
99#define PyObject_DEL(op) PyMem_DEL((op))
100#endif
101
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000102/* -------------------------------------------------------------------- */
103
Fredrik Lundh80946112000-06-29 18:03:25 +0000104#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +0000105#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +0000106#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +0000107/* fastest possible local call under MSVC */
108#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000109#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000110#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000111#else
112#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +0000113#endif
114
115/* error codes */
116#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000117#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000118#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +0000119#define SRE_ERROR_MEMORY -9 /* out of memory */
120
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000121#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +0000122#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +0000123#else
124#define TRACE(v)
125#endif
126
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000127/* -------------------------------------------------------------------- */
128/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000129
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000130/* default character predicates (run sre_chars.py to regenerate tables) */
131
132#define SRE_DIGIT_MASK 1
133#define SRE_SPACE_MASK 2
134#define SRE_LINEBREAK_MASK 4
135#define SRE_ALNUM_MASK 8
136#define SRE_WORD_MASK 16
137
Fredrik Lundh21009b92001-09-18 18:47:09 +0000138/* FIXME: this assumes ASCII. create tables in init_sre() instead */
139
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000140static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1412, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1420, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
14325, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
14424, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1450, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
14624, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
147
Fredrik Lundhb389df32000-06-29 12:48:37 +0000148static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000014910, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
15027, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
15144, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
15261, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
153108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
154122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
155106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
156120, 121, 122, 123, 124, 125, 126, 127 };
157
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000158#define SRE_IS_DIGIT(ch)\
159 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
160#define SRE_IS_SPACE(ch)\
161 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
162#define SRE_IS_LINEBREAK(ch)\
163 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
164#define SRE_IS_ALNUM(ch)\
165 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
166#define SRE_IS_WORD(ch)\
167 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000168
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000169static unsigned int sre_lower(unsigned int ch)
170{
171 return ((ch) < 128 ? sre_char_lower[ch] : ch);
172}
173
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000174/* locale-specific character predicates */
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000175
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000176#define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
177#define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
178#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
179#define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
180#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
181
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000182static unsigned int sre_lower_locale(unsigned int ch)
183{
184 return ((ch) < 256 ? tolower((ch)) : ch);
185}
186
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000187/* unicode-specific character predicates */
188
189#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000190
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000191#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
192#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
193#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000194#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000195#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000196
197static unsigned int sre_lower_unicode(unsigned int ch)
198{
199 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
200}
201
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000202#endif
203
Guido van Rossumb700df92000-03-31 14:59:30 +0000204LOCAL(int)
205sre_category(SRE_CODE category, unsigned int ch)
206{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000207 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000208
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000209 case SRE_CATEGORY_DIGIT:
210 return SRE_IS_DIGIT(ch);
211 case SRE_CATEGORY_NOT_DIGIT:
212 return !SRE_IS_DIGIT(ch);
213 case SRE_CATEGORY_SPACE:
214 return SRE_IS_SPACE(ch);
215 case SRE_CATEGORY_NOT_SPACE:
216 return !SRE_IS_SPACE(ch);
217 case SRE_CATEGORY_WORD:
218 return SRE_IS_WORD(ch);
219 case SRE_CATEGORY_NOT_WORD:
220 return !SRE_IS_WORD(ch);
221 case SRE_CATEGORY_LINEBREAK:
222 return SRE_IS_LINEBREAK(ch);
223 case SRE_CATEGORY_NOT_LINEBREAK:
224 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000225
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000226 case SRE_CATEGORY_LOC_WORD:
227 return SRE_LOC_IS_WORD(ch);
228 case SRE_CATEGORY_LOC_NOT_WORD:
229 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000230
231#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000232 case SRE_CATEGORY_UNI_DIGIT:
233 return SRE_UNI_IS_DIGIT(ch);
234 case SRE_CATEGORY_UNI_NOT_DIGIT:
235 return !SRE_UNI_IS_DIGIT(ch);
236 case SRE_CATEGORY_UNI_SPACE:
237 return SRE_UNI_IS_SPACE(ch);
238 case SRE_CATEGORY_UNI_NOT_SPACE:
239 return !SRE_UNI_IS_SPACE(ch);
240 case SRE_CATEGORY_UNI_WORD:
241 return SRE_UNI_IS_WORD(ch);
242 case SRE_CATEGORY_UNI_NOT_WORD:
243 return !SRE_UNI_IS_WORD(ch);
244 case SRE_CATEGORY_UNI_LINEBREAK:
245 return SRE_UNI_IS_LINEBREAK(ch);
246 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
247 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000248#else
249 case SRE_CATEGORY_UNI_DIGIT:
250 return SRE_IS_DIGIT(ch);
251 case SRE_CATEGORY_UNI_NOT_DIGIT:
252 return !SRE_IS_DIGIT(ch);
253 case SRE_CATEGORY_UNI_SPACE:
254 return SRE_IS_SPACE(ch);
255 case SRE_CATEGORY_UNI_NOT_SPACE:
256 return !SRE_IS_SPACE(ch);
257 case SRE_CATEGORY_UNI_WORD:
258 return SRE_LOC_IS_WORD(ch);
259 case SRE_CATEGORY_UNI_NOT_WORD:
260 return !SRE_LOC_IS_WORD(ch);
261 case SRE_CATEGORY_UNI_LINEBREAK:
262 return SRE_IS_LINEBREAK(ch);
263 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
264 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000265#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000266 }
267 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000268}
269
270/* helpers */
271
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000272static void
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000273mark_fini(SRE_STATE* state)
274{
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000275 if (state->mark_stack) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000276 free(state->mark_stack);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000277 state->mark_stack = NULL;
278 }
279 state->mark_stack_size = state->mark_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000280}
281
282static int
283mark_save(SRE_STATE* state, int lo, int hi)
284{
285 void* stack;
286 int size;
287 int minsize, newsize;
288
289 if (hi <= lo)
290 return 0;
291
292 size = (hi - lo) + 1;
293
294 newsize = state->mark_stack_size;
295 minsize = state->mark_stack_base + size;
296
297 if (newsize < minsize) {
298 /* create new stack */
299 if (!newsize) {
300 newsize = 512;
301 if (newsize < minsize)
302 newsize = minsize;
303 TRACE(("allocate stack %d\n", newsize));
304 stack = malloc(sizeof(void*) * newsize);
305 } else {
306 /* grow the stack */
307 while (newsize < minsize)
308 newsize += newsize;
309 TRACE(("grow stack to %d\n", newsize));
310 stack = realloc(state->mark_stack, sizeof(void*) * newsize);
311 }
312 if (!stack) {
313 mark_fini(state);
314 return SRE_ERROR_MEMORY;
315 }
316 state->mark_stack = stack;
317 state->mark_stack_size = newsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000318 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000319
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000320 TRACE(("copy %d:%d to %d (%d)\n", lo, hi, state->mark_stack_base, size));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000321
322 memcpy(state->mark_stack + state->mark_stack_base, state->mark + lo,
323 size * sizeof(void*));
324
325 state->mark_stack_base += size;
326
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000327 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000328}
329
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000330static int
331mark_restore(SRE_STATE* state, int lo, int hi)
Guido van Rossumb700df92000-03-31 14:59:30 +0000332{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000333 int size;
Guido van Rossumb700df92000-03-31 14:59:30 +0000334
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000335 if (hi <= lo)
336 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000337
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000338 size = (hi - lo) + 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000339
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000340 state->mark_stack_base -= size;
Guido van Rossumb700df92000-03-31 14:59:30 +0000341
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000342 TRACE(("copy %d:%d from %d\n", lo, hi, state->mark_stack_base));
Guido van Rossumb700df92000-03-31 14:59:30 +0000343
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000344 memcpy(state->mark + lo, state->mark_stack + state->mark_stack_base,
345 size * sizeof(void*));
Guido van Rossumb700df92000-03-31 14:59:30 +0000346
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000347 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000348}
349
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000350/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000351
352#define SRE_CHAR unsigned char
353#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000354#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000355#define SRE_CHARSET sre_charset
356#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000357#define SRE_MATCH sre_match
358#define SRE_SEARCH sre_search
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000359
360#if defined(HAVE_UNICODE)
361
Guido van Rossumb700df92000-03-31 14:59:30 +0000362#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000363#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000364#undef SRE_RECURSIVE
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000365
Guido van Rossumb700df92000-03-31 14:59:30 +0000366#undef SRE_SEARCH
367#undef SRE_MATCH
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000368#undef SRE_INFO
369#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000370#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000371#undef SRE_AT
372#undef SRE_CHAR
373
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000374/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000375
376#define SRE_CHAR Py_UNICODE
377#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000378#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000379#define SRE_CHARSET sre_ucharset
380#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000381#define SRE_MATCH sre_umatch
382#define SRE_SEARCH sre_usearch
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000383#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000384
385#endif /* SRE_RECURSIVE */
386
387/* -------------------------------------------------------------------- */
388/* String matching engine */
389
390/* the following section is compiled twice, with different character
391 settings */
392
393LOCAL(int)
394SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
395{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000396 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000397
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000398 int this, that;
Guido van Rossumb700df92000-03-31 14:59:30 +0000399
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000400 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000401
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000402 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000403 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000404 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000405
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000406 case SRE_AT_BEGINNING_LINE:
407 return ((void*) ptr == state->beginning ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000408 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000409
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000410 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000411 return (((void*) (ptr+1) == state->end &&
412 SRE_IS_LINEBREAK((int) ptr[0])) ||
413 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000414
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000415 case SRE_AT_END_LINE:
416 return ((void*) ptr == state->end ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000417 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000418
Fredrik Lundh770617b2001-01-14 15:06:11 +0000419 case SRE_AT_END_STRING:
420 return ((void*) ptr == state->end);
421
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000422 case SRE_AT_BOUNDARY:
423 if (state->beginning == state->end)
424 return 0;
425 that = ((void*) ptr > state->beginning) ?
426 SRE_IS_WORD((int) ptr[-1]) : 0;
427 this = ((void*) ptr < state->end) ?
428 SRE_IS_WORD((int) ptr[0]) : 0;
429 return this != that;
Fredrik Lundh80946112000-06-29 18:03:25 +0000430
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000431 case SRE_AT_NON_BOUNDARY:
432 if (state->beginning == state->end)
433 return 0;
434 that = ((void*) ptr > state->beginning) ?
435 SRE_IS_WORD((int) ptr[-1]) : 0;
436 this = ((void*) ptr < state->end) ?
437 SRE_IS_WORD((int) ptr[0]) : 0;
438 return this == that;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000439
440 case SRE_AT_LOC_BOUNDARY:
441 if (state->beginning == state->end)
442 return 0;
443 that = ((void*) ptr > state->beginning) ?
444 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
445 this = ((void*) ptr < state->end) ?
446 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
447 return this != that;
448
449 case SRE_AT_LOC_NON_BOUNDARY:
450 if (state->beginning == state->end)
451 return 0;
452 that = ((void*) ptr > state->beginning) ?
453 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
454 this = ((void*) ptr < state->end) ?
455 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
456 return this == that;
457
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000458#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000459 case SRE_AT_UNI_BOUNDARY:
460 if (state->beginning == state->end)
461 return 0;
462 that = ((void*) ptr > state->beginning) ?
463 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
464 this = ((void*) ptr < state->end) ?
465 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
466 return this != that;
467
468 case SRE_AT_UNI_NON_BOUNDARY:
469 if (state->beginning == state->end)
470 return 0;
471 that = ((void*) ptr > state->beginning) ?
472 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
473 this = ((void*) ptr < state->end) ?
474 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
475 return this == that;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000476#endif
477
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000478 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000479
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000480 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000481}
482
483LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000484SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000485{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000486 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000487
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000488 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000489
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000490 for (;;) {
491 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000492
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000493 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000494 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000495 if (ch == set[0])
496 return ok;
497 set++;
498 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000499
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000500 case SRE_OP_RANGE:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000501 /* <RANGE> <lower> <upper> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000502 if (set[0] <= ch && ch <= set[1])
503 return ok;
504 set += 2;
505 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000506
Fredrik Lundh3562f112000-07-02 12:00:07 +0000507 case SRE_OP_CHARSET:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000508 /* <CHARSET> <bitmap> (16 bits per code word) */
Fredrik Lundh3562f112000-07-02 12:00:07 +0000509 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
510 return ok;
511 set += 16;
512 break;
513
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000514 case SRE_OP_BIGCHARSET:
515 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
516 {
517 int count, block;
518 count = *(set++);
519 block = ((unsigned char*)set)[ch >> 8];
520 set += 128;
521 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
522 return ok;
523 set += count*16;
524 break;
525 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000526
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000527 case SRE_OP_CATEGORY:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000528 /* <CATEGORY> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000529 if (sre_category(set[0], (int) ch))
530 return ok;
531 set += 1;
532 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000533
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000534 case SRE_OP_NEGATE:
535 ok = !ok;
536 break;
537
538 case SRE_OP_FAILURE:
539 return !ok;
540
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000541 default:
542 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000543 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000544 return 0;
545 }
546 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000547}
548
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000549LOCAL(int) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level);
550
551LOCAL(int)
552SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, int maxcount, int level)
553{
554 SRE_CODE chr;
555 SRE_CHAR* ptr = state->ptr;
556 SRE_CHAR* end = state->end;
557 int i;
558
559 /* adjust end */
560 if (maxcount < end - ptr && maxcount != 65535)
561 end = ptr + maxcount;
562
563 switch (pattern[0]) {
564
565 case SRE_OP_ANY:
566 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000567 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000568 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
569 ptr++;
570 break;
571
572 case SRE_OP_ANY_ALL:
573 /* repeated dot wildcare. skip to the end of the target
574 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000575 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000576 ptr = end;
577 break;
578
579 case SRE_OP_LITERAL:
580 /* repeated literal */
581 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000582 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000583 while (ptr < end && (SRE_CODE) *ptr == chr)
584 ptr++;
585 break;
586
587 case SRE_OP_LITERAL_IGNORE:
588 /* repeated literal */
589 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000590 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000591 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
592 ptr++;
593 break;
594
595 case SRE_OP_NOT_LITERAL:
596 /* repeated non-literal */
597 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000598 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000599 while (ptr < end && (SRE_CODE) *ptr != chr)
600 ptr++;
601 break;
602
603 case SRE_OP_NOT_LITERAL_IGNORE:
604 /* repeated non-literal */
605 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000606 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000607 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
608 ptr++;
609 break;
610
611 case SRE_OP_IN:
612 /* repeated set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000613 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
614 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000615 ptr++;
616 break;
617
618 default:
619 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000620 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000621 while ((SRE_CHAR*) state->ptr < end) {
622 i = SRE_MATCH(state, pattern, level);
623 if (i < 0)
624 return i;
625 if (!i)
626 break;
627 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000628 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
629 (SRE_CHAR*) state->ptr - ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000630 return (SRE_CHAR*) state->ptr - ptr;
631 }
632
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000633 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000634 return ptr - (SRE_CHAR*) state->ptr;
635}
636
Fredrik Lundh33accc12000-08-27 20:59:47 +0000637#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000638LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000639SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
640{
641 /* check if an SRE_OP_INFO block matches at the current position.
642 returns the number of SRE_CODE objects to skip if successful, 0
643 if no match */
644
645 SRE_CHAR* end = state->end;
646 SRE_CHAR* ptr = state->ptr;
647 int i;
648
649 /* check minimal length */
650 if (pattern[3] && (end - ptr) < pattern[3])
651 return 0;
652
653 /* check known prefix */
654 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
655 /* <length> <skip> <prefix data> <overlap data> */
656 for (i = 0; i < pattern[5]; i++)
657 if ((SRE_CODE) ptr[i] != pattern[7 + i])
658 return 0;
659 return pattern[0] + 2 * pattern[6];
660 }
661 return pattern[0];
662}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000663#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000664
665LOCAL(int)
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000666SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
Guido van Rossumb700df92000-03-31 14:59:30 +0000667{
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000668 /* check if string matches the given pattern. returns <0 for
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000669 error, 0 for failure, and 1 for success */
Guido van Rossumb700df92000-03-31 14:59:30 +0000670
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000671 SRE_CHAR* end = state->end;
672 SRE_CHAR* ptr = state->ptr;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000673 int i, count;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000674 SRE_REPEAT* rp;
675 int lastmark;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000676 SRE_CODE chr;
Guido van Rossumb700df92000-03-31 14:59:30 +0000677
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000678 SRE_REPEAT rep; /* FIXME: <fl> allocate in STATE instead */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000679
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000680 TRACE(("|%p|%p|ENTER %d\n", pattern, ptr, level));
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000681
Fredrik Lundh18c2aa22000-08-07 17:33:38 +0000682#if defined(USE_STACKCHECK)
Fredrik Lundh58100642000-08-09 09:14:35 +0000683 if (level % 10 == 0 && PyOS_CheckStack())
Fredrik Lundh18c2aa22000-08-07 17:33:38 +0000684 return SRE_ERROR_RECURSION_LIMIT;
685#endif
686
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000687#if defined(USE_RECURSION_LIMIT)
688 if (level > USE_RECURSION_LIMIT)
689 return SRE_ERROR_RECURSION_LIMIT;
690#endif
691
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000692 if (pattern[0] == SRE_OP_INFO) {
693 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000694 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000695 if (pattern[3] && (end - ptr) < pattern[3]) {
696 TRACE(("reject (got %d chars, need %d)\n",
697 (end - ptr), pattern[3]));
698 return 0;
699 }
700 pattern += pattern[1] + 1;
701 }
702
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000703 for (;;) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000704
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000705 switch (*pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000706
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000707 case SRE_OP_FAILURE:
708 /* immediate failure */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000709 TRACE(("|%p|%p|FAILURE\n", pattern, ptr));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000710 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000711
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000712 case SRE_OP_SUCCESS:
713 /* end of pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000714 TRACE(("|%p|%p|SUCCESS\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000715 state->ptr = ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000716 return 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000717
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000718 case SRE_OP_AT:
719 /* match at given position */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000720 /* <AT> <code> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000721 TRACE(("|%p|%p|AT %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000722 if (!SRE_AT(state, ptr, *pattern))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000723 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000724 pattern++;
725 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000726
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000727 case SRE_OP_CATEGORY:
728 /* match at given category */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000729 /* <CATEGORY> <code> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000730 TRACE(("|%p|%p|CATEGORY %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000731 if (ptr >= end || !sre_category(pattern[0], ptr[0]))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000732 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000733 pattern++;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000734 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000735 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000736
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000737 case SRE_OP_LITERAL:
738 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000739 /* <LITERAL> <code> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000740 TRACE(("|%p|%p|LITERAL %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000741 if (ptr >= end || (SRE_CODE) ptr[0] != pattern[0])
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000742 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000743 pattern++;
744 ptr++;
745 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000746
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000747 case SRE_OP_NOT_LITERAL:
748 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000749 /* <NOT_LITERAL> <code> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000750 TRACE(("|%p|%p|NOT_LITERAL %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000751 if (ptr >= end || (SRE_CODE) ptr[0] == pattern[0])
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000752 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000753 pattern++;
754 ptr++;
755 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000756
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000757 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000758 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000759 /* <ANY> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000760 TRACE(("|%p|%p|ANY\n", pattern, ptr));
Fredrik Lundhe1869832000-08-01 22:47:49 +0000761 if (ptr >= end || SRE_IS_LINEBREAK(ptr[0]))
762 return 0;
763 ptr++;
764 break;
765
766 case SRE_OP_ANY_ALL:
767 /* match anything */
768 /* <ANY_ALL> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000769 TRACE(("|%p|%p|ANY_ALL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000770 if (ptr >= end)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000771 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000772 ptr++;
773 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000774
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000775 case SRE_OP_IN:
776 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000777 /* <IN> <skip> <set> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000778 TRACE(("|%p|%p|IN\n", pattern, ptr));
779 if (ptr >= end || !SRE_CHARSET(pattern + 1, *ptr))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000780 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000781 pattern += pattern[0];
782 ptr++;
783 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000784
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000785 case SRE_OP_GROUPREF:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000786 /* match backreference */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000787 TRACE(("|%p|%p|GROUPREF %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000788 i = pattern[0];
789 {
790 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
791 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
792 if (!p || !e || e < p)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000793 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000794 while (p < e) {
795 if (ptr >= end || *ptr != *p)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000796 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000797 p++; ptr++;
798 }
799 }
800 pattern++;
801 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000802
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000803 case SRE_OP_GROUPREF_IGNORE:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000804 /* match backreference */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000805 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000806 i = pattern[0];
807 {
808 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
809 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
810 if (!p || !e || e < p)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000811 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000812 while (p < e) {
813 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000814 state->lower(*ptr) != state->lower(*p))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000815 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000816 p++; ptr++;
817 }
818 }
819 pattern++;
820 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000821
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000822 case SRE_OP_LITERAL_IGNORE:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000823 TRACE(("|%p|%p|LITERAL_IGNORE %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000824 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000825 state->lower(*ptr) != state->lower(*pattern))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000826 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000827 pattern++;
828 ptr++;
829 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000830
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000831 case SRE_OP_NOT_LITERAL_IGNORE:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000832 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000833 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000834 state->lower(*ptr) == state->lower(*pattern))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000835 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000836 pattern++;
837 ptr++;
838 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000839
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000840 case SRE_OP_IN_IGNORE:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000841 TRACE(("|%p|%p|IN_IGNORE\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000842 if (ptr >= end
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000843 || !SRE_CHARSET(pattern + 1, (SRE_CODE) state->lower(*ptr)))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000844 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000845 pattern += pattern[0];
846 ptr++;
847 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000848
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000849 case SRE_OP_MARK:
850 /* set mark */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000851 /* <MARK> <gid> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000852 TRACE(("|%p|%p|MARK %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000853 i = pattern[0];
854 if (i & 1)
855 state->lastindex = i/2 + 1;
856 if (i > state->lastmark)
857 state->lastmark = i;
858 state->mark[i] = ptr;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000859 pattern++;
860 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000861
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000862 case SRE_OP_JUMP:
863 case SRE_OP_INFO:
864 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000865 /* <JUMP> <offset> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000866 TRACE(("|%p|%p|JUMP %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000867 pattern += pattern[0];
868 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000869
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000870 case SRE_OP_ASSERT:
871 /* assert subpattern */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000872 /* <ASSERT> <skip> <back> <pattern> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000873 TRACE(("|%p|%p|ASSERT %d\n", pattern, ptr, pattern[1]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000874 state->ptr = ptr - pattern[1];
Fredrik Lundh6f013982000-07-03 18:44:21 +0000875 if (state->ptr < state->beginning)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000876 return 0;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000877 i = SRE_MATCH(state, pattern + 2, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000878 if (i <= 0)
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000879 return i;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000880 pattern += pattern[0];
881 break;
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000882
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000883 case SRE_OP_ASSERT_NOT:
884 /* assert not subpattern */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000885 /* <ASSERT_NOT> <skip> <back> <pattern> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000886 TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern, ptr, pattern[1]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000887 state->ptr = ptr - pattern[1];
Fredrik Lundhebc37b22000-10-28 19:30:41 +0000888 if (state->ptr >= state->beginning) {
889 i = SRE_MATCH(state, pattern + 2, level + 1);
890 if (i < 0)
891 return i;
892 if (i)
893 return 0;
894 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000895 pattern += pattern[0];
896 break;
897
898 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000899 /* alternation */
900 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000901 TRACE(("|%p|%p|BRANCH\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000902 lastmark = state->lastmark;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000903 for (; pattern[0]; pattern += pattern[0]) {
904 if (pattern[1] == SRE_OP_LITERAL &&
905 (ptr >= end || (SRE_CODE) *ptr != pattern[2]))
906 continue;
907 if (pattern[1] == SRE_OP_IN &&
908 (ptr >= end || !SRE_CHARSET(pattern + 3, (SRE_CODE) *ptr)))
909 continue;
910 state->ptr = ptr;
911 i = SRE_MATCH(state, pattern + 1, level + 1);
912 if (i)
913 return i;
914 if (state->lastmark > lastmark) {
915 memset(
916 state->mark + lastmark + 1, 0,
917 (state->lastmark - lastmark) * sizeof(void*)
918 );
919 state->lastmark = lastmark;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000920 }
921 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000922 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000923
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000924 case SRE_OP_REPEAT_ONE:
925 /* match repeated sequence (maximizing regexp) */
926
927 /* this operator only works if the repeated item is
928 exactly one character wide, and we're not already
929 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +0000930 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000931
932 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
933
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000934 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", pattern, ptr,
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000935 pattern[1], pattern[2]));
936
Fredrik Lundhe1869832000-08-01 22:47:49 +0000937 if (ptr + pattern[1] > end)
938 return 0; /* cannot match */
939
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000940 state->ptr = ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000941
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000942 count = SRE_COUNT(state, pattern + 3, pattern[2], level + 1);
943 if (count < 0)
944 return count;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000945
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000946 ptr += count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000947
948 /* when we arrive here, count contains the number of
949 matches, and ptr points to the tail of the target
950 string. check if the rest of the pattern matches,
951 and backtrack if not. */
952
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000953 if (count < (int) pattern[1])
954 return 0;
955
956 if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
957 /* tail is empty. we're finished */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000958 state->ptr = ptr;
959 return 1;
960
961 } else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
962 /* tail starts with a literal. skip positions where
963 the rest of the pattern cannot possibly match */
Fredrik Lundhe1869832000-08-01 22:47:49 +0000964 chr = pattern[pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000965 for (;;) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000966 while (count >= (int) pattern[1] &&
967 (ptr >= end || *ptr != chr)) {
968 ptr--;
969 count--;
970 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000971 if (count < (int) pattern[1])
972 break;
973 state->ptr = ptr;
974 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000975 if (i)
Fredrik Lundh33accc12000-08-27 20:59:47 +0000976 return i;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000977 ptr--;
978 count--;
979 }
980
981 } else {
982 /* general case */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000983 lastmark = state->lastmark;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000984 while (count >= (int) pattern[1]) {
985 state->ptr = ptr;
986 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000987 if (i)
Fredrik Lundh33accc12000-08-27 20:59:47 +0000988 return i;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000989 ptr--;
990 count--;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000991 if (state->lastmark > lastmark) {
992 memset(
993 state->mark + lastmark + 1, 0,
994 (state->lastmark - lastmark) * sizeof(void*)
995 );
996 state->lastmark = lastmark;
997 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000998 }
999 }
1000 return 0;
1001
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001002 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001003 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001004 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001005 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001006 TRACE(("|%p|%p|REPEAT %d %d\n", pattern, ptr,
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001007 pattern[1], pattern[2]));
1008
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001009 rep.count = -1;
1010 rep.pattern = pattern;
1011
1012 /* install new repeat context */
1013 rep.prev = state->repeat;
1014 state->repeat = &rep;
1015
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001016 state->ptr = ptr;
1017 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001018
1019 state->repeat = rep.prev;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001020
1021 return i;
1022
1023 case SRE_OP_MAX_UNTIL:
1024 /* maximizing repeat */
1025 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1026
1027 /* FIXME: we probably need to deal with zero-width
1028 matches in here... */
1029
1030 rp = state->repeat;
1031 if (!rp)
1032 return SRE_ERROR_STATE;
1033
1034 state->ptr = ptr;
1035
1036 count = rp->count + 1;
1037
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001038 TRACE(("|%p|%p|MAX_UNTIL %d\n", pattern, ptr, count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001039
1040 if (count < rp->pattern[1]) {
1041 /* not enough matches */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001042 rp->count = count;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001043 /* RECURSIVE */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001044 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001045 if (i)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001046 return i;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001047 rp->count = count - 1;
1048 state->ptr = ptr;
1049 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001050 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001051
1052 if (count < rp->pattern[2] || rp->pattern[2] == 65535) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001053 /* we may have enough matches, but if we can
1054 match another item, do so */
1055 rp->count = count;
1056 lastmark = state->lastmark;
Fredrik Lundh33accc12000-08-27 20:59:47 +00001057 i = mark_save(state, 0, lastmark);
1058 if (i < 0)
1059 return i;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001060 /* RECURSIVE */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001061 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001062 if (i)
1063 return i;
Fredrik Lundh33accc12000-08-27 20:59:47 +00001064 i = mark_restore(state, 0, lastmark);
Fredrik Lundh397a6542001-10-18 19:30:16 +00001065 state->lastmark = lastmark;
Fredrik Lundh33accc12000-08-27 20:59:47 +00001066 if (i < 0)
1067 return i;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001068 rp->count = count - 1;
1069 state->ptr = ptr;
1070 }
1071
1072 /* cannot match more repeated items here. make sure the
1073 tail matches */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001074 state->repeat = rp->prev;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001075 i = SRE_MATCH(state, pattern, level + 1);
1076 if (i)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001077 return i;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001078 state->repeat = rp;
Fredrik Lundh770617b2001-01-14 15:06:11 +00001079 state->ptr = ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001080 return 0;
1081
1082 case SRE_OP_MIN_UNTIL:
1083 /* minimizing repeat */
1084 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1085
1086 rp = state->repeat;
1087 if (!rp)
1088 return SRE_ERROR_STATE;
1089
1090 count = rp->count + 1;
1091
Fredrik Lundh770617b2001-01-14 15:06:11 +00001092 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", pattern, ptr, count,
1093 rp->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001094
1095 state->ptr = ptr;
1096
1097 if (count < rp->pattern[1]) {
1098 /* not enough matches */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001099 rp->count = count;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001100 /* RECURSIVE */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001101 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001102 if (i)
1103 return i;
1104 rp->count = count-1;
1105 state->ptr = ptr;
1106 return 0;
1107 }
1108
1109 /* see if the tail matches */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001110 state->repeat = rp->prev;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +00001111 /* FIXME: the following fix doesn't always work (#133283) */
Fredrik Lundhdf781e62001-07-02 19:54:28 +00001112 if (rp->pattern[2] == 65535) {
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001113 /* unbounded repeat */
1114 for (;;) {
1115 i = SRE_MATCH(state, pattern, level + 1);
1116 if (i || ptr >= end)
1117 break;
1118 state->ptr = ++ptr;
1119 }
1120 } else
1121 i = SRE_MATCH(state, pattern, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001122 if (i) {
1123 /* free(rp); */
1124 return i;
1125 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001126
Fredrik Lundh770617b2001-01-14 15:06:11 +00001127 state->ptr = ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001128 state->repeat = rp;
1129
1130 if (count >= rp->pattern[2] && rp->pattern[2] != 65535)
1131 return 0;
1132
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001133 rp->count = count;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001134 /* RECURSIVE */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001135 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001136 if (i)
1137 return i;
1138 rp->count = count - 1;
Fredrik Lundh770617b2001-01-14 15:06:11 +00001139 state->ptr = ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001140 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001141
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001142 default:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001143 TRACE(("|%p|%p|UNKNOWN %d\n", pattern, ptr, pattern[-1]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001144 return SRE_ERROR_ILLEGAL;
1145 }
1146 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001147
Sjoerd Mullender89dfe9e2001-08-30 14:37:07 +00001148 /* can't end up here */
Fredrik Lundh21009b92001-09-18 18:47:09 +00001149 /* return SRE_ERROR_ILLEGAL; -- see python-dev discussion */
Guido van Rossumb700df92000-03-31 14:59:30 +00001150}
1151
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001152LOCAL(int)
Guido van Rossumb700df92000-03-31 14:59:30 +00001153SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1154{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001155 SRE_CHAR* ptr = state->start;
1156 SRE_CHAR* end = state->end;
1157 int status = 0;
Fredrik Lundh28552902000-07-05 21:14:16 +00001158 int prefix_len = 0;
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001159 int prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001160 SRE_CODE* prefix = NULL;
1161 SRE_CODE* charset = NULL;
1162 SRE_CODE* overlap = NULL;
1163 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001164
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001165 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001166 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001167 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001168
1169 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001170
1171 if (pattern[3] > 0) {
1172 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001173 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001174 end -= pattern[3]-1;
1175 if (end <= ptr)
1176 end = ptr+1;
1177 }
1178
Fredrik Lundh3562f112000-07-02 12:00:07 +00001179 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001180 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001181 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001182 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001183 prefix_skip = pattern[6];
1184 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001185 overlap = prefix + prefix_len - 1;
1186 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001187 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001188 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001189 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001190
1191 pattern += 1 + pattern[1];
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001192 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001193
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001194 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1195 TRACE(("charset = %p\n", charset));
1196
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001197#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001198 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001199 /* pattern starts with a known prefix. use the overlap
1200 table to skip forward as fast as we possibly can */
1201 int i = 0;
1202 end = state->end;
1203 while (ptr < end) {
1204 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001205 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001206 if (!i)
1207 break;
1208 else
1209 i = overlap[i];
1210 } else {
1211 if (++i == prefix_len) {
1212 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001213 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1214 state->start = ptr + 1 - prefix_len;
1215 state->ptr = ptr + 1 - prefix_len + prefix_skip;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001216 if (flags & SRE_INFO_LITERAL)
1217 return 1; /* we got all of it */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001218 status = SRE_MATCH(state, pattern + 2*prefix_skip, 1);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001219 if (status != 0)
1220 return status;
1221 /* close but no cigar -- try again */
1222 i = overlap[i];
1223 }
1224 break;
1225 }
1226
1227 }
1228 ptr++;
1229 }
1230 return 0;
1231 }
1232#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001233
Fredrik Lundh3562f112000-07-02 12:00:07 +00001234 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001235 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001236 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001237 SRE_CODE chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001238 end = state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001239 for (;;) {
1240 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1241 ptr++;
1242 if (ptr == end)
1243 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001244 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001245 state->start = ptr;
1246 state->ptr = ++ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001247 status = SRE_MATCH(state, pattern + 2, 1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001248 if (status != 0)
1249 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001250 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001251 } else if (charset) {
1252 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001253 end = state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001254 for (;;) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001255 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001256 ptr++;
1257 if (ptr == end)
1258 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001259 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001260 state->start = ptr;
1261 state->ptr = ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001262 status = SRE_MATCH(state, pattern, 1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001263 if (status != 0)
1264 break;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001265 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001266 }
1267 } else
1268 /* general case */
1269 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001270 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001271 state->start = state->ptr = ptr++;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001272 status = SRE_MATCH(state, pattern, 1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001273 if (status != 0)
1274 break;
1275 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001276
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001277 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001278}
Fredrik Lundh3562f112000-07-02 12:00:07 +00001279
Guido van Rossumb700df92000-03-31 14:59:30 +00001280
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001281#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001282
1283/* -------------------------------------------------------------------- */
1284/* factories and destructors */
1285
1286/* see sre.h for object declarations */
1287
1288staticforward PyTypeObject Pattern_Type;
1289staticforward PyTypeObject Match_Type;
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001290staticforward PyTypeObject Scanner_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00001291
1292static PyObject *
1293_compile(PyObject* self_, PyObject* args)
1294{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001295 /* "compile" pattern descriptor to pattern object */
Guido van Rossumb700df92000-03-31 14:59:30 +00001296
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001297 PatternObject* self;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001298 int i, n;
Guido van Rossumb700df92000-03-31 14:59:30 +00001299
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001300 PyObject* pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001301 int flags = 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001302 PyObject* code;
1303 int groups = 0;
1304 PyObject* groupindex = NULL;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001305 PyObject* indexgroup = NULL;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001306 if (!PyArg_ParseTuple(args, "OiO!|iOO", &pattern, &flags,
1307 &PyList_Type, &code, &groups,
1308 &groupindex, &indexgroup))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001309 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001310
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001311 n = PyList_GET_SIZE(code);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001312
Fredrik Lundhebc37b22000-10-28 19:30:41 +00001313 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001314 if (!self)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001315 return NULL;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001316
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001317 self->codesize = n;
1318
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001319 for (i = 0; i < n; i++) {
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001320 PyObject *o = PyList_GET_ITEM(code, i);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001321 self->code[i] = (SRE_CODE) PyInt_AsLong(o);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001322 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00001323
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001324 if (PyErr_Occurred()) {
1325 PyObject_DEL(self);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001326 return NULL;
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001327 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00001328
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001329 Py_INCREF(pattern);
1330 self->pattern = pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00001331
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001332 self->flags = flags;
1333
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001334 self->groups = groups;
Guido van Rossumb700df92000-03-31 14:59:30 +00001335
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001336 Py_XINCREF(groupindex);
1337 self->groupindex = groupindex;
Guido van Rossumb700df92000-03-31 14:59:30 +00001338
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001339 Py_XINCREF(indexgroup);
1340 self->indexgroup = indexgroup;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001341
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001342 return (PyObject*) self;
Guido van Rossumb700df92000-03-31 14:59:30 +00001343}
1344
1345static PyObject *
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001346sre_codesize(PyObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001347{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001348 return Py_BuildValue("i", sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001349}
1350
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001351static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001352sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001353{
1354 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001355 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001356 return NULL;
1357 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001358 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001359 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001360#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001361 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001362#else
1363 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001364#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001365 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001366}
1367
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001368LOCAL(void)
1369state_reset(SRE_STATE* state)
1370{
1371 int i;
1372
1373 state->lastmark = 0;
1374
1375 /* FIXME: dynamic! */
1376 for (i = 0; i < SRE_MARK_SIZE; i++)
1377 state->mark[i] = NULL;
1378
1379 state->lastindex = -1;
1380
1381 state->repeat = NULL;
1382
1383 mark_fini(state);
1384}
1385
Guido van Rossumb700df92000-03-31 14:59:30 +00001386LOCAL(PyObject*)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001387state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1388 int start, int end)
Guido van Rossumb700df92000-03-31 14:59:30 +00001389{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001390 /* prepare state object */
Guido van Rossumb700df92000-03-31 14:59:30 +00001391
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001392 PyBufferProcs *buffer;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001393 int size, bytes;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001394 void* ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001395
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001396 memset(state, 0, sizeof(SRE_STATE));
1397
1398 state->lastindex = -1;
1399
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001400#if defined(HAVE_UNICODE)
1401 if (PyUnicode_Check(string)) {
1402 /* unicode strings doesn't always support the buffer interface */
1403 ptr = (void*) PyUnicode_AS_DATA(string);
1404 bytes = PyUnicode_GET_DATA_SIZE(string);
1405 size = PyUnicode_GET_SIZE(string);
1406 state->charsize = sizeof(Py_UNICODE);
1407
1408 } else {
1409#endif
1410
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001411 /* get pointer to string buffer */
1412 buffer = string->ob_type->tp_as_buffer;
1413 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1414 buffer->bf_getsegcount(string, NULL) != 1) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001415 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001416 return NULL;
1417 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001418
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001419 /* determine buffer size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001420 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1421 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001422 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1423 return NULL;
1424 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001425
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001426 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001427#if PY_VERSION_HEX >= 0x01060000
1428 size = PyObject_Size(string);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001429#else
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001430 size = PyObject_Length(string);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001431#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001432
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001433 if (PyString_Check(string) || bytes == size)
1434 state->charsize = 1;
1435#if defined(HAVE_UNICODE)
1436 else if (bytes == (int) (size * sizeof(Py_UNICODE)))
1437 state->charsize = sizeof(Py_UNICODE);
1438#endif
1439 else {
1440 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1441 return NULL;
1442 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001443
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001444#if defined(HAVE_UNICODE)
1445 }
1446#endif
1447
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001448 /* adjust boundaries */
1449 if (start < 0)
1450 start = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001451 else if (start > size)
1452 start = size;
Guido van Rossumb700df92000-03-31 14:59:30 +00001453
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001454 if (end < 0)
1455 end = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001456 else if (end > size)
1457 end = size;
Guido van Rossumb700df92000-03-31 14:59:30 +00001458
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001459 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001460
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001461 state->start = (void*) ((char*) ptr + start * state->charsize);
1462 state->end = (void*) ((char*) ptr + end * state->charsize);
1463
1464 Py_INCREF(string);
1465 state->string = string;
1466 state->pos = start;
1467 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001468
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001469 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001470 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001471 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001472#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001473 state->lower = sre_lower_unicode;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001474#else
1475 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001476#endif
1477 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001478 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001479
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001480 return string;
Guido van Rossumb700df92000-03-31 14:59:30 +00001481}
1482
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001483LOCAL(void)
1484state_fini(SRE_STATE* state)
1485{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001486 Py_XDECREF(state->string);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001487 mark_fini(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001488}
1489
1490LOCAL(PyObject*)
1491state_getslice(SRE_STATE* state, int index, PyObject* string)
1492{
Fredrik Lundh58100642000-08-09 09:14:35 +00001493 int i, j;
1494
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001495 index = (index - 1) * 2;
1496
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001497 if (string == Py_None || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh58100642000-08-09 09:14:35 +00001498 i = j = 0;
1499 } else {
1500 i = ((char*)state->mark[index] - (char*)state->beginning) /
1501 state->charsize;
1502 j = ((char*)state->mark[index+1] - (char*)state->beginning) /
1503 state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001504 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001505
Fredrik Lundh58100642000-08-09 09:14:35 +00001506 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001507}
1508
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001509static void
1510pattern_error(int status)
1511{
1512 switch (status) {
1513 case SRE_ERROR_RECURSION_LIMIT:
1514 PyErr_SetString(
1515 PyExc_RuntimeError,
1516 "maximum recursion limit exceeded"
1517 );
1518 break;
1519 case SRE_ERROR_MEMORY:
1520 PyErr_NoMemory();
1521 break;
1522 default:
1523 /* other error codes indicate compiler/engine bugs */
1524 PyErr_SetString(
1525 PyExc_RuntimeError,
1526 "internal error in regular expression engine"
1527 );
1528 }
1529}
1530
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001531static PyObject*
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001532pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001533{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001534 /* create match object (from state object) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001535
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001536 MatchObject* match;
1537 int i, j;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001538 char* base;
1539 int n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001540
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001541 if (status > 0) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001542
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001543 /* create match object (with room for extra group marks) */
1544 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
Fredrik Lundh6f013982000-07-03 18:44:21 +00001545 2*(pattern->groups+1));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001546 if (!match)
1547 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001548
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001549 Py_INCREF(pattern);
1550 match->pattern = pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001551
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001552 Py_INCREF(state->string);
1553 match->string = state->string;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001554
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001555 match->regs = NULL;
1556 match->groups = pattern->groups+1;
1557
1558 /* fill in group slices */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001559
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001560 base = (char*) state->beginning;
1561 n = state->charsize;
1562
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001563 match->mark[0] = ((char*) state->start - base) / n;
1564 match->mark[1] = ((char*) state->ptr - base) / n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001565
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001566 for (i = j = 0; i < pattern->groups; i++, j+=2)
1567 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
1568 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
1569 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
1570 } else
1571 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
1572
1573 match->pos = state->pos;
1574 match->endpos = state->endpos;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001575
Fredrik Lundh6f013982000-07-03 18:44:21 +00001576 match->lastindex = state->lastindex;
1577
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001578 return (PyObject*) match;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001579
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001580 } else if (status == 0) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001581
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001582 /* no match */
1583 Py_INCREF(Py_None);
1584 return Py_None;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001585
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001586 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001587
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001588 /* internal error */
1589 pattern_error(status);
1590 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001591}
1592
1593static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001594pattern_scanner(PatternObject* pattern, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001595{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001596 /* create search state object */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001597
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001598 ScannerObject* self;
1599
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001600 PyObject* string;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001601 int start = 0;
1602 int end = INT_MAX;
1603 if (!PyArg_ParseTuple(args, "O|ii:scanner", &string, &start, &end))
1604 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001605
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001606 /* create scanner object */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001607 self = PyObject_NEW(ScannerObject, &Scanner_Type);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001608 if (!self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001609 return NULL;
1610
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001611 string = state_init(&self->state, pattern, string, start, end);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001612 if (!string) {
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001613 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001614 return NULL;
1615 }
1616
1617 Py_INCREF(pattern);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001618 self->pattern = (PyObject*) pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001619
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001620 return (PyObject*) self;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001621}
1622
Guido van Rossumb700df92000-03-31 14:59:30 +00001623static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001624pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001625{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001626 Py_XDECREF(self->pattern);
1627 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001628 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001629 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001630}
1631
1632static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001633pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001634{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001635 SRE_STATE state;
1636 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001637
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001638 PyObject* string;
1639 int start = 0;
1640 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00001641 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1642 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:match", kwlist,
1643 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001644 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001645
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001646 string = state_init(&state, self, string, start, end);
1647 if (!string)
1648 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001649
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001650 state.ptr = state.start;
1651
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001652 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1653
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001654 if (state.charsize == 1) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001655 status = sre_match(&state, PatternObject_GetCode(self), 1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001656 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001657#if defined(HAVE_UNICODE)
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001658 status = sre_umatch(&state, PatternObject_GetCode(self), 1);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001659#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001660 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001661
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001662 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1663
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001664 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001665
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001666 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001667}
1668
1669static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001670pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001671{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001672 SRE_STATE state;
1673 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001674
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001675 PyObject* string;
1676 int start = 0;
1677 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00001678 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1679 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:search", kwlist,
1680 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001681 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001682
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001683 string = state_init(&state, self, string, start, end);
1684 if (!string)
1685 return NULL;
1686
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001687 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1688
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001689 if (state.charsize == 1) {
1690 status = sre_search(&state, PatternObject_GetCode(self));
1691 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001692#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001693 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001694#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001695 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001696
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001697 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1698
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001699 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001700
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001701 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001702}
1703
1704static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001705call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001706{
1707 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001708 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001709 PyObject* func;
1710 PyObject* result;
1711
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001712 name = PyString_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001713 if (!name)
1714 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001715 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001716 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001717 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001718 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001719 func = PyObject_GetAttrString(mod, function);
1720 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001721 if (!func)
1722 return NULL;
1723 result = PyObject_CallObject(func, args);
1724 Py_DECREF(func);
1725 Py_DECREF(args);
1726 return result;
1727}
1728
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001729#ifdef USE_BUILTIN_COPY
1730static int
1731deepcopy(PyObject** object, PyObject* memo)
1732{
1733 PyObject* copy;
1734
1735 copy = call(
1736 "copy", "deepcopy",
1737 Py_BuildValue("OO", *object, memo)
1738 );
1739 if (!copy)
1740 return 0;
1741
1742 Py_DECREF(*object);
1743 *object = copy;
1744
1745 return 1; /* success */
1746}
1747#endif
1748
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001749static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001750pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001751{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001752 PyObject* template;
1753 PyObject* string;
Fredrik Lundh28552902000-07-05 21:14:16 +00001754 PyObject* count = Py_False; /* zero */
Fredrik Lundh562586e2000-10-03 20:43:34 +00001755 static char* kwlist[] = { "repl", "string", "count", NULL };
1756 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|O:sub", kwlist,
1757 &template, &string, &count))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001758 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001759
1760 /* delegate to Python code */
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001761 return call(
1762 SRE_MODULE, "_sub",
1763 Py_BuildValue("OOOO", self, template, string, count)
1764 );
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001765}
1766
1767static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001768pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001769{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001770 PyObject* template;
1771 PyObject* string;
Fredrik Lundh28552902000-07-05 21:14:16 +00001772 PyObject* count = Py_False; /* zero */
Fredrik Lundh562586e2000-10-03 20:43:34 +00001773 static char* kwlist[] = { "repl", "string", "count", NULL };
1774 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|O:subn", kwlist,
1775 &template, &string, &count))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001776 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001777
1778 /* delegate to Python code */
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001779 return call(
1780 SRE_MODULE, "_subn",
1781 Py_BuildValue("OOOO", self, template, string, count)
1782 );
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001783}
1784
1785static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001786pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001787{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001788 PyObject* string;
Fredrik Lundh28552902000-07-05 21:14:16 +00001789 PyObject* maxsplit = Py_False; /* zero */
Fredrik Lundh562586e2000-10-03 20:43:34 +00001790 static char* kwlist[] = { "source", "maxsplit", NULL };
1791 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|O:split", kwlist,
1792 &string, &maxsplit))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001793 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001794
1795 /* delegate to Python code */
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001796 return call(
1797 SRE_MODULE, "_split",
1798 Py_BuildValue("OOO", self, string, maxsplit)
1799 );
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001800}
1801
1802static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001803pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001804{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001805 SRE_STATE state;
1806 PyObject* list;
1807 int status;
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001808 int i;
Guido van Rossumb700df92000-03-31 14:59:30 +00001809
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001810 PyObject* string;
1811 int start = 0;
1812 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00001813 static char* kwlist[] = { "source", "pos", "endpos", NULL };
1814 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:findall", kwlist,
1815 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001816 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001817
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001818 string = state_init(&state, self, string, start, end);
1819 if (!string)
1820 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001821
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001822 list = PyList_New(0);
Guido van Rossumb700df92000-03-31 14:59:30 +00001823
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001824 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001825
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001826 PyObject* item;
1827
Fredrik Lundhebc37b22000-10-28 19:30:41 +00001828 state_reset(&state);
1829
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001830 state.ptr = state.start;
1831
1832 if (state.charsize == 1) {
1833 status = sre_search(&state, PatternObject_GetCode(self));
1834 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001835#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001836 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001837#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001838 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001839
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001840 if (status > 0) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001841
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001842 /* don't bother to build a match object */
1843 switch (self->groups) {
1844 case 0:
1845 item = PySequence_GetSlice(
1846 string,
1847 ((char*) state.start - (char*) state.beginning) /
1848 state.charsize,
1849 ((char*) state.ptr - (char*) state.beginning) /
1850 state.charsize);
1851 if (!item)
1852 goto error;
1853 break;
1854 case 1:
1855 item = state_getslice(&state, 1, string);
1856 if (!item)
1857 goto error;
1858 break;
1859 default:
1860 item = PyTuple_New(self->groups);
1861 if (!item)
1862 goto error;
1863 for (i = 0; i < self->groups; i++) {
1864 PyObject* o = state_getslice(&state, i+1, string);
1865 if (!o) {
1866 Py_DECREF(item);
1867 goto error;
1868 }
1869 PyTuple_SET_ITEM(item, i, o);
1870 }
1871 break;
1872 }
1873
Fredrik Lundhe67d8e52000-08-27 21:32:46 +00001874 status = PyList_Append(list, item);
1875 Py_DECREF(item);
1876
1877 if (status < 0)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001878 goto error;
Guido van Rossumb700df92000-03-31 14:59:30 +00001879
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001880 if (state.ptr == state.start)
1881 state.start = (void*) ((char*) state.ptr + state.charsize);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001882 else
1883 state.start = state.ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001884
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001885 } else {
Guido van Rossumb700df92000-03-31 14:59:30 +00001886
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001887 if (status == 0)
1888 break;
1889
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001890 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001891 goto error;
Guido van Rossumb700df92000-03-31 14:59:30 +00001892
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001893 }
1894 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001895
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001896 state_fini(&state);
1897 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00001898
1899error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001900 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001901 state_fini(&state);
1902 return NULL;
1903
Guido van Rossumb700df92000-03-31 14:59:30 +00001904}
1905
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001906static PyObject*
1907pattern_copy(PatternObject* self, PyObject* args)
1908{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001909#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001910 PatternObject* copy;
1911 int offset;
1912
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001913 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
1914 return NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001915
1916 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
1917 if (!copy)
1918 return NULL;
1919
1920 offset = offsetof(PatternObject, groups);
1921
1922 Py_XINCREF(self->groupindex);
1923 Py_XINCREF(self->indexgroup);
1924 Py_XINCREF(self->pattern);
1925
1926 memcpy((char*) copy + offset, (char*) self + offset,
1927 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
1928
1929 return (PyObject*) copy;
1930#else
1931 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
1932 return NULL;
1933#endif
1934}
1935
1936static PyObject*
1937pattern_deepcopy(PatternObject* self, PyObject* args)
1938{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001939#ifdef USE_BUILTIN_COPY
1940 PatternObject* copy;
1941
1942 PyObject* memo;
1943 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
1944 return NULL;
1945
1946 copy = (PatternObject*) pattern_copy(self, Py_None);
1947 if (!copy)
1948 return NULL;
1949
1950 if (!deepcopy(&copy->groupindex, memo) ||
1951 !deepcopy(&copy->indexgroup, memo) ||
1952 !deepcopy(&copy->pattern, memo)) {
1953 Py_DECREF(copy);
1954 return NULL;
1955 }
1956
1957#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001958 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
1959 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001960#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001961}
1962
Fredrik Lundh2d96f112001-07-08 13:26:57 +00001963static PyObject*
Fredrik Lundh59b68652001-09-18 20:55:24 +00001964pattern_getliteral(PatternObject* self, PyObject* args)
Fredrik Lundh2d96f112001-07-08 13:26:57 +00001965{
Fredrik Lundh59b68652001-09-18 20:55:24 +00001966 /* internal: if the pattern is a literal string, return that
1967 string. otherwise, return None */
Fredrik Lundh2d96f112001-07-08 13:26:57 +00001968
1969 SRE_CODE* code;
Fredrik Lundh59b68652001-09-18 20:55:24 +00001970 PyObject* literal;
Fredrik Lundh2d96f112001-07-08 13:26:57 +00001971
Fredrik Lundh59b68652001-09-18 20:55:24 +00001972 if (!PyArg_ParseTuple(args, ":_getliteral"))
Fredrik Lundh2d96f112001-07-08 13:26:57 +00001973 return NULL;
1974
1975 code = PatternObject_GetCode(self);
1976
Fredrik Lundh59b68652001-09-18 20:55:24 +00001977 if (code[0] == SRE_OP_INFO && code[2] & SRE_INFO_LITERAL) {
1978 /* FIXME: extract literal string from code buffer. we can't
1979 use the pattern member, since it may contain untranslated
1980 escape codes (see SF bug 449000) */
1981 literal = Py_None;
1982 } else
1983 literal = Py_None; /* no literal */
Fredrik Lundh2d96f112001-07-08 13:26:57 +00001984
Fredrik Lundh59b68652001-09-18 20:55:24 +00001985 Py_INCREF(literal);
1986 return literal;
Fredrik Lundh2d96f112001-07-08 13:26:57 +00001987}
1988
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001989static PyMethodDef pattern_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00001990 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS},
1991 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS},
1992 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS},
1993 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS},
1994 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS},
1995 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS},
Fredrik Lundh562586e2000-10-03 20:43:34 +00001996 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001997 {"__copy__", (PyCFunction) pattern_copy, METH_VARARGS},
1998 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_VARARGS},
Fredrik Lundh59b68652001-09-18 20:55:24 +00001999 {"_getliteral", (PyCFunction) pattern_getliteral, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002000 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002001};
2002
2003static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002004pattern_getattr(PatternObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00002005{
2006 PyObject* res;
2007
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002008 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00002009
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002010 if (res)
2011 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00002012
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002013 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00002014
2015 /* attributes */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002016 if (!strcmp(name, "pattern")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002017 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002018 return self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00002019 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002020
2021 if (!strcmp(name, "flags"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002022 return Py_BuildValue("i", self->flags);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002023
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002024 if (!strcmp(name, "groups"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002025 return Py_BuildValue("i", self->groups);
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002026
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002027 if (!strcmp(name, "groupindex") && self->groupindex) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002028 Py_INCREF(self->groupindex);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002029 return self->groupindex;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002030 }
2031
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002032 PyErr_SetString(PyExc_AttributeError, name);
2033 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002034}
2035
2036statichere PyTypeObject Pattern_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002037 PyObject_HEAD_INIT(NULL)
2038 0, "SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002039 sizeof(PatternObject), sizeof(SRE_CODE),
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002040 (destructor)pattern_dealloc, /*tp_dealloc*/
2041 0, /*tp_print*/
2042 (getattrfunc)pattern_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002043};
2044
2045/* -------------------------------------------------------------------- */
2046/* match methods */
2047
2048static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002049match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00002050{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002051 Py_XDECREF(self->regs);
2052 Py_XDECREF(self->string);
2053 Py_DECREF(self->pattern);
2054 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00002055}
2056
2057static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002058match_getslice_by_index(MatchObject* self, int index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00002059{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002060 if (index < 0 || index >= self->groups) {
2061 /* raise IndexError if we were given a bad group number */
2062 PyErr_SetString(
2063 PyExc_IndexError,
2064 "no such group"
2065 );
2066 return NULL;
2067 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002068
Fredrik Lundh6f013982000-07-03 18:44:21 +00002069 index *= 2;
2070
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002071 if (self->string == Py_None || self->mark[index] < 0) {
2072 /* return default value if the string or group is undefined */
2073 Py_INCREF(def);
2074 return def;
2075 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002076
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002077 return PySequence_GetSlice(
2078 self->string, self->mark[index], self->mark[index+1]
2079 );
Guido van Rossumb700df92000-03-31 14:59:30 +00002080}
2081
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002082static int
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002083match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00002084{
Fredrik Lundh6f013982000-07-03 18:44:21 +00002085 int i;
Guido van Rossumb700df92000-03-31 14:59:30 +00002086
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002087 if (PyInt_Check(index))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002088 return (int) PyInt_AS_LONG(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00002089
Fredrik Lundh6f013982000-07-03 18:44:21 +00002090 i = -1;
2091
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002092 if (self->pattern->groupindex) {
2093 index = PyObject_GetItem(self->pattern->groupindex, index);
2094 if (index) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00002095 if (PyInt_Check(index))
2096 i = (int) PyInt_AS_LONG(index);
2097 Py_DECREF(index);
2098 } else
2099 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002100 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00002101
2102 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002103}
2104
2105static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002106match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002107{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002108 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00002109}
2110
2111static PyObject*
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002112match_expand(MatchObject* self, PyObject* args)
2113{
2114 PyObject* template;
2115 if (!PyArg_ParseTuple(args, "O:expand", &template))
2116 return NULL;
2117
2118 /* delegate to Python code */
2119 return call(
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002120 SRE_MODULE, "_expand",
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002121 Py_BuildValue("OOO", self->pattern, self, template)
2122 );
2123}
2124
2125static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002126match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002127{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002128 PyObject* result;
2129 int i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00002130
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002131 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00002132
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002133 switch (size) {
2134 case 0:
2135 result = match_getslice(self, Py_False, Py_None);
2136 break;
2137 case 1:
2138 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2139 break;
2140 default:
2141 /* fetch multiple items */
2142 result = PyTuple_New(size);
2143 if (!result)
2144 return NULL;
2145 for (i = 0; i < size; i++) {
2146 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002147 self, PyTuple_GET_ITEM(args, i), Py_None
2148 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002149 if (!item) {
2150 Py_DECREF(result);
2151 return NULL;
2152 }
2153 PyTuple_SET_ITEM(result, i, item);
2154 }
2155 break;
2156 }
2157 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002158}
2159
2160static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002161match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002162{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002163 PyObject* result;
2164 int index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002165
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002166 PyObject* def = Py_None;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002167 static char* kwlist[] = { "default", NULL };
2168 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002169 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002170
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002171 result = PyTuple_New(self->groups-1);
2172 if (!result)
2173 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002174
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002175 for (index = 1; index < self->groups; index++) {
2176 PyObject* item;
2177 item = match_getslice_by_index(self, index, def);
2178 if (!item) {
2179 Py_DECREF(result);
2180 return NULL;
2181 }
2182 PyTuple_SET_ITEM(result, index-1, item);
2183 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002184
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002185 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002186}
2187
2188static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002189match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002190{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002191 PyObject* result;
2192 PyObject* keys;
2193 int index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002194
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002195 PyObject* def = Py_None;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002196 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00002197 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002198 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002199
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002200 result = PyDict_New();
2201 if (!result || !self->pattern->groupindex)
2202 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002203
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002204 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002205 if (!keys)
2206 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00002207
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002208 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00002209 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002210 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002211 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002212 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002213 if (!key)
2214 goto failed;
2215 value = match_getslice(self, key, def);
2216 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002217 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002218 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002219 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00002220 status = PyDict_SetItem(result, key, value);
2221 Py_DECREF(value);
2222 if (status < 0)
2223 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002224 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002225
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002226 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00002227
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002228 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002229
2230failed:
2231 Py_DECREF(keys);
2232 Py_DECREF(result);
2233 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002234}
2235
2236static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002237match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002238{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002239 int index;
2240
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002241 PyObject* index_ = Py_False; /* zero */
2242 if (!PyArg_ParseTuple(args, "|O:start", &index_))
2243 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002244
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002245 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002246
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002247 if (index < 0 || index >= self->groups) {
2248 PyErr_SetString(
2249 PyExc_IndexError,
2250 "no such group"
2251 );
2252 return NULL;
2253 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002254
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002255 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002256 return Py_BuildValue("i", self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00002257}
2258
2259static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002260match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002261{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002262 int index;
2263
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002264 PyObject* index_ = Py_False; /* zero */
2265 if (!PyArg_ParseTuple(args, "|O:end", &index_))
2266 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002267
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002268 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002269
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002270 if (index < 0 || index >= self->groups) {
2271 PyErr_SetString(
2272 PyExc_IndexError,
2273 "no such group"
2274 );
2275 return NULL;
2276 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002277
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002278 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002279 return Py_BuildValue("i", self->mark[index*2+1]);
2280}
2281
2282LOCAL(PyObject*)
2283_pair(int i1, int i2)
2284{
2285 PyObject* pair;
2286 PyObject* item;
2287
2288 pair = PyTuple_New(2);
2289 if (!pair)
2290 return NULL;
2291
2292 item = PyInt_FromLong(i1);
2293 if (!item)
2294 goto error;
2295 PyTuple_SET_ITEM(pair, 0, item);
2296
2297 item = PyInt_FromLong(i2);
2298 if (!item)
2299 goto error;
2300 PyTuple_SET_ITEM(pair, 1, item);
2301
2302 return pair;
2303
2304 error:
2305 Py_DECREF(pair);
2306 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002307}
2308
2309static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002310match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002311{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002312 int index;
2313
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002314 PyObject* index_ = Py_False; /* zero */
2315 if (!PyArg_ParseTuple(args, "|O:span", &index_))
2316 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002317
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002318 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002319
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002320 if (index < 0 || index >= self->groups) {
2321 PyErr_SetString(
2322 PyExc_IndexError,
2323 "no such group"
2324 );
2325 return NULL;
2326 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002327
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002328 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002329 return _pair(self->mark[index*2], self->mark[index*2+1]);
2330}
2331
2332static PyObject*
2333match_regs(MatchObject* self)
2334{
2335 PyObject* regs;
2336 PyObject* item;
2337 int index;
2338
2339 regs = PyTuple_New(self->groups);
2340 if (!regs)
2341 return NULL;
2342
2343 for (index = 0; index < self->groups; index++) {
2344 item = _pair(self->mark[index*2], self->mark[index*2+1]);
2345 if (!item) {
2346 Py_DECREF(regs);
2347 return NULL;
2348 }
2349 PyTuple_SET_ITEM(regs, index, item);
2350 }
2351
2352 Py_INCREF(regs);
2353 self->regs = regs;
2354
2355 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00002356}
2357
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002358static PyObject*
2359match_copy(MatchObject* self, PyObject* args)
2360{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002361#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002362 MatchObject* copy;
2363 int slots, offset;
2364
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002365 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
2366 return NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002367
2368 slots = 2 * (self->pattern->groups+1);
2369
2370 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
2371 if (!copy)
2372 return NULL;
2373
2374 /* this value a constant, but any compiler should be able to
2375 figure that out all by itself */
2376 offset = offsetof(MatchObject, string);
2377
2378 Py_XINCREF(self->pattern);
2379 Py_XINCREF(self->string);
2380 Py_XINCREF(self->regs);
2381
2382 memcpy((char*) copy + offset, (char*) self + offset,
2383 sizeof(MatchObject) + slots * sizeof(int) - offset);
2384
2385 return (PyObject*) copy;
2386#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002387 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002388 return NULL;
2389#endif
2390}
2391
2392static PyObject*
2393match_deepcopy(MatchObject* self, PyObject* args)
2394{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002395#ifdef USE_BUILTIN_COPY
2396 MatchObject* copy;
2397
2398 PyObject* memo;
2399 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
2400 return NULL;
2401
2402 copy = (MatchObject*) match_copy(self, Py_None);
2403 if (!copy)
2404 return NULL;
2405
2406 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
2407 !deepcopy(&copy->string, memo) ||
2408 !deepcopy(&copy->regs, memo)) {
2409 Py_DECREF(copy);
2410 return NULL;
2411 }
2412
2413#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002414 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
2415 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002416#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002417}
2418
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002419static PyMethodDef match_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00002420 {"group", (PyCFunction) match_group, METH_VARARGS},
2421 {"start", (PyCFunction) match_start, METH_VARARGS},
2422 {"end", (PyCFunction) match_end, METH_VARARGS},
2423 {"span", (PyCFunction) match_span, METH_VARARGS},
2424 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
2425 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
2426 {"expand", (PyCFunction) match_expand, METH_VARARGS},
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002427 {"__copy__", (PyCFunction) match_copy, METH_VARARGS},
2428 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002429 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002430};
2431
2432static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002433match_getattr(MatchObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00002434{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002435 PyObject* res;
Guido van Rossumb700df92000-03-31 14:59:30 +00002436
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002437 res = Py_FindMethod(match_methods, (PyObject*) self, name);
2438 if (res)
2439 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00002440
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002441 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00002442
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002443 if (!strcmp(name, "lastindex")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00002444 if (self->lastindex >= 0)
2445 return Py_BuildValue("i", self->lastindex);
Fredrik Lundhc2301732000-07-02 22:25:39 +00002446 Py_INCREF(Py_None);
2447 return Py_None;
2448 }
2449
2450 if (!strcmp(name, "lastgroup")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00002451 if (self->pattern->indexgroup && self->lastindex >= 0) {
Fredrik Lundhc2301732000-07-02 22:25:39 +00002452 PyObject* result = PySequence_GetItem(
Fredrik Lundh6f013982000-07-03 18:44:21 +00002453 self->pattern->indexgroup, self->lastindex
Fredrik Lundhc2301732000-07-02 22:25:39 +00002454 );
2455 if (result)
2456 return result;
2457 PyErr_Clear();
2458 }
2459 Py_INCREF(Py_None);
2460 return Py_None;
2461 }
2462
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002463 if (!strcmp(name, "string")) {
2464 if (self->string) {
2465 Py_INCREF(self->string);
2466 return self->string;
2467 } else {
2468 Py_INCREF(Py_None);
2469 return Py_None;
2470 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002471 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002472
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002473 if (!strcmp(name, "regs")) {
2474 if (self->regs) {
2475 Py_INCREF(self->regs);
2476 return self->regs;
2477 } else
2478 return match_regs(self);
2479 }
2480
2481 if (!strcmp(name, "re")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002482 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002483 return (PyObject*) self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00002484 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002485
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002486 if (!strcmp(name, "pos"))
2487 return Py_BuildValue("i", self->pos);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002488
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002489 if (!strcmp(name, "endpos"))
2490 return Py_BuildValue("i", self->endpos);
Guido van Rossumb700df92000-03-31 14:59:30 +00002491
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002492 PyErr_SetString(PyExc_AttributeError, name);
2493 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002494}
2495
2496/* FIXME: implement setattr("string", None) as a special case (to
2497 detach the associated string, if any */
2498
2499statichere PyTypeObject Match_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002500 PyObject_HEAD_INIT(NULL)
2501 0, "SRE_Match",
2502 sizeof(MatchObject), sizeof(int),
2503 (destructor)match_dealloc, /*tp_dealloc*/
2504 0, /*tp_print*/
2505 (getattrfunc)match_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002506};
2507
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002508/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002509/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002510
2511static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002512scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002513{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002514 state_fini(&self->state);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002515 Py_DECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002516 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002517}
2518
2519static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002520scanner_match(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002521{
2522 SRE_STATE* state = &self->state;
2523 PyObject* match;
2524 int status;
2525
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00002526 state_reset(state);
2527
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002528 state->ptr = state->start;
2529
2530 if (state->charsize == 1) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00002531 status = sre_match(state, PatternObject_GetCode(self->pattern), 1);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002532 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002533#if defined(HAVE_UNICODE)
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00002534 status = sre_umatch(state, PatternObject_GetCode(self->pattern), 1);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002535#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002536 }
2537
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002538 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002539 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002540
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002541 if (status == 0 || state->ptr == state->start)
2542 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002543 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002544 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002545
2546 return match;
2547}
2548
2549
2550static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002551scanner_search(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002552{
2553 SRE_STATE* state = &self->state;
2554 PyObject* match;
2555 int status;
2556
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00002557 state_reset(state);
2558
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002559 state->ptr = state->start;
2560
2561 if (state->charsize == 1) {
2562 status = sre_search(state, PatternObject_GetCode(self->pattern));
2563 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002564#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002565 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002566#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002567 }
2568
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002569 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002570 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002571
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002572 if (status == 0 || state->ptr == state->start)
2573 state->start = (void*) ((char*) state->ptr + state->charsize);
2574 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002575 state->start = state->ptr;
2576
2577 return match;
2578}
2579
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002580static PyMethodDef scanner_methods[] = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002581 {"match", (PyCFunction) scanner_match, 0},
2582 {"search", (PyCFunction) scanner_search, 0},
2583 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002584};
2585
2586static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002587scanner_getattr(ScannerObject* self, char* name)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002588{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002589 PyObject* res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002590
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002591 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
2592 if (res)
2593 return res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002594
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002595 PyErr_Clear();
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002596
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002597 /* attributes */
2598 if (!strcmp(name, "pattern")) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002599 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002600 return self->pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002601 }
2602
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002603 PyErr_SetString(PyExc_AttributeError, name);
2604 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002605}
2606
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002607statichere PyTypeObject Scanner_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002608 PyObject_HEAD_INIT(NULL)
2609 0, "SRE_Scanner",
2610 sizeof(ScannerObject), 0,
2611 (destructor)scanner_dealloc, /*tp_dealloc*/
2612 0, /*tp_print*/
2613 (getattrfunc)scanner_getattr, /*tp_getattr*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002614};
2615
Guido van Rossumb700df92000-03-31 14:59:30 +00002616static PyMethodDef _functions[] = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002617 {"compile", _compile, 1},
2618 {"getcodesize", sre_codesize, 1},
2619 {"getlower", sre_getlower, 1},
2620 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002621};
2622
Tim Peters5687ffe2001-02-28 16:44:18 +00002623DL_EXPORT(void)
Thomas Woutersf3f33dc2000-07-21 06:00:07 +00002624init_sre(void)
Guido van Rossumb700df92000-03-31 14:59:30 +00002625{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00002626 PyObject* m;
2627 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00002628 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00002629
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002630 /* Patch object types */
2631 Pattern_Type.ob_type = Match_Type.ob_type =
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002632 Scanner_Type.ob_type = &PyType_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00002633
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00002634 m = Py_InitModule("_" SRE_MODULE, _functions);
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00002635 d = PyModule_GetDict(m);
2636
Fredrik Lundh21009b92001-09-18 18:47:09 +00002637 x = PyInt_FromLong(SRE_MAGIC);
2638 if (x) {
2639 PyDict_SetItemString(d, "MAGIC", x);
2640 Py_DECREF(x);
2641 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00002642
Fredrik Lundh21009b92001-09-18 18:47:09 +00002643 x = PyString_FromString(copyright);
2644 if (x) {
2645 PyDict_SetItemString(d, "copyright", x);
2646 Py_DECREF(x);
2647 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002648}
2649
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002650#endif /* !defined(SRE_RECURSIVE) */