blob: b0ab66308fd6a90218406eb00c03723eb17cdf9d [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
9 * 2000-06-30 fl added fast search optimization
10 * 2000-06-30 fl added assert (lookahead) primitives, etc
11 * 2000-07-02 fl added charset optimizations, etc
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000012 * 2000-07-03 fl store code in pattern object, lookbehind, etc
13 * 2000-07-08 fl added regs attribute
Fredrik Lundhebc37b22000-10-28 19:30:41 +000014 * 2000-07-21 fl reset lastindex in scanner methods
15 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000016 * 2000-08-03 fl added recursion limit
17 * 2000-08-07 fl use PyOS_CheckStack() if available
18 * 2000-08-08 fl changed findall to return empty strings instead of None
19 * 2000-08-27 fl properly propagate memory errors
20 * 2000-09-02 fl return -1 instead of None for start/end/span
21 * 2000-09-20 fl added expand method
22 * 2000-09-21 fl don't use the buffer interface for unicode strings
Fredrik Lundh562586e2000-10-03 20:43:34 +000023 * 2000-10-03 fl fixed assert_not primitive; support keyword arguments
Fredrik Lundhebc37b22000-10-28 19:30:41 +000024 * 2000-10-24 fl really fixed assert_not; reset groups in findall
Fredrik Lundh770617b2001-01-14 15:06:11 +000025 * 2000-12-21 fl fixed memory leak in groupdict
26 * 2001-01-02 fl properly reset pointer after failed assertion in MIN_UNTIL
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000027 * 2001-01-15 fl avoid recursion for MIN_UNTIL; fixed uppercase literal bug
Fredrik Lundh6f5cba62001-01-16 07:05:29 +000028 * 2001-01-16 fl fixed memory leak in pattern destructor
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000029 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000030 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000031 * 2001-04-28 fl added __copy__ methods (work in progress)
32 * 2001-05-14 fl fixes for 1.5.2
Fredrik Lundhf71ae462001-07-02 17:04:48 +000033 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh21009b92001-09-18 18:47:09 +000034 * 2001-09-18 fl
Guido van Rossumb700df92000-03-31 14:59:30 +000035 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000036 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000037 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000038 * This version of the SRE library can be redistributed under CNRI's
39 * Python 1.6 license. For any other use, please contact Secret Labs
40 * AB (info@pythonware.com).
41 *
Guido van Rossumb700df92000-03-31 14:59:30 +000042 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000043 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000044 * other compatibility work.
45 */
46
47#ifndef SRE_RECURSIVE
48
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000049static char copyright[] =
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000050 " SRE 2.1.1 Copyright (c) 1997-2001 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000051
52#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000053#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000054
55#include "sre.h"
56
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000057#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000058
Fredrik Lundh436c3d52000-06-29 08:58:44 +000059/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000060#if !defined(SRE_MODULE)
61#define SRE_MODULE "sre"
62#endif
Fredrik Lundh436c3d52000-06-29 08:58:44 +000063
Guido van Rossumb700df92000-03-31 14:59:30 +000064/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000065#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000066
Martin v. Löwis339d0f72001-08-17 18:39:25 +000067#if PY_VERSION_HEX >= 0x01060000 && defined(Py_USING_UNICODE)
Fredrik Lundh22d25462000-07-01 17:50:59 +000068/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d52000-06-29 08:58:44 +000069#define HAVE_UNICODE
70#endif
71
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000072/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000073/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000074
Fredrik Lundh33accc12000-08-27 20:59:47 +000075/* prevent run-away recursion (bad patterns on long strings) */
76
Fredrik Lundh18c2aa22000-08-07 17:33:38 +000077#if !defined(USE_STACKCHECK)
Fredrik Lundh33accc12000-08-27 20:59:47 +000078#if defined(MS_WIN64) || defined(__LP64__) || defined(_LP64)
79/* require smaller recursion limit for a number of 64-bit platforms:
80 Win64 (MS_WIN64), Linux64 (__LP64__), Monterey (64-bit AIX) (_LP64) */
81/* FIXME: maybe the limit should be 40000 / sizeof(void*) ? */
82#define USE_RECURSION_LIMIT 7500
83#else
84#define USE_RECURSION_LIMIT 10000
85#endif
Fredrik Lundh18c2aa22000-08-07 17:33:38 +000086#endif
Fredrik Lundh96ab4652000-08-03 16:29:50 +000087
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000088/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000089#define USE_FAST_SEARCH
90
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000091/* enables aggressive inlining (always on for Visual C) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000092#undef USE_INLINE
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000093
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000094/* enables copy/deepcopy handling (work in progress) */
95#undef USE_BUILTIN_COPY
96
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000097#if PY_VERSION_HEX < 0x01060000
98#define PyObject_DEL(op) PyMem_DEL((op))
99#endif
100
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000101/* -------------------------------------------------------------------- */
102
Fredrik Lundh80946112000-06-29 18:03:25 +0000103#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +0000104#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +0000105#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +0000106/* fastest possible local call under MSVC */
107#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000108#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000109#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000110#else
111#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +0000112#endif
113
114/* error codes */
115#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000116#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000117#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +0000118#define SRE_ERROR_MEMORY -9 /* out of memory */
119
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000120#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +0000121#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +0000122#else
123#define TRACE(v)
124#endif
125
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000126/* -------------------------------------------------------------------- */
127/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000128
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000129/* default character predicates (run sre_chars.py to regenerate tables) */
130
131#define SRE_DIGIT_MASK 1
132#define SRE_SPACE_MASK 2
133#define SRE_LINEBREAK_MASK 4
134#define SRE_ALNUM_MASK 8
135#define SRE_WORD_MASK 16
136
Fredrik Lundh21009b92001-09-18 18:47:09 +0000137/* FIXME: this assumes ASCII. create tables in init_sre() instead */
138
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000139static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1402, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1410, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
14225, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
14324, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1440, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
14524, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
146
Fredrik Lundhb389df32000-06-29 12:48:37 +0000147static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d52000-06-29 08:58:44 +000014810, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
14927, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
15044, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
15161, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
152108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
153122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
154106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
155120, 121, 122, 123, 124, 125, 126, 127 };
156
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000157#define SRE_IS_DIGIT(ch)\
158 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
159#define SRE_IS_SPACE(ch)\
160 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
161#define SRE_IS_LINEBREAK(ch)\
162 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
163#define SRE_IS_ALNUM(ch)\
164 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
165#define SRE_IS_WORD(ch)\
166 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000167
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000168static unsigned int sre_lower(unsigned int ch)
169{
170 return ((ch) < 128 ? sre_char_lower[ch] : ch);
171}
172
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000173/* locale-specific character predicates */
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000174
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000175#define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
176#define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
177#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
178#define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
179#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
180
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000181static unsigned int sre_lower_locale(unsigned int ch)
182{
183 return ((ch) < 256 ? tolower((ch)) : ch);
184}
185
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000186/* unicode-specific character predicates */
187
188#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000189
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000190#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
191#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
192#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000193#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000194#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000195
196static unsigned int sre_lower_unicode(unsigned int ch)
197{
198 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
199}
200
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000201#endif
202
Guido van Rossumb700df92000-03-31 14:59:30 +0000203LOCAL(int)
204sre_category(SRE_CODE category, unsigned int ch)
205{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000206 switch (category) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000207
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000208 case SRE_CATEGORY_DIGIT:
209 return SRE_IS_DIGIT(ch);
210 case SRE_CATEGORY_NOT_DIGIT:
211 return !SRE_IS_DIGIT(ch);
212 case SRE_CATEGORY_SPACE:
213 return SRE_IS_SPACE(ch);
214 case SRE_CATEGORY_NOT_SPACE:
215 return !SRE_IS_SPACE(ch);
216 case SRE_CATEGORY_WORD:
217 return SRE_IS_WORD(ch);
218 case SRE_CATEGORY_NOT_WORD:
219 return !SRE_IS_WORD(ch);
220 case SRE_CATEGORY_LINEBREAK:
221 return SRE_IS_LINEBREAK(ch);
222 case SRE_CATEGORY_NOT_LINEBREAK:
223 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000224
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000225 case SRE_CATEGORY_LOC_WORD:
226 return SRE_LOC_IS_WORD(ch);
227 case SRE_CATEGORY_LOC_NOT_WORD:
228 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000229
230#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000231 case SRE_CATEGORY_UNI_DIGIT:
232 return SRE_UNI_IS_DIGIT(ch);
233 case SRE_CATEGORY_UNI_NOT_DIGIT:
234 return !SRE_UNI_IS_DIGIT(ch);
235 case SRE_CATEGORY_UNI_SPACE:
236 return SRE_UNI_IS_SPACE(ch);
237 case SRE_CATEGORY_UNI_NOT_SPACE:
238 return !SRE_UNI_IS_SPACE(ch);
239 case SRE_CATEGORY_UNI_WORD:
240 return SRE_UNI_IS_WORD(ch);
241 case SRE_CATEGORY_UNI_NOT_WORD:
242 return !SRE_UNI_IS_WORD(ch);
243 case SRE_CATEGORY_UNI_LINEBREAK:
244 return SRE_UNI_IS_LINEBREAK(ch);
245 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
246 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000247#else
248 case SRE_CATEGORY_UNI_DIGIT:
249 return SRE_IS_DIGIT(ch);
250 case SRE_CATEGORY_UNI_NOT_DIGIT:
251 return !SRE_IS_DIGIT(ch);
252 case SRE_CATEGORY_UNI_SPACE:
253 return SRE_IS_SPACE(ch);
254 case SRE_CATEGORY_UNI_NOT_SPACE:
255 return !SRE_IS_SPACE(ch);
256 case SRE_CATEGORY_UNI_WORD:
257 return SRE_LOC_IS_WORD(ch);
258 case SRE_CATEGORY_UNI_NOT_WORD:
259 return !SRE_LOC_IS_WORD(ch);
260 case SRE_CATEGORY_UNI_LINEBREAK:
261 return SRE_IS_LINEBREAK(ch);
262 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
263 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000264#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000265 }
266 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000267}
268
269/* helpers */
270
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000271static void
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000272mark_fini(SRE_STATE* state)
273{
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000274 if (state->mark_stack) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000275 free(state->mark_stack);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000276 state->mark_stack = NULL;
277 }
278 state->mark_stack_size = state->mark_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000279}
280
281static int
282mark_save(SRE_STATE* state, int lo, int hi)
283{
284 void* stack;
285 int size;
286 int minsize, newsize;
287
288 if (hi <= lo)
289 return 0;
290
291 size = (hi - lo) + 1;
292
293 newsize = state->mark_stack_size;
294 minsize = state->mark_stack_base + size;
295
296 if (newsize < minsize) {
297 /* create new stack */
298 if (!newsize) {
299 newsize = 512;
300 if (newsize < minsize)
301 newsize = minsize;
302 TRACE(("allocate stack %d\n", newsize));
303 stack = malloc(sizeof(void*) * newsize);
304 } else {
305 /* grow the stack */
306 while (newsize < minsize)
307 newsize += newsize;
308 TRACE(("grow stack to %d\n", newsize));
309 stack = realloc(state->mark_stack, sizeof(void*) * newsize);
310 }
311 if (!stack) {
312 mark_fini(state);
313 return SRE_ERROR_MEMORY;
314 }
315 state->mark_stack = stack;
316 state->mark_stack_size = newsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000317 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000318
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000319 TRACE(("copy %d:%d to %d (%d)\n", lo, hi, state->mark_stack_base, size));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000320
321 memcpy(state->mark_stack + state->mark_stack_base, state->mark + lo,
322 size * sizeof(void*));
323
324 state->mark_stack_base += size;
325
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000326 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000327}
328
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000329static int
330mark_restore(SRE_STATE* state, int lo, int hi)
Guido van Rossumb700df92000-03-31 14:59:30 +0000331{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000332 int size;
Guido van Rossumb700df92000-03-31 14:59:30 +0000333
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000334 if (hi <= lo)
335 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000336
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000337 size = (hi - lo) + 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000338
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000339 state->mark_stack_base -= size;
Guido van Rossumb700df92000-03-31 14:59:30 +0000340
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000341 TRACE(("copy %d:%d from %d\n", lo, hi, state->mark_stack_base));
Guido van Rossumb700df92000-03-31 14:59:30 +0000342
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000343 memcpy(state->mark + lo, state->mark_stack + state->mark_stack_base,
344 size * sizeof(void*));
Guido van Rossumb700df92000-03-31 14:59:30 +0000345
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000346 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000347}
348
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000349/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000350
351#define SRE_CHAR unsigned char
352#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000353#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000354#define SRE_CHARSET sre_charset
355#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000356#define SRE_MATCH sre_match
357#define SRE_SEARCH sre_search
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000358
359#if defined(HAVE_UNICODE)
360
Guido van Rossumb700df92000-03-31 14:59:30 +0000361#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000362#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000363#undef SRE_RECURSIVE
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000364
Guido van Rossumb700df92000-03-31 14:59:30 +0000365#undef SRE_SEARCH
366#undef SRE_MATCH
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000367#undef SRE_INFO
368#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000369#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000370#undef SRE_AT
371#undef SRE_CHAR
372
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000373/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000374
375#define SRE_CHAR Py_UNICODE
376#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000377#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000378#define SRE_CHARSET sre_ucharset
379#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000380#define SRE_MATCH sre_umatch
381#define SRE_SEARCH sre_usearch
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000382#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000383
384#endif /* SRE_RECURSIVE */
385
386/* -------------------------------------------------------------------- */
387/* String matching engine */
388
389/* the following section is compiled twice, with different character
390 settings */
391
392LOCAL(int)
393SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
394{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000395 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000396
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000397 int this, that;
Guido van Rossumb700df92000-03-31 14:59:30 +0000398
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000399 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000400
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000401 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000402 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000403 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000404
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000405 case SRE_AT_BEGINNING_LINE:
406 return ((void*) ptr == state->beginning ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000407 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000408
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000409 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000410 return (((void*) (ptr+1) == state->end &&
411 SRE_IS_LINEBREAK((int) ptr[0])) ||
412 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000413
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000414 case SRE_AT_END_LINE:
415 return ((void*) ptr == state->end ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000416 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000417
Fredrik Lundh770617b2001-01-14 15:06:11 +0000418 case SRE_AT_END_STRING:
419 return ((void*) ptr == state->end);
420
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000421 case SRE_AT_BOUNDARY:
422 if (state->beginning == state->end)
423 return 0;
424 that = ((void*) ptr > state->beginning) ?
425 SRE_IS_WORD((int) ptr[-1]) : 0;
426 this = ((void*) ptr < state->end) ?
427 SRE_IS_WORD((int) ptr[0]) : 0;
428 return this != that;
Fredrik Lundh80946112000-06-29 18:03:25 +0000429
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000430 case SRE_AT_NON_BOUNDARY:
431 if (state->beginning == state->end)
432 return 0;
433 that = ((void*) ptr > state->beginning) ?
434 SRE_IS_WORD((int) ptr[-1]) : 0;
435 this = ((void*) ptr < state->end) ?
436 SRE_IS_WORD((int) ptr[0]) : 0;
437 return this == that;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000438
439 case SRE_AT_LOC_BOUNDARY:
440 if (state->beginning == state->end)
441 return 0;
442 that = ((void*) ptr > state->beginning) ?
443 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
444 this = ((void*) ptr < state->end) ?
445 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
446 return this != that;
447
448 case SRE_AT_LOC_NON_BOUNDARY:
449 if (state->beginning == state->end)
450 return 0;
451 that = ((void*) ptr > state->beginning) ?
452 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
453 this = ((void*) ptr < state->end) ?
454 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
455 return this == that;
456
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000457#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000458 case SRE_AT_UNI_BOUNDARY:
459 if (state->beginning == state->end)
460 return 0;
461 that = ((void*) ptr > state->beginning) ?
462 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
463 this = ((void*) ptr < state->end) ?
464 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
465 return this != that;
466
467 case SRE_AT_UNI_NON_BOUNDARY:
468 if (state->beginning == state->end)
469 return 0;
470 that = ((void*) ptr > state->beginning) ?
471 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
472 this = ((void*) ptr < state->end) ?
473 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
474 return this == that;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000475#endif
476
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000477 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000478
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000479 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000480}
481
482LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000483SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000484{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000485 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000486
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000487 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000488
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000489 for (;;) {
490 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000491
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000492 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000493 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000494 if (ch == set[0])
495 return ok;
496 set++;
497 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000498
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000499 case SRE_OP_RANGE:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000500 /* <RANGE> <lower> <upper> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000501 if (set[0] <= ch && ch <= set[1])
502 return ok;
503 set += 2;
504 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000505
Fredrik Lundh3562f112000-07-02 12:00:07 +0000506 case SRE_OP_CHARSET:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000507 /* <CHARSET> <bitmap> (16 bits per code word) */
Fredrik Lundh3562f112000-07-02 12:00:07 +0000508 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
509 return ok;
510 set += 16;
511 break;
512
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000513 case SRE_OP_BIGCHARSET:
514 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
515 {
516 int count, block;
517 count = *(set++);
518 block = ((unsigned char*)set)[ch >> 8];
519 set += 128;
520 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
521 return ok;
522 set += count*16;
523 break;
524 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000525
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000526 case SRE_OP_CATEGORY:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000527 /* <CATEGORY> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000528 if (sre_category(set[0], (int) ch))
529 return ok;
530 set += 1;
531 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000532
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000533 case SRE_OP_NEGATE:
534 ok = !ok;
535 break;
536
537 case SRE_OP_FAILURE:
538 return !ok;
539
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000540 default:
541 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000542 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000543 return 0;
544 }
545 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000546}
547
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000548LOCAL(int) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level);
549
550LOCAL(int)
551SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, int maxcount, int level)
552{
553 SRE_CODE chr;
554 SRE_CHAR* ptr = state->ptr;
555 SRE_CHAR* end = state->end;
556 int i;
557
558 /* adjust end */
559 if (maxcount < end - ptr && maxcount != 65535)
560 end = ptr + maxcount;
561
562 switch (pattern[0]) {
563
564 case SRE_OP_ANY:
565 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000566 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000567 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
568 ptr++;
569 break;
570
571 case SRE_OP_ANY_ALL:
572 /* repeated dot wildcare. skip to the end of the target
573 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000574 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000575 ptr = end;
576 break;
577
578 case SRE_OP_LITERAL:
579 /* repeated literal */
580 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000581 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000582 while (ptr < end && (SRE_CODE) *ptr == chr)
583 ptr++;
584 break;
585
586 case SRE_OP_LITERAL_IGNORE:
587 /* repeated literal */
588 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000589 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000590 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
591 ptr++;
592 break;
593
594 case SRE_OP_NOT_LITERAL:
595 /* repeated non-literal */
596 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000597 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000598 while (ptr < end && (SRE_CODE) *ptr != chr)
599 ptr++;
600 break;
601
602 case SRE_OP_NOT_LITERAL_IGNORE:
603 /* repeated non-literal */
604 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000605 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000606 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
607 ptr++;
608 break;
609
610 case SRE_OP_IN:
611 /* repeated set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000612 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
613 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000614 ptr++;
615 break;
616
617 default:
618 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000619 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000620 while ((SRE_CHAR*) state->ptr < end) {
621 i = SRE_MATCH(state, pattern, level);
622 if (i < 0)
623 return i;
624 if (!i)
625 break;
626 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000627 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
628 (SRE_CHAR*) state->ptr - ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000629 return (SRE_CHAR*) state->ptr - ptr;
630 }
631
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000632 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000633 return ptr - (SRE_CHAR*) state->ptr;
634}
635
Fredrik Lundh33accc12000-08-27 20:59:47 +0000636#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000637LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000638SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
639{
640 /* check if an SRE_OP_INFO block matches at the current position.
641 returns the number of SRE_CODE objects to skip if successful, 0
642 if no match */
643
644 SRE_CHAR* end = state->end;
645 SRE_CHAR* ptr = state->ptr;
646 int i;
647
648 /* check minimal length */
649 if (pattern[3] && (end - ptr) < pattern[3])
650 return 0;
651
652 /* check known prefix */
653 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
654 /* <length> <skip> <prefix data> <overlap data> */
655 for (i = 0; i < pattern[5]; i++)
656 if ((SRE_CODE) ptr[i] != pattern[7 + i])
657 return 0;
658 return pattern[0] + 2 * pattern[6];
659 }
660 return pattern[0];
661}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000662#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000663
664LOCAL(int)
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000665SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
Guido van Rossumb700df92000-03-31 14:59:30 +0000666{
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000667 /* check if string matches the given pattern. returns <0 for
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000668 error, 0 for failure, and 1 for success */
Guido van Rossumb700df92000-03-31 14:59:30 +0000669
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000670 SRE_CHAR* end = state->end;
671 SRE_CHAR* ptr = state->ptr;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000672 int i, count;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000673 SRE_REPEAT* rp;
674 int lastmark;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000675 SRE_CODE chr;
Guido van Rossumb700df92000-03-31 14:59:30 +0000676
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000677 SRE_REPEAT rep; /* FIXME: <fl> allocate in STATE instead */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000678
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000679 TRACE(("|%p|%p|ENTER %d\n", pattern, ptr, level));
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000680
Fredrik Lundh18c2aa22000-08-07 17:33:38 +0000681#if defined(USE_STACKCHECK)
Fredrik Lundh58100642000-08-09 09:14:35 +0000682 if (level % 10 == 0 && PyOS_CheckStack())
Fredrik Lundh18c2aa22000-08-07 17:33:38 +0000683 return SRE_ERROR_RECURSION_LIMIT;
684#endif
685
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000686#if defined(USE_RECURSION_LIMIT)
687 if (level > USE_RECURSION_LIMIT)
688 return SRE_ERROR_RECURSION_LIMIT;
689#endif
690
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000691 if (pattern[0] == SRE_OP_INFO) {
692 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000693 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000694 if (pattern[3] && (end - ptr) < pattern[3]) {
695 TRACE(("reject (got %d chars, need %d)\n",
696 (end - ptr), pattern[3]));
697 return 0;
698 }
699 pattern += pattern[1] + 1;
700 }
701
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000702 for (;;) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000703
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000704 switch (*pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000705
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000706 case SRE_OP_FAILURE:
707 /* immediate failure */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000708 TRACE(("|%p|%p|FAILURE\n", pattern, ptr));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000709 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000710
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000711 case SRE_OP_SUCCESS:
712 /* end of pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000713 TRACE(("|%p|%p|SUCCESS\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000714 state->ptr = ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000715 return 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000716
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000717 case SRE_OP_AT:
718 /* match at given position */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000719 /* <AT> <code> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000720 TRACE(("|%p|%p|AT %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000721 if (!SRE_AT(state, ptr, *pattern))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000722 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000723 pattern++;
724 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000725
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000726 case SRE_OP_CATEGORY:
727 /* match at given category */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000728 /* <CATEGORY> <code> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000729 TRACE(("|%p|%p|CATEGORY %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000730 if (ptr >= end || !sre_category(pattern[0], ptr[0]))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000731 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000732 pattern++;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000733 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000734 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000735
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000736 case SRE_OP_LITERAL:
737 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000738 /* <LITERAL> <code> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000739 TRACE(("|%p|%p|LITERAL %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000740 if (ptr >= end || (SRE_CODE) ptr[0] != pattern[0])
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000741 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000742 pattern++;
743 ptr++;
744 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000745
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000746 case SRE_OP_NOT_LITERAL:
747 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000748 /* <NOT_LITERAL> <code> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000749 TRACE(("|%p|%p|NOT_LITERAL %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000750 if (ptr >= end || (SRE_CODE) ptr[0] == pattern[0])
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000751 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000752 pattern++;
753 ptr++;
754 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000755
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000756 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000757 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000758 /* <ANY> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000759 TRACE(("|%p|%p|ANY\n", pattern, ptr));
Fredrik Lundhe1869832000-08-01 22:47:49 +0000760 if (ptr >= end || SRE_IS_LINEBREAK(ptr[0]))
761 return 0;
762 ptr++;
763 break;
764
765 case SRE_OP_ANY_ALL:
766 /* match anything */
767 /* <ANY_ALL> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000768 TRACE(("|%p|%p|ANY_ALL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000769 if (ptr >= end)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000770 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000771 ptr++;
772 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000773
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000774 case SRE_OP_IN:
775 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000776 /* <IN> <skip> <set> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000777 TRACE(("|%p|%p|IN\n", pattern, ptr));
778 if (ptr >= end || !SRE_CHARSET(pattern + 1, *ptr))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000779 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000780 pattern += pattern[0];
781 ptr++;
782 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000783
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000784 case SRE_OP_GROUPREF:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000785 /* match backreference */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000786 TRACE(("|%p|%p|GROUPREF %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000787 i = pattern[0];
788 {
789 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
790 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
791 if (!p || !e || e < p)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000792 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000793 while (p < e) {
794 if (ptr >= end || *ptr != *p)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000795 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000796 p++; ptr++;
797 }
798 }
799 pattern++;
800 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000801
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000802 case SRE_OP_GROUPREF_IGNORE:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000803 /* match backreference */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000804 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000805 i = pattern[0];
806 {
807 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
808 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
809 if (!p || !e || e < p)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000810 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000811 while (p < e) {
812 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000813 state->lower(*ptr) != state->lower(*p))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000814 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000815 p++; ptr++;
816 }
817 }
818 pattern++;
819 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000820
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000821 case SRE_OP_LITERAL_IGNORE:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000822 TRACE(("|%p|%p|LITERAL_IGNORE %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000823 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000824 state->lower(*ptr) != state->lower(*pattern))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000825 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000826 pattern++;
827 ptr++;
828 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000829
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000830 case SRE_OP_NOT_LITERAL_IGNORE:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000831 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000832 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000833 state->lower(*ptr) == state->lower(*pattern))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000834 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000835 pattern++;
836 ptr++;
837 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000838
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000839 case SRE_OP_IN_IGNORE:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000840 TRACE(("|%p|%p|IN_IGNORE\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000841 if (ptr >= end
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000842 || !SRE_CHARSET(pattern + 1, (SRE_CODE) state->lower(*ptr)))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000843 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000844 pattern += pattern[0];
845 ptr++;
846 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000847
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000848 case SRE_OP_MARK:
849 /* set mark */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000850 /* <MARK> <gid> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000851 TRACE(("|%p|%p|MARK %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000852 i = pattern[0];
853 if (i & 1)
854 state->lastindex = i/2 + 1;
855 if (i > state->lastmark)
856 state->lastmark = i;
857 state->mark[i] = ptr;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000858 pattern++;
859 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000860
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000861 case SRE_OP_JUMP:
862 case SRE_OP_INFO:
863 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000864 /* <JUMP> <offset> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000865 TRACE(("|%p|%p|JUMP %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000866 pattern += pattern[0];
867 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000868
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000869 case SRE_OP_ASSERT:
870 /* assert subpattern */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000871 /* <ASSERT> <skip> <back> <pattern> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000872 TRACE(("|%p|%p|ASSERT %d\n", pattern, ptr, pattern[1]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000873 state->ptr = ptr - pattern[1];
Fredrik Lundh6f013982000-07-03 18:44:21 +0000874 if (state->ptr < state->beginning)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000875 return 0;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000876 i = SRE_MATCH(state, pattern + 2, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000877 if (i <= 0)
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000878 return i;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000879 pattern += pattern[0];
880 break;
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000881
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000882 case SRE_OP_ASSERT_NOT:
883 /* assert not subpattern */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000884 /* <ASSERT_NOT> <skip> <back> <pattern> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000885 TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern, ptr, pattern[1]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000886 state->ptr = ptr - pattern[1];
Fredrik Lundhebc37b22000-10-28 19:30:41 +0000887 if (state->ptr >= state->beginning) {
888 i = SRE_MATCH(state, pattern + 2, level + 1);
889 if (i < 0)
890 return i;
891 if (i)
892 return 0;
893 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000894 pattern += pattern[0];
895 break;
896
897 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000898 /* alternation */
899 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000900 TRACE(("|%p|%p|BRANCH\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000901 lastmark = state->lastmark;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000902 for (; pattern[0]; pattern += pattern[0]) {
903 if (pattern[1] == SRE_OP_LITERAL &&
904 (ptr >= end || (SRE_CODE) *ptr != pattern[2]))
905 continue;
906 if (pattern[1] == SRE_OP_IN &&
907 (ptr >= end || !SRE_CHARSET(pattern + 3, (SRE_CODE) *ptr)))
908 continue;
909 state->ptr = ptr;
910 i = SRE_MATCH(state, pattern + 1, level + 1);
911 if (i)
912 return i;
913 if (state->lastmark > lastmark) {
914 memset(
915 state->mark + lastmark + 1, 0,
916 (state->lastmark - lastmark) * sizeof(void*)
917 );
918 state->lastmark = lastmark;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000919 }
920 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000921 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000922
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000923 case SRE_OP_REPEAT_ONE:
924 /* match repeated sequence (maximizing regexp) */
925
926 /* this operator only works if the repeated item is
927 exactly one character wide, and we're not already
928 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +0000929 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000930
931 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
932
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000933 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", pattern, ptr,
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000934 pattern[1], pattern[2]));
935
Fredrik Lundhe1869832000-08-01 22:47:49 +0000936 if (ptr + pattern[1] > end)
937 return 0; /* cannot match */
938
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000939 state->ptr = ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000940
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000941 count = SRE_COUNT(state, pattern + 3, pattern[2], level + 1);
942 if (count < 0)
943 return count;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000944
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000945 ptr += count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000946
947 /* when we arrive here, count contains the number of
948 matches, and ptr points to the tail of the target
949 string. check if the rest of the pattern matches,
950 and backtrack if not. */
951
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000952 if (count < (int) pattern[1])
953 return 0;
954
955 if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
956 /* tail is empty. we're finished */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000957 state->ptr = ptr;
958 return 1;
959
960 } else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
961 /* tail starts with a literal. skip positions where
962 the rest of the pattern cannot possibly match */
Fredrik Lundhe1869832000-08-01 22:47:49 +0000963 chr = pattern[pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000964 for (;;) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000965 while (count >= (int) pattern[1] &&
966 (ptr >= end || *ptr != chr)) {
967 ptr--;
968 count--;
969 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000970 if (count < (int) pattern[1])
971 break;
972 state->ptr = ptr;
973 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000974 if (i)
Fredrik Lundh33accc12000-08-27 20:59:47 +0000975 return i;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000976 ptr--;
977 count--;
978 }
979
980 } else {
981 /* general case */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000982 lastmark = state->lastmark;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000983 while (count >= (int) pattern[1]) {
984 state->ptr = ptr;
985 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000986 if (i)
Fredrik Lundh33accc12000-08-27 20:59:47 +0000987 return i;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000988 ptr--;
989 count--;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000990 if (state->lastmark > lastmark) {
991 memset(
992 state->mark + lastmark + 1, 0,
993 (state->lastmark - lastmark) * sizeof(void*)
994 );
995 state->lastmark = lastmark;
996 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000997 }
998 }
999 return 0;
1000
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001001 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001002 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001003 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001004 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001005 TRACE(("|%p|%p|REPEAT %d %d\n", pattern, ptr,
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001006 pattern[1], pattern[2]));
1007
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001008 rep.count = -1;
1009 rep.pattern = pattern;
1010
1011 /* install new repeat context */
1012 rep.prev = state->repeat;
1013 state->repeat = &rep;
1014
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001015 state->ptr = ptr;
1016 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001017
1018 state->repeat = rep.prev;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001019
1020 return i;
1021
1022 case SRE_OP_MAX_UNTIL:
1023 /* maximizing repeat */
1024 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1025
1026 /* FIXME: we probably need to deal with zero-width
1027 matches in here... */
1028
1029 rp = state->repeat;
1030 if (!rp)
1031 return SRE_ERROR_STATE;
1032
1033 state->ptr = ptr;
1034
1035 count = rp->count + 1;
1036
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001037 TRACE(("|%p|%p|MAX_UNTIL %d\n", pattern, ptr, count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001038
1039 if (count < rp->pattern[1]) {
1040 /* not enough matches */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001041 rp->count = count;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001042 /* RECURSIVE */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001043 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001044 if (i)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001045 return i;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001046 rp->count = count - 1;
1047 state->ptr = ptr;
1048 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001049 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001050
1051 if (count < rp->pattern[2] || rp->pattern[2] == 65535) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001052 /* we may have enough matches, but if we can
1053 match another item, do so */
1054 rp->count = count;
1055 lastmark = state->lastmark;
Fredrik Lundh33accc12000-08-27 20:59:47 +00001056 i = mark_save(state, 0, lastmark);
1057 if (i < 0)
1058 return i;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001059 /* RECURSIVE */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001060 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001061 if (i)
1062 return i;
Fredrik Lundh33accc12000-08-27 20:59:47 +00001063 i = mark_restore(state, 0, lastmark);
1064 if (i < 0)
1065 return i;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001066 rp->count = count - 1;
1067 state->ptr = ptr;
1068 }
1069
1070 /* cannot match more repeated items here. make sure the
1071 tail matches */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001072 state->repeat = rp->prev;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001073 i = SRE_MATCH(state, pattern, level + 1);
1074 if (i)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001075 return i;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001076 state->repeat = rp;
Fredrik Lundh770617b2001-01-14 15:06:11 +00001077 state->ptr = ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001078 return 0;
1079
1080 case SRE_OP_MIN_UNTIL:
1081 /* minimizing repeat */
1082 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1083
1084 rp = state->repeat;
1085 if (!rp)
1086 return SRE_ERROR_STATE;
1087
1088 count = rp->count + 1;
1089
Fredrik Lundh770617b2001-01-14 15:06:11 +00001090 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", pattern, ptr, count,
1091 rp->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001092
1093 state->ptr = ptr;
1094
1095 if (count < rp->pattern[1]) {
1096 /* not enough matches */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001097 rp->count = count;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001098 /* RECURSIVE */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001099 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001100 if (i)
1101 return i;
1102 rp->count = count-1;
1103 state->ptr = ptr;
1104 return 0;
1105 }
1106
1107 /* see if the tail matches */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001108 state->repeat = rp->prev;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +00001109 /* FIXME: the following fix doesn't always work (#133283) */
Fredrik Lundhdf781e62001-07-02 19:54:28 +00001110 if (rp->pattern[2] == 65535) {
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001111 /* unbounded repeat */
1112 for (;;) {
1113 i = SRE_MATCH(state, pattern, level + 1);
1114 if (i || ptr >= end)
1115 break;
1116 state->ptr = ++ptr;
1117 }
1118 } else
1119 i = SRE_MATCH(state, pattern, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001120 if (i) {
1121 /* free(rp); */
1122 return i;
1123 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001124
Fredrik Lundh770617b2001-01-14 15:06:11 +00001125 state->ptr = ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001126 state->repeat = rp;
1127
1128 if (count >= rp->pattern[2] && rp->pattern[2] != 65535)
1129 return 0;
1130
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001131 rp->count = count;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001132 /* RECURSIVE */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001133 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001134 if (i)
1135 return i;
1136 rp->count = count - 1;
Fredrik Lundh770617b2001-01-14 15:06:11 +00001137 state->ptr = ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001138 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001139
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001140 default:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001141 TRACE(("|%p|%p|UNKNOWN %d\n", pattern, ptr, pattern[-1]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001142 return SRE_ERROR_ILLEGAL;
1143 }
1144 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001145
Sjoerd Mullender89dfe9e2001-08-30 14:37:07 +00001146 /* can't end up here */
Fredrik Lundh21009b92001-09-18 18:47:09 +00001147 /* return SRE_ERROR_ILLEGAL; -- see python-dev discussion */
Guido van Rossumb700df92000-03-31 14:59:30 +00001148}
1149
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001150LOCAL(int)
Guido van Rossumb700df92000-03-31 14:59:30 +00001151SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1152{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001153 SRE_CHAR* ptr = state->start;
1154 SRE_CHAR* end = state->end;
1155 int status = 0;
Fredrik Lundh28552902000-07-05 21:14:16 +00001156 int prefix_len = 0;
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001157 int prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001158 SRE_CODE* prefix = NULL;
1159 SRE_CODE* charset = NULL;
1160 SRE_CODE* overlap = NULL;
1161 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001162
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001163 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001164 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001165 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001166
1167 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001168
1169 if (pattern[3] > 0) {
1170 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001171 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001172 end -= pattern[3]-1;
1173 if (end <= ptr)
1174 end = ptr+1;
1175 }
1176
Fredrik Lundh3562f112000-07-02 12:00:07 +00001177 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001178 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001179 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001180 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001181 prefix_skip = pattern[6];
1182 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001183 overlap = prefix + prefix_len - 1;
1184 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001185 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001186 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001187 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001188
1189 pattern += 1 + pattern[1];
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001190 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001191
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001192 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1193 TRACE(("charset = %p\n", charset));
1194
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001195#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001196 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001197 /* pattern starts with a known prefix. use the overlap
1198 table to skip forward as fast as we possibly can */
1199 int i = 0;
1200 end = state->end;
1201 while (ptr < end) {
1202 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001203 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001204 if (!i)
1205 break;
1206 else
1207 i = overlap[i];
1208 } else {
1209 if (++i == prefix_len) {
1210 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001211 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1212 state->start = ptr + 1 - prefix_len;
1213 state->ptr = ptr + 1 - prefix_len + prefix_skip;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001214 if (flags & SRE_INFO_LITERAL)
1215 return 1; /* we got all of it */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001216 status = SRE_MATCH(state, pattern + 2*prefix_skip, 1);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001217 if (status != 0)
1218 return status;
1219 /* close but no cigar -- try again */
1220 i = overlap[i];
1221 }
1222 break;
1223 }
1224
1225 }
1226 ptr++;
1227 }
1228 return 0;
1229 }
1230#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001231
Fredrik Lundh3562f112000-07-02 12:00:07 +00001232 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001233 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001234 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001235 SRE_CODE chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001236 end = state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001237 for (;;) {
1238 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1239 ptr++;
1240 if (ptr == end)
1241 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001242 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001243 state->start = ptr;
1244 state->ptr = ++ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001245 status = SRE_MATCH(state, pattern + 2, 1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001246 if (status != 0)
1247 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001248 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001249 } else if (charset) {
1250 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001251 end = state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001252 for (;;) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001253 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001254 ptr++;
1255 if (ptr == end)
1256 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001257 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001258 state->start = ptr;
1259 state->ptr = ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001260 status = SRE_MATCH(state, pattern, 1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001261 if (status != 0)
1262 break;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001263 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001264 }
1265 } else
1266 /* general case */
1267 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001268 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001269 state->start = state->ptr = ptr++;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001270 status = SRE_MATCH(state, pattern, 1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001271 if (status != 0)
1272 break;
1273 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001274
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001275 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001276}
Fredrik Lundh3562f112000-07-02 12:00:07 +00001277
Guido van Rossumb700df92000-03-31 14:59:30 +00001278
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001279#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001280
1281/* -------------------------------------------------------------------- */
1282/* factories and destructors */
1283
1284/* see sre.h for object declarations */
1285
1286staticforward PyTypeObject Pattern_Type;
1287staticforward PyTypeObject Match_Type;
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001288staticforward PyTypeObject Scanner_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00001289
1290static PyObject *
1291_compile(PyObject* self_, PyObject* args)
1292{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001293 /* "compile" pattern descriptor to pattern object */
Guido van Rossumb700df92000-03-31 14:59:30 +00001294
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001295 PatternObject* self;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001296 int i, n;
Guido van Rossumb700df92000-03-31 14:59:30 +00001297
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001298 PyObject* pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001299 int flags = 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001300 PyObject* code;
1301 int groups = 0;
1302 PyObject* groupindex = NULL;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001303 PyObject* indexgroup = NULL;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001304 if (!PyArg_ParseTuple(args, "OiO!|iOO", &pattern, &flags,
1305 &PyList_Type, &code, &groups,
1306 &groupindex, &indexgroup))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001307 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001308
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001309 n = PyList_GET_SIZE(code);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001310
Fredrik Lundhebc37b22000-10-28 19:30:41 +00001311 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001312 if (!self)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001313 return NULL;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001314
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001315 self->codesize = n;
1316
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001317 for (i = 0; i < n; i++) {
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001318 PyObject *o = PyList_GET_ITEM(code, i);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001319 self->code[i] = (SRE_CODE) PyInt_AsLong(o);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001320 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00001321
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001322 if (PyErr_Occurred()) {
1323 PyObject_DEL(self);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001324 return NULL;
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001325 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00001326
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001327 Py_INCREF(pattern);
1328 self->pattern = pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00001329
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001330 self->flags = flags;
1331
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001332 self->groups = groups;
Guido van Rossumb700df92000-03-31 14:59:30 +00001333
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001334 Py_XINCREF(groupindex);
1335 self->groupindex = groupindex;
Guido van Rossumb700df92000-03-31 14:59:30 +00001336
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001337 Py_XINCREF(indexgroup);
1338 self->indexgroup = indexgroup;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001339
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001340 return (PyObject*) self;
Guido van Rossumb700df92000-03-31 14:59:30 +00001341}
1342
1343static PyObject *
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001344sre_codesize(PyObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001345{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001346 return Py_BuildValue("i", sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001347}
1348
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001349static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001350sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001351{
1352 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001353 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001354 return NULL;
1355 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001356 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001357 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001358#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001359 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001360#else
1361 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001362#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001363 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001364}
1365
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001366LOCAL(void)
1367state_reset(SRE_STATE* state)
1368{
1369 int i;
1370
1371 state->lastmark = 0;
1372
1373 /* FIXME: dynamic! */
1374 for (i = 0; i < SRE_MARK_SIZE; i++)
1375 state->mark[i] = NULL;
1376
1377 state->lastindex = -1;
1378
1379 state->repeat = NULL;
1380
1381 mark_fini(state);
1382}
1383
Guido van Rossumb700df92000-03-31 14:59:30 +00001384LOCAL(PyObject*)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001385state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1386 int start, int end)
Guido van Rossumb700df92000-03-31 14:59:30 +00001387{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001388 /* prepare state object */
Guido van Rossumb700df92000-03-31 14:59:30 +00001389
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001390 PyBufferProcs *buffer;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001391 int size, bytes;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001392 void* ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001393
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001394 memset(state, 0, sizeof(SRE_STATE));
1395
1396 state->lastindex = -1;
1397
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001398#if defined(HAVE_UNICODE)
1399 if (PyUnicode_Check(string)) {
1400 /* unicode strings doesn't always support the buffer interface */
1401 ptr = (void*) PyUnicode_AS_DATA(string);
1402 bytes = PyUnicode_GET_DATA_SIZE(string);
1403 size = PyUnicode_GET_SIZE(string);
1404 state->charsize = sizeof(Py_UNICODE);
1405
1406 } else {
1407#endif
1408
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001409 /* get pointer to string buffer */
1410 buffer = string->ob_type->tp_as_buffer;
1411 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1412 buffer->bf_getsegcount(string, NULL) != 1) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001413 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001414 return NULL;
1415 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001416
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001417 /* determine buffer size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001418 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1419 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001420 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1421 return NULL;
1422 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001423
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001424 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001425#if PY_VERSION_HEX >= 0x01060000
1426 size = PyObject_Size(string);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001427#else
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001428 size = PyObject_Length(string);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001429#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001430
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001431 if (PyString_Check(string) || bytes == size)
1432 state->charsize = 1;
1433#if defined(HAVE_UNICODE)
1434 else if (bytes == (int) (size * sizeof(Py_UNICODE)))
1435 state->charsize = sizeof(Py_UNICODE);
1436#endif
1437 else {
1438 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1439 return NULL;
1440 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001441
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001442#if defined(HAVE_UNICODE)
1443 }
1444#endif
1445
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001446 /* adjust boundaries */
1447 if (start < 0)
1448 start = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001449 else if (start > size)
1450 start = size;
Guido van Rossumb700df92000-03-31 14:59:30 +00001451
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001452 if (end < 0)
1453 end = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001454 else if (end > size)
1455 end = size;
Guido van Rossumb700df92000-03-31 14:59:30 +00001456
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001457 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001458
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001459 state->start = (void*) ((char*) ptr + start * state->charsize);
1460 state->end = (void*) ((char*) ptr + end * state->charsize);
1461
1462 Py_INCREF(string);
1463 state->string = string;
1464 state->pos = start;
1465 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001466
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001467 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001468 state->lower = sre_lower_locale;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001469 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001470#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001471 state->lower = sre_lower_unicode;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001472#else
1473 state->lower = sre_lower_locale;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001474#endif
1475 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001476 state->lower = sre_lower;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001477
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001478 return string;
Guido van Rossumb700df92000-03-31 14:59:30 +00001479}
1480
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001481LOCAL(void)
1482state_fini(SRE_STATE* state)
1483{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001484 Py_XDECREF(state->string);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001485 mark_fini(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001486}
1487
1488LOCAL(PyObject*)
1489state_getslice(SRE_STATE* state, int index, PyObject* string)
1490{
Fredrik Lundh58100642000-08-09 09:14:35 +00001491 int i, j;
1492
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001493 index = (index - 1) * 2;
1494
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001495 if (string == Py_None || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh58100642000-08-09 09:14:35 +00001496 i = j = 0;
1497 } else {
1498 i = ((char*)state->mark[index] - (char*)state->beginning) /
1499 state->charsize;
1500 j = ((char*)state->mark[index+1] - (char*)state->beginning) /
1501 state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001502 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001503
Fredrik Lundh58100642000-08-09 09:14:35 +00001504 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001505}
1506
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001507static void
1508pattern_error(int status)
1509{
1510 switch (status) {
1511 case SRE_ERROR_RECURSION_LIMIT:
1512 PyErr_SetString(
1513 PyExc_RuntimeError,
1514 "maximum recursion limit exceeded"
1515 );
1516 break;
1517 case SRE_ERROR_MEMORY:
1518 PyErr_NoMemory();
1519 break;
1520 default:
1521 /* other error codes indicate compiler/engine bugs */
1522 PyErr_SetString(
1523 PyExc_RuntimeError,
1524 "internal error in regular expression engine"
1525 );
1526 }
1527}
1528
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001529static PyObject*
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001530pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001531{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001532 /* create match object (from state object) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001533
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001534 MatchObject* match;
1535 int i, j;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001536 char* base;
1537 int n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001538
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001539 if (status > 0) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001540
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001541 /* create match object (with room for extra group marks) */
1542 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
Fredrik Lundh6f013982000-07-03 18:44:21 +00001543 2*(pattern->groups+1));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001544 if (!match)
1545 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001546
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001547 Py_INCREF(pattern);
1548 match->pattern = pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001549
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001550 Py_INCREF(state->string);
1551 match->string = state->string;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001552
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001553 match->regs = NULL;
1554 match->groups = pattern->groups+1;
1555
1556 /* fill in group slices */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001557
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001558 base = (char*) state->beginning;
1559 n = state->charsize;
1560
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001561 match->mark[0] = ((char*) state->start - base) / n;
1562 match->mark[1] = ((char*) state->ptr - base) / n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001563
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001564 for (i = j = 0; i < pattern->groups; i++, j+=2)
1565 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
1566 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
1567 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
1568 } else
1569 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
1570
1571 match->pos = state->pos;
1572 match->endpos = state->endpos;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001573
Fredrik Lundh6f013982000-07-03 18:44:21 +00001574 match->lastindex = state->lastindex;
1575
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001576 return (PyObject*) match;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001577
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001578 } else if (status == 0) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001579
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001580 /* no match */
1581 Py_INCREF(Py_None);
1582 return Py_None;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001583
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001584 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001585
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001586 /* internal error */
1587 pattern_error(status);
1588 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001589}
1590
1591static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001592pattern_scanner(PatternObject* pattern, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001593{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001594 /* create search state object */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001595
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001596 ScannerObject* self;
1597
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001598 PyObject* string;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001599 int start = 0;
1600 int end = INT_MAX;
1601 if (!PyArg_ParseTuple(args, "O|ii:scanner", &string, &start, &end))
1602 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001603
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001604 /* create scanner object */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001605 self = PyObject_NEW(ScannerObject, &Scanner_Type);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001606 if (!self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001607 return NULL;
1608
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001609 string = state_init(&self->state, pattern, string, start, end);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001610 if (!string) {
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001611 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001612 return NULL;
1613 }
1614
1615 Py_INCREF(pattern);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001616 self->pattern = (PyObject*) pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001617
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001618 return (PyObject*) self;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001619}
1620
Guido van Rossumb700df92000-03-31 14:59:30 +00001621static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001622pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001623{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001624 Py_XDECREF(self->pattern);
1625 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001626 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001627 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001628}
1629
1630static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001631pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001632{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001633 SRE_STATE state;
1634 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001635
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001636 PyObject* string;
1637 int start = 0;
1638 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00001639 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1640 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:match", kwlist,
1641 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001642 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001643
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001644 string = state_init(&state, self, string, start, end);
1645 if (!string)
1646 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001647
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001648 state.ptr = state.start;
1649
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001650 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1651
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001652 if (state.charsize == 1) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001653 status = sre_match(&state, PatternObject_GetCode(self), 1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001654 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001655#if defined(HAVE_UNICODE)
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001656 status = sre_umatch(&state, PatternObject_GetCode(self), 1);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001657#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001658 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001659
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001660 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1661
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001662 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001663
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001664 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001665}
1666
1667static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001668pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001669{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001670 SRE_STATE state;
1671 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001672
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001673 PyObject* string;
1674 int start = 0;
1675 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00001676 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1677 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:search", kwlist,
1678 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001679 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001680
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001681 string = state_init(&state, self, string, start, end);
1682 if (!string)
1683 return NULL;
1684
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001685 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1686
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001687 if (state.charsize == 1) {
1688 status = sre_search(&state, PatternObject_GetCode(self));
1689 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001690#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001691 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001692#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001693 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001694
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001695 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1696
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001697 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001698
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001699 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001700}
1701
1702static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001703call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001704{
1705 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001706 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001707 PyObject* func;
1708 PyObject* result;
1709
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001710 name = PyString_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001711 if (!name)
1712 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001713 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001714 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001715 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001716 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001717 func = PyObject_GetAttrString(mod, function);
1718 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001719 if (!func)
1720 return NULL;
1721 result = PyObject_CallObject(func, args);
1722 Py_DECREF(func);
1723 Py_DECREF(args);
1724 return result;
1725}
1726
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001727#ifdef USE_BUILTIN_COPY
1728static int
1729deepcopy(PyObject** object, PyObject* memo)
1730{
1731 PyObject* copy;
1732
1733 copy = call(
1734 "copy", "deepcopy",
1735 Py_BuildValue("OO", *object, memo)
1736 );
1737 if (!copy)
1738 return 0;
1739
1740 Py_DECREF(*object);
1741 *object = copy;
1742
1743 return 1; /* success */
1744}
1745#endif
1746
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001747static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001748pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001749{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001750 PyObject* template;
1751 PyObject* string;
Fredrik Lundh28552902000-07-05 21:14:16 +00001752 PyObject* count = Py_False; /* zero */
Fredrik Lundh562586e2000-10-03 20:43:34 +00001753 static char* kwlist[] = { "repl", "string", "count", NULL };
1754 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|O:sub", kwlist,
1755 &template, &string, &count))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001756 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001757
1758 /* delegate to Python code */
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001759 return call(
1760 SRE_MODULE, "_sub",
1761 Py_BuildValue("OOOO", self, template, string, count)
1762 );
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001763}
1764
1765static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001766pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001767{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001768 PyObject* template;
1769 PyObject* string;
Fredrik Lundh28552902000-07-05 21:14:16 +00001770 PyObject* count = Py_False; /* zero */
Fredrik Lundh562586e2000-10-03 20:43:34 +00001771 static char* kwlist[] = { "repl", "string", "count", NULL };
1772 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|O:subn", kwlist,
1773 &template, &string, &count))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001774 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001775
1776 /* delegate to Python code */
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001777 return call(
1778 SRE_MODULE, "_subn",
1779 Py_BuildValue("OOOO", self, template, string, count)
1780 );
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001781}
1782
1783static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001784pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001785{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001786 PyObject* string;
Fredrik Lundh28552902000-07-05 21:14:16 +00001787 PyObject* maxsplit = Py_False; /* zero */
Fredrik Lundh562586e2000-10-03 20:43:34 +00001788 static char* kwlist[] = { "source", "maxsplit", NULL };
1789 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|O:split", kwlist,
1790 &string, &maxsplit))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001791 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001792
1793 /* delegate to Python code */
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001794 return call(
1795 SRE_MODULE, "_split",
1796 Py_BuildValue("OOO", self, string, maxsplit)
1797 );
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001798}
1799
1800static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001801pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001802{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001803 SRE_STATE state;
1804 PyObject* list;
1805 int status;
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001806 int i;
Guido van Rossumb700df92000-03-31 14:59:30 +00001807
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001808 PyObject* string;
1809 int start = 0;
1810 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00001811 static char* kwlist[] = { "source", "pos", "endpos", NULL };
1812 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:findall", kwlist,
1813 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001814 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001815
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001816 string = state_init(&state, self, string, start, end);
1817 if (!string)
1818 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001819
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001820 list = PyList_New(0);
Guido van Rossumb700df92000-03-31 14:59:30 +00001821
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001822 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001823
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001824 PyObject* item;
1825
Fredrik Lundhebc37b22000-10-28 19:30:41 +00001826 state_reset(&state);
1827
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001828 state.ptr = state.start;
1829
1830 if (state.charsize == 1) {
1831 status = sre_search(&state, PatternObject_GetCode(self));
1832 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001833#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001834 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001835#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001836 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001837
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001838 if (status > 0) {
Guido van Rossumb700df92000-03-31 14:59:30 +00001839
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001840 /* don't bother to build a match object */
1841 switch (self->groups) {
1842 case 0:
1843 item = PySequence_GetSlice(
1844 string,
1845 ((char*) state.start - (char*) state.beginning) /
1846 state.charsize,
1847 ((char*) state.ptr - (char*) state.beginning) /
1848 state.charsize);
1849 if (!item)
1850 goto error;
1851 break;
1852 case 1:
1853 item = state_getslice(&state, 1, string);
1854 if (!item)
1855 goto error;
1856 break;
1857 default:
1858 item = PyTuple_New(self->groups);
1859 if (!item)
1860 goto error;
1861 for (i = 0; i < self->groups; i++) {
1862 PyObject* o = state_getslice(&state, i+1, string);
1863 if (!o) {
1864 Py_DECREF(item);
1865 goto error;
1866 }
1867 PyTuple_SET_ITEM(item, i, o);
1868 }
1869 break;
1870 }
1871
Fredrik Lundhe67d8e52000-08-27 21:32:46 +00001872 status = PyList_Append(list, item);
1873 Py_DECREF(item);
1874
1875 if (status < 0)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001876 goto error;
Guido van Rossumb700df92000-03-31 14:59:30 +00001877
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001878 if (state.ptr == state.start)
1879 state.start = (void*) ((char*) state.ptr + state.charsize);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001880 else
1881 state.start = state.ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001882
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001883 } else {
Guido van Rossumb700df92000-03-31 14:59:30 +00001884
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001885 if (status == 0)
1886 break;
1887
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001888 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001889 goto error;
Guido van Rossumb700df92000-03-31 14:59:30 +00001890
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001891 }
1892 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001893
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001894 state_fini(&state);
1895 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00001896
1897error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001898 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001899 state_fini(&state);
1900 return NULL;
1901
Guido van Rossumb700df92000-03-31 14:59:30 +00001902}
1903
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001904static PyObject*
1905pattern_copy(PatternObject* self, PyObject* args)
1906{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001907#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001908 PatternObject* copy;
1909 int offset;
1910
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001911 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
1912 return NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001913
1914 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
1915 if (!copy)
1916 return NULL;
1917
1918 offset = offsetof(PatternObject, groups);
1919
1920 Py_XINCREF(self->groupindex);
1921 Py_XINCREF(self->indexgroup);
1922 Py_XINCREF(self->pattern);
1923
1924 memcpy((char*) copy + offset, (char*) self + offset,
1925 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
1926
1927 return (PyObject*) copy;
1928#else
1929 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
1930 return NULL;
1931#endif
1932}
1933
1934static PyObject*
1935pattern_deepcopy(PatternObject* self, PyObject* args)
1936{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001937#ifdef USE_BUILTIN_COPY
1938 PatternObject* copy;
1939
1940 PyObject* memo;
1941 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
1942 return NULL;
1943
1944 copy = (PatternObject*) pattern_copy(self, Py_None);
1945 if (!copy)
1946 return NULL;
1947
1948 if (!deepcopy(&copy->groupindex, memo) ||
1949 !deepcopy(&copy->indexgroup, memo) ||
1950 !deepcopy(&copy->pattern, memo)) {
1951 Py_DECREF(copy);
1952 return NULL;
1953 }
1954
1955#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001956 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
1957 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001958#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001959}
1960
Fredrik Lundh2d96f112001-07-08 13:26:57 +00001961static PyObject*
1962pattern_isliteral(PatternObject* self, PyObject* args)
1963{
1964 /* internal: return true if pattern consists of literal text only */
1965
1966 SRE_CODE* code;
1967 PyObject* isliteral;
1968
1969 if (!PyArg_ParseTuple(args, ":_isliteral"))
1970 return NULL;
1971
1972 code = PatternObject_GetCode(self);
1973
1974 if (code[0] == SRE_OP_INFO && code[2] & SRE_INFO_LITERAL)
1975 isliteral = Py_True;
1976 else
1977 isliteral = Py_False;
1978
1979 Py_INCREF(isliteral);
1980 return isliteral;
1981}
1982
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001983static PyMethodDef pattern_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00001984 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS},
1985 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS},
1986 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS},
1987 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS},
1988 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS},
1989 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS},
Fredrik Lundh562586e2000-10-03 20:43:34 +00001990 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001991 {"__copy__", (PyCFunction) pattern_copy, METH_VARARGS},
1992 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_VARARGS},
Fredrik Lundh2d96f112001-07-08 13:26:57 +00001993 {"_isliteral", (PyCFunction) pattern_isliteral, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001994 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00001995};
1996
1997static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001998pattern_getattr(PatternObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00001999{
2000 PyObject* res;
2001
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002002 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00002003
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002004 if (res)
2005 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00002006
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002007 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00002008
2009 /* attributes */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002010 if (!strcmp(name, "pattern")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002011 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002012 return self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00002013 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002014
2015 if (!strcmp(name, "flags"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002016 return Py_BuildValue("i", self->flags);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002017
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002018 if (!strcmp(name, "groups"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002019 return Py_BuildValue("i", self->groups);
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002020
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002021 if (!strcmp(name, "groupindex") && self->groupindex) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002022 Py_INCREF(self->groupindex);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002023 return self->groupindex;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002024 }
2025
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002026 PyErr_SetString(PyExc_AttributeError, name);
2027 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002028}
2029
2030statichere PyTypeObject Pattern_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002031 PyObject_HEAD_INIT(NULL)
2032 0, "SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002033 sizeof(PatternObject), sizeof(SRE_CODE),
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002034 (destructor)pattern_dealloc, /*tp_dealloc*/
2035 0, /*tp_print*/
2036 (getattrfunc)pattern_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002037};
2038
2039/* -------------------------------------------------------------------- */
2040/* match methods */
2041
2042static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002043match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00002044{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002045 Py_XDECREF(self->regs);
2046 Py_XDECREF(self->string);
2047 Py_DECREF(self->pattern);
2048 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00002049}
2050
2051static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002052match_getslice_by_index(MatchObject* self, int index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00002053{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002054 if (index < 0 || index >= self->groups) {
2055 /* raise IndexError if we were given a bad group number */
2056 PyErr_SetString(
2057 PyExc_IndexError,
2058 "no such group"
2059 );
2060 return NULL;
2061 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002062
Fredrik Lundh6f013982000-07-03 18:44:21 +00002063 index *= 2;
2064
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002065 if (self->string == Py_None || self->mark[index] < 0) {
2066 /* return default value if the string or group is undefined */
2067 Py_INCREF(def);
2068 return def;
2069 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002070
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002071 return PySequence_GetSlice(
2072 self->string, self->mark[index], self->mark[index+1]
2073 );
Guido van Rossumb700df92000-03-31 14:59:30 +00002074}
2075
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002076static int
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002077match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00002078{
Fredrik Lundh6f013982000-07-03 18:44:21 +00002079 int i;
Guido van Rossumb700df92000-03-31 14:59:30 +00002080
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002081 if (PyInt_Check(index))
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002082 return (int) PyInt_AS_LONG(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00002083
Fredrik Lundh6f013982000-07-03 18:44:21 +00002084 i = -1;
2085
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002086 if (self->pattern->groupindex) {
2087 index = PyObject_GetItem(self->pattern->groupindex, index);
2088 if (index) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00002089 if (PyInt_Check(index))
2090 i = (int) PyInt_AS_LONG(index);
2091 Py_DECREF(index);
2092 } else
2093 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002094 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00002095
2096 return i;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002097}
2098
2099static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002100match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002101{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002102 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00002103}
2104
2105static PyObject*
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002106match_expand(MatchObject* self, PyObject* args)
2107{
2108 PyObject* template;
2109 if (!PyArg_ParseTuple(args, "O:expand", &template))
2110 return NULL;
2111
2112 /* delegate to Python code */
2113 return call(
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002114 SRE_MODULE, "_expand",
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002115 Py_BuildValue("OOO", self->pattern, self, template)
2116 );
2117}
2118
2119static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002120match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002121{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002122 PyObject* result;
2123 int i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00002124
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002125 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00002126
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002127 switch (size) {
2128 case 0:
2129 result = match_getslice(self, Py_False, Py_None);
2130 break;
2131 case 1:
2132 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2133 break;
2134 default:
2135 /* fetch multiple items */
2136 result = PyTuple_New(size);
2137 if (!result)
2138 return NULL;
2139 for (i = 0; i < size; i++) {
2140 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002141 self, PyTuple_GET_ITEM(args, i), Py_None
2142 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002143 if (!item) {
2144 Py_DECREF(result);
2145 return NULL;
2146 }
2147 PyTuple_SET_ITEM(result, i, item);
2148 }
2149 break;
2150 }
2151 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002152}
2153
2154static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002155match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002156{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002157 PyObject* result;
2158 int index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002159
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002160 PyObject* def = Py_None;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002161 static char* kwlist[] = { "default", NULL };
2162 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002163 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002164
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002165 result = PyTuple_New(self->groups-1);
2166 if (!result)
2167 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002168
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002169 for (index = 1; index < self->groups; index++) {
2170 PyObject* item;
2171 item = match_getslice_by_index(self, index, def);
2172 if (!item) {
2173 Py_DECREF(result);
2174 return NULL;
2175 }
2176 PyTuple_SET_ITEM(result, index-1, item);
2177 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002178
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002179 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002180}
2181
2182static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002183match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002184{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002185 PyObject* result;
2186 PyObject* keys;
2187 int index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002188
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002189 PyObject* def = Py_None;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002190 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00002191 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002192 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002193
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002194 result = PyDict_New();
2195 if (!result || !self->pattern->groupindex)
2196 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002197
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002198 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002199 if (!keys)
2200 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00002201
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002202 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00002203 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002204 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002205 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002206 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002207 if (!key)
2208 goto failed;
2209 value = match_getslice(self, key, def);
2210 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002211 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002212 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002213 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00002214 status = PyDict_SetItem(result, key, value);
2215 Py_DECREF(value);
2216 if (status < 0)
2217 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002218 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002219
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002220 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00002221
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002222 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002223
2224failed:
2225 Py_DECREF(keys);
2226 Py_DECREF(result);
2227 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002228}
2229
2230static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002231match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002232{
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002233 int index;
2234
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002235 PyObject* index_ = Py_False; /* zero */
2236 if (!PyArg_ParseTuple(args, "|O:start", &index_))
2237 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002238
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002239 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002240
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002241 if (index < 0 || index >= self->groups) {
2242 PyErr_SetString(
2243 PyExc_IndexError,
2244 "no such group"
2245 );
2246 return NULL;
2247 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002248
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002249 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002250 return Py_BuildValue("i", self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00002251}
2252
2253static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002254match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002255{
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002256 int index;
2257
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002258 PyObject* index_ = Py_False; /* zero */
2259 if (!PyArg_ParseTuple(args, "|O:end", &index_))
2260 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002261
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002262 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002263
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002264 if (index < 0 || index >= self->groups) {
2265 PyErr_SetString(
2266 PyExc_IndexError,
2267 "no such group"
2268 );
2269 return NULL;
2270 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002271
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002272 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002273 return Py_BuildValue("i", self->mark[index*2+1]);
2274}
2275
2276LOCAL(PyObject*)
2277_pair(int i1, int i2)
2278{
2279 PyObject* pair;
2280 PyObject* item;
2281
2282 pair = PyTuple_New(2);
2283 if (!pair)
2284 return NULL;
2285
2286 item = PyInt_FromLong(i1);
2287 if (!item)
2288 goto error;
2289 PyTuple_SET_ITEM(pair, 0, item);
2290
2291 item = PyInt_FromLong(i2);
2292 if (!item)
2293 goto error;
2294 PyTuple_SET_ITEM(pair, 1, item);
2295
2296 return pair;
2297
2298 error:
2299 Py_DECREF(pair);
2300 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002301}
2302
2303static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002304match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002305{
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002306 int index;
2307
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002308 PyObject* index_ = Py_False; /* zero */
2309 if (!PyArg_ParseTuple(args, "|O:span", &index_))
2310 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002311
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002312 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002313
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002314 if (index < 0 || index >= self->groups) {
2315 PyErr_SetString(
2316 PyExc_IndexError,
2317 "no such group"
2318 );
2319 return NULL;
2320 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002321
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002322 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002323 return _pair(self->mark[index*2], self->mark[index*2+1]);
2324}
2325
2326static PyObject*
2327match_regs(MatchObject* self)
2328{
2329 PyObject* regs;
2330 PyObject* item;
2331 int index;
2332
2333 regs = PyTuple_New(self->groups);
2334 if (!regs)
2335 return NULL;
2336
2337 for (index = 0; index < self->groups; index++) {
2338 item = _pair(self->mark[index*2], self->mark[index*2+1]);
2339 if (!item) {
2340 Py_DECREF(regs);
2341 return NULL;
2342 }
2343 PyTuple_SET_ITEM(regs, index, item);
2344 }
2345
2346 Py_INCREF(regs);
2347 self->regs = regs;
2348
2349 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00002350}
2351
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002352static PyObject*
2353match_copy(MatchObject* self, PyObject* args)
2354{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002355#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002356 MatchObject* copy;
2357 int slots, offset;
2358
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002359 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
2360 return NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002361
2362 slots = 2 * (self->pattern->groups+1);
2363
2364 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
2365 if (!copy)
2366 return NULL;
2367
2368 /* this value a constant, but any compiler should be able to
2369 figure that out all by itself */
2370 offset = offsetof(MatchObject, string);
2371
2372 Py_XINCREF(self->pattern);
2373 Py_XINCREF(self->string);
2374 Py_XINCREF(self->regs);
2375
2376 memcpy((char*) copy + offset, (char*) self + offset,
2377 sizeof(MatchObject) + slots * sizeof(int) - offset);
2378
2379 return (PyObject*) copy;
2380#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002381 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002382 return NULL;
2383#endif
2384}
2385
2386static PyObject*
2387match_deepcopy(MatchObject* self, PyObject* args)
2388{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002389#ifdef USE_BUILTIN_COPY
2390 MatchObject* copy;
2391
2392 PyObject* memo;
2393 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
2394 return NULL;
2395
2396 copy = (MatchObject*) match_copy(self, Py_None);
2397 if (!copy)
2398 return NULL;
2399
2400 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
2401 !deepcopy(&copy->string, memo) ||
2402 !deepcopy(&copy->regs, memo)) {
2403 Py_DECREF(copy);
2404 return NULL;
2405 }
2406
2407#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002408 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
2409 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002410#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002411}
2412
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002413static PyMethodDef match_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00002414 {"group", (PyCFunction) match_group, METH_VARARGS},
2415 {"start", (PyCFunction) match_start, METH_VARARGS},
2416 {"end", (PyCFunction) match_end, METH_VARARGS},
2417 {"span", (PyCFunction) match_span, METH_VARARGS},
2418 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
2419 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
2420 {"expand", (PyCFunction) match_expand, METH_VARARGS},
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002421 {"__copy__", (PyCFunction) match_copy, METH_VARARGS},
2422 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002423 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002424};
2425
2426static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002427match_getattr(MatchObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00002428{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002429 PyObject* res;
Guido van Rossumb700df92000-03-31 14:59:30 +00002430
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002431 res = Py_FindMethod(match_methods, (PyObject*) self, name);
2432 if (res)
2433 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00002434
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002435 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00002436
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002437 if (!strcmp(name, "lastindex")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00002438 if (self->lastindex >= 0)
2439 return Py_BuildValue("i", self->lastindex);
Fredrik Lundhc2301732000-07-02 22:25:39 +00002440 Py_INCREF(Py_None);
2441 return Py_None;
2442 }
2443
2444 if (!strcmp(name, "lastgroup")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00002445 if (self->pattern->indexgroup && self->lastindex >= 0) {
Fredrik Lundhc2301732000-07-02 22:25:39 +00002446 PyObject* result = PySequence_GetItem(
Fredrik Lundh6f013982000-07-03 18:44:21 +00002447 self->pattern->indexgroup, self->lastindex
Fredrik Lundhc2301732000-07-02 22:25:39 +00002448 );
2449 if (result)
2450 return result;
2451 PyErr_Clear();
2452 }
2453 Py_INCREF(Py_None);
2454 return Py_None;
2455 }
2456
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002457 if (!strcmp(name, "string")) {
2458 if (self->string) {
2459 Py_INCREF(self->string);
2460 return self->string;
2461 } else {
2462 Py_INCREF(Py_None);
2463 return Py_None;
2464 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002465 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002466
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002467 if (!strcmp(name, "regs")) {
2468 if (self->regs) {
2469 Py_INCREF(self->regs);
2470 return self->regs;
2471 } else
2472 return match_regs(self);
2473 }
2474
2475 if (!strcmp(name, "re")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002476 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002477 return (PyObject*) self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00002478 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002479
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002480 if (!strcmp(name, "pos"))
2481 return Py_BuildValue("i", self->pos);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002482
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002483 if (!strcmp(name, "endpos"))
2484 return Py_BuildValue("i", self->endpos);
Guido van Rossumb700df92000-03-31 14:59:30 +00002485
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002486 PyErr_SetString(PyExc_AttributeError, name);
2487 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002488}
2489
2490/* FIXME: implement setattr("string", None) as a special case (to
2491 detach the associated string, if any */
2492
2493statichere PyTypeObject Match_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002494 PyObject_HEAD_INIT(NULL)
2495 0, "SRE_Match",
2496 sizeof(MatchObject), sizeof(int),
2497 (destructor)match_dealloc, /*tp_dealloc*/
2498 0, /*tp_print*/
2499 (getattrfunc)match_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002500};
2501
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002502/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002503/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002504
2505static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002506scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002507{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002508 state_fini(&self->state);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002509 Py_DECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002510 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002511}
2512
2513static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002514scanner_match(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002515{
2516 SRE_STATE* state = &self->state;
2517 PyObject* match;
2518 int status;
2519
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00002520 state_reset(state);
2521
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002522 state->ptr = state->start;
2523
2524 if (state->charsize == 1) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00002525 status = sre_match(state, PatternObject_GetCode(self->pattern), 1);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002526 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002527#if defined(HAVE_UNICODE)
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00002528 status = sre_umatch(state, PatternObject_GetCode(self->pattern), 1);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002529#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002530 }
2531
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002532 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002533 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002534
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002535 if (status == 0 || state->ptr == state->start)
2536 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002537 else
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002538 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002539
2540 return match;
2541}
2542
2543
2544static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002545scanner_search(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002546{
2547 SRE_STATE* state = &self->state;
2548 PyObject* match;
2549 int status;
2550
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00002551 state_reset(state);
2552
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002553 state->ptr = state->start;
2554
2555 if (state->charsize == 1) {
2556 status = sre_search(state, PatternObject_GetCode(self->pattern));
2557 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002558#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002559 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002560#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002561 }
2562
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002563 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002564 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002565
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002566 if (status == 0 || state->ptr == state->start)
2567 state->start = (void*) ((char*) state->ptr + state->charsize);
2568 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002569 state->start = state->ptr;
2570
2571 return match;
2572}
2573
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002574static PyMethodDef scanner_methods[] = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002575 {"match", (PyCFunction) scanner_match, 0},
2576 {"search", (PyCFunction) scanner_search, 0},
2577 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002578};
2579
2580static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002581scanner_getattr(ScannerObject* self, char* name)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002582{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002583 PyObject* res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002584
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002585 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
2586 if (res)
2587 return res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002588
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002589 PyErr_Clear();
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002590
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002591 /* attributes */
2592 if (!strcmp(name, "pattern")) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002593 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002594 return self->pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002595 }
2596
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002597 PyErr_SetString(PyExc_AttributeError, name);
2598 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002599}
2600
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002601statichere PyTypeObject Scanner_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002602 PyObject_HEAD_INIT(NULL)
2603 0, "SRE_Scanner",
2604 sizeof(ScannerObject), 0,
2605 (destructor)scanner_dealloc, /*tp_dealloc*/
2606 0, /*tp_print*/
2607 (getattrfunc)scanner_getattr, /*tp_getattr*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002608};
2609
Guido van Rossumb700df92000-03-31 14:59:30 +00002610static PyMethodDef _functions[] = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002611 {"compile", _compile, 1},
2612 {"getcodesize", sre_codesize, 1},
2613 {"getlower", sre_getlower, 1},
2614 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002615};
2616
Tim Peters5687ffe2001-02-28 16:44:18 +00002617DL_EXPORT(void)
Thomas Woutersf3f33dc2000-07-21 06:00:07 +00002618init_sre(void)
Guido van Rossumb700df92000-03-31 14:59:30 +00002619{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00002620 PyObject* m;
2621 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00002622 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00002623
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002624 /* Patch object types */
2625 Pattern_Type.ob_type = Match_Type.ob_type =
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002626 Scanner_Type.ob_type = &PyType_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00002627
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00002628 m = Py_InitModule("_" SRE_MODULE, _functions);
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00002629 d = PyModule_GetDict(m);
2630
Fredrik Lundh21009b92001-09-18 18:47:09 +00002631 x = PyInt_FromLong(SRE_MAGIC);
2632 if (x) {
2633 PyDict_SetItemString(d, "MAGIC", x);
2634 Py_DECREF(x);
2635 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00002636
Fredrik Lundh21009b92001-09-18 18:47:09 +00002637 x = PyString_FromString(copyright);
2638 if (x) {
2639 PyDict_SetItemString(d, "copyright", x);
2640 Py_DECREF(x);
2641 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002642}
2643
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002644#endif /* !defined(SRE_RECURSIVE) */