blob: a8a97748b197372a18d4222f2c25cedc2ec5dca9 [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
Fredrik Lundhebc37b22000-10-28 19:30:41 +00009 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000010 * 2000-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000011 * 2000-09-20 fl added expand method
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000012 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000013 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000014 * 2001-04-28 fl added __copy__ methods (work in progress)
Fredrik Lundh09705f02002-11-22 12:46:35 +000015 * 2001-05-14 fl fixes for 1.5.2 compatibility
Fredrik Lundhf71ae462001-07-02 17:04:48 +000016 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh397a6542001-10-18 19:30:16 +000017 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Fredrik Lundh971e78b2001-10-20 17:48:46 +000018 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
Fredrik Lundhbec95b92001-10-21 16:47:57 +000019 * 2001-10-21 fl added sub/subn primitive
Fredrik Lundh703ce812001-10-24 22:16:30 +000020 * 2001-10-24 fl added finditer primitive (for 2.2 only)
Fredrik Lundh82b23072001-12-09 16:13:15 +000021 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
Fredrik Lundh09705f02002-11-22 12:46:35 +000022 * 2002-11-09 fl fixed empty sub/subn return type
Martin v. Löwis78e2f062003-04-19 12:56:08 +000023 * 2003-04-18 mvl fully support 4-byte codes
Guido van Rossumb700df92000-03-31 14:59:30 +000024 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000025 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000026 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000027 * This version of the SRE library can be redistributed under CNRI's
28 * Python 1.6 license. For any other use, please contact Secret Labs
29 * AB (info@pythonware.com).
30 *
Guido van Rossumb700df92000-03-31 14:59:30 +000031 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000032 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000033 * other compatibility work.
34 */
35
36#ifndef SRE_RECURSIVE
37
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000038static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000039 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000040
41#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000042#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000043
44#include "sre.h"
45
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000046#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000047
Fredrik Lundh436c3d582000-06-29 08:58:44 +000048/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000049#if !defined(SRE_MODULE)
50#define SRE_MODULE "sre"
51#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000052
Guido van Rossumb700df92000-03-31 14:59:30 +000053/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000054#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000055
Fredrik Lundh971e78b2001-10-20 17:48:46 +000056#if PY_VERSION_HEX >= 0x01060000
57#if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
Fredrik Lundh22d25462000-07-01 17:50:59 +000058/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +000059#define HAVE_UNICODE
60#endif
Fredrik Lundh971e78b2001-10-20 17:48:46 +000061#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000062
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000063/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000064/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000065
Fredrik Lundh33accc12000-08-27 20:59:47 +000066/* prevent run-away recursion (bad patterns on long strings) */
67
Fredrik Lundh18c2aa22000-08-07 17:33:38 +000068#if !defined(USE_STACKCHECK)
Fredrik Lundh33accc12000-08-27 20:59:47 +000069#if defined(MS_WIN64) || defined(__LP64__) || defined(_LP64)
70/* require smaller recursion limit for a number of 64-bit platforms:
71 Win64 (MS_WIN64), Linux64 (__LP64__), Monterey (64-bit AIX) (_LP64) */
72/* FIXME: maybe the limit should be 40000 / sizeof(void*) ? */
73#define USE_RECURSION_LIMIT 7500
74#else
Andrew MacIntyre1a444482003-06-09 08:22:11 +000075
76#if defined(__GNUC__) && defined(WITH_THREAD) && defined(__FreeBSD__)
77/* the pthreads library on FreeBSD has a fixed 1MB stack size for the
78 * initial (or "primary") thread, which is insufficient for the default
79 * recursion limit. gcc 3.x at the default optimisation
80 * level (-O3) uses stack space more aggressively than gcc 2.95.
Gustavo Niemeyerc23fb772003-04-27 06:58:54 +000081 */
Andrew MacIntyre1a444482003-06-09 08:22:11 +000082#if (__GNUC__ > 2)
83#define USE_RECURSION_LIMIT 6500
84#else
Gustavo Niemeyerc23fb772003-04-27 06:58:54 +000085#define USE_RECURSION_LIMIT 7500
Andrew MacIntyre1a444482003-06-09 08:22:11 +000086#endif
87
Gustavo Niemeyerc23fb772003-04-27 06:58:54 +000088#else
Fredrik Lundh33accc12000-08-27 20:59:47 +000089#define USE_RECURSION_LIMIT 10000
90#endif
Fredrik Lundh18c2aa22000-08-07 17:33:38 +000091#endif
Gustavo Niemeyerc23fb772003-04-27 06:58:54 +000092#endif
Fredrik Lundh96ab4652000-08-03 16:29:50 +000093
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000094/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000095#define USE_FAST_SEARCH
96
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000097/* enables aggressive inlining (always on for Visual C) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000098#undef USE_INLINE
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000099
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000100/* enables copy/deepcopy handling (work in progress) */
101#undef USE_BUILTIN_COPY
102
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000103#if PY_VERSION_HEX < 0x01060000
104#define PyObject_DEL(op) PyMem_DEL((op))
105#endif
106
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000107/* -------------------------------------------------------------------- */
108
Fredrik Lundh80946112000-06-29 18:03:25 +0000109#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +0000110#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +0000111#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +0000112/* fastest possible local call under MSVC */
113#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000114#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000115#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000116#else
117#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +0000118#endif
119
120/* error codes */
121#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000122#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000123#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +0000124#define SRE_ERROR_MEMORY -9 /* out of memory */
125
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000126#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +0000127#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +0000128#else
129#define TRACE(v)
130#endif
131
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000132/* -------------------------------------------------------------------- */
133/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000134
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000135/* default character predicates (run sre_chars.py to regenerate tables) */
136
137#define SRE_DIGIT_MASK 1
138#define SRE_SPACE_MASK 2
139#define SRE_LINEBREAK_MASK 4
140#define SRE_ALNUM_MASK 8
141#define SRE_WORD_MASK 16
142
Fredrik Lundh21009b92001-09-18 18:47:09 +0000143/* FIXME: this assumes ASCII. create tables in init_sre() instead */
144
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000145static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1462, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1470, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
14825, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
14924, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1500, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
15124, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
152
Fredrik Lundhb389df32000-06-29 12:48:37 +0000153static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000015410, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
15527, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
15644, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
15761, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
158108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
159122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
160106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
161120, 121, 122, 123, 124, 125, 126, 127 };
162
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000163#define SRE_IS_DIGIT(ch)\
164 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
165#define SRE_IS_SPACE(ch)\
166 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
167#define SRE_IS_LINEBREAK(ch)\
168 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
169#define SRE_IS_ALNUM(ch)\
170 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
171#define SRE_IS_WORD(ch)\
172 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000173
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000174static unsigned int sre_lower(unsigned int ch)
175{
176 return ((ch) < 128 ? sre_char_lower[ch] : ch);
177}
178
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000179/* locale-specific character predicates */
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000180
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000181#define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
182#define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
183#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
184#define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
185#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
186
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000187static unsigned int sre_lower_locale(unsigned int ch)
188{
189 return ((ch) < 256 ? tolower((ch)) : ch);
190}
191
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000192/* unicode-specific character predicates */
193
194#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000195
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000196#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
197#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
198#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000199#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000200#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000201
202static unsigned int sre_lower_unicode(unsigned int ch)
203{
204 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
205}
206
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000207#endif
208
Guido van Rossumb700df92000-03-31 14:59:30 +0000209LOCAL(int)
210sre_category(SRE_CODE category, unsigned int ch)
211{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000212 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000213
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000214 case SRE_CATEGORY_DIGIT:
215 return SRE_IS_DIGIT(ch);
216 case SRE_CATEGORY_NOT_DIGIT:
217 return !SRE_IS_DIGIT(ch);
218 case SRE_CATEGORY_SPACE:
219 return SRE_IS_SPACE(ch);
220 case SRE_CATEGORY_NOT_SPACE:
221 return !SRE_IS_SPACE(ch);
222 case SRE_CATEGORY_WORD:
223 return SRE_IS_WORD(ch);
224 case SRE_CATEGORY_NOT_WORD:
225 return !SRE_IS_WORD(ch);
226 case SRE_CATEGORY_LINEBREAK:
227 return SRE_IS_LINEBREAK(ch);
228 case SRE_CATEGORY_NOT_LINEBREAK:
229 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000230
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000231 case SRE_CATEGORY_LOC_WORD:
232 return SRE_LOC_IS_WORD(ch);
233 case SRE_CATEGORY_LOC_NOT_WORD:
234 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000235
236#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000237 case SRE_CATEGORY_UNI_DIGIT:
238 return SRE_UNI_IS_DIGIT(ch);
239 case SRE_CATEGORY_UNI_NOT_DIGIT:
240 return !SRE_UNI_IS_DIGIT(ch);
241 case SRE_CATEGORY_UNI_SPACE:
242 return SRE_UNI_IS_SPACE(ch);
243 case SRE_CATEGORY_UNI_NOT_SPACE:
244 return !SRE_UNI_IS_SPACE(ch);
245 case SRE_CATEGORY_UNI_WORD:
246 return SRE_UNI_IS_WORD(ch);
247 case SRE_CATEGORY_UNI_NOT_WORD:
248 return !SRE_UNI_IS_WORD(ch);
249 case SRE_CATEGORY_UNI_LINEBREAK:
250 return SRE_UNI_IS_LINEBREAK(ch);
251 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
252 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000253#else
254 case SRE_CATEGORY_UNI_DIGIT:
255 return SRE_IS_DIGIT(ch);
256 case SRE_CATEGORY_UNI_NOT_DIGIT:
257 return !SRE_IS_DIGIT(ch);
258 case SRE_CATEGORY_UNI_SPACE:
259 return SRE_IS_SPACE(ch);
260 case SRE_CATEGORY_UNI_NOT_SPACE:
261 return !SRE_IS_SPACE(ch);
262 case SRE_CATEGORY_UNI_WORD:
263 return SRE_LOC_IS_WORD(ch);
264 case SRE_CATEGORY_UNI_NOT_WORD:
265 return !SRE_LOC_IS_WORD(ch);
266 case SRE_CATEGORY_UNI_LINEBREAK:
267 return SRE_IS_LINEBREAK(ch);
268 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
269 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000270#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000271 }
272 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000273}
274
275/* helpers */
276
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000277static void
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000278mark_fini(SRE_STATE* state)
279{
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000280 if (state->mark_stack) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000281 free(state->mark_stack);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000282 state->mark_stack = NULL;
283 }
284 state->mark_stack_size = state->mark_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000285}
286
287static int
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +0000288mark_save(SRE_STATE* state, int lo, int hi, int *mark_stack_base)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000289{
290 void* stack;
291 int size;
292 int minsize, newsize;
293
294 if (hi <= lo)
295 return 0;
296
297 size = (hi - lo) + 1;
298
299 newsize = state->mark_stack_size;
300 minsize = state->mark_stack_base + size;
301
302 if (newsize < minsize) {
303 /* create new stack */
304 if (!newsize) {
305 newsize = 512;
306 if (newsize < minsize)
307 newsize = minsize;
308 TRACE(("allocate stack %d\n", newsize));
309 stack = malloc(sizeof(void*) * newsize);
310 } else {
311 /* grow the stack */
312 while (newsize < minsize)
313 newsize += newsize;
314 TRACE(("grow stack to %d\n", newsize));
315 stack = realloc(state->mark_stack, sizeof(void*) * newsize);
316 }
317 if (!stack) {
318 mark_fini(state);
319 return SRE_ERROR_MEMORY;
320 }
321 state->mark_stack = stack;
322 state->mark_stack_size = newsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000323 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000324
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000325 TRACE(("copy %d:%d to %d (%d)\n", lo, hi, state->mark_stack_base, size));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000326
327 memcpy(state->mark_stack + state->mark_stack_base, state->mark + lo,
328 size * sizeof(void*));
329
330 state->mark_stack_base += size;
331
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +0000332 *mark_stack_base = state->mark_stack_base;
333
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000334 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000335}
336
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000337static int
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +0000338mark_restore(SRE_STATE* state, int lo, int hi, int *mark_stack_base)
Guido van Rossumb700df92000-03-31 14:59:30 +0000339{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000340 int size;
Guido van Rossumb700df92000-03-31 14:59:30 +0000341
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000342 if (hi <= lo)
343 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000344
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000345 size = (hi - lo) + 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000346
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +0000347 state->mark_stack_base = *mark_stack_base - size;
Guido van Rossumb700df92000-03-31 14:59:30 +0000348
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000349 TRACE(("copy %d:%d from %d\n", lo, hi, state->mark_stack_base));
Guido van Rossumb700df92000-03-31 14:59:30 +0000350
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000351 memcpy(state->mark + lo, state->mark_stack + state->mark_stack_base,
352 size * sizeof(void*));
Guido van Rossumb700df92000-03-31 14:59:30 +0000353
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000354 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000355}
356
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000357/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000358
359#define SRE_CHAR unsigned char
360#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000361#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000362#define SRE_CHARSET sre_charset
363#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000364#define SRE_MATCH sre_match
365#define SRE_SEARCH sre_search
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000366#define SRE_LITERAL_TEMPLATE sre_literal_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000367
368#if defined(HAVE_UNICODE)
369
Guido van Rossumb700df92000-03-31 14:59:30 +0000370#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000371#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000372#undef SRE_RECURSIVE
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000373
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000374#undef SRE_LITERAL_TEMPLATE
Guido van Rossumb700df92000-03-31 14:59:30 +0000375#undef SRE_SEARCH
376#undef SRE_MATCH
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000377#undef SRE_INFO
378#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000379#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000380#undef SRE_AT
381#undef SRE_CHAR
382
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000383/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000384
385#define SRE_CHAR Py_UNICODE
386#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000387#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000388#define SRE_CHARSET sre_ucharset
389#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000390#define SRE_MATCH sre_umatch
391#define SRE_SEARCH sre_usearch
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000392#define SRE_LITERAL_TEMPLATE sre_uliteral_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000393#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000394
395#endif /* SRE_RECURSIVE */
396
397/* -------------------------------------------------------------------- */
398/* String matching engine */
399
400/* the following section is compiled twice, with different character
401 settings */
402
403LOCAL(int)
404SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
405{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000406 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000407
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000408 int this, that;
Guido van Rossumb700df92000-03-31 14:59:30 +0000409
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000410 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000411
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000412 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000413 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000414 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000415
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000416 case SRE_AT_BEGINNING_LINE:
417 return ((void*) ptr == state->beginning ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000418 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000419
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000420 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000421 return (((void*) (ptr+1) == state->end &&
422 SRE_IS_LINEBREAK((int) ptr[0])) ||
423 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000424
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000425 case SRE_AT_END_LINE:
426 return ((void*) ptr == state->end ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000427 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000428
Fredrik Lundh770617b2001-01-14 15:06:11 +0000429 case SRE_AT_END_STRING:
430 return ((void*) ptr == state->end);
431
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000432 case SRE_AT_BOUNDARY:
433 if (state->beginning == state->end)
434 return 0;
435 that = ((void*) ptr > state->beginning) ?
436 SRE_IS_WORD((int) ptr[-1]) : 0;
437 this = ((void*) ptr < state->end) ?
438 SRE_IS_WORD((int) ptr[0]) : 0;
439 return this != that;
Fredrik Lundh80946112000-06-29 18:03:25 +0000440
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000441 case SRE_AT_NON_BOUNDARY:
442 if (state->beginning == state->end)
443 return 0;
444 that = ((void*) ptr > state->beginning) ?
445 SRE_IS_WORD((int) ptr[-1]) : 0;
446 this = ((void*) ptr < state->end) ?
447 SRE_IS_WORD((int) ptr[0]) : 0;
448 return this == that;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000449
450 case SRE_AT_LOC_BOUNDARY:
451 if (state->beginning == state->end)
452 return 0;
453 that = ((void*) ptr > state->beginning) ?
454 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
455 this = ((void*) ptr < state->end) ?
456 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
457 return this != that;
458
459 case SRE_AT_LOC_NON_BOUNDARY:
460 if (state->beginning == state->end)
461 return 0;
462 that = ((void*) ptr > state->beginning) ?
463 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
464 this = ((void*) ptr < state->end) ?
465 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
466 return this == that;
467
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000468#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000469 case SRE_AT_UNI_BOUNDARY:
470 if (state->beginning == state->end)
471 return 0;
472 that = ((void*) ptr > state->beginning) ?
473 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
474 this = ((void*) ptr < state->end) ?
475 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
476 return this != that;
477
478 case SRE_AT_UNI_NON_BOUNDARY:
479 if (state->beginning == state->end)
480 return 0;
481 that = ((void*) ptr > state->beginning) ?
482 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
483 this = ((void*) ptr < state->end) ?
484 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
485 return this == that;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000486#endif
487
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000488 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000489
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000490 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000491}
492
493LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000494SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000495{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000496 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000497
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000498 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000499
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000500 for (;;) {
501 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000502
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000503 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000504 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000505 if (ch == set[0])
506 return ok;
507 set++;
508 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000509
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000510 case SRE_OP_RANGE:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000511 /* <RANGE> <lower> <upper> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000512 if (set[0] <= ch && ch <= set[1])
513 return ok;
514 set += 2;
515 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000516
Fredrik Lundh3562f112000-07-02 12:00:07 +0000517 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000518 if (sizeof(SRE_CODE) == 2) {
519 /* <CHARSET> <bitmap> (16 bits per code word) */
520 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
521 return ok;
522 set += 16;
523 }
524 else {
525 /* <CHARSET> <bitmap> (32 bits per code word) */
526 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
527 return ok;
528 set += 8;
529 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000530 break;
531
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000532 case SRE_OP_BIGCHARSET:
533 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
534 {
535 int count, block;
536 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000537
538 if (sizeof(SRE_CODE) == 2) {
539 block = ((unsigned char*)set)[ch >> 8];
540 set += 128;
541 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
542 return ok;
543 set += count*16;
544 }
545 else {
546 if (ch < 65536)
547 block = ((unsigned char*)set)[ch >> 8];
548 else
549 block = -1;
550 set += 64;
551 if (block >=0 &&
552 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
553 return ok;
554 set += count*8;
555 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000556 break;
557 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000558
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000559 case SRE_OP_CATEGORY:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000560 /* <CATEGORY> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000561 if (sre_category(set[0], (int) ch))
562 return ok;
563 set += 1;
564 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000565
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000566 case SRE_OP_NEGATE:
567 ok = !ok;
568 break;
569
570 case SRE_OP_FAILURE:
571 return !ok;
572
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000573 default:
574 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000575 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000576 return 0;
577 }
578 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000579}
580
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000581LOCAL(int) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level);
582
583LOCAL(int)
584SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, int maxcount, int level)
585{
586 SRE_CODE chr;
587 SRE_CHAR* ptr = state->ptr;
588 SRE_CHAR* end = state->end;
589 int i;
590
591 /* adjust end */
592 if (maxcount < end - ptr && maxcount != 65535)
593 end = ptr + maxcount;
594
595 switch (pattern[0]) {
596
597 case SRE_OP_ANY:
598 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000599 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000600 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
601 ptr++;
602 break;
603
604 case SRE_OP_ANY_ALL:
605 /* repeated dot wildcare. skip to the end of the target
606 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000607 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000608 ptr = end;
609 break;
610
611 case SRE_OP_LITERAL:
612 /* repeated literal */
613 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000614 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000615 while (ptr < end && (SRE_CODE) *ptr == chr)
616 ptr++;
617 break;
618
619 case SRE_OP_LITERAL_IGNORE:
620 /* repeated literal */
621 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000622 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000623 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
624 ptr++;
625 break;
626
627 case SRE_OP_NOT_LITERAL:
628 /* repeated non-literal */
629 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000630 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000631 while (ptr < end && (SRE_CODE) *ptr != chr)
632 ptr++;
633 break;
634
635 case SRE_OP_NOT_LITERAL_IGNORE:
636 /* repeated non-literal */
637 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000638 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000639 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
640 ptr++;
641 break;
642
643 case SRE_OP_IN:
644 /* repeated set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000645 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
646 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000647 ptr++;
648 break;
649
650 default:
651 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000652 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000653 while ((SRE_CHAR*) state->ptr < end) {
654 i = SRE_MATCH(state, pattern, level);
655 if (i < 0)
656 return i;
657 if (!i)
658 break;
659 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000660 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
661 (SRE_CHAR*) state->ptr - ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000662 return (SRE_CHAR*) state->ptr - ptr;
663 }
664
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000665 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000666 return ptr - (SRE_CHAR*) state->ptr;
667}
668
Fredrik Lundh33accc12000-08-27 20:59:47 +0000669#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000670LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000671SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
672{
673 /* check if an SRE_OP_INFO block matches at the current position.
674 returns the number of SRE_CODE objects to skip if successful, 0
675 if no match */
676
677 SRE_CHAR* end = state->end;
678 SRE_CHAR* ptr = state->ptr;
679 int i;
680
681 /* check minimal length */
682 if (pattern[3] && (end - ptr) < pattern[3])
683 return 0;
684
685 /* check known prefix */
686 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
687 /* <length> <skip> <prefix data> <overlap data> */
688 for (i = 0; i < pattern[5]; i++)
689 if ((SRE_CODE) ptr[i] != pattern[7 + i])
690 return 0;
691 return pattern[0] + 2 * pattern[6];
692 }
693 return pattern[0];
694}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000695#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000696
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000697/* The macros below should be used to protect recursive SRE_MATCH()
698 * calls that *failed* and do *not* return immediately (IOW, those
699 * that will backtrack). Explaining:
700 *
701 * - Recursive SRE_MATCH() returned true: that's usually a success
702 * (besides atypical cases like ASSERT_NOT), therefore there's no
703 * reason to restore lastmark;
704 *
705 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
706 * is returning to the caller: If the current SRE_MATCH() is the
707 * top function of the recursion, returning false will be a matching
708 * failure, and it doesn't matter where lastmark is pointing to.
709 * If it's *not* the top function, it will be a recursive SRE_MATCH()
710 * failure by itself, and the calling SRE_MATCH() will have to deal
711 * with the failure by the same rules explained here (it will restore
712 * lastmark by itself if necessary);
713 *
714 * - Recursive SRE_MATCH() returned false, and will continue the
715 * outside 'for' loop: must be protected when breaking, since the next
716 * OP could potentially depend on lastmark;
717 *
718 * - Recursive SRE_MATCH() returned false, and will be called again
719 * inside a local for/while loop: must be protected between each
720 * loop iteration, since the recursive SRE_MATCH() could do anything,
721 * and could potentially depend on lastmark.
722 *
723 * For more information, check the discussion at SF patch #712900.
724 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000725#define LASTMARK_SAVE() \
726 do { \
727 lastmark = state->lastmark; \
728 lastindex = state->lastindex; \
729 } while (0)
730#define LASTMARK_RESTORE() \
731 do { \
732 if (state->lastmark > lastmark) { \
733 memset(state->mark + lastmark + 1, 0, \
734 (state->lastmark - lastmark) * sizeof(void*)); \
735 state->lastmark = lastmark; \
736 state->lastindex = lastindex; \
737 } \
738 } while (0)
739
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000740LOCAL(int)
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000741SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
Guido van Rossumb700df92000-03-31 14:59:30 +0000742{
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000743 /* check if string matches the given pattern. returns <0 for
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000744 error, 0 for failure, and 1 for success */
Guido van Rossumb700df92000-03-31 14:59:30 +0000745
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000746 SRE_CHAR* end = state->end;
747 SRE_CHAR* ptr = state->ptr;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000748 int i, count;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000749 SRE_REPEAT* rp;
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +0000750 int lastmark, lastindex, mark_stack_base;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000751 SRE_CODE chr;
Guido van Rossumb700df92000-03-31 14:59:30 +0000752
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000753 SRE_REPEAT rep; /* FIXME: <fl> allocate in STATE instead */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000754
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000755 TRACE(("|%p|%p|ENTER %d\n", pattern, ptr, level));
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000756
Fredrik Lundh18c2aa22000-08-07 17:33:38 +0000757#if defined(USE_STACKCHECK)
Fredrik Lundh58100642000-08-09 09:14:35 +0000758 if (level % 10 == 0 && PyOS_CheckStack())
Fredrik Lundh18c2aa22000-08-07 17:33:38 +0000759 return SRE_ERROR_RECURSION_LIMIT;
760#endif
761
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000762#if defined(USE_RECURSION_LIMIT)
763 if (level > USE_RECURSION_LIMIT)
764 return SRE_ERROR_RECURSION_LIMIT;
765#endif
766
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000767 if (pattern[0] == SRE_OP_INFO) {
768 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000769 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000770 if (pattern[3] && (end - ptr) < pattern[3]) {
771 TRACE(("reject (got %d chars, need %d)\n",
772 (end - ptr), pattern[3]));
773 return 0;
774 }
775 pattern += pattern[1] + 1;
776 }
777
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000778 for (;;) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000779
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000780 switch (*pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000781
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000782 case SRE_OP_FAILURE:
783 /* immediate failure */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000784 TRACE(("|%p|%p|FAILURE\n", pattern, ptr));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000785 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000786
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000787 case SRE_OP_SUCCESS:
788 /* end of pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000789 TRACE(("|%p|%p|SUCCESS\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000790 state->ptr = ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000791 return 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000792
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000793 case SRE_OP_AT:
794 /* match at given position */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000795 /* <AT> <code> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000796 TRACE(("|%p|%p|AT %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000797 if (!SRE_AT(state, ptr, *pattern))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000798 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000799 pattern++;
800 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000801
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000802 case SRE_OP_CATEGORY:
803 /* match at given category */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000804 /* <CATEGORY> <code> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000805 TRACE(("|%p|%p|CATEGORY %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000806 if (ptr >= end || !sre_category(pattern[0], ptr[0]))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000807 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000808 pattern++;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000809 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000810 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000811
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000812 case SRE_OP_LITERAL:
813 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000814 /* <LITERAL> <code> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000815 TRACE(("|%p|%p|LITERAL %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000816 if (ptr >= end || (SRE_CODE) ptr[0] != pattern[0])
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000817 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000818 pattern++;
819 ptr++;
820 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000821
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000822 case SRE_OP_NOT_LITERAL:
823 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000824 /* <NOT_LITERAL> <code> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000825 TRACE(("|%p|%p|NOT_LITERAL %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000826 if (ptr >= end || (SRE_CODE) ptr[0] == pattern[0])
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000827 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000828 pattern++;
829 ptr++;
830 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000831
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000832 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000833 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000834 /* <ANY> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000835 TRACE(("|%p|%p|ANY\n", pattern, ptr));
Fredrik Lundhe1869832000-08-01 22:47:49 +0000836 if (ptr >= end || SRE_IS_LINEBREAK(ptr[0]))
837 return 0;
838 ptr++;
839 break;
840
841 case SRE_OP_ANY_ALL:
842 /* match anything */
843 /* <ANY_ALL> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000844 TRACE(("|%p|%p|ANY_ALL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000845 if (ptr >= end)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000846 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000847 ptr++;
848 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000849
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000850 case SRE_OP_IN:
851 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000852 /* <IN> <skip> <set> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000853 TRACE(("|%p|%p|IN\n", pattern, ptr));
854 if (ptr >= end || !SRE_CHARSET(pattern + 1, *ptr))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000855 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000856 pattern += pattern[0];
857 ptr++;
858 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000859
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000860 case SRE_OP_GROUPREF:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000861 /* match backreference */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000862 TRACE(("|%p|%p|GROUPREF %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000863 i = pattern[0];
864 {
865 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
866 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
867 if (!p || !e || e < p)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000868 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000869 while (p < e) {
870 if (ptr >= end || *ptr != *p)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000871 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000872 p++; ptr++;
873 }
874 }
875 pattern++;
876 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000877
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000878 case SRE_OP_GROUPREF_IGNORE:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000879 /* match backreference */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000880 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000881 i = pattern[0];
882 {
883 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
884 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
885 if (!p || !e || e < p)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000886 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000887 while (p < e) {
888 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000889 state->lower(*ptr) != state->lower(*p))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000890 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000891 p++; ptr++;
892 }
893 }
894 pattern++;
895 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000896
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000897 case SRE_OP_LITERAL_IGNORE:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000898 TRACE(("|%p|%p|LITERAL_IGNORE %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000899 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000900 state->lower(*ptr) != state->lower(*pattern))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000901 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000902 pattern++;
903 ptr++;
904 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000905
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000906 case SRE_OP_NOT_LITERAL_IGNORE:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000907 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000908 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000909 state->lower(*ptr) == state->lower(*pattern))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000910 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000911 pattern++;
912 ptr++;
913 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000914
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000915 case SRE_OP_IN_IGNORE:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000916 TRACE(("|%p|%p|IN_IGNORE\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000917 if (ptr >= end
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000918 || !SRE_CHARSET(pattern + 1, (SRE_CODE) state->lower(*ptr)))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000919 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000920 pattern += pattern[0];
921 ptr++;
922 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000923
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000924 case SRE_OP_MARK:
925 /* set mark */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000926 /* <MARK> <gid> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000927 TRACE(("|%p|%p|MARK %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000928 i = pattern[0];
Gustavo Niemeyer1aca3592003-04-20 00:45:13 +0000929 if (i & 1)
930 state->lastindex = i/2 + 1;
931 if (i > state->lastmark)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000932 state->lastmark = i;
933 state->mark[i] = ptr;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000934 pattern++;
935 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000936
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000937 case SRE_OP_JUMP:
938 case SRE_OP_INFO:
939 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000940 /* <JUMP> <offset> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000941 TRACE(("|%p|%p|JUMP %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000942 pattern += pattern[0];
943 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000944
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000945 case SRE_OP_ASSERT:
946 /* assert subpattern */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000947 /* <ASSERT> <skip> <back> <pattern> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000948 TRACE(("|%p|%p|ASSERT %d\n", pattern, ptr, pattern[1]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000949 state->ptr = ptr - pattern[1];
Fredrik Lundh6f013982000-07-03 18:44:21 +0000950 if (state->ptr < state->beginning)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000951 return 0;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000952 i = SRE_MATCH(state, pattern + 2, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000953 if (i <= 0)
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000954 return i;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000955 pattern += pattern[0];
956 break;
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000957
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000958 case SRE_OP_ASSERT_NOT:
959 /* assert not subpattern */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000960 /* <ASSERT_NOT> <skip> <back> <pattern> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000961 TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern, ptr, pattern[1]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000962 state->ptr = ptr - pattern[1];
Fredrik Lundhebc37b22000-10-28 19:30:41 +0000963 if (state->ptr >= state->beginning) {
964 i = SRE_MATCH(state, pattern + 2, level + 1);
965 if (i < 0)
966 return i;
967 if (i)
968 return 0;
969 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000970 pattern += pattern[0];
971 break;
972
973 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000974 /* alternation */
975 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000976 TRACE(("|%p|%p|BRANCH\n", pattern, ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000977 LASTMARK_SAVE();
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000978 if (state->repeat) {
979 i = mark_save(state, 0, lastmark, &mark_stack_base);
980 if (i < 0)
981 return i;
982 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000983 for (; pattern[0]; pattern += pattern[0]) {
984 if (pattern[1] == SRE_OP_LITERAL &&
985 (ptr >= end || (SRE_CODE) *ptr != pattern[2]))
986 continue;
987 if (pattern[1] == SRE_OP_IN &&
988 (ptr >= end || !SRE_CHARSET(pattern + 3, (SRE_CODE) *ptr)))
989 continue;
990 state->ptr = ptr;
991 i = SRE_MATCH(state, pattern + 1, level + 1);
992 if (i)
993 return i;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000994 if (state->repeat) {
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +0000995 i = mark_restore(state, 0, lastmark, &mark_stack_base);
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000996 if (i < 0)
997 return i;
998 }
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000999 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001000 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001001 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001002
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001003 case SRE_OP_REPEAT_ONE:
1004 /* match repeated sequence (maximizing regexp) */
1005
1006 /* this operator only works if the repeated item is
1007 exactly one character wide, and we're not already
1008 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +00001009 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001010
1011 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1012
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001013 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", pattern, ptr,
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001014 pattern[1], pattern[2]));
1015
Fredrik Lundhe1869832000-08-01 22:47:49 +00001016 if (ptr + pattern[1] > end)
1017 return 0; /* cannot match */
1018
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001019 state->ptr = ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001020
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001021 count = SRE_COUNT(state, pattern + 3, pattern[2], level + 1);
1022 if (count < 0)
1023 return count;
Fredrik Lundhe1869832000-08-01 22:47:49 +00001024
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001025 ptr += count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001026
1027 /* when we arrive here, count contains the number of
1028 matches, and ptr points to the tail of the target
1029 string. check if the rest of the pattern matches,
1030 and backtrack if not. */
1031
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001032 if (count < (int) pattern[1])
1033 return 0;
1034
1035 if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
1036 /* tail is empty. we're finished */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001037 state->ptr = ptr;
1038 return 1;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001039 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001040
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001041 LASTMARK_SAVE();
1042
1043 if (pattern[pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001044 /* tail starts with a literal. skip positions where
1045 the rest of the pattern cannot possibly match */
Fredrik Lundhe1869832000-08-01 22:47:49 +00001046 chr = pattern[pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001047 for (;;) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001048 while (count >= (int) pattern[1] &&
1049 (ptr >= end || *ptr != chr)) {
1050 ptr--;
1051 count--;
1052 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001053 if (count < (int) pattern[1])
1054 break;
1055 state->ptr = ptr;
1056 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001057 if (i)
Fredrik Lundh33accc12000-08-27 20:59:47 +00001058 return i;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001059 ptr--;
1060 count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001061 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001062 }
1063
1064 } else {
1065 /* general case */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001066 while (count >= (int) pattern[1]) {
1067 state->ptr = ptr;
1068 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001069 if (i)
Fredrik Lundh33accc12000-08-27 20:59:47 +00001070 return i;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001071 ptr--;
1072 count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001073 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001074 }
1075 }
1076 return 0;
1077
Guido van Rossum41c99e72003-04-14 17:59:34 +00001078 case SRE_OP_MIN_REPEAT_ONE:
1079 /* match repeated sequence (minimizing regexp) */
1080
1081 /* this operator only works if the repeated item is
1082 exactly one character wide, and we're not already
1083 collecting backtracking points. for other cases,
1084 use the MIN_REPEAT operator */
1085
1086 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1087
1088 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", pattern, ptr,
1089 pattern[1], pattern[2]));
1090
1091 if (ptr + pattern[1] > end)
1092 return 0; /* cannot match */
1093
1094 state->ptr = ptr;
1095
1096 if (pattern[1] == 0)
1097 count = 0;
1098 else {
1099 /* count using pattern min as the maximum */
1100 count = SRE_COUNT(state, pattern + 3, pattern[1], level + 1);
1101
1102 if (count < 0)
1103 return count; /* exception */
1104 if (count < (int) pattern[1])
1105 return 0; /* did not match minimum number of times */
1106 ptr += count; /* advance past minimum matches of repeat */
1107 }
1108
1109 if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
1110 /* tail is empty. we're finished */
1111 state->ptr = ptr;
1112 return 1;
1113
1114 } else {
1115 /* general case */
1116 int matchmax = ((int)pattern[2] == 65535);
1117 int c;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001118 LASTMARK_SAVE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001119 while (matchmax || count <= (int) pattern[2]) {
1120 state->ptr = ptr;
1121 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
1122 if (i)
1123 return i;
1124 state->ptr = ptr;
1125 c = SRE_COUNT(state, pattern+3, 1, level+1);
1126 if (c < 0)
1127 return c;
1128 if (c == 0)
1129 break;
1130 assert(c == 1);
1131 ptr++;
1132 count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001133 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001134 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001135 }
1136 return 0;
1137
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001138 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001139 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001140 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001141 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001142 TRACE(("|%p|%p|REPEAT %d %d\n", pattern, ptr,
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001143 pattern[1], pattern[2]));
1144
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001145 rep.count = -1;
1146 rep.pattern = pattern;
1147
1148 /* install new repeat context */
1149 rep.prev = state->repeat;
1150 state->repeat = &rep;
1151
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001152 state->ptr = ptr;
1153 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001154
1155 state->repeat = rep.prev;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001156
1157 return i;
1158
1159 case SRE_OP_MAX_UNTIL:
1160 /* maximizing repeat */
1161 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1162
1163 /* FIXME: we probably need to deal with zero-width
1164 matches in here... */
1165
1166 rp = state->repeat;
1167 if (!rp)
1168 return SRE_ERROR_STATE;
1169
1170 state->ptr = ptr;
1171
1172 count = rp->count + 1;
1173
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001174 TRACE(("|%p|%p|MAX_UNTIL %d\n", pattern, ptr, count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001175
1176 if (count < rp->pattern[1]) {
1177 /* not enough matches */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001178 rp->count = count;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001179 /* RECURSIVE */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001180 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001181 if (i)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001182 return i;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001183 rp->count = count - 1;
1184 state->ptr = ptr;
1185 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001186 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001187
1188 if (count < rp->pattern[2] || rp->pattern[2] == 65535) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001189 /* we may have enough matches, but if we can
1190 match another item, do so */
1191 rp->count = count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001192 LASTMARK_SAVE();
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +00001193 i = mark_save(state, 0, lastmark, &mark_stack_base);
Fredrik Lundh33accc12000-08-27 20:59:47 +00001194 if (i < 0)
1195 return i;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001196 /* RECURSIVE */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001197 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001198 if (i)
1199 return i;
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +00001200 i = mark_restore(state, 0, lastmark, &mark_stack_base);
Fredrik Lundh33accc12000-08-27 20:59:47 +00001201 if (i < 0)
1202 return i;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001203 LASTMARK_RESTORE();
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001204 rp->count = count - 1;
1205 state->ptr = ptr;
1206 }
1207
1208 /* cannot match more repeated items here. make sure the
1209 tail matches */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001210 state->repeat = rp->prev;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001211 i = SRE_MATCH(state, pattern, level + 1);
1212 if (i)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001213 return i;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001214 state->repeat = rp;
Fredrik Lundh770617b2001-01-14 15:06:11 +00001215 state->ptr = ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001216 return 0;
1217
1218 case SRE_OP_MIN_UNTIL:
1219 /* minimizing repeat */
1220 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1221
1222 rp = state->repeat;
1223 if (!rp)
1224 return SRE_ERROR_STATE;
1225
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001226 state->ptr = ptr;
1227
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001228 count = rp->count + 1;
1229
Fredrik Lundh770617b2001-01-14 15:06:11 +00001230 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", pattern, ptr, count,
1231 rp->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001232
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001233 if (count < rp->pattern[1]) {
1234 /* not enough matches */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001235 rp->count = count;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001236 /* RECURSIVE */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001237 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001238 if (i)
1239 return i;
1240 rp->count = count-1;
1241 state->ptr = ptr;
1242 return 0;
1243 }
1244
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001245 LASTMARK_SAVE();
1246
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001247 /* see if the tail matches */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001248 state->repeat = rp->prev;
Fredrik Lundh82b23072001-12-09 16:13:15 +00001249 i = SRE_MATCH(state, pattern, level + 1);
1250 if (i)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001251 return i;
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001252
Fredrik Lundh770617b2001-01-14 15:06:11 +00001253 state->ptr = ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001254 state->repeat = rp;
1255
1256 if (count >= rp->pattern[2] && rp->pattern[2] != 65535)
1257 return 0;
1258
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001259 LASTMARK_RESTORE();
1260
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001261 rp->count = count;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001262 /* RECURSIVE */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001263 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001264 if (i)
1265 return i;
1266 rp->count = count - 1;
Fredrik Lundh770617b2001-01-14 15:06:11 +00001267 state->ptr = ptr;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001268
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001269 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001270
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001271 default:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001272 TRACE(("|%p|%p|UNKNOWN %d\n", pattern, ptr, pattern[-1]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001273 return SRE_ERROR_ILLEGAL;
1274 }
1275 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001276
Sjoerd Mullender89dfe9e2001-08-30 14:37:07 +00001277 /* can't end up here */
Fredrik Lundh21009b92001-09-18 18:47:09 +00001278 /* return SRE_ERROR_ILLEGAL; -- see python-dev discussion */
Guido van Rossumb700df92000-03-31 14:59:30 +00001279}
1280
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001281LOCAL(int)
Guido van Rossumb700df92000-03-31 14:59:30 +00001282SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1283{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001284 SRE_CHAR* ptr = state->start;
1285 SRE_CHAR* end = state->end;
1286 int status = 0;
Fredrik Lundh28552902000-07-05 21:14:16 +00001287 int prefix_len = 0;
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001288 int prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001289 SRE_CODE* prefix = NULL;
1290 SRE_CODE* charset = NULL;
1291 SRE_CODE* overlap = NULL;
1292 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001293
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001294 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001295 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001296 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001297
1298 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001299
Gustavo Niemeyer28b5bb32003-06-26 14:41:08 +00001300 if (pattern[3] > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001301 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001302 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001303 end -= pattern[3]-1;
1304 if (end <= ptr)
1305 end = ptr+1;
1306 }
1307
Fredrik Lundh3562f112000-07-02 12:00:07 +00001308 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001309 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001310 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001311 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001312 prefix_skip = pattern[6];
1313 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001314 overlap = prefix + prefix_len - 1;
1315 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001316 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001317 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001318 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001319
1320 pattern += 1 + pattern[1];
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001321 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001322
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001323 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1324 TRACE(("charset = %p\n", charset));
1325
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001326#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001327 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001328 /* pattern starts with a known prefix. use the overlap
1329 table to skip forward as fast as we possibly can */
1330 int i = 0;
1331 end = state->end;
1332 while (ptr < end) {
1333 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001334 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001335 if (!i)
1336 break;
1337 else
1338 i = overlap[i];
1339 } else {
1340 if (++i == prefix_len) {
1341 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001342 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1343 state->start = ptr + 1 - prefix_len;
1344 state->ptr = ptr + 1 - prefix_len + prefix_skip;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001345 if (flags & SRE_INFO_LITERAL)
1346 return 1; /* we got all of it */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001347 status = SRE_MATCH(state, pattern + 2*prefix_skip, 1);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001348 if (status != 0)
1349 return status;
1350 /* close but no cigar -- try again */
1351 i = overlap[i];
1352 }
1353 break;
1354 }
1355
1356 }
1357 ptr++;
1358 }
1359 return 0;
1360 }
1361#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001362
Fredrik Lundh3562f112000-07-02 12:00:07 +00001363 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001364 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001365 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001366 SRE_CODE chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001367 end = state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001368 for (;;) {
1369 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1370 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001371 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001372 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001373 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001374 state->start = ptr;
1375 state->ptr = ++ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001376 if (flags & SRE_INFO_LITERAL)
1377 return 1; /* we got all of it */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001378 status = SRE_MATCH(state, pattern + 2, 1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001379 if (status != 0)
1380 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001381 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001382 } else if (charset) {
1383 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001384 end = state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001385 for (;;) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001386 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001387 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001388 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001389 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001390 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001391 state->start = ptr;
1392 state->ptr = ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001393 status = SRE_MATCH(state, pattern, 1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001394 if (status != 0)
1395 break;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001396 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001397 }
1398 } else
1399 /* general case */
1400 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001401 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001402 state->start = state->ptr = ptr++;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001403 status = SRE_MATCH(state, pattern, 1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001404 if (status != 0)
1405 break;
1406 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001407
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001408 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001409}
Fredrik Lundh3562f112000-07-02 12:00:07 +00001410
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001411LOCAL(int)
1412SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, int len)
1413{
1414 /* check if given string is a literal template (i.e. no escapes) */
1415 while (len-- > 0)
1416 if (*ptr++ == '\\')
1417 return 0;
1418 return 1;
1419}
Guido van Rossumb700df92000-03-31 14:59:30 +00001420
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001421#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001422
1423/* -------------------------------------------------------------------- */
1424/* factories and destructors */
1425
1426/* see sre.h for object declarations */
1427
Jeremy Hylton938ace62002-07-17 16:30:39 +00001428static PyTypeObject Pattern_Type;
1429static PyTypeObject Match_Type;
1430static PyTypeObject Scanner_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00001431
1432static PyObject *
1433_compile(PyObject* self_, PyObject* args)
1434{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001435 /* "compile" pattern descriptor to pattern object */
Guido van Rossumb700df92000-03-31 14:59:30 +00001436
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001437 PatternObject* self;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001438 int i, n;
Guido van Rossumb700df92000-03-31 14:59:30 +00001439
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001440 PyObject* pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001441 int flags = 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001442 PyObject* code;
1443 int groups = 0;
1444 PyObject* groupindex = NULL;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001445 PyObject* indexgroup = NULL;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001446 if (!PyArg_ParseTuple(args, "OiO!|iOO", &pattern, &flags,
1447 &PyList_Type, &code, &groups,
1448 &groupindex, &indexgroup))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001449 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001450
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001451 n = PyList_GET_SIZE(code);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001452
Fredrik Lundhebc37b22000-10-28 19:30:41 +00001453 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001454 if (!self)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001455 return NULL;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001456
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001457 self->codesize = n;
1458
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001459 for (i = 0; i < n; i++) {
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001460 PyObject *o = PyList_GET_ITEM(code, i);
Martin v. Löwis78e2f062003-04-19 12:56:08 +00001461 if (PyInt_Check(o))
1462 self->code[i] = (SRE_CODE) PyInt_AsLong(o);
1463 else
1464 self->code[i] = (SRE_CODE) PyLong_AsUnsignedLong(o);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001465 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00001466
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001467 if (PyErr_Occurred()) {
1468 PyObject_DEL(self);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001469 return NULL;
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001470 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00001471
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001472 Py_INCREF(pattern);
1473 self->pattern = pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00001474
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001475 self->flags = flags;
1476
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001477 self->groups = groups;
Guido van Rossumb700df92000-03-31 14:59:30 +00001478
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001479 Py_XINCREF(groupindex);
1480 self->groupindex = groupindex;
Guido van Rossumb700df92000-03-31 14:59:30 +00001481
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001482 Py_XINCREF(indexgroup);
1483 self->indexgroup = indexgroup;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001484
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001485 return (PyObject*) self;
Guido van Rossumb700df92000-03-31 14:59:30 +00001486}
1487
1488static PyObject *
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001489sre_codesize(PyObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001490{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001491 return Py_BuildValue("i", sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001492}
1493
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001494static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001495sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001496{
1497 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001498 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001499 return NULL;
1500 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001501 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001502 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001503#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001504 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001505#else
1506 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001507#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001508 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001509}
1510
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001511LOCAL(void)
1512state_reset(SRE_STATE* state)
1513{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001514 state->lastmark = 0;
1515
1516 /* FIXME: dynamic! */
Neal Norwitz35fc7602002-06-13 21:11:11 +00001517 memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001518
1519 state->lastindex = -1;
1520
1521 state->repeat = NULL;
1522
1523 mark_fini(state);
1524}
1525
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001526static void*
1527getstring(PyObject* string, int* p_length, int* p_charsize)
Guido van Rossumb700df92000-03-31 14:59:30 +00001528{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001529 /* given a python object, return a data pointer, a length (in
1530 characters), and a character size. return NULL if the object
1531 is not a string (or not compatible) */
1532
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001533 PyBufferProcs *buffer;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001534 int size, bytes, charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001535 void* ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001536
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001537#if defined(HAVE_UNICODE)
1538 if (PyUnicode_Check(string)) {
1539 /* unicode strings doesn't always support the buffer interface */
1540 ptr = (void*) PyUnicode_AS_DATA(string);
1541 bytes = PyUnicode_GET_DATA_SIZE(string);
1542 size = PyUnicode_GET_SIZE(string);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001543 charsize = sizeof(Py_UNICODE);
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001544
1545 } else {
1546#endif
1547
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001548 /* get pointer to string buffer */
1549 buffer = string->ob_type->tp_as_buffer;
1550 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1551 buffer->bf_getsegcount(string, NULL) != 1) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001552 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001553 return NULL;
1554 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001555
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001556 /* determine buffer size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001557 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1558 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001559 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1560 return NULL;
1561 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001562
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001563 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001564#if PY_VERSION_HEX >= 0x01060000
1565 size = PyObject_Size(string);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001566#else
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001567 size = PyObject_Length(string);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001568#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001569
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001570 if (PyString_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001571 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001572#if defined(HAVE_UNICODE)
1573 else if (bytes == (int) (size * sizeof(Py_UNICODE)))
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001574 charsize = sizeof(Py_UNICODE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001575#endif
1576 else {
1577 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1578 return NULL;
1579 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001580
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001581#if defined(HAVE_UNICODE)
1582 }
1583#endif
1584
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001585 *p_length = size;
1586 *p_charsize = charsize;
1587
1588 return ptr;
1589}
1590
1591LOCAL(PyObject*)
1592state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1593 int start, int end)
1594{
1595 /* prepare state object */
1596
1597 int length;
1598 int charsize;
1599 void* ptr;
1600
1601 memset(state, 0, sizeof(SRE_STATE));
1602
1603 state->lastindex = -1;
1604
1605 ptr = getstring(string, &length, &charsize);
1606 if (!ptr)
1607 return NULL;
1608
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001609 /* adjust boundaries */
1610 if (start < 0)
1611 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001612 else if (start > length)
1613 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001614
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001615 if (end < 0)
1616 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001617 else if (end > length)
1618 end = length;
1619
1620 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001621
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001622 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001623
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001624 state->start = (void*) ((char*) ptr + start * state->charsize);
1625 state->end = (void*) ((char*) ptr + end * state->charsize);
1626
1627 Py_INCREF(string);
1628 state->string = string;
1629 state->pos = start;
1630 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001631
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001632 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001633 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001634 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001635#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001636 state->lower = sre_lower_unicode;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001637#else
1638 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001639#endif
1640 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001641 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001642
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001643 return string;
Guido van Rossumb700df92000-03-31 14:59:30 +00001644}
1645
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001646LOCAL(void)
1647state_fini(SRE_STATE* state)
1648{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001649 Py_XDECREF(state->string);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001650 mark_fini(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001651}
1652
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001653/* calculate offset from start of string */
1654#define STATE_OFFSET(state, member)\
1655 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1656
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001657LOCAL(PyObject*)
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001658state_getslice(SRE_STATE* state, int index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001659{
Fredrik Lundh58100642000-08-09 09:14:35 +00001660 int i, j;
1661
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001662 index = (index - 1) * 2;
1663
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001664 if (string == Py_None || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001665 if (empty)
1666 /* want empty string */
1667 i = j = 0;
1668 else {
1669 Py_INCREF(Py_None);
1670 return Py_None;
1671 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001672 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001673 i = STATE_OFFSET(state, state->mark[index]);
1674 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001675 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001676
Fredrik Lundh58100642000-08-09 09:14:35 +00001677 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001678}
1679
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001680static void
1681pattern_error(int status)
1682{
1683 switch (status) {
1684 case SRE_ERROR_RECURSION_LIMIT:
1685 PyErr_SetString(
1686 PyExc_RuntimeError,
1687 "maximum recursion limit exceeded"
1688 );
1689 break;
1690 case SRE_ERROR_MEMORY:
1691 PyErr_NoMemory();
1692 break;
1693 default:
1694 /* other error codes indicate compiler/engine bugs */
1695 PyErr_SetString(
1696 PyExc_RuntimeError,
1697 "internal error in regular expression engine"
1698 );
1699 }
1700}
1701
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001702static PyObject*
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001703pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001704{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001705 /* create match object (from state object) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001706
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001707 MatchObject* match;
1708 int i, j;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001709 char* base;
1710 int n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001711
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001712 if (status > 0) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001713
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001714 /* create match object (with room for extra group marks) */
1715 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
Fredrik Lundh6f013982000-07-03 18:44:21 +00001716 2*(pattern->groups+1));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001717 if (!match)
1718 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001719
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001720 Py_INCREF(pattern);
1721 match->pattern = pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001722
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001723 Py_INCREF(state->string);
1724 match->string = state->string;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001725
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001726 match->regs = NULL;
1727 match->groups = pattern->groups+1;
1728
1729 /* fill in group slices */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001730
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001731 base = (char*) state->beginning;
1732 n = state->charsize;
1733
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001734 match->mark[0] = ((char*) state->start - base) / n;
1735 match->mark[1] = ((char*) state->ptr - base) / n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001736
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001737 for (i = j = 0; i < pattern->groups; i++, j+=2)
1738 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
1739 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
1740 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
1741 } else
1742 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
1743
1744 match->pos = state->pos;
1745 match->endpos = state->endpos;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001746
Fredrik Lundh6f013982000-07-03 18:44:21 +00001747 match->lastindex = state->lastindex;
1748
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001749 return (PyObject*) match;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001750
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001751 } else if (status == 0) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001752
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001753 /* no match */
1754 Py_INCREF(Py_None);
1755 return Py_None;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001756
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001757 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001758
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001759 /* internal error */
1760 pattern_error(status);
1761 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001762}
1763
1764static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001765pattern_scanner(PatternObject* pattern, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001766{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001767 /* create search state object */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001768
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001769 ScannerObject* self;
1770
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001771 PyObject* string;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001772 int start = 0;
1773 int end = INT_MAX;
1774 if (!PyArg_ParseTuple(args, "O|ii:scanner", &string, &start, &end))
1775 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001776
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001777 /* create scanner object */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001778 self = PyObject_NEW(ScannerObject, &Scanner_Type);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001779 if (!self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001780 return NULL;
1781
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001782 string = state_init(&self->state, pattern, string, start, end);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001783 if (!string) {
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001784 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001785 return NULL;
1786 }
1787
1788 Py_INCREF(pattern);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001789 self->pattern = (PyObject*) pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001790
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001791 return (PyObject*) self;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001792}
1793
Guido van Rossumb700df92000-03-31 14:59:30 +00001794static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001795pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001796{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001797 Py_XDECREF(self->pattern);
1798 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001799 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001800 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001801}
1802
1803static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001804pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001805{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001806 SRE_STATE state;
1807 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001808
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001809 PyObject* string;
1810 int start = 0;
1811 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00001812 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1813 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:match", kwlist,
1814 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001815 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001816
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001817 string = state_init(&state, self, string, start, end);
1818 if (!string)
1819 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001820
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001821 state.ptr = state.start;
1822
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001823 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1824
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001825 if (state.charsize == 1) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001826 status = sre_match(&state, PatternObject_GetCode(self), 1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001827 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001828#if defined(HAVE_UNICODE)
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001829 status = sre_umatch(&state, PatternObject_GetCode(self), 1);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001830#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001831 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001832
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001833 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1834
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001835 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001836
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001837 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001838}
1839
1840static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001841pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001842{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001843 SRE_STATE state;
1844 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001845
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001846 PyObject* string;
1847 int start = 0;
1848 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00001849 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1850 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:search", kwlist,
1851 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001852 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001853
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001854 string = state_init(&state, self, string, start, end);
1855 if (!string)
1856 return NULL;
1857
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001858 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1859
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001860 if (state.charsize == 1) {
1861 status = sre_search(&state, PatternObject_GetCode(self));
1862 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001863#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001864 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001865#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001866 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001867
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001868 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1869
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001870 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001871
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001872 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001873}
1874
1875static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001876call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001877{
1878 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001879 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001880 PyObject* func;
1881 PyObject* result;
1882
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001883 if (!args)
1884 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001885 name = PyString_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001886 if (!name)
1887 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001888 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001889 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001890 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001891 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001892 func = PyObject_GetAttrString(mod, function);
1893 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001894 if (!func)
1895 return NULL;
1896 result = PyObject_CallObject(func, args);
1897 Py_DECREF(func);
1898 Py_DECREF(args);
1899 return result;
1900}
1901
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001902#ifdef USE_BUILTIN_COPY
1903static int
1904deepcopy(PyObject** object, PyObject* memo)
1905{
1906 PyObject* copy;
1907
1908 copy = call(
1909 "copy", "deepcopy",
1910 Py_BuildValue("OO", *object, memo)
1911 );
1912 if (!copy)
1913 return 0;
1914
1915 Py_DECREF(*object);
1916 *object = copy;
1917
1918 return 1; /* success */
1919}
1920#endif
1921
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001922static PyObject*
Michael W. Hudsonb6a45052002-07-31 09:54:24 +00001923join_list(PyObject* list, PyObject* pattern)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001924{
1925 /* join list elements */
1926
1927 PyObject* joiner;
1928#if PY_VERSION_HEX >= 0x01060000
1929 PyObject* function;
1930 PyObject* args;
1931#endif
1932 PyObject* result;
1933
1934 switch (PyList_GET_SIZE(list)) {
1935 case 0:
1936 Py_DECREF(list);
Fredrik Lundh09705f02002-11-22 12:46:35 +00001937 return PySequence_GetSlice(pattern, 0, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001938 case 1:
1939 result = PyList_GET_ITEM(list, 0);
1940 Py_INCREF(result);
1941 Py_DECREF(list);
1942 return result;
1943 }
1944
1945 /* two or more elements: slice out a suitable separator from the
1946 first member, and use that to join the entire list */
1947
1948 joiner = PySequence_GetSlice(pattern, 0, 0);
1949 if (!joiner)
1950 return NULL;
1951
1952#if PY_VERSION_HEX >= 0x01060000
1953 function = PyObject_GetAttrString(joiner, "join");
1954 if (!function) {
1955 Py_DECREF(joiner);
1956 return NULL;
1957 }
1958 args = PyTuple_New(1);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00001959 if (!args) {
1960 Py_DECREF(function);
1961 Py_DECREF(joiner);
1962 return NULL;
1963 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001964 PyTuple_SET_ITEM(args, 0, list);
1965 result = PyObject_CallObject(function, args);
1966 Py_DECREF(args); /* also removes list */
1967 Py_DECREF(function);
1968#else
1969 result = call(
1970 "string", "join",
1971 Py_BuildValue("OO", list, joiner)
1972 );
1973#endif
1974 Py_DECREF(joiner);
1975
1976 return result;
1977}
1978
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001979static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001980pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001981{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001982 SRE_STATE state;
1983 PyObject* list;
1984 int status;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001985 int i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00001986
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001987 PyObject* string;
1988 int start = 0;
1989 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00001990 static char* kwlist[] = { "source", "pos", "endpos", NULL };
1991 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:findall", kwlist,
1992 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001993 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001994
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001995 string = state_init(&state, self, string, start, end);
1996 if (!string)
1997 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001998
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001999 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002000 if (!list) {
2001 state_fini(&state);
2002 return NULL;
2003 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002004
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002005 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002006
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002007 PyObject* item;
2008
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002009 state_reset(&state);
2010
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002011 state.ptr = state.start;
2012
2013 if (state.charsize == 1) {
2014 status = sre_search(&state, PatternObject_GetCode(self));
2015 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002016#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002017 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002018#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002019 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002020
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002021 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002022 if (status == 0)
2023 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002024 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002025 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002026 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002027
2028 /* don't bother to build a match object */
2029 switch (self->groups) {
2030 case 0:
2031 b = STATE_OFFSET(&state, state.start);
2032 e = STATE_OFFSET(&state, state.ptr);
2033 item = PySequence_GetSlice(string, b, e);
2034 if (!item)
2035 goto error;
2036 break;
2037 case 1:
2038 item = state_getslice(&state, 1, string, 1);
2039 if (!item)
2040 goto error;
2041 break;
2042 default:
2043 item = PyTuple_New(self->groups);
2044 if (!item)
2045 goto error;
2046 for (i = 0; i < self->groups; i++) {
2047 PyObject* o = state_getslice(&state, i+1, string, 1);
2048 if (!o) {
2049 Py_DECREF(item);
2050 goto error;
2051 }
2052 PyTuple_SET_ITEM(item, i, o);
2053 }
2054 break;
2055 }
2056
2057 status = PyList_Append(list, item);
2058 Py_DECREF(item);
2059 if (status < 0)
2060 goto error;
2061
2062 if (state.ptr == state.start)
2063 state.start = (void*) ((char*) state.ptr + state.charsize);
2064 else
2065 state.start = state.ptr;
2066
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002067 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002068
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002069 state_fini(&state);
2070 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002071
2072error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002073 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002074 state_fini(&state);
2075 return NULL;
2076
Guido van Rossumb700df92000-03-31 14:59:30 +00002077}
2078
Fredrik Lundh703ce812001-10-24 22:16:30 +00002079#if PY_VERSION_HEX >= 0x02020000
2080static PyObject*
2081pattern_finditer(PatternObject* pattern, PyObject* args)
2082{
2083 PyObject* scanner;
2084 PyObject* search;
2085 PyObject* iterator;
2086
2087 scanner = pattern_scanner(pattern, args);
2088 if (!scanner)
2089 return NULL;
2090
2091 search = PyObject_GetAttrString(scanner, "search");
2092 Py_DECREF(scanner);
2093 if (!search)
2094 return NULL;
2095
2096 iterator = PyCallIter_New(search, Py_None);
2097 Py_DECREF(search);
2098
2099 return iterator;
2100}
2101#endif
2102
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002103static PyObject*
2104pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2105{
2106 SRE_STATE state;
2107 PyObject* list;
2108 PyObject* item;
2109 int status;
2110 int n;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002111 int i;
2112 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002113
2114 PyObject* string;
2115 int maxsplit = 0;
2116 static char* kwlist[] = { "source", "maxsplit", NULL };
2117 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|i:split", kwlist,
2118 &string, &maxsplit))
2119 return NULL;
2120
2121 string = state_init(&state, self, string, 0, INT_MAX);
2122 if (!string)
2123 return NULL;
2124
2125 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002126 if (!list) {
2127 state_fini(&state);
2128 return NULL;
2129 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002130
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002131 n = 0;
2132 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002133
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002134 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002135
2136 state_reset(&state);
2137
2138 state.ptr = state.start;
2139
2140 if (state.charsize == 1) {
2141 status = sre_search(&state, PatternObject_GetCode(self));
2142 } else {
2143#if defined(HAVE_UNICODE)
2144 status = sre_usearch(&state, PatternObject_GetCode(self));
2145#endif
2146 }
2147
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002148 if (status <= 0) {
2149 if (status == 0)
2150 break;
2151 pattern_error(status);
2152 goto error;
2153 }
2154
2155 if (state.start == state.ptr) {
2156 if (last == state.end)
2157 break;
2158 /* skip one character */
2159 state.start = (void*) ((char*) state.ptr + state.charsize);
2160 continue;
2161 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002162
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002163 /* get segment before this match */
2164 item = PySequence_GetSlice(
2165 string, STATE_OFFSET(&state, last),
2166 STATE_OFFSET(&state, state.start)
2167 );
2168 if (!item)
2169 goto error;
2170 status = PyList_Append(list, item);
2171 Py_DECREF(item);
2172 if (status < 0)
2173 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002174
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002175 /* add groups (if any) */
2176 for (i = 0; i < self->groups; i++) {
2177 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002178 if (!item)
2179 goto error;
2180 status = PyList_Append(list, item);
2181 Py_DECREF(item);
2182 if (status < 0)
2183 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002184 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002185
2186 n = n + 1;
2187
2188 last = state.start = state.ptr;
2189
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002190 }
2191
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002192 /* get segment following last match (even if empty) */
2193 item = PySequence_GetSlice(
2194 string, STATE_OFFSET(&state, last), state.endpos
2195 );
2196 if (!item)
2197 goto error;
2198 status = PyList_Append(list, item);
2199 Py_DECREF(item);
2200 if (status < 0)
2201 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002202
2203 state_fini(&state);
2204 return list;
2205
2206error:
2207 Py_DECREF(list);
2208 state_fini(&state);
2209 return NULL;
2210
2211}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002212
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002213static PyObject*
2214pattern_subx(PatternObject* self, PyObject* template, PyObject* string,
2215 int count, int subn)
2216{
2217 SRE_STATE state;
2218 PyObject* list;
2219 PyObject* item;
2220 PyObject* filter;
2221 PyObject* args;
2222 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002223 void* ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002224 int status;
2225 int n;
2226 int i, b, e;
2227 int filter_is_callable;
2228
Fredrik Lundhdac58492001-10-21 21:48:30 +00002229 if (PyCallable_Check(template)) {
2230 /* sub/subn takes either a function or a template */
2231 filter = template;
2232 Py_INCREF(filter);
2233 filter_is_callable = 1;
2234 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002235 /* if not callable, check if it's a literal string */
2236 int literal;
2237 ptr = getstring(template, &n, &b);
2238 if (ptr) {
2239 if (b == 1) {
2240 literal = sre_literal_template(ptr, n);
2241 } else {
2242#if defined(HAVE_UNICODE)
2243 literal = sre_uliteral_template(ptr, n);
2244#endif
2245 }
2246 } else {
2247 PyErr_Clear();
2248 literal = 0;
2249 }
2250 if (literal) {
2251 filter = template;
2252 Py_INCREF(filter);
2253 filter_is_callable = 0;
2254 } else {
2255 /* not a literal; hand it over to the template compiler */
2256 filter = call(
2257 SRE_MODULE, "_subx",
2258 Py_BuildValue("OO", self, template)
2259 );
2260 if (!filter)
2261 return NULL;
2262 filter_is_callable = PyCallable_Check(filter);
2263 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002264 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002265
2266 string = state_init(&state, self, string, 0, INT_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002267 if (!string) {
2268 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002269 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002270 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002271
2272 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002273 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002274 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002275 state_fini(&state);
2276 return NULL;
2277 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002278
2279 n = i = 0;
2280
2281 while (!count || n < count) {
2282
2283 state_reset(&state);
2284
2285 state.ptr = state.start;
2286
2287 if (state.charsize == 1) {
2288 status = sre_search(&state, PatternObject_GetCode(self));
2289 } else {
2290#if defined(HAVE_UNICODE)
2291 status = sre_usearch(&state, PatternObject_GetCode(self));
2292#endif
2293 }
2294
2295 if (status <= 0) {
2296 if (status == 0)
2297 break;
2298 pattern_error(status);
2299 goto error;
2300 }
2301
2302 b = STATE_OFFSET(&state, state.start);
2303 e = STATE_OFFSET(&state, state.ptr);
2304
2305 if (i < b) {
2306 /* get segment before this match */
2307 item = PySequence_GetSlice(string, i, b);
2308 if (!item)
2309 goto error;
2310 status = PyList_Append(list, item);
2311 Py_DECREF(item);
2312 if (status < 0)
2313 goto error;
2314
2315 } else if (i == b && i == e && n > 0)
2316 /* ignore empty match on latest position */
2317 goto next;
2318
2319 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002320 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002321 match = pattern_new_match(self, &state, 1);
2322 if (!match)
2323 goto error;
2324 args = Py_BuildValue("(O)", match);
2325 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002326 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002327 goto error;
2328 }
2329 item = PyObject_CallObject(filter, args);
2330 Py_DECREF(args);
2331 Py_DECREF(match);
2332 if (!item)
2333 goto error;
2334 } else {
2335 /* filter is literal string */
2336 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002337 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002338 }
2339
2340 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002341 if (item != Py_None) {
2342 status = PyList_Append(list, item);
2343 Py_DECREF(item);
2344 if (status < 0)
2345 goto error;
2346 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002347
2348 i = e;
2349 n = n + 1;
2350
2351next:
2352 /* move on */
2353 if (state.ptr == state.start)
2354 state.start = (void*) ((char*) state.ptr + state.charsize);
2355 else
2356 state.start = state.ptr;
2357
2358 }
2359
2360 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002361 if (i < state.endpos) {
2362 item = PySequence_GetSlice(string, i, state.endpos);
2363 if (!item)
2364 goto error;
2365 status = PyList_Append(list, item);
2366 Py_DECREF(item);
2367 if (status < 0)
2368 goto error;
2369 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002370
2371 state_fini(&state);
2372
Guido van Rossum4e173842001-12-07 04:25:10 +00002373 Py_DECREF(filter);
2374
Fredrik Lundhdac58492001-10-21 21:48:30 +00002375 /* convert list to single string (also removes list) */
Michael W. Hudsonb6a45052002-07-31 09:54:24 +00002376 item = join_list(list, self->pattern);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002377
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002378 if (!item)
2379 return NULL;
2380
2381 if (subn)
2382 return Py_BuildValue("Ni", item, n);
2383
2384 return item;
2385
2386error:
2387 Py_DECREF(list);
2388 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002389 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002390 return NULL;
2391
2392}
2393
2394static PyObject*
2395pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2396{
2397 PyObject* template;
2398 PyObject* string;
2399 int count = 0;
2400 static char* kwlist[] = { "repl", "string", "count", NULL };
2401 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:sub", kwlist,
2402 &template, &string, &count))
2403 return NULL;
2404
2405 return pattern_subx(self, template, string, count, 0);
2406}
2407
2408static PyObject*
2409pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2410{
2411 PyObject* template;
2412 PyObject* string;
2413 int count = 0;
2414 static char* kwlist[] = { "repl", "string", "count", NULL };
2415 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:subn", kwlist,
2416 &template, &string, &count))
2417 return NULL;
2418
2419 return pattern_subx(self, template, string, count, 1);
2420}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002421
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002422static PyObject*
2423pattern_copy(PatternObject* self, PyObject* args)
2424{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002425#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002426 PatternObject* copy;
2427 int offset;
2428
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002429 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
2430 return NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002431
2432 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2433 if (!copy)
2434 return NULL;
2435
2436 offset = offsetof(PatternObject, groups);
2437
2438 Py_XINCREF(self->groupindex);
2439 Py_XINCREF(self->indexgroup);
2440 Py_XINCREF(self->pattern);
2441
2442 memcpy((char*) copy + offset, (char*) self + offset,
2443 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
2444
2445 return (PyObject*) copy;
2446#else
2447 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2448 return NULL;
2449#endif
2450}
2451
2452static PyObject*
2453pattern_deepcopy(PatternObject* self, PyObject* args)
2454{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002455#ifdef USE_BUILTIN_COPY
2456 PatternObject* copy;
2457
2458 PyObject* memo;
2459 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
2460 return NULL;
2461
2462 copy = (PatternObject*) pattern_copy(self, Py_None);
2463 if (!copy)
2464 return NULL;
2465
2466 if (!deepcopy(&copy->groupindex, memo) ||
2467 !deepcopy(&copy->indexgroup, memo) ||
2468 !deepcopy(&copy->pattern, memo)) {
2469 Py_DECREF(copy);
2470 return NULL;
2471 }
2472
2473#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002474 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2475 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002476#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002477}
2478
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002479static PyMethodDef pattern_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00002480 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS},
2481 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS},
2482 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS},
2483 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS},
2484 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS},
2485 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002486#if PY_VERSION_HEX >= 0x02020000
2487 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS},
2488#endif
Fredrik Lundh562586e2000-10-03 20:43:34 +00002489 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002490 {"__copy__", (PyCFunction) pattern_copy, METH_VARARGS},
2491 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002492 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002493};
2494
2495static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002496pattern_getattr(PatternObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00002497{
2498 PyObject* res;
2499
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002500 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00002501
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002502 if (res)
2503 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00002504
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002505 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00002506
2507 /* attributes */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002508 if (!strcmp(name, "pattern")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002509 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002510 return self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00002511 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002512
2513 if (!strcmp(name, "flags"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002514 return Py_BuildValue("i", self->flags);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002515
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002516 if (!strcmp(name, "groups"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002517 return Py_BuildValue("i", self->groups);
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002518
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002519 if (!strcmp(name, "groupindex") && self->groupindex) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002520 Py_INCREF(self->groupindex);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002521 return self->groupindex;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002522 }
2523
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002524 PyErr_SetString(PyExc_AttributeError, name);
2525 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002526}
2527
2528statichere PyTypeObject Pattern_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002529 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00002530 0, "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002531 sizeof(PatternObject), sizeof(SRE_CODE),
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002532 (destructor)pattern_dealloc, /*tp_dealloc*/
2533 0, /*tp_print*/
2534 (getattrfunc)pattern_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002535};
2536
2537/* -------------------------------------------------------------------- */
2538/* match methods */
2539
2540static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002541match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00002542{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002543 Py_XDECREF(self->regs);
2544 Py_XDECREF(self->string);
2545 Py_DECREF(self->pattern);
2546 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00002547}
2548
2549static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002550match_getslice_by_index(MatchObject* self, int index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00002551{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002552 if (index < 0 || index >= self->groups) {
2553 /* raise IndexError if we were given a bad group number */
2554 PyErr_SetString(
2555 PyExc_IndexError,
2556 "no such group"
2557 );
2558 return NULL;
2559 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002560
Fredrik Lundh6f013982000-07-03 18:44:21 +00002561 index *= 2;
2562
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002563 if (self->string == Py_None || self->mark[index] < 0) {
2564 /* return default value if the string or group is undefined */
2565 Py_INCREF(def);
2566 return def;
2567 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002568
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002569 return PySequence_GetSlice(
2570 self->string, self->mark[index], self->mark[index+1]
2571 );
Guido van Rossumb700df92000-03-31 14:59:30 +00002572}
2573
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002574static int
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002575match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00002576{
Fredrik Lundh6f013982000-07-03 18:44:21 +00002577 int i;
Guido van Rossumb700df92000-03-31 14:59:30 +00002578
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002579 if (PyInt_Check(index))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002580 return (int) PyInt_AS_LONG(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00002581
Fredrik Lundh6f013982000-07-03 18:44:21 +00002582 i = -1;
2583
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002584 if (self->pattern->groupindex) {
2585 index = PyObject_GetItem(self->pattern->groupindex, index);
2586 if (index) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00002587 if (PyInt_Check(index))
2588 i = (int) PyInt_AS_LONG(index);
2589 Py_DECREF(index);
2590 } else
2591 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002592 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00002593
2594 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002595}
2596
2597static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002598match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002599{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002600 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00002601}
2602
2603static PyObject*
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002604match_expand(MatchObject* self, PyObject* args)
2605{
2606 PyObject* template;
2607 if (!PyArg_ParseTuple(args, "O:expand", &template))
2608 return NULL;
2609
2610 /* delegate to Python code */
2611 return call(
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002612 SRE_MODULE, "_expand",
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002613 Py_BuildValue("OOO", self->pattern, self, template)
2614 );
2615}
2616
2617static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002618match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002619{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002620 PyObject* result;
2621 int i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00002622
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002623 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00002624
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002625 switch (size) {
2626 case 0:
2627 result = match_getslice(self, Py_False, Py_None);
2628 break;
2629 case 1:
2630 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2631 break;
2632 default:
2633 /* fetch multiple items */
2634 result = PyTuple_New(size);
2635 if (!result)
2636 return NULL;
2637 for (i = 0; i < size; i++) {
2638 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002639 self, PyTuple_GET_ITEM(args, i), Py_None
2640 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002641 if (!item) {
2642 Py_DECREF(result);
2643 return NULL;
2644 }
2645 PyTuple_SET_ITEM(result, i, item);
2646 }
2647 break;
2648 }
2649 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002650}
2651
2652static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002653match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002654{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002655 PyObject* result;
2656 int index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002657
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002658 PyObject* def = Py_None;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002659 static char* kwlist[] = { "default", NULL };
2660 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002661 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002662
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002663 result = PyTuple_New(self->groups-1);
2664 if (!result)
2665 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002666
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002667 for (index = 1; index < self->groups; index++) {
2668 PyObject* item;
2669 item = match_getslice_by_index(self, index, def);
2670 if (!item) {
2671 Py_DECREF(result);
2672 return NULL;
2673 }
2674 PyTuple_SET_ITEM(result, index-1, item);
2675 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002676
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002677 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002678}
2679
2680static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002681match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002682{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002683 PyObject* result;
2684 PyObject* keys;
2685 int index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002686
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002687 PyObject* def = Py_None;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002688 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00002689 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002690 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002691
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002692 result = PyDict_New();
2693 if (!result || !self->pattern->groupindex)
2694 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002695
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002696 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002697 if (!keys)
2698 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00002699
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002700 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00002701 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002702 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002703 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002704 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002705 if (!key)
2706 goto failed;
2707 value = match_getslice(self, key, def);
2708 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002709 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002710 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002711 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00002712 status = PyDict_SetItem(result, key, value);
2713 Py_DECREF(value);
2714 if (status < 0)
2715 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002716 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002717
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002718 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00002719
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002720 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002721
2722failed:
2723 Py_DECREF(keys);
2724 Py_DECREF(result);
2725 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002726}
2727
2728static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002729match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002730{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002731 int index;
2732
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002733 PyObject* index_ = Py_False; /* zero */
2734 if (!PyArg_ParseTuple(args, "|O:start", &index_))
2735 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002736
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002737 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002738
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002739 if (index < 0 || index >= self->groups) {
2740 PyErr_SetString(
2741 PyExc_IndexError,
2742 "no such group"
2743 );
2744 return NULL;
2745 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002746
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002747 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002748 return Py_BuildValue("i", self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00002749}
2750
2751static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002752match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002753{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002754 int index;
2755
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002756 PyObject* index_ = Py_False; /* zero */
2757 if (!PyArg_ParseTuple(args, "|O:end", &index_))
2758 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002759
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002760 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002761
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002762 if (index < 0 || index >= self->groups) {
2763 PyErr_SetString(
2764 PyExc_IndexError,
2765 "no such group"
2766 );
2767 return NULL;
2768 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002769
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002770 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002771 return Py_BuildValue("i", self->mark[index*2+1]);
2772}
2773
2774LOCAL(PyObject*)
2775_pair(int i1, int i2)
2776{
2777 PyObject* pair;
2778 PyObject* item;
2779
2780 pair = PyTuple_New(2);
2781 if (!pair)
2782 return NULL;
2783
2784 item = PyInt_FromLong(i1);
2785 if (!item)
2786 goto error;
2787 PyTuple_SET_ITEM(pair, 0, item);
2788
2789 item = PyInt_FromLong(i2);
2790 if (!item)
2791 goto error;
2792 PyTuple_SET_ITEM(pair, 1, item);
2793
2794 return pair;
2795
2796 error:
2797 Py_DECREF(pair);
2798 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002799}
2800
2801static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002802match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002803{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002804 int index;
2805
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002806 PyObject* index_ = Py_False; /* zero */
2807 if (!PyArg_ParseTuple(args, "|O:span", &index_))
2808 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002809
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002810 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002811
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002812 if (index < 0 || index >= self->groups) {
2813 PyErr_SetString(
2814 PyExc_IndexError,
2815 "no such group"
2816 );
2817 return NULL;
2818 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002819
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002820 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002821 return _pair(self->mark[index*2], self->mark[index*2+1]);
2822}
2823
2824static PyObject*
2825match_regs(MatchObject* self)
2826{
2827 PyObject* regs;
2828 PyObject* item;
2829 int index;
2830
2831 regs = PyTuple_New(self->groups);
2832 if (!regs)
2833 return NULL;
2834
2835 for (index = 0; index < self->groups; index++) {
2836 item = _pair(self->mark[index*2], self->mark[index*2+1]);
2837 if (!item) {
2838 Py_DECREF(regs);
2839 return NULL;
2840 }
2841 PyTuple_SET_ITEM(regs, index, item);
2842 }
2843
2844 Py_INCREF(regs);
2845 self->regs = regs;
2846
2847 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00002848}
2849
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002850static PyObject*
2851match_copy(MatchObject* self, PyObject* args)
2852{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002853#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002854 MatchObject* copy;
2855 int slots, offset;
2856
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002857 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
2858 return NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002859
2860 slots = 2 * (self->pattern->groups+1);
2861
2862 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
2863 if (!copy)
2864 return NULL;
2865
2866 /* this value a constant, but any compiler should be able to
2867 figure that out all by itself */
2868 offset = offsetof(MatchObject, string);
2869
2870 Py_XINCREF(self->pattern);
2871 Py_XINCREF(self->string);
2872 Py_XINCREF(self->regs);
2873
2874 memcpy((char*) copy + offset, (char*) self + offset,
2875 sizeof(MatchObject) + slots * sizeof(int) - offset);
2876
2877 return (PyObject*) copy;
2878#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002879 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002880 return NULL;
2881#endif
2882}
2883
2884static PyObject*
2885match_deepcopy(MatchObject* self, PyObject* args)
2886{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002887#ifdef USE_BUILTIN_COPY
2888 MatchObject* copy;
2889
2890 PyObject* memo;
2891 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
2892 return NULL;
2893
2894 copy = (MatchObject*) match_copy(self, Py_None);
2895 if (!copy)
2896 return NULL;
2897
2898 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
2899 !deepcopy(&copy->string, memo) ||
2900 !deepcopy(&copy->regs, memo)) {
2901 Py_DECREF(copy);
2902 return NULL;
2903 }
2904
2905#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002906 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
2907 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002908#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002909}
2910
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002911static PyMethodDef match_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00002912 {"group", (PyCFunction) match_group, METH_VARARGS},
2913 {"start", (PyCFunction) match_start, METH_VARARGS},
2914 {"end", (PyCFunction) match_end, METH_VARARGS},
2915 {"span", (PyCFunction) match_span, METH_VARARGS},
2916 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
2917 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
2918 {"expand", (PyCFunction) match_expand, METH_VARARGS},
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002919 {"__copy__", (PyCFunction) match_copy, METH_VARARGS},
2920 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002921 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002922};
2923
2924static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002925match_getattr(MatchObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00002926{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002927 PyObject* res;
Guido van Rossumb700df92000-03-31 14:59:30 +00002928
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002929 res = Py_FindMethod(match_methods, (PyObject*) self, name);
2930 if (res)
2931 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00002932
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002933 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00002934
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002935 if (!strcmp(name, "lastindex")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00002936 if (self->lastindex >= 0)
2937 return Py_BuildValue("i", self->lastindex);
Fredrik Lundhc2301732000-07-02 22:25:39 +00002938 Py_INCREF(Py_None);
2939 return Py_None;
2940 }
2941
2942 if (!strcmp(name, "lastgroup")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00002943 if (self->pattern->indexgroup && self->lastindex >= 0) {
Fredrik Lundhc2301732000-07-02 22:25:39 +00002944 PyObject* result = PySequence_GetItem(
Fredrik Lundh6f013982000-07-03 18:44:21 +00002945 self->pattern->indexgroup, self->lastindex
Fredrik Lundhc2301732000-07-02 22:25:39 +00002946 );
2947 if (result)
2948 return result;
2949 PyErr_Clear();
2950 }
2951 Py_INCREF(Py_None);
2952 return Py_None;
2953 }
2954
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002955 if (!strcmp(name, "string")) {
2956 if (self->string) {
2957 Py_INCREF(self->string);
2958 return self->string;
2959 } else {
2960 Py_INCREF(Py_None);
2961 return Py_None;
2962 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002963 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002964
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002965 if (!strcmp(name, "regs")) {
2966 if (self->regs) {
2967 Py_INCREF(self->regs);
2968 return self->regs;
2969 } else
2970 return match_regs(self);
2971 }
2972
2973 if (!strcmp(name, "re")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002974 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002975 return (PyObject*) self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00002976 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002977
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002978 if (!strcmp(name, "pos"))
2979 return Py_BuildValue("i", self->pos);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002980
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002981 if (!strcmp(name, "endpos"))
2982 return Py_BuildValue("i", self->endpos);
Guido van Rossumb700df92000-03-31 14:59:30 +00002983
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002984 PyErr_SetString(PyExc_AttributeError, name);
2985 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002986}
2987
2988/* FIXME: implement setattr("string", None) as a special case (to
2989 detach the associated string, if any */
2990
2991statichere PyTypeObject Match_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002992 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00002993 0, "_" SRE_MODULE ".SRE_Match",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002994 sizeof(MatchObject), sizeof(int),
2995 (destructor)match_dealloc, /*tp_dealloc*/
2996 0, /*tp_print*/
2997 (getattrfunc)match_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002998};
2999
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003000/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003001/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003002
3003static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003004scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003005{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003006 state_fini(&self->state);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003007 Py_DECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003008 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003009}
3010
3011static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003012scanner_match(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003013{
3014 SRE_STATE* state = &self->state;
3015 PyObject* match;
3016 int status;
3017
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003018 state_reset(state);
3019
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003020 state->ptr = state->start;
3021
3022 if (state->charsize == 1) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00003023 status = sre_match(state, PatternObject_GetCode(self->pattern), 1);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003024 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003025#if defined(HAVE_UNICODE)
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00003026 status = sre_umatch(state, PatternObject_GetCode(self->pattern), 1);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003027#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003028 }
3029
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003030 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003031 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003032
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00003033 if ((status == 0 || state->ptr == state->start) &&
3034 state->ptr < state->end)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003035 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003036 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003037 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003038
3039 return match;
3040}
3041
3042
3043static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003044scanner_search(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003045{
3046 SRE_STATE* state = &self->state;
3047 PyObject* match;
3048 int status;
3049
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003050 state_reset(state);
3051
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003052 state->ptr = state->start;
3053
3054 if (state->charsize == 1) {
3055 status = sre_search(state, PatternObject_GetCode(self->pattern));
3056 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003057#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003058 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003059#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003060 }
3061
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003062 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003063 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003064
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00003065 if ((status == 0 || state->ptr == state->start) &&
3066 state->ptr < state->end)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003067 state->start = (void*) ((char*) state->ptr + state->charsize);
3068 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003069 state->start = state->ptr;
3070
3071 return match;
3072}
3073
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003074static PyMethodDef scanner_methods[] = {
Neal Norwitzbb2769f2002-03-31 15:46:00 +00003075 /* FIXME: use METH_OLDARGS instead of 0 or fix to use METH_VARARGS */
3076 /* METH_OLDARGS is not in Python 1.5.2 */
3077 {"match", (PyCFunction) scanner_match, 0},
3078 {"search", (PyCFunction) scanner_search, 0},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003079 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003080};
3081
3082static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003083scanner_getattr(ScannerObject* self, char* name)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003084{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003085 PyObject* res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003086
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003087 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
3088 if (res)
3089 return res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003090
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003091 PyErr_Clear();
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003092
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003093 /* attributes */
3094 if (!strcmp(name, "pattern")) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003095 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003096 return self->pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003097 }
3098
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003099 PyErr_SetString(PyExc_AttributeError, name);
3100 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003101}
3102
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003103statichere PyTypeObject Scanner_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003104 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00003105 0, "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003106 sizeof(ScannerObject), 0,
3107 (destructor)scanner_dealloc, /*tp_dealloc*/
3108 0, /*tp_print*/
3109 (getattrfunc)scanner_getattr, /*tp_getattr*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003110};
3111
Guido van Rossumb700df92000-03-31 14:59:30 +00003112static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00003113 {"compile", _compile, METH_VARARGS},
3114 {"getcodesize", sre_codesize, METH_VARARGS},
3115 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003116 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003117};
3118
Andrew M. Kuchlingc24fe362003-04-30 13:09:08 +00003119#if PY_VERSION_HEX < 0x02030000
3120DL_EXPORT(void) init_sre(void)
3121#else
Mark Hammond8235ea12002-07-19 06:55:41 +00003122PyMODINIT_FUNC init_sre(void)
Andrew M. Kuchlingc24fe362003-04-30 13:09:08 +00003123#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00003124{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003125 PyObject* m;
3126 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00003127 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003128
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003129 /* Patch object types */
3130 Pattern_Type.ob_type = Match_Type.ob_type =
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003131 Scanner_Type.ob_type = &PyType_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00003132
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00003133 m = Py_InitModule("_" SRE_MODULE, _functions);
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003134 d = PyModule_GetDict(m);
3135
Fredrik Lundh21009b92001-09-18 18:47:09 +00003136 x = PyInt_FromLong(SRE_MAGIC);
3137 if (x) {
3138 PyDict_SetItemString(d, "MAGIC", x);
3139 Py_DECREF(x);
3140 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00003141
Martin v. Löwis78e2f062003-04-19 12:56:08 +00003142 x = PyInt_FromLong(sizeof(SRE_CODE));
3143 if (x) {
3144 PyDict_SetItemString(d, "CODESIZE", x);
3145 Py_DECREF(x);
3146 }
3147
Fredrik Lundh21009b92001-09-18 18:47:09 +00003148 x = PyString_FromString(copyright);
3149 if (x) {
3150 PyDict_SetItemString(d, "copyright", x);
3151 Py_DECREF(x);
3152 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003153}
3154
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003155#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00003156
3157/* vim:ts=4:sw=4:et
3158*/