blob: 8203ac89e5923c2650cf659c5e39e88a020767c7 [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
Fredrik Lundhebc37b22000-10-28 19:30:41 +00009 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000010 * 2000-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000011 * 2000-09-20 fl added expand method
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000012 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000013 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000014 * 2001-04-28 fl added __copy__ methods (work in progress)
Fredrik Lundh09705f02002-11-22 12:46:35 +000015 * 2001-05-14 fl fixes for 1.5.2 compatibility
Fredrik Lundhf71ae462001-07-02 17:04:48 +000016 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh397a6542001-10-18 19:30:16 +000017 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Fredrik Lundh971e78b2001-10-20 17:48:46 +000018 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
Fredrik Lundhbec95b92001-10-21 16:47:57 +000019 * 2001-10-21 fl added sub/subn primitive
Fredrik Lundh703ce812001-10-24 22:16:30 +000020 * 2001-10-24 fl added finditer primitive (for 2.2 only)
Fredrik Lundh82b23072001-12-09 16:13:15 +000021 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
Fredrik Lundh09705f02002-11-22 12:46:35 +000022 * 2002-11-09 fl fixed empty sub/subn return type
Martin v. Löwis78e2f062003-04-19 12:56:08 +000023 * 2003-04-18 mvl fully support 4-byte codes
Guido van Rossumb700df92000-03-31 14:59:30 +000024 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000025 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000026 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000027 * This version of the SRE library can be redistributed under CNRI's
28 * Python 1.6 license. For any other use, please contact Secret Labs
29 * AB (info@pythonware.com).
30 *
Guido van Rossumb700df92000-03-31 14:59:30 +000031 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000032 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000033 * other compatibility work.
34 */
35
36#ifndef SRE_RECURSIVE
37
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000038static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000039 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000040
41#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000042#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000043
44#include "sre.h"
45
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000046#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000047
Fredrik Lundh436c3d582000-06-29 08:58:44 +000048/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000049#if !defined(SRE_MODULE)
50#define SRE_MODULE "sre"
51#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000052
Guido van Rossumb700df92000-03-31 14:59:30 +000053/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000054#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000055
Fredrik Lundh971e78b2001-10-20 17:48:46 +000056#if PY_VERSION_HEX >= 0x01060000
57#if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
Fredrik Lundh22d25462000-07-01 17:50:59 +000058/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +000059#define HAVE_UNICODE
60#endif
Fredrik Lundh971e78b2001-10-20 17:48:46 +000061#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000062
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000063/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000064/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000065
Fredrik Lundh33accc12000-08-27 20:59:47 +000066/* prevent run-away recursion (bad patterns on long strings) */
67
Fredrik Lundh18c2aa22000-08-07 17:33:38 +000068#if !defined(USE_STACKCHECK)
Fredrik Lundh33accc12000-08-27 20:59:47 +000069#if defined(MS_WIN64) || defined(__LP64__) || defined(_LP64)
70/* require smaller recursion limit for a number of 64-bit platforms:
71 Win64 (MS_WIN64), Linux64 (__LP64__), Monterey (64-bit AIX) (_LP64) */
72/* FIXME: maybe the limit should be 40000 / sizeof(void*) ? */
73#define USE_RECURSION_LIMIT 7500
74#else
Gustavo Niemeyerc23fb772003-04-27 06:58:54 +000075#if defined(__GNUC__) && (__GNUC__ > 2) && \
76 (defined(__FreeBSD__) || defined(PYOS_OS2))
77/* gcc 3.x, on FreeBSD and OS/2+EMX and at optimisation levels of
78 * -O3 (autoconf default) and -O2 (EMX port default), generates code
79 * for _sre that fails for the default recursion limit.
80 */
81#define USE_RECURSION_LIMIT 7500
82#else
Fredrik Lundh33accc12000-08-27 20:59:47 +000083#define USE_RECURSION_LIMIT 10000
84#endif
Fredrik Lundh18c2aa22000-08-07 17:33:38 +000085#endif
Gustavo Niemeyerc23fb772003-04-27 06:58:54 +000086#endif
Fredrik Lundh96ab4652000-08-03 16:29:50 +000087
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000088/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000089#define USE_FAST_SEARCH
90
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000091/* enables aggressive inlining (always on for Visual C) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000092#undef USE_INLINE
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000093
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000094/* enables copy/deepcopy handling (work in progress) */
95#undef USE_BUILTIN_COPY
96
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000097#if PY_VERSION_HEX < 0x01060000
98#define PyObject_DEL(op) PyMem_DEL((op))
99#endif
100
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000101/* -------------------------------------------------------------------- */
102
Fredrik Lundh80946112000-06-29 18:03:25 +0000103#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +0000104#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +0000105#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +0000106/* fastest possible local call under MSVC */
107#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000108#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000109#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000110#else
111#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +0000112#endif
113
114/* error codes */
115#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000116#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000117#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +0000118#define SRE_ERROR_MEMORY -9 /* out of memory */
119
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000120#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +0000121#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +0000122#else
123#define TRACE(v)
124#endif
125
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000126/* -------------------------------------------------------------------- */
127/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000128
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000129/* default character predicates (run sre_chars.py to regenerate tables) */
130
131#define SRE_DIGIT_MASK 1
132#define SRE_SPACE_MASK 2
133#define SRE_LINEBREAK_MASK 4
134#define SRE_ALNUM_MASK 8
135#define SRE_WORD_MASK 16
136
Fredrik Lundh21009b92001-09-18 18:47:09 +0000137/* FIXME: this assumes ASCII. create tables in init_sre() instead */
138
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000139static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1402, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1410, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
14225, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
14324, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1440, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
14524, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
146
Fredrik Lundhb389df32000-06-29 12:48:37 +0000147static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000014810, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
14927, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
15044, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
15161, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
152108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
153122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
154106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
155120, 121, 122, 123, 124, 125, 126, 127 };
156
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000157#define SRE_IS_DIGIT(ch)\
158 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
159#define SRE_IS_SPACE(ch)\
160 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
161#define SRE_IS_LINEBREAK(ch)\
162 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
163#define SRE_IS_ALNUM(ch)\
164 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
165#define SRE_IS_WORD(ch)\
166 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000167
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000168static unsigned int sre_lower(unsigned int ch)
169{
170 return ((ch) < 128 ? sre_char_lower[ch] : ch);
171}
172
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000173/* locale-specific character predicates */
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000174
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000175#define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
176#define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
177#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
178#define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
179#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
180
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000181static unsigned int sre_lower_locale(unsigned int ch)
182{
183 return ((ch) < 256 ? tolower((ch)) : ch);
184}
185
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000186/* unicode-specific character predicates */
187
188#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000189
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000190#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
191#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
192#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000193#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000194#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000195
196static unsigned int sre_lower_unicode(unsigned int ch)
197{
198 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
199}
200
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000201#endif
202
Guido van Rossumb700df92000-03-31 14:59:30 +0000203LOCAL(int)
204sre_category(SRE_CODE category, unsigned int ch)
205{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000206 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000207
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000208 case SRE_CATEGORY_DIGIT:
209 return SRE_IS_DIGIT(ch);
210 case SRE_CATEGORY_NOT_DIGIT:
211 return !SRE_IS_DIGIT(ch);
212 case SRE_CATEGORY_SPACE:
213 return SRE_IS_SPACE(ch);
214 case SRE_CATEGORY_NOT_SPACE:
215 return !SRE_IS_SPACE(ch);
216 case SRE_CATEGORY_WORD:
217 return SRE_IS_WORD(ch);
218 case SRE_CATEGORY_NOT_WORD:
219 return !SRE_IS_WORD(ch);
220 case SRE_CATEGORY_LINEBREAK:
221 return SRE_IS_LINEBREAK(ch);
222 case SRE_CATEGORY_NOT_LINEBREAK:
223 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000224
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000225 case SRE_CATEGORY_LOC_WORD:
226 return SRE_LOC_IS_WORD(ch);
227 case SRE_CATEGORY_LOC_NOT_WORD:
228 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000229
230#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000231 case SRE_CATEGORY_UNI_DIGIT:
232 return SRE_UNI_IS_DIGIT(ch);
233 case SRE_CATEGORY_UNI_NOT_DIGIT:
234 return !SRE_UNI_IS_DIGIT(ch);
235 case SRE_CATEGORY_UNI_SPACE:
236 return SRE_UNI_IS_SPACE(ch);
237 case SRE_CATEGORY_UNI_NOT_SPACE:
238 return !SRE_UNI_IS_SPACE(ch);
239 case SRE_CATEGORY_UNI_WORD:
240 return SRE_UNI_IS_WORD(ch);
241 case SRE_CATEGORY_UNI_NOT_WORD:
242 return !SRE_UNI_IS_WORD(ch);
243 case SRE_CATEGORY_UNI_LINEBREAK:
244 return SRE_UNI_IS_LINEBREAK(ch);
245 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
246 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000247#else
248 case SRE_CATEGORY_UNI_DIGIT:
249 return SRE_IS_DIGIT(ch);
250 case SRE_CATEGORY_UNI_NOT_DIGIT:
251 return !SRE_IS_DIGIT(ch);
252 case SRE_CATEGORY_UNI_SPACE:
253 return SRE_IS_SPACE(ch);
254 case SRE_CATEGORY_UNI_NOT_SPACE:
255 return !SRE_IS_SPACE(ch);
256 case SRE_CATEGORY_UNI_WORD:
257 return SRE_LOC_IS_WORD(ch);
258 case SRE_CATEGORY_UNI_NOT_WORD:
259 return !SRE_LOC_IS_WORD(ch);
260 case SRE_CATEGORY_UNI_LINEBREAK:
261 return SRE_IS_LINEBREAK(ch);
262 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
263 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000264#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000265 }
266 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000267}
268
269/* helpers */
270
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000271static void
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000272mark_fini(SRE_STATE* state)
273{
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000274 if (state->mark_stack) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000275 free(state->mark_stack);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000276 state->mark_stack = NULL;
277 }
278 state->mark_stack_size = state->mark_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000279}
280
281static int
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +0000282mark_save(SRE_STATE* state, int lo, int hi, int *mark_stack_base)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000283{
284 void* stack;
285 int size;
286 int minsize, newsize;
287
288 if (hi <= lo)
289 return 0;
290
291 size = (hi - lo) + 1;
292
293 newsize = state->mark_stack_size;
294 minsize = state->mark_stack_base + size;
295
296 if (newsize < minsize) {
297 /* create new stack */
298 if (!newsize) {
299 newsize = 512;
300 if (newsize < minsize)
301 newsize = minsize;
302 TRACE(("allocate stack %d\n", newsize));
303 stack = malloc(sizeof(void*) * newsize);
304 } else {
305 /* grow the stack */
306 while (newsize < minsize)
307 newsize += newsize;
308 TRACE(("grow stack to %d\n", newsize));
309 stack = realloc(state->mark_stack, sizeof(void*) * newsize);
310 }
311 if (!stack) {
312 mark_fini(state);
313 return SRE_ERROR_MEMORY;
314 }
315 state->mark_stack = stack;
316 state->mark_stack_size = newsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000317 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000318
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000319 TRACE(("copy %d:%d to %d (%d)\n", lo, hi, state->mark_stack_base, size));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000320
321 memcpy(state->mark_stack + state->mark_stack_base, state->mark + lo,
322 size * sizeof(void*));
323
324 state->mark_stack_base += size;
325
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +0000326 *mark_stack_base = state->mark_stack_base;
327
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000328 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000329}
330
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000331static int
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +0000332mark_restore(SRE_STATE* state, int lo, int hi, int *mark_stack_base)
Guido van Rossumb700df92000-03-31 14:59:30 +0000333{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000334 int size;
Guido van Rossumb700df92000-03-31 14:59:30 +0000335
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000336 if (hi <= lo)
337 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000338
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000339 size = (hi - lo) + 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000340
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +0000341 state->mark_stack_base = *mark_stack_base - size;
Guido van Rossumb700df92000-03-31 14:59:30 +0000342
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000343 TRACE(("copy %d:%d from %d\n", lo, hi, state->mark_stack_base));
Guido van Rossumb700df92000-03-31 14:59:30 +0000344
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000345 memcpy(state->mark + lo, state->mark_stack + state->mark_stack_base,
346 size * sizeof(void*));
Guido van Rossumb700df92000-03-31 14:59:30 +0000347
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000348 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000349}
350
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000351/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000352
353#define SRE_CHAR unsigned char
354#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000355#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000356#define SRE_CHARSET sre_charset
357#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000358#define SRE_MATCH sre_match
359#define SRE_SEARCH sre_search
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000360#define SRE_LITERAL_TEMPLATE sre_literal_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000361
362#if defined(HAVE_UNICODE)
363
Guido van Rossumb700df92000-03-31 14:59:30 +0000364#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000365#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000366#undef SRE_RECURSIVE
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000367
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000368#undef SRE_LITERAL_TEMPLATE
Guido van Rossumb700df92000-03-31 14:59:30 +0000369#undef SRE_SEARCH
370#undef SRE_MATCH
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000371#undef SRE_INFO
372#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000373#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000374#undef SRE_AT
375#undef SRE_CHAR
376
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000377/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000378
379#define SRE_CHAR Py_UNICODE
380#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000381#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000382#define SRE_CHARSET sre_ucharset
383#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000384#define SRE_MATCH sre_umatch
385#define SRE_SEARCH sre_usearch
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000386#define SRE_LITERAL_TEMPLATE sre_uliteral_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000387#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000388
389#endif /* SRE_RECURSIVE */
390
391/* -------------------------------------------------------------------- */
392/* String matching engine */
393
394/* the following section is compiled twice, with different character
395 settings */
396
397LOCAL(int)
398SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
399{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000400 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000401
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000402 int this, that;
Guido van Rossumb700df92000-03-31 14:59:30 +0000403
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000404 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000405
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000406 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000407 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000408 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000409
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000410 case SRE_AT_BEGINNING_LINE:
411 return ((void*) ptr == state->beginning ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000412 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000413
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000414 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000415 return (((void*) (ptr+1) == state->end &&
416 SRE_IS_LINEBREAK((int) ptr[0])) ||
417 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000418
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000419 case SRE_AT_END_LINE:
420 return ((void*) ptr == state->end ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000421 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000422
Fredrik Lundh770617b2001-01-14 15:06:11 +0000423 case SRE_AT_END_STRING:
424 return ((void*) ptr == state->end);
425
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000426 case SRE_AT_BOUNDARY:
427 if (state->beginning == state->end)
428 return 0;
429 that = ((void*) ptr > state->beginning) ?
430 SRE_IS_WORD((int) ptr[-1]) : 0;
431 this = ((void*) ptr < state->end) ?
432 SRE_IS_WORD((int) ptr[0]) : 0;
433 return this != that;
Fredrik Lundh80946112000-06-29 18:03:25 +0000434
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000435 case SRE_AT_NON_BOUNDARY:
436 if (state->beginning == state->end)
437 return 0;
438 that = ((void*) ptr > state->beginning) ?
439 SRE_IS_WORD((int) ptr[-1]) : 0;
440 this = ((void*) ptr < state->end) ?
441 SRE_IS_WORD((int) ptr[0]) : 0;
442 return this == that;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000443
444 case SRE_AT_LOC_BOUNDARY:
445 if (state->beginning == state->end)
446 return 0;
447 that = ((void*) ptr > state->beginning) ?
448 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
449 this = ((void*) ptr < state->end) ?
450 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
451 return this != that;
452
453 case SRE_AT_LOC_NON_BOUNDARY:
454 if (state->beginning == state->end)
455 return 0;
456 that = ((void*) ptr > state->beginning) ?
457 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
458 this = ((void*) ptr < state->end) ?
459 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
460 return this == that;
461
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000462#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000463 case SRE_AT_UNI_BOUNDARY:
464 if (state->beginning == state->end)
465 return 0;
466 that = ((void*) ptr > state->beginning) ?
467 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
468 this = ((void*) ptr < state->end) ?
469 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
470 return this != that;
471
472 case SRE_AT_UNI_NON_BOUNDARY:
473 if (state->beginning == state->end)
474 return 0;
475 that = ((void*) ptr > state->beginning) ?
476 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
477 this = ((void*) ptr < state->end) ?
478 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
479 return this == that;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000480#endif
481
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000482 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000483
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000484 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000485}
486
487LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000488SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000489{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000490 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000491
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000492 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000493
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000494 for (;;) {
495 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000496
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000497 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000498 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000499 if (ch == set[0])
500 return ok;
501 set++;
502 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000503
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000504 case SRE_OP_RANGE:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000505 /* <RANGE> <lower> <upper> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000506 if (set[0] <= ch && ch <= set[1])
507 return ok;
508 set += 2;
509 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000510
Fredrik Lundh3562f112000-07-02 12:00:07 +0000511 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000512 if (sizeof(SRE_CODE) == 2) {
513 /* <CHARSET> <bitmap> (16 bits per code word) */
514 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
515 return ok;
516 set += 16;
517 }
518 else {
519 /* <CHARSET> <bitmap> (32 bits per code word) */
520 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
521 return ok;
522 set += 8;
523 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000524 break;
525
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000526 case SRE_OP_BIGCHARSET:
527 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
528 {
529 int count, block;
530 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000531
532 if (sizeof(SRE_CODE) == 2) {
533 block = ((unsigned char*)set)[ch >> 8];
534 set += 128;
535 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
536 return ok;
537 set += count*16;
538 }
539 else {
540 if (ch < 65536)
541 block = ((unsigned char*)set)[ch >> 8];
542 else
543 block = -1;
544 set += 64;
545 if (block >=0 &&
546 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
547 return ok;
548 set += count*8;
549 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000550 break;
551 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000552
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000553 case SRE_OP_CATEGORY:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000554 /* <CATEGORY> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000555 if (sre_category(set[0], (int) ch))
556 return ok;
557 set += 1;
558 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000559
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000560 case SRE_OP_NEGATE:
561 ok = !ok;
562 break;
563
564 case SRE_OP_FAILURE:
565 return !ok;
566
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000567 default:
568 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000569 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000570 return 0;
571 }
572 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000573}
574
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000575LOCAL(int) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level);
576
577LOCAL(int)
578SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, int maxcount, int level)
579{
580 SRE_CODE chr;
581 SRE_CHAR* ptr = state->ptr;
582 SRE_CHAR* end = state->end;
583 int i;
584
585 /* adjust end */
586 if (maxcount < end - ptr && maxcount != 65535)
587 end = ptr + maxcount;
588
589 switch (pattern[0]) {
590
591 case SRE_OP_ANY:
592 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000593 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000594 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
595 ptr++;
596 break;
597
598 case SRE_OP_ANY_ALL:
599 /* repeated dot wildcare. skip to the end of the target
600 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000601 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000602 ptr = end;
603 break;
604
605 case SRE_OP_LITERAL:
606 /* repeated literal */
607 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000608 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000609 while (ptr < end && (SRE_CODE) *ptr == chr)
610 ptr++;
611 break;
612
613 case SRE_OP_LITERAL_IGNORE:
614 /* repeated literal */
615 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000616 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000617 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
618 ptr++;
619 break;
620
621 case SRE_OP_NOT_LITERAL:
622 /* repeated non-literal */
623 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000624 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000625 while (ptr < end && (SRE_CODE) *ptr != chr)
626 ptr++;
627 break;
628
629 case SRE_OP_NOT_LITERAL_IGNORE:
630 /* repeated non-literal */
631 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000632 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000633 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
634 ptr++;
635 break;
636
637 case SRE_OP_IN:
638 /* repeated set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000639 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
640 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000641 ptr++;
642 break;
643
644 default:
645 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000646 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000647 while ((SRE_CHAR*) state->ptr < end) {
648 i = SRE_MATCH(state, pattern, level);
649 if (i < 0)
650 return i;
651 if (!i)
652 break;
653 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000654 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
655 (SRE_CHAR*) state->ptr - ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000656 return (SRE_CHAR*) state->ptr - ptr;
657 }
658
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000659 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000660 return ptr - (SRE_CHAR*) state->ptr;
661}
662
Fredrik Lundh33accc12000-08-27 20:59:47 +0000663#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000664LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000665SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
666{
667 /* check if an SRE_OP_INFO block matches at the current position.
668 returns the number of SRE_CODE objects to skip if successful, 0
669 if no match */
670
671 SRE_CHAR* end = state->end;
672 SRE_CHAR* ptr = state->ptr;
673 int i;
674
675 /* check minimal length */
676 if (pattern[3] && (end - ptr) < pattern[3])
677 return 0;
678
679 /* check known prefix */
680 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
681 /* <length> <skip> <prefix data> <overlap data> */
682 for (i = 0; i < pattern[5]; i++)
683 if ((SRE_CODE) ptr[i] != pattern[7 + i])
684 return 0;
685 return pattern[0] + 2 * pattern[6];
686 }
687 return pattern[0];
688}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000689#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000690
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000691/* The macros below should be used to protect recursive SRE_MATCH()
692 * calls that *failed* and do *not* return immediately (IOW, those
693 * that will backtrack). Explaining:
694 *
695 * - Recursive SRE_MATCH() returned true: that's usually a success
696 * (besides atypical cases like ASSERT_NOT), therefore there's no
697 * reason to restore lastmark;
698 *
699 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
700 * is returning to the caller: If the current SRE_MATCH() is the
701 * top function of the recursion, returning false will be a matching
702 * failure, and it doesn't matter where lastmark is pointing to.
703 * If it's *not* the top function, it will be a recursive SRE_MATCH()
704 * failure by itself, and the calling SRE_MATCH() will have to deal
705 * with the failure by the same rules explained here (it will restore
706 * lastmark by itself if necessary);
707 *
708 * - Recursive SRE_MATCH() returned false, and will continue the
709 * outside 'for' loop: must be protected when breaking, since the next
710 * OP could potentially depend on lastmark;
711 *
712 * - Recursive SRE_MATCH() returned false, and will be called again
713 * inside a local for/while loop: must be protected between each
714 * loop iteration, since the recursive SRE_MATCH() could do anything,
715 * and could potentially depend on lastmark.
716 *
717 * For more information, check the discussion at SF patch #712900.
718 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000719#define LASTMARK_SAVE() \
720 do { \
721 lastmark = state->lastmark; \
722 lastindex = state->lastindex; \
723 } while (0)
724#define LASTMARK_RESTORE() \
725 do { \
726 if (state->lastmark > lastmark) { \
727 memset(state->mark + lastmark + 1, 0, \
728 (state->lastmark - lastmark) * sizeof(void*)); \
729 state->lastmark = lastmark; \
730 state->lastindex = lastindex; \
731 } \
732 } while (0)
733
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000734LOCAL(int)
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000735SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
Guido van Rossumb700df92000-03-31 14:59:30 +0000736{
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000737 /* check if string matches the given pattern. returns <0 for
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000738 error, 0 for failure, and 1 for success */
Guido van Rossumb700df92000-03-31 14:59:30 +0000739
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000740 SRE_CHAR* end = state->end;
741 SRE_CHAR* ptr = state->ptr;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000742 int i, count;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000743 SRE_REPEAT* rp;
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +0000744 int lastmark, lastindex, mark_stack_base;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000745 SRE_CODE chr;
Guido van Rossumb700df92000-03-31 14:59:30 +0000746
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000747 SRE_REPEAT rep; /* FIXME: <fl> allocate in STATE instead */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000748
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000749 TRACE(("|%p|%p|ENTER %d\n", pattern, ptr, level));
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000750
Fredrik Lundh18c2aa22000-08-07 17:33:38 +0000751#if defined(USE_STACKCHECK)
Fredrik Lundh58100642000-08-09 09:14:35 +0000752 if (level % 10 == 0 && PyOS_CheckStack())
Fredrik Lundh18c2aa22000-08-07 17:33:38 +0000753 return SRE_ERROR_RECURSION_LIMIT;
754#endif
755
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000756#if defined(USE_RECURSION_LIMIT)
757 if (level > USE_RECURSION_LIMIT)
758 return SRE_ERROR_RECURSION_LIMIT;
759#endif
760
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000761 if (pattern[0] == SRE_OP_INFO) {
762 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000763 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000764 if (pattern[3] && (end - ptr) < pattern[3]) {
765 TRACE(("reject (got %d chars, need %d)\n",
766 (end - ptr), pattern[3]));
767 return 0;
768 }
769 pattern += pattern[1] + 1;
770 }
771
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000772 for (;;) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000773
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000774 switch (*pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000775
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000776 case SRE_OP_FAILURE:
777 /* immediate failure */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000778 TRACE(("|%p|%p|FAILURE\n", pattern, ptr));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000779 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000780
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000781 case SRE_OP_SUCCESS:
782 /* end of pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000783 TRACE(("|%p|%p|SUCCESS\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000784 state->ptr = ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000785 return 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000786
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000787 case SRE_OP_AT:
788 /* match at given position */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000789 /* <AT> <code> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000790 TRACE(("|%p|%p|AT %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000791 if (!SRE_AT(state, ptr, *pattern))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000792 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000793 pattern++;
794 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000795
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000796 case SRE_OP_CATEGORY:
797 /* match at given category */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000798 /* <CATEGORY> <code> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000799 TRACE(("|%p|%p|CATEGORY %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000800 if (ptr >= end || !sre_category(pattern[0], ptr[0]))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000801 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000802 pattern++;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000803 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000804 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000805
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000806 case SRE_OP_LITERAL:
807 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000808 /* <LITERAL> <code> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000809 TRACE(("|%p|%p|LITERAL %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000810 if (ptr >= end || (SRE_CODE) ptr[0] != pattern[0])
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000811 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000812 pattern++;
813 ptr++;
814 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000815
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000816 case SRE_OP_NOT_LITERAL:
817 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000818 /* <NOT_LITERAL> <code> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000819 TRACE(("|%p|%p|NOT_LITERAL %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000820 if (ptr >= end || (SRE_CODE) ptr[0] == pattern[0])
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000821 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000822 pattern++;
823 ptr++;
824 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000825
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000826 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000827 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000828 /* <ANY> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000829 TRACE(("|%p|%p|ANY\n", pattern, ptr));
Fredrik Lundhe1869832000-08-01 22:47:49 +0000830 if (ptr >= end || SRE_IS_LINEBREAK(ptr[0]))
831 return 0;
832 ptr++;
833 break;
834
835 case SRE_OP_ANY_ALL:
836 /* match anything */
837 /* <ANY_ALL> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000838 TRACE(("|%p|%p|ANY_ALL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000839 if (ptr >= end)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000840 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000841 ptr++;
842 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000843
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000844 case SRE_OP_IN:
845 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000846 /* <IN> <skip> <set> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000847 TRACE(("|%p|%p|IN\n", pattern, ptr));
848 if (ptr >= end || !SRE_CHARSET(pattern + 1, *ptr))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000849 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000850 pattern += pattern[0];
851 ptr++;
852 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000853
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000854 case SRE_OP_GROUPREF:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000855 /* match backreference */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000856 TRACE(("|%p|%p|GROUPREF %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000857 i = pattern[0];
858 {
859 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
860 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
861 if (!p || !e || e < p)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000862 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000863 while (p < e) {
864 if (ptr >= end || *ptr != *p)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000865 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000866 p++; ptr++;
867 }
868 }
869 pattern++;
870 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000871
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000872 case SRE_OP_GROUPREF_IGNORE:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000873 /* match backreference */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000874 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000875 i = pattern[0];
876 {
877 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
878 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
879 if (!p || !e || e < p)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000880 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000881 while (p < e) {
882 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000883 state->lower(*ptr) != state->lower(*p))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000884 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000885 p++; ptr++;
886 }
887 }
888 pattern++;
889 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000890
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000891 case SRE_OP_LITERAL_IGNORE:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000892 TRACE(("|%p|%p|LITERAL_IGNORE %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000893 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000894 state->lower(*ptr) != state->lower(*pattern))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000895 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000896 pattern++;
897 ptr++;
898 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000899
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000900 case SRE_OP_NOT_LITERAL_IGNORE:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000901 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n", pattern, ptr, *pattern));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000902 if (ptr >= end ||
Fredrik Lundhb389df32000-06-29 12:48:37 +0000903 state->lower(*ptr) == state->lower(*pattern))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000904 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000905 pattern++;
906 ptr++;
907 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000908
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000909 case SRE_OP_IN_IGNORE:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000910 TRACE(("|%p|%p|IN_IGNORE\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000911 if (ptr >= end
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000912 || !SRE_CHARSET(pattern + 1, (SRE_CODE) state->lower(*ptr)))
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000913 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000914 pattern += pattern[0];
915 ptr++;
916 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000917
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000918 case SRE_OP_MARK:
919 /* set mark */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000920 /* <MARK> <gid> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000921 TRACE(("|%p|%p|MARK %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000922 i = pattern[0];
Gustavo Niemeyer1aca3592003-04-20 00:45:13 +0000923 if (i & 1)
924 state->lastindex = i/2 + 1;
925 if (i > state->lastmark)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000926 state->lastmark = i;
927 state->mark[i] = ptr;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000928 pattern++;
929 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000930
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000931 case SRE_OP_JUMP:
932 case SRE_OP_INFO:
933 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000934 /* <JUMP> <offset> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000935 TRACE(("|%p|%p|JUMP %d\n", pattern, ptr, pattern[0]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000936 pattern += pattern[0];
937 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000938
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000939 case SRE_OP_ASSERT:
940 /* assert subpattern */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000941 /* <ASSERT> <skip> <back> <pattern> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000942 TRACE(("|%p|%p|ASSERT %d\n", pattern, ptr, pattern[1]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000943 state->ptr = ptr - pattern[1];
Fredrik Lundh6f013982000-07-03 18:44:21 +0000944 if (state->ptr < state->beginning)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000945 return 0;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000946 i = SRE_MATCH(state, pattern + 2, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000947 if (i <= 0)
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000948 return i;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000949 pattern += pattern[0];
950 break;
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000951
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000952 case SRE_OP_ASSERT_NOT:
953 /* assert not subpattern */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000954 /* <ASSERT_NOT> <skip> <back> <pattern> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000955 TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern, ptr, pattern[1]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000956 state->ptr = ptr - pattern[1];
Fredrik Lundhebc37b22000-10-28 19:30:41 +0000957 if (state->ptr >= state->beginning) {
958 i = SRE_MATCH(state, pattern + 2, level + 1);
959 if (i < 0)
960 return i;
961 if (i)
962 return 0;
963 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000964 pattern += pattern[0];
965 break;
966
967 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000968 /* alternation */
969 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000970 TRACE(("|%p|%p|BRANCH\n", pattern, ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000971 LASTMARK_SAVE();
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000972 if (state->repeat) {
973 i = mark_save(state, 0, lastmark, &mark_stack_base);
974 if (i < 0)
975 return i;
976 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000977 for (; pattern[0]; pattern += pattern[0]) {
978 if (pattern[1] == SRE_OP_LITERAL &&
979 (ptr >= end || (SRE_CODE) *ptr != pattern[2]))
980 continue;
981 if (pattern[1] == SRE_OP_IN &&
982 (ptr >= end || !SRE_CHARSET(pattern + 3, (SRE_CODE) *ptr)))
983 continue;
984 state->ptr = ptr;
985 i = SRE_MATCH(state, pattern + 1, level + 1);
986 if (i)
987 return i;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000988 if (state->repeat) {
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +0000989 i = mark_restore(state, 0, lastmark, &mark_stack_base);
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000990 if (i < 0)
991 return i;
992 }
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000993 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000994 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000995 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000996
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000997 case SRE_OP_REPEAT_ONE:
998 /* match repeated sequence (maximizing regexp) */
999
1000 /* this operator only works if the repeated item is
1001 exactly one character wide, and we're not already
1002 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +00001003 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001004
1005 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1006
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001007 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", pattern, ptr,
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001008 pattern[1], pattern[2]));
1009
Fredrik Lundhe1869832000-08-01 22:47:49 +00001010 if (ptr + pattern[1] > end)
1011 return 0; /* cannot match */
1012
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001013 state->ptr = ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001014
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001015 count = SRE_COUNT(state, pattern + 3, pattern[2], level + 1);
1016 if (count < 0)
1017 return count;
Fredrik Lundhe1869832000-08-01 22:47:49 +00001018
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001019 ptr += count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001020
1021 /* when we arrive here, count contains the number of
1022 matches, and ptr points to the tail of the target
1023 string. check if the rest of the pattern matches,
1024 and backtrack if not. */
1025
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001026 if (count < (int) pattern[1])
1027 return 0;
1028
1029 if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
1030 /* tail is empty. we're finished */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001031 state->ptr = ptr;
1032 return 1;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001033 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001034
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001035 LASTMARK_SAVE();
1036
1037 if (pattern[pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001038 /* tail starts with a literal. skip positions where
1039 the rest of the pattern cannot possibly match */
Fredrik Lundhe1869832000-08-01 22:47:49 +00001040 chr = pattern[pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001041 for (;;) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001042 while (count >= (int) pattern[1] &&
1043 (ptr >= end || *ptr != chr)) {
1044 ptr--;
1045 count--;
1046 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001047 if (count < (int) pattern[1])
1048 break;
1049 state->ptr = ptr;
1050 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001051 if (i)
Fredrik Lundh33accc12000-08-27 20:59:47 +00001052 return i;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001053 ptr--;
1054 count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001055 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001056 }
1057
1058 } else {
1059 /* general case */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001060 while (count >= (int) pattern[1]) {
1061 state->ptr = ptr;
1062 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001063 if (i)
Fredrik Lundh33accc12000-08-27 20:59:47 +00001064 return i;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001065 ptr--;
1066 count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001067 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001068 }
1069 }
1070 return 0;
1071
Guido van Rossum41c99e72003-04-14 17:59:34 +00001072 case SRE_OP_MIN_REPEAT_ONE:
1073 /* match repeated sequence (minimizing regexp) */
1074
1075 /* this operator only works if the repeated item is
1076 exactly one character wide, and we're not already
1077 collecting backtracking points. for other cases,
1078 use the MIN_REPEAT operator */
1079
1080 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1081
1082 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", pattern, ptr,
1083 pattern[1], pattern[2]));
1084
1085 if (ptr + pattern[1] > end)
1086 return 0; /* cannot match */
1087
1088 state->ptr = ptr;
1089
1090 if (pattern[1] == 0)
1091 count = 0;
1092 else {
1093 /* count using pattern min as the maximum */
1094 count = SRE_COUNT(state, pattern + 3, pattern[1], level + 1);
1095
1096 if (count < 0)
1097 return count; /* exception */
1098 if (count < (int) pattern[1])
1099 return 0; /* did not match minimum number of times */
1100 ptr += count; /* advance past minimum matches of repeat */
1101 }
1102
1103 if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
1104 /* tail is empty. we're finished */
1105 state->ptr = ptr;
1106 return 1;
1107
1108 } else {
1109 /* general case */
1110 int matchmax = ((int)pattern[2] == 65535);
1111 int c;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001112 LASTMARK_SAVE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001113 while (matchmax || count <= (int) pattern[2]) {
1114 state->ptr = ptr;
1115 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
1116 if (i)
1117 return i;
1118 state->ptr = ptr;
1119 c = SRE_COUNT(state, pattern+3, 1, level+1);
1120 if (c < 0)
1121 return c;
1122 if (c == 0)
1123 break;
1124 assert(c == 1);
1125 ptr++;
1126 count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001127 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001128 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001129 }
1130 return 0;
1131
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001132 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001133 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001134 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001135 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001136 TRACE(("|%p|%p|REPEAT %d %d\n", pattern, ptr,
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001137 pattern[1], pattern[2]));
1138
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001139 rep.count = -1;
1140 rep.pattern = pattern;
1141
1142 /* install new repeat context */
1143 rep.prev = state->repeat;
1144 state->repeat = &rep;
1145
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001146 state->ptr = ptr;
1147 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001148
1149 state->repeat = rep.prev;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001150
1151 return i;
1152
1153 case SRE_OP_MAX_UNTIL:
1154 /* maximizing repeat */
1155 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1156
1157 /* FIXME: we probably need to deal with zero-width
1158 matches in here... */
1159
1160 rp = state->repeat;
1161 if (!rp)
1162 return SRE_ERROR_STATE;
1163
1164 state->ptr = ptr;
1165
1166 count = rp->count + 1;
1167
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001168 TRACE(("|%p|%p|MAX_UNTIL %d\n", pattern, ptr, count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001169
1170 if (count < rp->pattern[1]) {
1171 /* not enough matches */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001172 rp->count = count;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001173 /* RECURSIVE */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001174 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001175 if (i)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001176 return i;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001177 rp->count = count - 1;
1178 state->ptr = ptr;
1179 return 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001180 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001181
1182 if (count < rp->pattern[2] || rp->pattern[2] == 65535) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001183 /* we may have enough matches, but if we can
1184 match another item, do so */
1185 rp->count = count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001186 LASTMARK_SAVE();
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +00001187 i = mark_save(state, 0, lastmark, &mark_stack_base);
Fredrik Lundh33accc12000-08-27 20:59:47 +00001188 if (i < 0)
1189 return i;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001190 /* RECURSIVE */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001191 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001192 if (i)
1193 return i;
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +00001194 i = mark_restore(state, 0, lastmark, &mark_stack_base);
Fredrik Lundh33accc12000-08-27 20:59:47 +00001195 if (i < 0)
1196 return i;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001197 LASTMARK_RESTORE();
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001198 rp->count = count - 1;
1199 state->ptr = ptr;
1200 }
1201
1202 /* cannot match more repeated items here. make sure the
1203 tail matches */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001204 state->repeat = rp->prev;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001205 i = SRE_MATCH(state, pattern, level + 1);
1206 if (i)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001207 return i;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001208 state->repeat = rp;
Fredrik Lundh770617b2001-01-14 15:06:11 +00001209 state->ptr = ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001210 return 0;
1211
1212 case SRE_OP_MIN_UNTIL:
1213 /* minimizing repeat */
1214 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1215
1216 rp = state->repeat;
1217 if (!rp)
1218 return SRE_ERROR_STATE;
1219
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001220 state->ptr = ptr;
1221
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001222 count = rp->count + 1;
1223
Fredrik Lundh770617b2001-01-14 15:06:11 +00001224 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", pattern, ptr, count,
1225 rp->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001226
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001227 if (count < rp->pattern[1]) {
1228 /* not enough matches */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001229 rp->count = count;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001230 /* RECURSIVE */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001231 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001232 if (i)
1233 return i;
1234 rp->count = count-1;
1235 state->ptr = ptr;
1236 return 0;
1237 }
1238
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001239 LASTMARK_SAVE();
1240
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001241 /* see if the tail matches */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001242 state->repeat = rp->prev;
Fredrik Lundh82b23072001-12-09 16:13:15 +00001243 i = SRE_MATCH(state, pattern, level + 1);
1244 if (i)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001245 return i;
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001246
Fredrik Lundh770617b2001-01-14 15:06:11 +00001247 state->ptr = ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001248 state->repeat = rp;
1249
1250 if (count >= rp->pattern[2] && rp->pattern[2] != 65535)
1251 return 0;
1252
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001253 LASTMARK_RESTORE();
1254
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001255 rp->count = count;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001256 /* RECURSIVE */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001257 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001258 if (i)
1259 return i;
1260 rp->count = count - 1;
Fredrik Lundh770617b2001-01-14 15:06:11 +00001261 state->ptr = ptr;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001262
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001263 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001264
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001265 default:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001266 TRACE(("|%p|%p|UNKNOWN %d\n", pattern, ptr, pattern[-1]));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001267 return SRE_ERROR_ILLEGAL;
1268 }
1269 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001270
Sjoerd Mullender89dfe9e2001-08-30 14:37:07 +00001271 /* can't end up here */
Fredrik Lundh21009b92001-09-18 18:47:09 +00001272 /* return SRE_ERROR_ILLEGAL; -- see python-dev discussion */
Guido van Rossumb700df92000-03-31 14:59:30 +00001273}
1274
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001275LOCAL(int)
Guido van Rossumb700df92000-03-31 14:59:30 +00001276SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1277{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001278 SRE_CHAR* ptr = state->start;
1279 SRE_CHAR* end = state->end;
1280 int status = 0;
Fredrik Lundh28552902000-07-05 21:14:16 +00001281 int prefix_len = 0;
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001282 int prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001283 SRE_CODE* prefix = NULL;
1284 SRE_CODE* charset = NULL;
1285 SRE_CODE* overlap = NULL;
1286 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001287
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001288 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001289 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001290 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001291
1292 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001293
1294 if (pattern[3] > 0) {
1295 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001296 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001297 end -= pattern[3]-1;
1298 if (end <= ptr)
1299 end = ptr+1;
1300 }
1301
Fredrik Lundh3562f112000-07-02 12:00:07 +00001302 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001303 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001304 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001305 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001306 prefix_skip = pattern[6];
1307 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001308 overlap = prefix + prefix_len - 1;
1309 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001310 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001311 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001312 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001313
1314 pattern += 1 + pattern[1];
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001315 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001316
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001317 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1318 TRACE(("charset = %p\n", charset));
1319
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001320#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001321 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001322 /* pattern starts with a known prefix. use the overlap
1323 table to skip forward as fast as we possibly can */
1324 int i = 0;
1325 end = state->end;
1326 while (ptr < end) {
1327 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001328 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001329 if (!i)
1330 break;
1331 else
1332 i = overlap[i];
1333 } else {
1334 if (++i == prefix_len) {
1335 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001336 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1337 state->start = ptr + 1 - prefix_len;
1338 state->ptr = ptr + 1 - prefix_len + prefix_skip;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001339 if (flags & SRE_INFO_LITERAL)
1340 return 1; /* we got all of it */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001341 status = SRE_MATCH(state, pattern + 2*prefix_skip, 1);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001342 if (status != 0)
1343 return status;
1344 /* close but no cigar -- try again */
1345 i = overlap[i];
1346 }
1347 break;
1348 }
1349
1350 }
1351 ptr++;
1352 }
1353 return 0;
1354 }
1355#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001356
Fredrik Lundh3562f112000-07-02 12:00:07 +00001357 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001358 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001359 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001360 SRE_CODE chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001361 end = state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001362 for (;;) {
1363 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1364 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001365 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001366 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001367 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001368 state->start = ptr;
1369 state->ptr = ++ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001370 if (flags & SRE_INFO_LITERAL)
1371 return 1; /* we got all of it */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001372 status = SRE_MATCH(state, pattern + 2, 1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001373 if (status != 0)
1374 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001375 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001376 } else if (charset) {
1377 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001378 end = state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001379 for (;;) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001380 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001381 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001382 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001383 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001384 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001385 state->start = ptr;
1386 state->ptr = ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001387 status = SRE_MATCH(state, pattern, 1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001388 if (status != 0)
1389 break;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001390 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001391 }
1392 } else
1393 /* general case */
1394 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001395 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001396 state->start = state->ptr = ptr++;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001397 status = SRE_MATCH(state, pattern, 1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001398 if (status != 0)
1399 break;
1400 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001401
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001402 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001403}
Fredrik Lundh3562f112000-07-02 12:00:07 +00001404
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001405LOCAL(int)
1406SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, int len)
1407{
1408 /* check if given string is a literal template (i.e. no escapes) */
1409 while (len-- > 0)
1410 if (*ptr++ == '\\')
1411 return 0;
1412 return 1;
1413}
Guido van Rossumb700df92000-03-31 14:59:30 +00001414
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001415#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001416
1417/* -------------------------------------------------------------------- */
1418/* factories and destructors */
1419
1420/* see sre.h for object declarations */
1421
Jeremy Hylton938ace62002-07-17 16:30:39 +00001422static PyTypeObject Pattern_Type;
1423static PyTypeObject Match_Type;
1424static PyTypeObject Scanner_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00001425
1426static PyObject *
1427_compile(PyObject* self_, PyObject* args)
1428{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001429 /* "compile" pattern descriptor to pattern object */
Guido van Rossumb700df92000-03-31 14:59:30 +00001430
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001431 PatternObject* self;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001432 int i, n;
Guido van Rossumb700df92000-03-31 14:59:30 +00001433
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001434 PyObject* pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001435 int flags = 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001436 PyObject* code;
1437 int groups = 0;
1438 PyObject* groupindex = NULL;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001439 PyObject* indexgroup = NULL;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001440 if (!PyArg_ParseTuple(args, "OiO!|iOO", &pattern, &flags,
1441 &PyList_Type, &code, &groups,
1442 &groupindex, &indexgroup))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001443 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001444
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001445 n = PyList_GET_SIZE(code);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001446
Fredrik Lundhebc37b22000-10-28 19:30:41 +00001447 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001448 if (!self)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001449 return NULL;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001450
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001451 self->codesize = n;
1452
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001453 for (i = 0; i < n; i++) {
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001454 PyObject *o = PyList_GET_ITEM(code, i);
Martin v. Löwis78e2f062003-04-19 12:56:08 +00001455 if (PyInt_Check(o))
1456 self->code[i] = (SRE_CODE) PyInt_AsLong(o);
1457 else
1458 self->code[i] = (SRE_CODE) PyLong_AsUnsignedLong(o);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001459 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00001460
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001461 if (PyErr_Occurred()) {
1462 PyObject_DEL(self);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001463 return NULL;
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001464 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00001465
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001466 Py_INCREF(pattern);
1467 self->pattern = pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00001468
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001469 self->flags = flags;
1470
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001471 self->groups = groups;
Guido van Rossumb700df92000-03-31 14:59:30 +00001472
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001473 Py_XINCREF(groupindex);
1474 self->groupindex = groupindex;
Guido van Rossumb700df92000-03-31 14:59:30 +00001475
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001476 Py_XINCREF(indexgroup);
1477 self->indexgroup = indexgroup;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001478
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001479 return (PyObject*) self;
Guido van Rossumb700df92000-03-31 14:59:30 +00001480}
1481
1482static PyObject *
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001483sre_codesize(PyObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001484{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001485 return Py_BuildValue("i", sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001486}
1487
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001488static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001489sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001490{
1491 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001492 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001493 return NULL;
1494 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001495 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001496 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001497#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001498 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001499#else
1500 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001501#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001502 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001503}
1504
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001505LOCAL(void)
1506state_reset(SRE_STATE* state)
1507{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001508 state->lastmark = 0;
1509
1510 /* FIXME: dynamic! */
Neal Norwitz35fc7602002-06-13 21:11:11 +00001511 memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001512
1513 state->lastindex = -1;
1514
1515 state->repeat = NULL;
1516
1517 mark_fini(state);
1518}
1519
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001520static void*
1521getstring(PyObject* string, int* p_length, int* p_charsize)
Guido van Rossumb700df92000-03-31 14:59:30 +00001522{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001523 /* given a python object, return a data pointer, a length (in
1524 characters), and a character size. return NULL if the object
1525 is not a string (or not compatible) */
1526
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001527 PyBufferProcs *buffer;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001528 int size, bytes, charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001529 void* ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001530
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001531#if defined(HAVE_UNICODE)
1532 if (PyUnicode_Check(string)) {
1533 /* unicode strings doesn't always support the buffer interface */
1534 ptr = (void*) PyUnicode_AS_DATA(string);
1535 bytes = PyUnicode_GET_DATA_SIZE(string);
1536 size = PyUnicode_GET_SIZE(string);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001537 charsize = sizeof(Py_UNICODE);
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001538
1539 } else {
1540#endif
1541
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001542 /* get pointer to string buffer */
1543 buffer = string->ob_type->tp_as_buffer;
1544 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1545 buffer->bf_getsegcount(string, NULL) != 1) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001546 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001547 return NULL;
1548 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001549
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001550 /* determine buffer size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001551 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1552 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001553 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1554 return NULL;
1555 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001556
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001557 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001558#if PY_VERSION_HEX >= 0x01060000
1559 size = PyObject_Size(string);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001560#else
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001561 size = PyObject_Length(string);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001562#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001563
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001564 if (PyString_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001565 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001566#if defined(HAVE_UNICODE)
1567 else if (bytes == (int) (size * sizeof(Py_UNICODE)))
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001568 charsize = sizeof(Py_UNICODE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001569#endif
1570 else {
1571 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1572 return NULL;
1573 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001574
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001575#if defined(HAVE_UNICODE)
1576 }
1577#endif
1578
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001579 *p_length = size;
1580 *p_charsize = charsize;
1581
1582 return ptr;
1583}
1584
1585LOCAL(PyObject*)
1586state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1587 int start, int end)
1588{
1589 /* prepare state object */
1590
1591 int length;
1592 int charsize;
1593 void* ptr;
1594
1595 memset(state, 0, sizeof(SRE_STATE));
1596
1597 state->lastindex = -1;
1598
1599 ptr = getstring(string, &length, &charsize);
1600 if (!ptr)
1601 return NULL;
1602
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001603 /* adjust boundaries */
1604 if (start < 0)
1605 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001606 else if (start > length)
1607 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001608
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001609 if (end < 0)
1610 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001611 else if (end > length)
1612 end = length;
1613
1614 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001615
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001616 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001617
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001618 state->start = (void*) ((char*) ptr + start * state->charsize);
1619 state->end = (void*) ((char*) ptr + end * state->charsize);
1620
1621 Py_INCREF(string);
1622 state->string = string;
1623 state->pos = start;
1624 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001625
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001626 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001627 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001628 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001629#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001630 state->lower = sre_lower_unicode;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001631#else
1632 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001633#endif
1634 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001635 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001636
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001637 return string;
Guido van Rossumb700df92000-03-31 14:59:30 +00001638}
1639
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001640LOCAL(void)
1641state_fini(SRE_STATE* state)
1642{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001643 Py_XDECREF(state->string);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001644 mark_fini(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001645}
1646
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001647/* calculate offset from start of string */
1648#define STATE_OFFSET(state, member)\
1649 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1650
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001651LOCAL(PyObject*)
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001652state_getslice(SRE_STATE* state, int index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001653{
Fredrik Lundh58100642000-08-09 09:14:35 +00001654 int i, j;
1655
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001656 index = (index - 1) * 2;
1657
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001658 if (string == Py_None || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001659 if (empty)
1660 /* want empty string */
1661 i = j = 0;
1662 else {
1663 Py_INCREF(Py_None);
1664 return Py_None;
1665 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001666 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001667 i = STATE_OFFSET(state, state->mark[index]);
1668 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001669 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001670
Fredrik Lundh58100642000-08-09 09:14:35 +00001671 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001672}
1673
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001674static void
1675pattern_error(int status)
1676{
1677 switch (status) {
1678 case SRE_ERROR_RECURSION_LIMIT:
1679 PyErr_SetString(
1680 PyExc_RuntimeError,
1681 "maximum recursion limit exceeded"
1682 );
1683 break;
1684 case SRE_ERROR_MEMORY:
1685 PyErr_NoMemory();
1686 break;
1687 default:
1688 /* other error codes indicate compiler/engine bugs */
1689 PyErr_SetString(
1690 PyExc_RuntimeError,
1691 "internal error in regular expression engine"
1692 );
1693 }
1694}
1695
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001696static PyObject*
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001697pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001698{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001699 /* create match object (from state object) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001700
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001701 MatchObject* match;
1702 int i, j;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001703 char* base;
1704 int n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001705
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001706 if (status > 0) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001707
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001708 /* create match object (with room for extra group marks) */
1709 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
Fredrik Lundh6f013982000-07-03 18:44:21 +00001710 2*(pattern->groups+1));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001711 if (!match)
1712 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001713
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001714 Py_INCREF(pattern);
1715 match->pattern = pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001716
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001717 Py_INCREF(state->string);
1718 match->string = state->string;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001719
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001720 match->regs = NULL;
1721 match->groups = pattern->groups+1;
1722
1723 /* fill in group slices */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001724
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001725 base = (char*) state->beginning;
1726 n = state->charsize;
1727
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001728 match->mark[0] = ((char*) state->start - base) / n;
1729 match->mark[1] = ((char*) state->ptr - base) / n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001730
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001731 for (i = j = 0; i < pattern->groups; i++, j+=2)
1732 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
1733 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
1734 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
1735 } else
1736 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
1737
1738 match->pos = state->pos;
1739 match->endpos = state->endpos;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001740
Fredrik Lundh6f013982000-07-03 18:44:21 +00001741 match->lastindex = state->lastindex;
1742
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001743 return (PyObject*) match;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001744
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001745 } else if (status == 0) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001746
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001747 /* no match */
1748 Py_INCREF(Py_None);
1749 return Py_None;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001750
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001751 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001752
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001753 /* internal error */
1754 pattern_error(status);
1755 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001756}
1757
1758static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001759pattern_scanner(PatternObject* pattern, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001760{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001761 /* create search state object */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001762
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001763 ScannerObject* self;
1764
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001765 PyObject* string;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001766 int start = 0;
1767 int end = INT_MAX;
1768 if (!PyArg_ParseTuple(args, "O|ii:scanner", &string, &start, &end))
1769 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001770
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001771 /* create scanner object */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001772 self = PyObject_NEW(ScannerObject, &Scanner_Type);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001773 if (!self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001774 return NULL;
1775
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001776 string = state_init(&self->state, pattern, string, start, end);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001777 if (!string) {
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001778 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001779 return NULL;
1780 }
1781
1782 Py_INCREF(pattern);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001783 self->pattern = (PyObject*) pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001784
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001785 return (PyObject*) self;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001786}
1787
Guido van Rossumb700df92000-03-31 14:59:30 +00001788static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001789pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001790{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001791 Py_XDECREF(self->pattern);
1792 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001793 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001794 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001795}
1796
1797static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001798pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001799{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001800 SRE_STATE state;
1801 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001802
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001803 PyObject* string;
1804 int start = 0;
1805 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00001806 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1807 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:match", kwlist,
1808 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001809 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001810
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001811 string = state_init(&state, self, string, start, end);
1812 if (!string)
1813 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001814
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001815 state.ptr = state.start;
1816
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001817 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1818
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001819 if (state.charsize == 1) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001820 status = sre_match(&state, PatternObject_GetCode(self), 1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001821 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001822#if defined(HAVE_UNICODE)
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001823 status = sre_umatch(&state, PatternObject_GetCode(self), 1);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001824#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001825 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001826
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001827 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1828
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001829 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001830
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001831 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001832}
1833
1834static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001835pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001836{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001837 SRE_STATE state;
1838 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001839
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001840 PyObject* string;
1841 int start = 0;
1842 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00001843 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1844 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:search", kwlist,
1845 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001846 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001847
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001848 string = state_init(&state, self, string, start, end);
1849 if (!string)
1850 return NULL;
1851
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001852 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1853
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001854 if (state.charsize == 1) {
1855 status = sre_search(&state, PatternObject_GetCode(self));
1856 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001857#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001858 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001859#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001860 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001861
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001862 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1863
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001864 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001865
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001866 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001867}
1868
1869static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001870call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001871{
1872 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001873 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001874 PyObject* func;
1875 PyObject* result;
1876
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001877 if (!args)
1878 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001879 name = PyString_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001880 if (!name)
1881 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001882 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001883 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001884 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001885 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001886 func = PyObject_GetAttrString(mod, function);
1887 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001888 if (!func)
1889 return NULL;
1890 result = PyObject_CallObject(func, args);
1891 Py_DECREF(func);
1892 Py_DECREF(args);
1893 return result;
1894}
1895
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001896#ifdef USE_BUILTIN_COPY
1897static int
1898deepcopy(PyObject** object, PyObject* memo)
1899{
1900 PyObject* copy;
1901
1902 copy = call(
1903 "copy", "deepcopy",
1904 Py_BuildValue("OO", *object, memo)
1905 );
1906 if (!copy)
1907 return 0;
1908
1909 Py_DECREF(*object);
1910 *object = copy;
1911
1912 return 1; /* success */
1913}
1914#endif
1915
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001916static PyObject*
Michael W. Hudsonb6a45052002-07-31 09:54:24 +00001917join_list(PyObject* list, PyObject* pattern)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001918{
1919 /* join list elements */
1920
1921 PyObject* joiner;
1922#if PY_VERSION_HEX >= 0x01060000
1923 PyObject* function;
1924 PyObject* args;
1925#endif
1926 PyObject* result;
1927
1928 switch (PyList_GET_SIZE(list)) {
1929 case 0:
1930 Py_DECREF(list);
Fredrik Lundh09705f02002-11-22 12:46:35 +00001931 return PySequence_GetSlice(pattern, 0, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001932 case 1:
1933 result = PyList_GET_ITEM(list, 0);
1934 Py_INCREF(result);
1935 Py_DECREF(list);
1936 return result;
1937 }
1938
1939 /* two or more elements: slice out a suitable separator from the
1940 first member, and use that to join the entire list */
1941
1942 joiner = PySequence_GetSlice(pattern, 0, 0);
1943 if (!joiner)
1944 return NULL;
1945
1946#if PY_VERSION_HEX >= 0x01060000
1947 function = PyObject_GetAttrString(joiner, "join");
1948 if (!function) {
1949 Py_DECREF(joiner);
1950 return NULL;
1951 }
1952 args = PyTuple_New(1);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00001953 if (!args) {
1954 Py_DECREF(function);
1955 Py_DECREF(joiner);
1956 return NULL;
1957 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001958 PyTuple_SET_ITEM(args, 0, list);
1959 result = PyObject_CallObject(function, args);
1960 Py_DECREF(args); /* also removes list */
1961 Py_DECREF(function);
1962#else
1963 result = call(
1964 "string", "join",
1965 Py_BuildValue("OO", list, joiner)
1966 );
1967#endif
1968 Py_DECREF(joiner);
1969
1970 return result;
1971}
1972
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001973static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001974pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001975{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001976 SRE_STATE state;
1977 PyObject* list;
1978 int status;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001979 int i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00001980
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001981 PyObject* string;
1982 int start = 0;
1983 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00001984 static char* kwlist[] = { "source", "pos", "endpos", NULL };
1985 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:findall", kwlist,
1986 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001987 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001988
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001989 string = state_init(&state, self, string, start, end);
1990 if (!string)
1991 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001992
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001993 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00001994 if (!list) {
1995 state_fini(&state);
1996 return NULL;
1997 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001998
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001999 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002000
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002001 PyObject* item;
2002
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002003 state_reset(&state);
2004
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002005 state.ptr = state.start;
2006
2007 if (state.charsize == 1) {
2008 status = sre_search(&state, PatternObject_GetCode(self));
2009 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002010#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002011 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002012#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002013 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002014
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002015 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002016 if (status == 0)
2017 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002018 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002019 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002020 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002021
2022 /* don't bother to build a match object */
2023 switch (self->groups) {
2024 case 0:
2025 b = STATE_OFFSET(&state, state.start);
2026 e = STATE_OFFSET(&state, state.ptr);
2027 item = PySequence_GetSlice(string, b, e);
2028 if (!item)
2029 goto error;
2030 break;
2031 case 1:
2032 item = state_getslice(&state, 1, string, 1);
2033 if (!item)
2034 goto error;
2035 break;
2036 default:
2037 item = PyTuple_New(self->groups);
2038 if (!item)
2039 goto error;
2040 for (i = 0; i < self->groups; i++) {
2041 PyObject* o = state_getslice(&state, i+1, string, 1);
2042 if (!o) {
2043 Py_DECREF(item);
2044 goto error;
2045 }
2046 PyTuple_SET_ITEM(item, i, o);
2047 }
2048 break;
2049 }
2050
2051 status = PyList_Append(list, item);
2052 Py_DECREF(item);
2053 if (status < 0)
2054 goto error;
2055
2056 if (state.ptr == state.start)
2057 state.start = (void*) ((char*) state.ptr + state.charsize);
2058 else
2059 state.start = state.ptr;
2060
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002061 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002062
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002063 state_fini(&state);
2064 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002065
2066error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002067 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002068 state_fini(&state);
2069 return NULL;
2070
Guido van Rossumb700df92000-03-31 14:59:30 +00002071}
2072
Fredrik Lundh703ce812001-10-24 22:16:30 +00002073#if PY_VERSION_HEX >= 0x02020000
2074static PyObject*
2075pattern_finditer(PatternObject* pattern, PyObject* args)
2076{
2077 PyObject* scanner;
2078 PyObject* search;
2079 PyObject* iterator;
2080
2081 scanner = pattern_scanner(pattern, args);
2082 if (!scanner)
2083 return NULL;
2084
2085 search = PyObject_GetAttrString(scanner, "search");
2086 Py_DECREF(scanner);
2087 if (!search)
2088 return NULL;
2089
2090 iterator = PyCallIter_New(search, Py_None);
2091 Py_DECREF(search);
2092
2093 return iterator;
2094}
2095#endif
2096
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002097static PyObject*
2098pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2099{
2100 SRE_STATE state;
2101 PyObject* list;
2102 PyObject* item;
2103 int status;
2104 int n;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002105 int i;
2106 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002107
2108 PyObject* string;
2109 int maxsplit = 0;
2110 static char* kwlist[] = { "source", "maxsplit", NULL };
2111 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|i:split", kwlist,
2112 &string, &maxsplit))
2113 return NULL;
2114
2115 string = state_init(&state, self, string, 0, INT_MAX);
2116 if (!string)
2117 return NULL;
2118
2119 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002120 if (!list) {
2121 state_fini(&state);
2122 return NULL;
2123 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002124
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002125 n = 0;
2126 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002127
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002128 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002129
2130 state_reset(&state);
2131
2132 state.ptr = state.start;
2133
2134 if (state.charsize == 1) {
2135 status = sre_search(&state, PatternObject_GetCode(self));
2136 } else {
2137#if defined(HAVE_UNICODE)
2138 status = sre_usearch(&state, PatternObject_GetCode(self));
2139#endif
2140 }
2141
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002142 if (status <= 0) {
2143 if (status == 0)
2144 break;
2145 pattern_error(status);
2146 goto error;
2147 }
2148
2149 if (state.start == state.ptr) {
2150 if (last == state.end)
2151 break;
2152 /* skip one character */
2153 state.start = (void*) ((char*) state.ptr + state.charsize);
2154 continue;
2155 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002156
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002157 /* get segment before this match */
2158 item = PySequence_GetSlice(
2159 string, STATE_OFFSET(&state, last),
2160 STATE_OFFSET(&state, state.start)
2161 );
2162 if (!item)
2163 goto error;
2164 status = PyList_Append(list, item);
2165 Py_DECREF(item);
2166 if (status < 0)
2167 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002168
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002169 /* add groups (if any) */
2170 for (i = 0; i < self->groups; i++) {
2171 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002172 if (!item)
2173 goto error;
2174 status = PyList_Append(list, item);
2175 Py_DECREF(item);
2176 if (status < 0)
2177 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002178 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002179
2180 n = n + 1;
2181
2182 last = state.start = state.ptr;
2183
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002184 }
2185
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002186 /* get segment following last match (even if empty) */
2187 item = PySequence_GetSlice(
2188 string, STATE_OFFSET(&state, last), state.endpos
2189 );
2190 if (!item)
2191 goto error;
2192 status = PyList_Append(list, item);
2193 Py_DECREF(item);
2194 if (status < 0)
2195 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002196
2197 state_fini(&state);
2198 return list;
2199
2200error:
2201 Py_DECREF(list);
2202 state_fini(&state);
2203 return NULL;
2204
2205}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002206
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002207static PyObject*
2208pattern_subx(PatternObject* self, PyObject* template, PyObject* string,
2209 int count, int subn)
2210{
2211 SRE_STATE state;
2212 PyObject* list;
2213 PyObject* item;
2214 PyObject* filter;
2215 PyObject* args;
2216 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002217 void* ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002218 int status;
2219 int n;
2220 int i, b, e;
2221 int filter_is_callable;
2222
Fredrik Lundhdac58492001-10-21 21:48:30 +00002223 if (PyCallable_Check(template)) {
2224 /* sub/subn takes either a function or a template */
2225 filter = template;
2226 Py_INCREF(filter);
2227 filter_is_callable = 1;
2228 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002229 /* if not callable, check if it's a literal string */
2230 int literal;
2231 ptr = getstring(template, &n, &b);
2232 if (ptr) {
2233 if (b == 1) {
2234 literal = sre_literal_template(ptr, n);
2235 } else {
2236#if defined(HAVE_UNICODE)
2237 literal = sre_uliteral_template(ptr, n);
2238#endif
2239 }
2240 } else {
2241 PyErr_Clear();
2242 literal = 0;
2243 }
2244 if (literal) {
2245 filter = template;
2246 Py_INCREF(filter);
2247 filter_is_callable = 0;
2248 } else {
2249 /* not a literal; hand it over to the template compiler */
2250 filter = call(
2251 SRE_MODULE, "_subx",
2252 Py_BuildValue("OO", self, template)
2253 );
2254 if (!filter)
2255 return NULL;
2256 filter_is_callable = PyCallable_Check(filter);
2257 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002258 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002259
2260 string = state_init(&state, self, string, 0, INT_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002261 if (!string) {
2262 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002263 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002264 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002265
2266 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002267 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002268 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002269 state_fini(&state);
2270 return NULL;
2271 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002272
2273 n = i = 0;
2274
2275 while (!count || n < count) {
2276
2277 state_reset(&state);
2278
2279 state.ptr = state.start;
2280
2281 if (state.charsize == 1) {
2282 status = sre_search(&state, PatternObject_GetCode(self));
2283 } else {
2284#if defined(HAVE_UNICODE)
2285 status = sre_usearch(&state, PatternObject_GetCode(self));
2286#endif
2287 }
2288
2289 if (status <= 0) {
2290 if (status == 0)
2291 break;
2292 pattern_error(status);
2293 goto error;
2294 }
2295
2296 b = STATE_OFFSET(&state, state.start);
2297 e = STATE_OFFSET(&state, state.ptr);
2298
2299 if (i < b) {
2300 /* get segment before this match */
2301 item = PySequence_GetSlice(string, i, b);
2302 if (!item)
2303 goto error;
2304 status = PyList_Append(list, item);
2305 Py_DECREF(item);
2306 if (status < 0)
2307 goto error;
2308
2309 } else if (i == b && i == e && n > 0)
2310 /* ignore empty match on latest position */
2311 goto next;
2312
2313 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002314 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002315 match = pattern_new_match(self, &state, 1);
2316 if (!match)
2317 goto error;
2318 args = Py_BuildValue("(O)", match);
2319 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002320 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002321 goto error;
2322 }
2323 item = PyObject_CallObject(filter, args);
2324 Py_DECREF(args);
2325 Py_DECREF(match);
2326 if (!item)
2327 goto error;
2328 } else {
2329 /* filter is literal string */
2330 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002331 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002332 }
2333
2334 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002335 if (item != Py_None) {
2336 status = PyList_Append(list, item);
2337 Py_DECREF(item);
2338 if (status < 0)
2339 goto error;
2340 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002341
2342 i = e;
2343 n = n + 1;
2344
2345next:
2346 /* move on */
2347 if (state.ptr == state.start)
2348 state.start = (void*) ((char*) state.ptr + state.charsize);
2349 else
2350 state.start = state.ptr;
2351
2352 }
2353
2354 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002355 if (i < state.endpos) {
2356 item = PySequence_GetSlice(string, i, state.endpos);
2357 if (!item)
2358 goto error;
2359 status = PyList_Append(list, item);
2360 Py_DECREF(item);
2361 if (status < 0)
2362 goto error;
2363 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002364
2365 state_fini(&state);
2366
Guido van Rossum4e173842001-12-07 04:25:10 +00002367 Py_DECREF(filter);
2368
Fredrik Lundhdac58492001-10-21 21:48:30 +00002369 /* convert list to single string (also removes list) */
Michael W. Hudsonb6a45052002-07-31 09:54:24 +00002370 item = join_list(list, self->pattern);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002371
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002372 if (!item)
2373 return NULL;
2374
2375 if (subn)
2376 return Py_BuildValue("Ni", item, n);
2377
2378 return item;
2379
2380error:
2381 Py_DECREF(list);
2382 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002383 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002384 return NULL;
2385
2386}
2387
2388static PyObject*
2389pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2390{
2391 PyObject* template;
2392 PyObject* string;
2393 int count = 0;
2394 static char* kwlist[] = { "repl", "string", "count", NULL };
2395 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:sub", kwlist,
2396 &template, &string, &count))
2397 return NULL;
2398
2399 return pattern_subx(self, template, string, count, 0);
2400}
2401
2402static PyObject*
2403pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2404{
2405 PyObject* template;
2406 PyObject* string;
2407 int count = 0;
2408 static char* kwlist[] = { "repl", "string", "count", NULL };
2409 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:subn", kwlist,
2410 &template, &string, &count))
2411 return NULL;
2412
2413 return pattern_subx(self, template, string, count, 1);
2414}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002415
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002416static PyObject*
2417pattern_copy(PatternObject* self, PyObject* args)
2418{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002419#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002420 PatternObject* copy;
2421 int offset;
2422
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002423 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
2424 return NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002425
2426 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2427 if (!copy)
2428 return NULL;
2429
2430 offset = offsetof(PatternObject, groups);
2431
2432 Py_XINCREF(self->groupindex);
2433 Py_XINCREF(self->indexgroup);
2434 Py_XINCREF(self->pattern);
2435
2436 memcpy((char*) copy + offset, (char*) self + offset,
2437 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
2438
2439 return (PyObject*) copy;
2440#else
2441 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2442 return NULL;
2443#endif
2444}
2445
2446static PyObject*
2447pattern_deepcopy(PatternObject* self, PyObject* args)
2448{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002449#ifdef USE_BUILTIN_COPY
2450 PatternObject* copy;
2451
2452 PyObject* memo;
2453 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
2454 return NULL;
2455
2456 copy = (PatternObject*) pattern_copy(self, Py_None);
2457 if (!copy)
2458 return NULL;
2459
2460 if (!deepcopy(&copy->groupindex, memo) ||
2461 !deepcopy(&copy->indexgroup, memo) ||
2462 !deepcopy(&copy->pattern, memo)) {
2463 Py_DECREF(copy);
2464 return NULL;
2465 }
2466
2467#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002468 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2469 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002470#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002471}
2472
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002473static PyMethodDef pattern_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00002474 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS},
2475 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS},
2476 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS},
2477 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS},
2478 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS},
2479 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002480#if PY_VERSION_HEX >= 0x02020000
2481 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS},
2482#endif
Fredrik Lundh562586e2000-10-03 20:43:34 +00002483 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002484 {"__copy__", (PyCFunction) pattern_copy, METH_VARARGS},
2485 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002486 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002487};
2488
2489static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002490pattern_getattr(PatternObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00002491{
2492 PyObject* res;
2493
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002494 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00002495
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002496 if (res)
2497 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00002498
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002499 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00002500
2501 /* attributes */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002502 if (!strcmp(name, "pattern")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002503 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002504 return self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00002505 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002506
2507 if (!strcmp(name, "flags"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002508 return Py_BuildValue("i", self->flags);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002509
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002510 if (!strcmp(name, "groups"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002511 return Py_BuildValue("i", self->groups);
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002512
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002513 if (!strcmp(name, "groupindex") && self->groupindex) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002514 Py_INCREF(self->groupindex);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002515 return self->groupindex;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002516 }
2517
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002518 PyErr_SetString(PyExc_AttributeError, name);
2519 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002520}
2521
2522statichere PyTypeObject Pattern_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002523 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00002524 0, "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002525 sizeof(PatternObject), sizeof(SRE_CODE),
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002526 (destructor)pattern_dealloc, /*tp_dealloc*/
2527 0, /*tp_print*/
2528 (getattrfunc)pattern_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002529};
2530
2531/* -------------------------------------------------------------------- */
2532/* match methods */
2533
2534static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002535match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00002536{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002537 Py_XDECREF(self->regs);
2538 Py_XDECREF(self->string);
2539 Py_DECREF(self->pattern);
2540 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00002541}
2542
2543static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002544match_getslice_by_index(MatchObject* self, int index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00002545{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002546 if (index < 0 || index >= self->groups) {
2547 /* raise IndexError if we were given a bad group number */
2548 PyErr_SetString(
2549 PyExc_IndexError,
2550 "no such group"
2551 );
2552 return NULL;
2553 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002554
Fredrik Lundh6f013982000-07-03 18:44:21 +00002555 index *= 2;
2556
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002557 if (self->string == Py_None || self->mark[index] < 0) {
2558 /* return default value if the string or group is undefined */
2559 Py_INCREF(def);
2560 return def;
2561 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002562
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002563 return PySequence_GetSlice(
2564 self->string, self->mark[index], self->mark[index+1]
2565 );
Guido van Rossumb700df92000-03-31 14:59:30 +00002566}
2567
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002568static int
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002569match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00002570{
Fredrik Lundh6f013982000-07-03 18:44:21 +00002571 int i;
Guido van Rossumb700df92000-03-31 14:59:30 +00002572
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002573 if (PyInt_Check(index))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002574 return (int) PyInt_AS_LONG(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00002575
Fredrik Lundh6f013982000-07-03 18:44:21 +00002576 i = -1;
2577
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002578 if (self->pattern->groupindex) {
2579 index = PyObject_GetItem(self->pattern->groupindex, index);
2580 if (index) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00002581 if (PyInt_Check(index))
2582 i = (int) PyInt_AS_LONG(index);
2583 Py_DECREF(index);
2584 } else
2585 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002586 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00002587
2588 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002589}
2590
2591static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002592match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002593{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002594 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00002595}
2596
2597static PyObject*
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002598match_expand(MatchObject* self, PyObject* args)
2599{
2600 PyObject* template;
2601 if (!PyArg_ParseTuple(args, "O:expand", &template))
2602 return NULL;
2603
2604 /* delegate to Python code */
2605 return call(
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002606 SRE_MODULE, "_expand",
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002607 Py_BuildValue("OOO", self->pattern, self, template)
2608 );
2609}
2610
2611static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002612match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002613{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002614 PyObject* result;
2615 int i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00002616
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002617 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00002618
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002619 switch (size) {
2620 case 0:
2621 result = match_getslice(self, Py_False, Py_None);
2622 break;
2623 case 1:
2624 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2625 break;
2626 default:
2627 /* fetch multiple items */
2628 result = PyTuple_New(size);
2629 if (!result)
2630 return NULL;
2631 for (i = 0; i < size; i++) {
2632 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002633 self, PyTuple_GET_ITEM(args, i), Py_None
2634 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002635 if (!item) {
2636 Py_DECREF(result);
2637 return NULL;
2638 }
2639 PyTuple_SET_ITEM(result, i, item);
2640 }
2641 break;
2642 }
2643 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002644}
2645
2646static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002647match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002648{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002649 PyObject* result;
2650 int index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002651
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002652 PyObject* def = Py_None;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002653 static char* kwlist[] = { "default", NULL };
2654 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002655 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002656
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002657 result = PyTuple_New(self->groups-1);
2658 if (!result)
2659 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002660
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002661 for (index = 1; index < self->groups; index++) {
2662 PyObject* item;
2663 item = match_getslice_by_index(self, index, def);
2664 if (!item) {
2665 Py_DECREF(result);
2666 return NULL;
2667 }
2668 PyTuple_SET_ITEM(result, index-1, item);
2669 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002670
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002671 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002672}
2673
2674static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002675match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002676{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002677 PyObject* result;
2678 PyObject* keys;
2679 int index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002680
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002681 PyObject* def = Py_None;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002682 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00002683 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002684 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002685
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002686 result = PyDict_New();
2687 if (!result || !self->pattern->groupindex)
2688 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002689
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002690 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002691 if (!keys)
2692 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00002693
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002694 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00002695 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002696 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002697 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002698 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002699 if (!key)
2700 goto failed;
2701 value = match_getslice(self, key, def);
2702 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002703 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002704 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002705 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00002706 status = PyDict_SetItem(result, key, value);
2707 Py_DECREF(value);
2708 if (status < 0)
2709 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002710 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002711
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002712 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00002713
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002714 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002715
2716failed:
2717 Py_DECREF(keys);
2718 Py_DECREF(result);
2719 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002720}
2721
2722static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002723match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002724{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002725 int index;
2726
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002727 PyObject* index_ = Py_False; /* zero */
2728 if (!PyArg_ParseTuple(args, "|O:start", &index_))
2729 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002730
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002731 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002732
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002733 if (index < 0 || index >= self->groups) {
2734 PyErr_SetString(
2735 PyExc_IndexError,
2736 "no such group"
2737 );
2738 return NULL;
2739 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002740
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002741 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002742 return Py_BuildValue("i", self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00002743}
2744
2745static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002746match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002747{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002748 int index;
2749
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002750 PyObject* index_ = Py_False; /* zero */
2751 if (!PyArg_ParseTuple(args, "|O:end", &index_))
2752 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002753
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002754 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002755
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002756 if (index < 0 || index >= self->groups) {
2757 PyErr_SetString(
2758 PyExc_IndexError,
2759 "no such group"
2760 );
2761 return NULL;
2762 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002763
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002764 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002765 return Py_BuildValue("i", self->mark[index*2+1]);
2766}
2767
2768LOCAL(PyObject*)
2769_pair(int i1, int i2)
2770{
2771 PyObject* pair;
2772 PyObject* item;
2773
2774 pair = PyTuple_New(2);
2775 if (!pair)
2776 return NULL;
2777
2778 item = PyInt_FromLong(i1);
2779 if (!item)
2780 goto error;
2781 PyTuple_SET_ITEM(pair, 0, item);
2782
2783 item = PyInt_FromLong(i2);
2784 if (!item)
2785 goto error;
2786 PyTuple_SET_ITEM(pair, 1, item);
2787
2788 return pair;
2789
2790 error:
2791 Py_DECREF(pair);
2792 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002793}
2794
2795static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002796match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002797{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002798 int index;
2799
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002800 PyObject* index_ = Py_False; /* zero */
2801 if (!PyArg_ParseTuple(args, "|O:span", &index_))
2802 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002803
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002804 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002805
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002806 if (index < 0 || index >= self->groups) {
2807 PyErr_SetString(
2808 PyExc_IndexError,
2809 "no such group"
2810 );
2811 return NULL;
2812 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002813
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002814 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002815 return _pair(self->mark[index*2], self->mark[index*2+1]);
2816}
2817
2818static PyObject*
2819match_regs(MatchObject* self)
2820{
2821 PyObject* regs;
2822 PyObject* item;
2823 int index;
2824
2825 regs = PyTuple_New(self->groups);
2826 if (!regs)
2827 return NULL;
2828
2829 for (index = 0; index < self->groups; index++) {
2830 item = _pair(self->mark[index*2], self->mark[index*2+1]);
2831 if (!item) {
2832 Py_DECREF(regs);
2833 return NULL;
2834 }
2835 PyTuple_SET_ITEM(regs, index, item);
2836 }
2837
2838 Py_INCREF(regs);
2839 self->regs = regs;
2840
2841 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00002842}
2843
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002844static PyObject*
2845match_copy(MatchObject* self, PyObject* args)
2846{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002847#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002848 MatchObject* copy;
2849 int slots, offset;
2850
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002851 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
2852 return NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002853
2854 slots = 2 * (self->pattern->groups+1);
2855
2856 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
2857 if (!copy)
2858 return NULL;
2859
2860 /* this value a constant, but any compiler should be able to
2861 figure that out all by itself */
2862 offset = offsetof(MatchObject, string);
2863
2864 Py_XINCREF(self->pattern);
2865 Py_XINCREF(self->string);
2866 Py_XINCREF(self->regs);
2867
2868 memcpy((char*) copy + offset, (char*) self + offset,
2869 sizeof(MatchObject) + slots * sizeof(int) - offset);
2870
2871 return (PyObject*) copy;
2872#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002873 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002874 return NULL;
2875#endif
2876}
2877
2878static PyObject*
2879match_deepcopy(MatchObject* self, PyObject* args)
2880{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002881#ifdef USE_BUILTIN_COPY
2882 MatchObject* copy;
2883
2884 PyObject* memo;
2885 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
2886 return NULL;
2887
2888 copy = (MatchObject*) match_copy(self, Py_None);
2889 if (!copy)
2890 return NULL;
2891
2892 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
2893 !deepcopy(&copy->string, memo) ||
2894 !deepcopy(&copy->regs, memo)) {
2895 Py_DECREF(copy);
2896 return NULL;
2897 }
2898
2899#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002900 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
2901 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002902#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002903}
2904
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002905static PyMethodDef match_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00002906 {"group", (PyCFunction) match_group, METH_VARARGS},
2907 {"start", (PyCFunction) match_start, METH_VARARGS},
2908 {"end", (PyCFunction) match_end, METH_VARARGS},
2909 {"span", (PyCFunction) match_span, METH_VARARGS},
2910 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
2911 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
2912 {"expand", (PyCFunction) match_expand, METH_VARARGS},
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002913 {"__copy__", (PyCFunction) match_copy, METH_VARARGS},
2914 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002915 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002916};
2917
2918static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002919match_getattr(MatchObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00002920{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002921 PyObject* res;
Guido van Rossumb700df92000-03-31 14:59:30 +00002922
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002923 res = Py_FindMethod(match_methods, (PyObject*) self, name);
2924 if (res)
2925 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00002926
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002927 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00002928
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002929 if (!strcmp(name, "lastindex")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00002930 if (self->lastindex >= 0)
2931 return Py_BuildValue("i", self->lastindex);
Fredrik Lundhc2301732000-07-02 22:25:39 +00002932 Py_INCREF(Py_None);
2933 return Py_None;
2934 }
2935
2936 if (!strcmp(name, "lastgroup")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00002937 if (self->pattern->indexgroup && self->lastindex >= 0) {
Fredrik Lundhc2301732000-07-02 22:25:39 +00002938 PyObject* result = PySequence_GetItem(
Fredrik Lundh6f013982000-07-03 18:44:21 +00002939 self->pattern->indexgroup, self->lastindex
Fredrik Lundhc2301732000-07-02 22:25:39 +00002940 );
2941 if (result)
2942 return result;
2943 PyErr_Clear();
2944 }
2945 Py_INCREF(Py_None);
2946 return Py_None;
2947 }
2948
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002949 if (!strcmp(name, "string")) {
2950 if (self->string) {
2951 Py_INCREF(self->string);
2952 return self->string;
2953 } else {
2954 Py_INCREF(Py_None);
2955 return Py_None;
2956 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002957 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002958
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002959 if (!strcmp(name, "regs")) {
2960 if (self->regs) {
2961 Py_INCREF(self->regs);
2962 return self->regs;
2963 } else
2964 return match_regs(self);
2965 }
2966
2967 if (!strcmp(name, "re")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002968 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002969 return (PyObject*) self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00002970 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002971
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002972 if (!strcmp(name, "pos"))
2973 return Py_BuildValue("i", self->pos);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002974
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002975 if (!strcmp(name, "endpos"))
2976 return Py_BuildValue("i", self->endpos);
Guido van Rossumb700df92000-03-31 14:59:30 +00002977
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002978 PyErr_SetString(PyExc_AttributeError, name);
2979 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002980}
2981
2982/* FIXME: implement setattr("string", None) as a special case (to
2983 detach the associated string, if any */
2984
2985statichere PyTypeObject Match_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002986 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00002987 0, "_" SRE_MODULE ".SRE_Match",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002988 sizeof(MatchObject), sizeof(int),
2989 (destructor)match_dealloc, /*tp_dealloc*/
2990 0, /*tp_print*/
2991 (getattrfunc)match_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002992};
2993
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002994/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002995/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002996
2997static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002998scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002999{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003000 state_fini(&self->state);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003001 Py_DECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003002 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003003}
3004
3005static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003006scanner_match(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003007{
3008 SRE_STATE* state = &self->state;
3009 PyObject* match;
3010 int status;
3011
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003012 state_reset(state);
3013
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003014 state->ptr = state->start;
3015
3016 if (state->charsize == 1) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00003017 status = sre_match(state, PatternObject_GetCode(self->pattern), 1);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003018 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003019#if defined(HAVE_UNICODE)
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00003020 status = sre_umatch(state, PatternObject_GetCode(self->pattern), 1);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003021#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003022 }
3023
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003024 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003025 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003026
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00003027 if ((status == 0 || state->ptr == state->start) &&
3028 state->ptr < state->end)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003029 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003030 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003031 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003032
3033 return match;
3034}
3035
3036
3037static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003038scanner_search(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003039{
3040 SRE_STATE* state = &self->state;
3041 PyObject* match;
3042 int status;
3043
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003044 state_reset(state);
3045
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003046 state->ptr = state->start;
3047
3048 if (state->charsize == 1) {
3049 status = sre_search(state, PatternObject_GetCode(self->pattern));
3050 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003051#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003052 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003053#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003054 }
3055
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003056 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003057 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003058
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00003059 if ((status == 0 || state->ptr == state->start) &&
3060 state->ptr < state->end)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003061 state->start = (void*) ((char*) state->ptr + state->charsize);
3062 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003063 state->start = state->ptr;
3064
3065 return match;
3066}
3067
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003068static PyMethodDef scanner_methods[] = {
Neal Norwitzbb2769f2002-03-31 15:46:00 +00003069 /* FIXME: use METH_OLDARGS instead of 0 or fix to use METH_VARARGS */
3070 /* METH_OLDARGS is not in Python 1.5.2 */
3071 {"match", (PyCFunction) scanner_match, 0},
3072 {"search", (PyCFunction) scanner_search, 0},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003073 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003074};
3075
3076static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003077scanner_getattr(ScannerObject* self, char* name)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003078{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003079 PyObject* res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003080
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003081 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
3082 if (res)
3083 return res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003084
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003085 PyErr_Clear();
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003086
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003087 /* attributes */
3088 if (!strcmp(name, "pattern")) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003089 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003090 return self->pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003091 }
3092
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003093 PyErr_SetString(PyExc_AttributeError, name);
3094 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003095}
3096
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003097statichere PyTypeObject Scanner_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003098 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00003099 0, "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003100 sizeof(ScannerObject), 0,
3101 (destructor)scanner_dealloc, /*tp_dealloc*/
3102 0, /*tp_print*/
3103 (getattrfunc)scanner_getattr, /*tp_getattr*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003104};
3105
Guido van Rossumb700df92000-03-31 14:59:30 +00003106static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00003107 {"compile", _compile, METH_VARARGS},
3108 {"getcodesize", sre_codesize, METH_VARARGS},
3109 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003110 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003111};
3112
Andrew M. Kuchlingc24fe362003-04-30 13:09:08 +00003113#if PY_VERSION_HEX < 0x02030000
3114DL_EXPORT(void) init_sre(void)
3115#else
Mark Hammond8235ea12002-07-19 06:55:41 +00003116PyMODINIT_FUNC init_sre(void)
Andrew M. Kuchlingc24fe362003-04-30 13:09:08 +00003117#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00003118{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003119 PyObject* m;
3120 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00003121 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003122
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003123 /* Patch object types */
3124 Pattern_Type.ob_type = Match_Type.ob_type =
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003125 Scanner_Type.ob_type = &PyType_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00003126
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00003127 m = Py_InitModule("_" SRE_MODULE, _functions);
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003128 d = PyModule_GetDict(m);
3129
Fredrik Lundh21009b92001-09-18 18:47:09 +00003130 x = PyInt_FromLong(SRE_MAGIC);
3131 if (x) {
3132 PyDict_SetItemString(d, "MAGIC", x);
3133 Py_DECREF(x);
3134 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00003135
Martin v. Löwis78e2f062003-04-19 12:56:08 +00003136 x = PyInt_FromLong(sizeof(SRE_CODE));
3137 if (x) {
3138 PyDict_SetItemString(d, "CODESIZE", x);
3139 Py_DECREF(x);
3140 }
3141
Fredrik Lundh21009b92001-09-18 18:47:09 +00003142 x = PyString_FromString(copyright);
3143 if (x) {
3144 PyDict_SetItemString(d, "copyright", x);
3145 Py_DECREF(x);
3146 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003147}
3148
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003149#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00003150
3151/* vim:ts=4:sw=4:et
3152*/