blob: 74e3dbf251209d199f6f370846dd294291f9d5bf [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
Fredrik Lundhebc37b22000-10-28 19:30:41 +00009 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000010 * 2000-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000011 * 2000-09-20 fl added expand method
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000012 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000013 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000014 * 2001-04-28 fl added __copy__ methods (work in progress)
Fredrik Lundh09705f02002-11-22 12:46:35 +000015 * 2001-05-14 fl fixes for 1.5.2 compatibility
Fredrik Lundhf71ae462001-07-02 17:04:48 +000016 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh397a6542001-10-18 19:30:16 +000017 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Fredrik Lundh971e78b2001-10-20 17:48:46 +000018 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
Fredrik Lundhbec95b92001-10-21 16:47:57 +000019 * 2001-10-21 fl added sub/subn primitive
Fredrik Lundh703ce812001-10-24 22:16:30 +000020 * 2001-10-24 fl added finditer primitive (for 2.2 only)
Fredrik Lundh82b23072001-12-09 16:13:15 +000021 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
Fredrik Lundh09705f02002-11-22 12:46:35 +000022 * 2002-11-09 fl fixed empty sub/subn return type
Martin v. Löwis78e2f062003-04-19 12:56:08 +000023 * 2003-04-18 mvl fully support 4-byte codes
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +000024 * 2003-10-17 gn implemented non recursive scheme
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000026 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000027 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000028 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
Guido van Rossumb700df92000-03-31 14:59:30 +000032 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000033 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000034 * other compatibility work.
35 */
36
37#ifndef SRE_RECURSIVE
38
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000039static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000040 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000041
42#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000043#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000044
45#include "sre.h"
46
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000047#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000048
Fredrik Lundh436c3d582000-06-29 08:58:44 +000049/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000050#if !defined(SRE_MODULE)
51#define SRE_MODULE "sre"
52#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000053
Guido van Rossumb700df92000-03-31 14:59:30 +000054/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000055#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000056
Fredrik Lundh971e78b2001-10-20 17:48:46 +000057#if PY_VERSION_HEX >= 0x01060000
58#if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
Fredrik Lundh22d25462000-07-01 17:50:59 +000059/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +000060#define HAVE_UNICODE
61#endif
Fredrik Lundh971e78b2001-10-20 17:48:46 +000062#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000063
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000065/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000066
Fredrik Lundh33accc12000-08-27 20:59:47 +000067/* prevent run-away recursion (bad patterns on long strings) */
68
Fredrik Lundh18c2aa22000-08-07 17:33:38 +000069#if !defined(USE_STACKCHECK)
Fredrik Lundh33accc12000-08-27 20:59:47 +000070#if defined(MS_WIN64) || defined(__LP64__) || defined(_LP64)
71/* require smaller recursion limit for a number of 64-bit platforms:
72 Win64 (MS_WIN64), Linux64 (__LP64__), Monterey (64-bit AIX) (_LP64) */
73/* FIXME: maybe the limit should be 40000 / sizeof(void*) ? */
74#define USE_RECURSION_LIMIT 7500
75#else
Andrew MacIntyre1a444482003-06-09 08:22:11 +000076
77#if defined(__GNUC__) && defined(WITH_THREAD) && defined(__FreeBSD__)
78/* the pthreads library on FreeBSD has a fixed 1MB stack size for the
79 * initial (or "primary") thread, which is insufficient for the default
80 * recursion limit. gcc 3.x at the default optimisation
81 * level (-O3) uses stack space more aggressively than gcc 2.95.
Gustavo Niemeyerc23fb772003-04-27 06:58:54 +000082 */
Andrew MacIntyre1a444482003-06-09 08:22:11 +000083#if (__GNUC__ > 2)
84#define USE_RECURSION_LIMIT 6500
85#else
Gustavo Niemeyerc23fb772003-04-27 06:58:54 +000086#define USE_RECURSION_LIMIT 7500
Andrew MacIntyre1a444482003-06-09 08:22:11 +000087#endif
88
Gustavo Niemeyerc23fb772003-04-27 06:58:54 +000089#else
Fredrik Lundh33accc12000-08-27 20:59:47 +000090#define USE_RECURSION_LIMIT 10000
91#endif
Fredrik Lundh18c2aa22000-08-07 17:33:38 +000092#endif
Gustavo Niemeyerc23fb772003-04-27 06:58:54 +000093#endif
Fredrik Lundh96ab4652000-08-03 16:29:50 +000094
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +000095/* enables usage of recursive scheme */
96#undef USE_RECURSION
97
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000098/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000099#define USE_FAST_SEARCH
100
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000101/* enables aggressive inlining (always on for Visual C) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000102#undef USE_INLINE
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000103
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000104/* enables copy/deepcopy handling (work in progress) */
105#undef USE_BUILTIN_COPY
106
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000107#if PY_VERSION_HEX < 0x01060000
108#define PyObject_DEL(op) PyMem_DEL((op))
109#endif
110
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000111/* -------------------------------------------------------------------- */
112
Fredrik Lundh80946112000-06-29 18:03:25 +0000113#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +0000114#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +0000115#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +0000116/* fastest possible local call under MSVC */
117#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000118#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000119#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000120#else
121#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +0000122#endif
123
124/* error codes */
125#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000126#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000127#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +0000128#define SRE_ERROR_MEMORY -9 /* out of memory */
129
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000130#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +0000131#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +0000132#else
133#define TRACE(v)
134#endif
135
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000136/* -------------------------------------------------------------------- */
137/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000138
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000139/* default character predicates (run sre_chars.py to regenerate tables) */
140
141#define SRE_DIGIT_MASK 1
142#define SRE_SPACE_MASK 2
143#define SRE_LINEBREAK_MASK 4
144#define SRE_ALNUM_MASK 8
145#define SRE_WORD_MASK 16
146
Fredrik Lundh21009b92001-09-18 18:47:09 +0000147/* FIXME: this assumes ASCII. create tables in init_sre() instead */
148
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000149static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1502, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1510, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
15225, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
15324, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1540, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
15524, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
156
Fredrik Lundhb389df32000-06-29 12:48:37 +0000157static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000015810, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
15927, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
16044, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
16161, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
162108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
163122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
164106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
165120, 121, 122, 123, 124, 125, 126, 127 };
166
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000167#define SRE_IS_DIGIT(ch)\
168 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
169#define SRE_IS_SPACE(ch)\
170 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
171#define SRE_IS_LINEBREAK(ch)\
172 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
173#define SRE_IS_ALNUM(ch)\
174 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
175#define SRE_IS_WORD(ch)\
176 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000177
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000178static unsigned int sre_lower(unsigned int ch)
179{
180 return ((ch) < 128 ? sre_char_lower[ch] : ch);
181}
182
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000183/* locale-specific character predicates */
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000184
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000185#define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
186#define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
187#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
188#define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
189#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
190
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000191static unsigned int sre_lower_locale(unsigned int ch)
192{
193 return ((ch) < 256 ? tolower((ch)) : ch);
194}
195
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000196/* unicode-specific character predicates */
197
198#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000199
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000200#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
201#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
202#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000203#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000204#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000205
206static unsigned int sre_lower_unicode(unsigned int ch)
207{
208 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
209}
210
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000211#endif
212
Guido van Rossumb700df92000-03-31 14:59:30 +0000213LOCAL(int)
214sre_category(SRE_CODE category, unsigned int ch)
215{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000216 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000217
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000218 case SRE_CATEGORY_DIGIT:
219 return SRE_IS_DIGIT(ch);
220 case SRE_CATEGORY_NOT_DIGIT:
221 return !SRE_IS_DIGIT(ch);
222 case SRE_CATEGORY_SPACE:
223 return SRE_IS_SPACE(ch);
224 case SRE_CATEGORY_NOT_SPACE:
225 return !SRE_IS_SPACE(ch);
226 case SRE_CATEGORY_WORD:
227 return SRE_IS_WORD(ch);
228 case SRE_CATEGORY_NOT_WORD:
229 return !SRE_IS_WORD(ch);
230 case SRE_CATEGORY_LINEBREAK:
231 return SRE_IS_LINEBREAK(ch);
232 case SRE_CATEGORY_NOT_LINEBREAK:
233 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000234
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000235 case SRE_CATEGORY_LOC_WORD:
236 return SRE_LOC_IS_WORD(ch);
237 case SRE_CATEGORY_LOC_NOT_WORD:
238 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000239
240#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000241 case SRE_CATEGORY_UNI_DIGIT:
242 return SRE_UNI_IS_DIGIT(ch);
243 case SRE_CATEGORY_UNI_NOT_DIGIT:
244 return !SRE_UNI_IS_DIGIT(ch);
245 case SRE_CATEGORY_UNI_SPACE:
246 return SRE_UNI_IS_SPACE(ch);
247 case SRE_CATEGORY_UNI_NOT_SPACE:
248 return !SRE_UNI_IS_SPACE(ch);
249 case SRE_CATEGORY_UNI_WORD:
250 return SRE_UNI_IS_WORD(ch);
251 case SRE_CATEGORY_UNI_NOT_WORD:
252 return !SRE_UNI_IS_WORD(ch);
253 case SRE_CATEGORY_UNI_LINEBREAK:
254 return SRE_UNI_IS_LINEBREAK(ch);
255 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
256 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000257#else
258 case SRE_CATEGORY_UNI_DIGIT:
259 return SRE_IS_DIGIT(ch);
260 case SRE_CATEGORY_UNI_NOT_DIGIT:
261 return !SRE_IS_DIGIT(ch);
262 case SRE_CATEGORY_UNI_SPACE:
263 return SRE_IS_SPACE(ch);
264 case SRE_CATEGORY_UNI_NOT_SPACE:
265 return !SRE_IS_SPACE(ch);
266 case SRE_CATEGORY_UNI_WORD:
267 return SRE_LOC_IS_WORD(ch);
268 case SRE_CATEGORY_UNI_NOT_WORD:
269 return !SRE_LOC_IS_WORD(ch);
270 case SRE_CATEGORY_UNI_LINEBREAK:
271 return SRE_IS_LINEBREAK(ch);
272 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
273 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000274#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000275 }
276 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000277}
278
279/* helpers */
280
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000281static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000282data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000283{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000284 if (state->data_stack) {
285 free(state->data_stack);
286 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000287 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000288 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000289}
290
291static int
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000292data_stack_grow(SRE_STATE* state, int size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000293{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000294 int minsize, cursize;
295 minsize = state->data_stack_base+size;
296 cursize = state->data_stack_size;
297 if (cursize < minsize) {
298 void* stack;
299 cursize = minsize+minsize/4+1024;
300 TRACE(("allocate/grow stack %d\n", cursize));
301 stack = realloc(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000302 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000303 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000304 return SRE_ERROR_MEMORY;
305 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000306 state->data_stack = stack;
307 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000308 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000309 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000310}
311
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000312/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000313
314#define SRE_CHAR unsigned char
315#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000316#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000317#define SRE_CHARSET sre_charset
318#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000319#define SRE_MATCH sre_match
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000320#define SRE_MATCH_CONTEXT sre_match_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000321#define SRE_SEARCH sre_search
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000322#define SRE_LITERAL_TEMPLATE sre_literal_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000323
324#if defined(HAVE_UNICODE)
325
Guido van Rossumb700df92000-03-31 14:59:30 +0000326#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000327#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000328#undef SRE_RECURSIVE
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000329
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000330#undef SRE_LITERAL_TEMPLATE
Guido van Rossumb700df92000-03-31 14:59:30 +0000331#undef SRE_SEARCH
332#undef SRE_MATCH
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000333#undef SRE_MATCH_CONTEXT
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000334#undef SRE_INFO
335#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000336#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000337#undef SRE_AT
338#undef SRE_CHAR
339
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000340/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000341
342#define SRE_CHAR Py_UNICODE
343#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000344#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000345#define SRE_CHARSET sre_ucharset
346#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000347#define SRE_MATCH sre_umatch
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000348#define SRE_MATCH_CONTEXT sre_umatch_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000349#define SRE_SEARCH sre_usearch
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000350#define SRE_LITERAL_TEMPLATE sre_uliteral_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000351#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000352
353#endif /* SRE_RECURSIVE */
354
355/* -------------------------------------------------------------------- */
356/* String matching engine */
357
358/* the following section is compiled twice, with different character
359 settings */
360
361LOCAL(int)
362SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
363{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000364 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000365
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000366 int this, that;
Guido van Rossumb700df92000-03-31 14:59:30 +0000367
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000368 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000369
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000370 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000371 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000372 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000373
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000374 case SRE_AT_BEGINNING_LINE:
375 return ((void*) ptr == state->beginning ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000376 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000377
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000378 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000379 return (((void*) (ptr+1) == state->end &&
380 SRE_IS_LINEBREAK((int) ptr[0])) ||
381 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000382
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000383 case SRE_AT_END_LINE:
384 return ((void*) ptr == state->end ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000385 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000386
Fredrik Lundh770617b2001-01-14 15:06:11 +0000387 case SRE_AT_END_STRING:
388 return ((void*) ptr == state->end);
389
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000390 case SRE_AT_BOUNDARY:
391 if (state->beginning == state->end)
392 return 0;
393 that = ((void*) ptr > state->beginning) ?
394 SRE_IS_WORD((int) ptr[-1]) : 0;
395 this = ((void*) ptr < state->end) ?
396 SRE_IS_WORD((int) ptr[0]) : 0;
397 return this != that;
Fredrik Lundh80946112000-06-29 18:03:25 +0000398
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000399 case SRE_AT_NON_BOUNDARY:
400 if (state->beginning == state->end)
401 return 0;
402 that = ((void*) ptr > state->beginning) ?
403 SRE_IS_WORD((int) ptr[-1]) : 0;
404 this = ((void*) ptr < state->end) ?
405 SRE_IS_WORD((int) ptr[0]) : 0;
406 return this == that;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000407
408 case SRE_AT_LOC_BOUNDARY:
409 if (state->beginning == state->end)
410 return 0;
411 that = ((void*) ptr > state->beginning) ?
412 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
413 this = ((void*) ptr < state->end) ?
414 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
415 return this != that;
416
417 case SRE_AT_LOC_NON_BOUNDARY:
418 if (state->beginning == state->end)
419 return 0;
420 that = ((void*) ptr > state->beginning) ?
421 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
422 this = ((void*) ptr < state->end) ?
423 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
424 return this == that;
425
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000426#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000427 case SRE_AT_UNI_BOUNDARY:
428 if (state->beginning == state->end)
429 return 0;
430 that = ((void*) ptr > state->beginning) ?
431 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
432 this = ((void*) ptr < state->end) ?
433 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
434 return this != that;
435
436 case SRE_AT_UNI_NON_BOUNDARY:
437 if (state->beginning == state->end)
438 return 0;
439 that = ((void*) ptr > state->beginning) ?
440 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
441 this = ((void*) ptr < state->end) ?
442 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
443 return this == that;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000444#endif
445
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000446 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000447
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000448 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000449}
450
451LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000452SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000453{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000454 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000455
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000456 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000457
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000458 for (;;) {
459 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000460
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000461 case SRE_OP_FAILURE:
462 return !ok;
463
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000464 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000465 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000466 if (ch == set[0])
467 return ok;
468 set++;
469 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000470
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000471 case SRE_OP_CATEGORY:
472 /* <CATEGORY> <code> */
473 if (sre_category(set[0], (int) ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000474 return ok;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000475 set += 1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000476 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000477
Fredrik Lundh3562f112000-07-02 12:00:07 +0000478 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000479 if (sizeof(SRE_CODE) == 2) {
480 /* <CHARSET> <bitmap> (16 bits per code word) */
481 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
482 return ok;
483 set += 16;
484 }
485 else {
486 /* <CHARSET> <bitmap> (32 bits per code word) */
487 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
488 return ok;
489 set += 8;
490 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000491 break;
492
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000493 case SRE_OP_RANGE:
494 /* <RANGE> <lower> <upper> */
495 if (set[0] <= ch && ch <= set[1])
496 return ok;
497 set += 2;
498 break;
499
500 case SRE_OP_NEGATE:
501 ok = !ok;
502 break;
503
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000504 case SRE_OP_BIGCHARSET:
505 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
506 {
507 int count, block;
508 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000509
510 if (sizeof(SRE_CODE) == 2) {
511 block = ((unsigned char*)set)[ch >> 8];
512 set += 128;
513 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
514 return ok;
515 set += count*16;
516 }
517 else {
518 if (ch < 65536)
519 block = ((unsigned char*)set)[ch >> 8];
520 else
521 block = -1;
522 set += 64;
523 if (block >=0 &&
524 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
525 return ok;
526 set += count*8;
527 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000528 break;
529 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000530
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000531 default:
532 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000533 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000534 return 0;
535 }
536 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000537}
538
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000539LOCAL(int) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level);
540
541LOCAL(int)
542SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, int maxcount, int level)
543{
544 SRE_CODE chr;
545 SRE_CHAR* ptr = state->ptr;
546 SRE_CHAR* end = state->end;
547 int i;
548
549 /* adjust end */
550 if (maxcount < end - ptr && maxcount != 65535)
551 end = ptr + maxcount;
552
553 switch (pattern[0]) {
554
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000555 case SRE_OP_IN:
556 /* repeated set */
557 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
558 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
559 ptr++;
560 break;
561
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000562 case SRE_OP_ANY:
563 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000564 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000565 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
566 ptr++;
567 break;
568
569 case SRE_OP_ANY_ALL:
570 /* repeated dot wildcare. skip to the end of the target
571 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000572 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000573 ptr = end;
574 break;
575
576 case SRE_OP_LITERAL:
577 /* repeated literal */
578 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000579 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000580 while (ptr < end && (SRE_CODE) *ptr == chr)
581 ptr++;
582 break;
583
584 case SRE_OP_LITERAL_IGNORE:
585 /* repeated literal */
586 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000587 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000588 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
589 ptr++;
590 break;
591
592 case SRE_OP_NOT_LITERAL:
593 /* repeated non-literal */
594 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000595 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000596 while (ptr < end && (SRE_CODE) *ptr != chr)
597 ptr++;
598 break;
599
600 case SRE_OP_NOT_LITERAL_IGNORE:
601 /* repeated non-literal */
602 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000603 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000604 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
605 ptr++;
606 break;
607
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000608 default:
609 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000610 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000611 while ((SRE_CHAR*) state->ptr < end) {
612 i = SRE_MATCH(state, pattern, level);
613 if (i < 0)
614 return i;
615 if (!i)
616 break;
617 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000618 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
619 (SRE_CHAR*) state->ptr - ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000620 return (SRE_CHAR*) state->ptr - ptr;
621 }
622
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000623 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000624 return ptr - (SRE_CHAR*) state->ptr;
625}
626
Fredrik Lundh33accc12000-08-27 20:59:47 +0000627#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000628LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000629SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
630{
631 /* check if an SRE_OP_INFO block matches at the current position.
632 returns the number of SRE_CODE objects to skip if successful, 0
633 if no match */
634
635 SRE_CHAR* end = state->end;
636 SRE_CHAR* ptr = state->ptr;
637 int i;
638
639 /* check minimal length */
640 if (pattern[3] && (end - ptr) < pattern[3])
641 return 0;
642
643 /* check known prefix */
644 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
645 /* <length> <skip> <prefix data> <overlap data> */
646 for (i = 0; i < pattern[5]; i++)
647 if ((SRE_CODE) ptr[i] != pattern[7 + i])
648 return 0;
649 return pattern[0] + 2 * pattern[6];
650 }
651 return pattern[0];
652}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000653#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000654
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000655/* The macros below should be used to protect recursive SRE_MATCH()
656 * calls that *failed* and do *not* return immediately (IOW, those
657 * that will backtrack). Explaining:
658 *
659 * - Recursive SRE_MATCH() returned true: that's usually a success
660 * (besides atypical cases like ASSERT_NOT), therefore there's no
661 * reason to restore lastmark;
662 *
663 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
664 * is returning to the caller: If the current SRE_MATCH() is the
665 * top function of the recursion, returning false will be a matching
666 * failure, and it doesn't matter where lastmark is pointing to.
667 * If it's *not* the top function, it will be a recursive SRE_MATCH()
668 * failure by itself, and the calling SRE_MATCH() will have to deal
669 * with the failure by the same rules explained here (it will restore
670 * lastmark by itself if necessary);
671 *
672 * - Recursive SRE_MATCH() returned false, and will continue the
673 * outside 'for' loop: must be protected when breaking, since the next
674 * OP could potentially depend on lastmark;
675 *
676 * - Recursive SRE_MATCH() returned false, and will be called again
677 * inside a local for/while loop: must be protected between each
678 * loop iteration, since the recursive SRE_MATCH() could do anything,
679 * and could potentially depend on lastmark.
680 *
681 * For more information, check the discussion at SF patch #712900.
682 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000683#define LASTMARK_SAVE() \
684 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000685 ctx->lastmark = state->lastmark; \
686 ctx->lastindex = state->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000687 } while (0)
688#define LASTMARK_RESTORE() \
689 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000690 state->lastmark = ctx->lastmark; \
691 state->lastindex = ctx->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000692 } while (0)
693
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000694#define RETURN_ERROR(i) do { return i; } while(0)
695#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
696#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
697
698#define RETURN_ON_ERROR(i) \
699 do { if (i < 0) RETURN_ERROR(i); } while (0)
700#define RETURN_ON_SUCCESS(i) \
701 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
702#define RETURN_ON_FAILURE(i) \
703 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
704
705#define SFY(x) #x
706
707#define DATA_STACK_ALLOC(state, type, ptr) \
708do { \
709 alloc_pos = state->data_stack_base; \
710 TRACE(("allocating %s in %d (%d)\n", \
711 SFY(type), alloc_pos, sizeof(type))); \
712 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
713 int j = data_stack_grow(state, sizeof(type)); \
714 if (j < 0) return j; \
715 if (ctx_pos != -1) \
716 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
717 } \
718 ptr = (type*)(state->data_stack+alloc_pos); \
719 state->data_stack_base += sizeof(type); \
720} while (0)
721
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000722#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
723do { \
724 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
725 ptr = (type*)(state->data_stack+pos); \
726} while (0)
727
728#define DATA_STACK_PUSH(state, data, size) \
729do { \
730 TRACE(("copy data in %p to %d (%d)\n", \
731 data, state->data_stack_base, size)); \
732 if (state->data_stack_size < state->data_stack_base+size) { \
733 int j = data_stack_grow(state, size); \
734 if (j < 0) return j; \
735 if (ctx_pos != -1) \
736 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
737 } \
738 memcpy(state->data_stack+state->data_stack_base, data, size); \
739 state->data_stack_base += size; \
740} while (0)
741
742#define DATA_STACK_POP(state, data, size, discard) \
743do { \
744 TRACE(("copy data to %p from %d (%d)\n", \
745 data, state->data_stack_base-size, size)); \
746 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
747 if (discard) \
748 state->data_stack_base -= size; \
749} while (0)
750
751#define DATA_STACK_POP_DISCARD(state, size) \
752do { \
753 TRACE(("discard data from %d (%d)\n", \
754 state->data_stack_base-size, size)); \
755 state->data_stack_base -= size; \
756} while(0)
757
758#define DATA_PUSH(x) \
759 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
760#define DATA_POP(x) \
761 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000762#define DATA_POP_DISCARD(x) \
763 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
764#define DATA_ALLOC(t,p) \
765 DATA_STACK_ALLOC(state, t, p)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000766#define DATA_LOOKUP_AT(t,p,pos) \
767 DATA_STACK_LOOKUP_AT(state,t,p,pos)
768
769#define MARK_PUSH(lastmark) \
770 do if (lastmark > 0) { \
771 i = lastmark; /* ctx->lastmark may change if reallocated */ \
772 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
773 } while (0)
774#define MARK_POP(lastmark) \
775 do if (lastmark > 0) { \
776 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
777 } while (0)
778#define MARK_POP_KEEP(lastmark) \
779 do if (lastmark > 0) { \
780 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
781 } while (0)
782#define MARK_POP_DISCARD(lastmark) \
783 do if (lastmark > 0) { \
784 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
785 } while (0)
786
787#define JUMP_NONE 0
788#define JUMP_MAX_UNTIL_1 1
789#define JUMP_MAX_UNTIL_2 2
790#define JUMP_MAX_UNTIL_3 3
791#define JUMP_MIN_UNTIL_1 4
792#define JUMP_MIN_UNTIL_2 5
793#define JUMP_MIN_UNTIL_3 6
794#define JUMP_REPEAT 7
795#define JUMP_REPEAT_ONE_1 8
796#define JUMP_REPEAT_ONE_2 9
797#define JUMP_MIN_REPEAT_ONE 10
798#define JUMP_BRANCH 11
799#define JUMP_ASSERT 12
800#define JUMP_ASSERT_NOT 13
801
802#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
803 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
804 nextctx->last_ctx_pos = ctx_pos; \
805 nextctx->jump = jumpvalue; \
806 nextctx->pattern = nextpattern; \
807 ctx_pos = alloc_pos; \
808 ctx = nextctx; \
809 goto entrance; \
810 jumplabel: \
811 while (0) /* gcc doesn't like labels at end of scopes */ \
812
813typedef struct {
814 int last_ctx_pos;
815 int jump;
816 SRE_CHAR* ptr;
817 SRE_CODE* pattern;
818 int count;
819 int lastmark;
820 int lastindex;
821 union {
822 SRE_CODE chr;
823 SRE_REPEAT* rep;
824 } u;
825} SRE_MATCH_CONTEXT;
826
827/* check if string matches the given pattern. returns <0 for
828 error, 0 for failure, and 1 for success */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000829LOCAL(int)
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000830SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
Guido van Rossumb700df92000-03-31 14:59:30 +0000831{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000832 SRE_CHAR* end = state->end;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000833 int alloc_pos, ctx_pos = -1;
834 int i, ret = 0;
835 int jump;
Guido van Rossumb700df92000-03-31 14:59:30 +0000836
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000837 SRE_MATCH_CONTEXT* ctx;
838 SRE_MATCH_CONTEXT* nextctx;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000839
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000840 TRACE(("|%p|%p|ENTER %d\n", pattern, state->ptr, level));
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000841
Fredrik Lundh18c2aa22000-08-07 17:33:38 +0000842#if defined(USE_STACKCHECK)
Fredrik Lundh58100642000-08-09 09:14:35 +0000843 if (level % 10 == 0 && PyOS_CheckStack())
Fredrik Lundh18c2aa22000-08-07 17:33:38 +0000844 return SRE_ERROR_RECURSION_LIMIT;
845#endif
846
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000847#if defined(USE_RECURSION_LIMIT)
848 if (level > USE_RECURSION_LIMIT)
849 return SRE_ERROR_RECURSION_LIMIT;
850#endif
851
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000852 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
853 ctx->last_ctx_pos = -1;
854 ctx->jump = JUMP_NONE;
855 ctx->pattern = pattern;
856 ctx_pos = alloc_pos;
857
858entrance:
859
860 ctx->ptr = state->ptr;
861
862 if (ctx->pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000863 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000864 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000865 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000866 TRACE(("reject (got %d chars, need %d)\n",
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000867 (end - ctx->ptr), ctx->pattern[3]));
868 RETURN_FAILURE;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000869 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000870 ctx->pattern += ctx->pattern[1] + 1;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000871 }
872
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000873 for (;;) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000874
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000875 switch (*ctx->pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000876
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000877 case SRE_OP_MARK:
878 /* set mark */
879 /* <MARK> <gid> */
880 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
881 ctx->ptr, ctx->pattern[0]));
882 i = ctx->pattern[0];
883 if (i & 1)
884 state->lastindex = i/2 + 1;
885 if (i > state->lastmark) {
886 /* state->lastmark is the highest valid index in the
887 state->mark array. If it is increased by more than 1,
888 the intervening marks must be set to NULL to signal
889 that these marks have not been encountered. */
890 int j = state->lastmark + 1;
891 while (j < i)
892 state->mark[j++] = NULL;
893 state->lastmark = i;
894 }
895 state->mark[i] = ctx->ptr;
896 ctx->pattern++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000897 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000898
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000899 case SRE_OP_LITERAL:
900 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000901 /* <LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000902 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
903 ctx->ptr, *ctx->pattern));
904 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
905 RETURN_FAILURE;
906 ctx->pattern++;
907 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000908 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000909
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000910 case SRE_OP_NOT_LITERAL:
911 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000912 /* <NOT_LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000913 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
914 ctx->ptr, *ctx->pattern));
915 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
916 RETURN_FAILURE;
917 ctx->pattern++;
918 ctx->ptr++;
919 break;
920
921 case SRE_OP_SUCCESS:
922 /* end of pattern */
923 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
924 state->ptr = ctx->ptr;
925 RETURN_SUCCESS;
926
927 case SRE_OP_AT:
928 /* match at given position */
929 /* <AT> <code> */
930 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
931 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
932 RETURN_FAILURE;
933 ctx->pattern++;
934 break;
935
936 case SRE_OP_CATEGORY:
937 /* match at given category */
938 /* <CATEGORY> <code> */
939 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
940 ctx->ptr, *ctx->pattern));
941 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
942 RETURN_FAILURE;
943 ctx->pattern++;
944 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000945 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000946
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000947 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000948 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000949 /* <ANY> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000950 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
951 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
952 RETURN_FAILURE;
953 ctx->ptr++;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000954 break;
955
956 case SRE_OP_ANY_ALL:
957 /* match anything */
958 /* <ANY_ALL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000959 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
960 if (ctx->ptr >= end)
961 RETURN_FAILURE;
962 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000963 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000964
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000965 case SRE_OP_IN:
966 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000967 /* <IN> <skip> <set> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000968 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
969 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
970 RETURN_FAILURE;
971 ctx->pattern += ctx->pattern[0];
972 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000973 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000974
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000975 case SRE_OP_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000976 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
977 ctx->pattern, ctx->ptr, ctx->pattern[0]));
978 if (ctx->ptr >= end ||
979 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
980 RETURN_FAILURE;
981 ctx->pattern++;
982 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000983 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000984
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000985 case SRE_OP_NOT_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000986 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
987 ctx->pattern, ctx->ptr, *ctx->pattern));
988 if (ctx->ptr >= end ||
989 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
990 RETURN_FAILURE;
991 ctx->pattern++;
992 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000993 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000994
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000995 case SRE_OP_IN_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000996 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
997 if (ctx->ptr >= end
998 || !SRE_CHARSET(ctx->pattern+1,
999 (SRE_CODE)state->lower(*ctx->ptr)))
1000 RETURN_FAILURE;
1001 ctx->pattern += ctx->pattern[0];
1002 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001003 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001004
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001005 case SRE_OP_JUMP:
1006 case SRE_OP_INFO:
1007 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001008 /* <JUMP> <offset> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001009 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
1010 ctx->ptr, ctx->pattern[0]));
1011 ctx->pattern += ctx->pattern[0];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001012 break;
1013
1014 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001015 /* alternation */
1016 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001017 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001018 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001019 ctx->u.rep = state->repeat;
1020 if (ctx->u.rep)
1021 MARK_PUSH(ctx->lastmark);
1022 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
1023 if (ctx->pattern[1] == SRE_OP_LITERAL &&
1024 (ctx->ptr >= end ||
1025 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001026 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001027 if (ctx->pattern[1] == SRE_OP_IN &&
1028 (ctx->ptr >= end ||
1029 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001030 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001031 state->ptr = ctx->ptr;
1032#ifdef USE_RECURSION
1033 ret = SRE_MATCH(state, ctx->pattern+1, level+1);
1034#else
1035 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
1036#endif
1037 if (ret) {
1038 if (ctx->u.rep)
1039 MARK_POP_DISCARD(ctx->lastmark);
1040 RETURN_ON_ERROR(ret);
1041 RETURN_SUCCESS;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001042 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001043 if (ctx->u.rep)
1044 MARK_POP_KEEP(ctx->lastmark);
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001045 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001046 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001047 if (ctx->u.rep)
1048 MARK_POP_DISCARD(ctx->lastmark);
1049 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001050
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001051 case SRE_OP_REPEAT_ONE:
1052 /* match repeated sequence (maximizing regexp) */
1053
1054 /* this operator only works if the repeated item is
1055 exactly one character wide, and we're not already
1056 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +00001057 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001058
1059 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1060
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001061 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1062 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001063
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001064 if (ctx->ptr + ctx->pattern[1] > end)
1065 RETURN_FAILURE; /* cannot match */
Fredrik Lundhe1869832000-08-01 22:47:49 +00001066
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001067 state->ptr = ctx->ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001068
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001069 ctx->count = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2],
1070 level+1);
1071 RETURN_ON_ERROR(ctx->count);
Fredrik Lundhe1869832000-08-01 22:47:49 +00001072
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001073 ctx->ptr += ctx->count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001074
1075 /* when we arrive here, count contains the number of
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001076 matches, and ctx->ptr points to the tail of the target
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001077 string. check if the rest of the pattern matches,
1078 and backtrack if not. */
1079
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001080 if (ctx->count < (int) ctx->pattern[1])
1081 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001082
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001083 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001084 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001085 state->ptr = ctx->ptr;
1086 RETURN_SUCCESS;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001087 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001088
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001089 LASTMARK_SAVE();
1090
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001091 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001092 /* tail starts with a literal. skip positions where
1093 the rest of the pattern cannot possibly match */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001094 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001095 for (;;) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001096 while (ctx->count >= (int) ctx->pattern[1] &&
1097 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1098 ctx->ptr--;
1099 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001100 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001101 if (ctx->count < (int) ctx->pattern[1])
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001102 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001103 state->ptr = ctx->ptr;
1104#ifdef USE_RECURSION
1105 ret = SRE_MATCH(state, ctx->pattern+ctx->pattern[0],
1106 level+1);
1107#else
1108 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1109 ctx->pattern+ctx->pattern[0]);
1110#endif
1111 if (ret) {
1112 RETURN_ON_ERROR(ret);
1113 RETURN_SUCCESS;
1114 }
1115
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001116 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001117
1118 ctx->ptr--;
1119 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001120 }
1121
1122 } else {
1123 /* general case */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001124 while (ctx->count >= (int) ctx->pattern[1]) {
1125 state->ptr = ctx->ptr;
1126#ifdef USE_RECURSION
1127 ret = SRE_MATCH(state, ctx->pattern+ctx->pattern[0],
1128 level+1);
1129#else
1130 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1131 ctx->pattern+ctx->pattern[0]);
1132#endif
1133 if (ret) {
1134 RETURN_ON_ERROR(ret);
1135 RETURN_SUCCESS;
1136 }
1137 ctx->ptr--;
1138 ctx->count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001139 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001140 }
1141 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001142 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001143
Guido van Rossum41c99e72003-04-14 17:59:34 +00001144 case SRE_OP_MIN_REPEAT_ONE:
1145 /* match repeated sequence (minimizing regexp) */
1146
1147 /* this operator only works if the repeated item is
1148 exactly one character wide, and we're not already
1149 collecting backtracking points. for other cases,
1150 use the MIN_REPEAT operator */
1151
1152 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1153
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001154 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1155 ctx->pattern[1], ctx->pattern[2]));
Guido van Rossum41c99e72003-04-14 17:59:34 +00001156
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001157 if (ctx->ptr + ctx->pattern[1] > end)
1158 RETURN_FAILURE; /* cannot match */
Guido van Rossum41c99e72003-04-14 17:59:34 +00001159
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001160 state->ptr = ctx->ptr;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001161
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001162 if (ctx->pattern[1] == 0)
1163 ctx->count = 0;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001164 else {
1165 /* count using pattern min as the maximum */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001166 ctx->count = SRE_COUNT(state, ctx->pattern+3,
1167 ctx->pattern[1], level+1);
1168 RETURN_ON_ERROR(ctx->count);
1169 if (ctx->count < (int) ctx->pattern[1])
1170 /* didn't match minimum number of times */
1171 RETURN_FAILURE;
1172 /* advance past minimum matches of repeat */
1173 ctx->ptr += ctx->count;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001174 }
1175
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001176 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Guido van Rossum41c99e72003-04-14 17:59:34 +00001177 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001178 state->ptr = ctx->ptr;
1179 RETURN_SUCCESS;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001180
1181 } else {
1182 /* general case */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001183 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001184 while ((int)ctx->pattern[2] == 65535
1185 || ctx->count <= (int)ctx->pattern[2]) {
1186 state->ptr = ctx->ptr;
1187#ifdef USE_RECURSION
1188 ret = SRE_MATCH(state, ctx->pattern+ctx->pattern[0],
1189 level+1);
1190#else
1191 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1192 ctx->pattern+ctx->pattern[0]);
1193#endif
1194 if (ret) {
1195 RETURN_ON_ERROR(ret);
1196 RETURN_SUCCESS;
1197 }
1198 state->ptr = ctx->ptr;
1199 ret = SRE_COUNT(state, ctx->pattern+3, 1, level+1);
1200 RETURN_ON_ERROR(ret);
1201 if (ret == 0)
Guido van Rossum41c99e72003-04-14 17:59:34 +00001202 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001203 assert(ret == 1);
1204 ctx->ptr++;
1205 ctx->count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001206 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001207 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001208 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001209 RETURN_FAILURE;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001210
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001211 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001212 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001213 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001214 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001215 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1216 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001217
1218 /* install new repeat context */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001219 ctx->u.rep = (SRE_REPEAT*) malloc(sizeof(*ctx->u.rep));
1220 ctx->u.rep->count = -1;
1221 ctx->u.rep->pattern = ctx->pattern;
1222 ctx->u.rep->prev = state->repeat;
1223 ctx->u.rep->last_ptr = NULL;
1224 state->repeat = ctx->u.rep;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001225
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001226 state->ptr = ctx->ptr;
1227#ifdef USE_RECURSION
1228 ret = SRE_MATCH(state, ctx->pattern+ctx->pattern[0], level+1);
1229#else
1230 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
1231#endif
1232 state->repeat = ctx->u.rep->prev;
1233 free(ctx->u.rep);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001234
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001235 if (ret) {
1236 RETURN_ON_ERROR(ret);
1237 RETURN_SUCCESS;
1238 }
1239 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001240
1241 case SRE_OP_MAX_UNTIL:
1242 /* maximizing repeat */
1243 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1244
1245 /* FIXME: we probably need to deal with zero-width
1246 matches in here... */
1247
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001248 ctx->u.rep = state->repeat;
1249 if (!ctx->u.rep)
1250 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001251
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001252 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001253
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001254 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001255
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001256 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1257 ctx->ptr, ctx->count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001258
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001259 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001260 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001261 ctx->u.rep->count = ctx->count;
1262#ifdef USE_RECURSION
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001263 /* RECURSIVE */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001264 ret = SRE_MATCH(state, ctx->u.rep->pattern+3, level+1);
1265#else
1266 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1267 ctx->u.rep->pattern+3);
1268#endif
1269 if (ret) {
1270 RETURN_ON_ERROR(ret);
1271 RETURN_SUCCESS;
1272 }
1273 ctx->u.rep->count = ctx->count-1;
1274 state->ptr = ctx->ptr;
1275 RETURN_FAILURE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001276 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001277
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001278 if ((ctx->count < ctx->u.rep->pattern[2] ||
1279 ctx->u.rep->pattern[2] == 65535) &&
1280 state->ptr != ctx->u.rep->last_ptr) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001281 /* we may have enough matches, but if we can
1282 match another item, do so */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001283 ctx->u.rep->count = ctx->count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001284 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001285 MARK_PUSH(ctx->lastmark);
1286 /* zero-width match protection */
1287 DATA_PUSH(&ctx->u.rep->last_ptr);
1288 ctx->u.rep->last_ptr = state->ptr;
1289#ifdef USE_RECURSION
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001290 /* RECURSIVE */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001291 ret = SRE_MATCH(state, ctx->u.rep->pattern+3, level+1);
1292#else
1293 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1294 ctx->u.rep->pattern+3);
1295#endif
1296 DATA_POP(&ctx->u.rep->last_ptr);
1297 if (ret) {
1298 MARK_POP_DISCARD(ctx->lastmark);
1299 RETURN_ON_ERROR(ret);
1300 RETURN_SUCCESS;
1301 }
1302 MARK_POP(ctx->lastmark);
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001303 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001304 ctx->u.rep->count = ctx->count-1;
1305 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001306 }
1307
1308 /* cannot match more repeated items here. make sure the
1309 tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001310 state->repeat = ctx->u.rep->prev;
1311#ifdef USE_RECURSION
1312 ret = SRE_MATCH(state, ctx->pattern, level+1);
1313#else
1314 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
1315#endif
1316 RETURN_ON_SUCCESS(ret);
1317 state->repeat = ctx->u.rep;
1318 state->ptr = ctx->ptr;
1319 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001320
1321 case SRE_OP_MIN_UNTIL:
1322 /* minimizing repeat */
1323 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1324
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001325 ctx->u.rep = state->repeat;
1326 if (!ctx->u.rep)
1327 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001328
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001329 state->ptr = ctx->ptr;
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001330
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001331 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001332
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001333 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1334 ctx->ptr, ctx->count, ctx->u.rep->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001335
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001336 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001337 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001338 ctx->u.rep->count = ctx->count;
1339#ifdef USE_RECURSION
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001340 /* RECURSIVE */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001341 ret = SRE_MATCH(state, ctx->u.rep->pattern+3, level+1);
1342#else
1343 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1344 ctx->u.rep->pattern+3);
1345#endif
1346 if (ret) {
1347 RETURN_ON_ERROR(ret);
1348 RETURN_SUCCESS;
1349 }
1350 ctx->u.rep->count = ctx->count-1;
1351 state->ptr = ctx->ptr;
1352 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001353 }
1354
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001355 LASTMARK_SAVE();
1356
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001357 /* see if the tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001358 state->repeat = ctx->u.rep->prev;
1359#ifdef USE_RECURSION
1360 ret = SRE_MATCH(state, ctx->pattern, level+1);
1361#else
1362 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
1363#endif
1364 if (ret) {
1365 RETURN_ON_ERROR(ret);
1366 RETURN_SUCCESS;
1367 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001368
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001369 state->repeat = ctx->u.rep;
1370 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001371
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001372 LASTMARK_RESTORE();
1373
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001374 if (ctx->count >= ctx->u.rep->pattern[2]
1375 && ctx->u.rep->pattern[2] != 65535)
1376 RETURN_FAILURE;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001377
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001378 ctx->u.rep->count = ctx->count;
1379#ifdef USE_RECURSION
1380 /* RECURSIVE */
1381 ret = SRE_MATCH(state, ctx->u.rep->pattern+3, level+1);
1382#else
1383 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1384 ctx->u.rep->pattern+3);
1385#endif
1386 if (ret) {
1387 RETURN_ON_ERROR(ret);
1388 RETURN_SUCCESS;
1389 }
1390 ctx->u.rep->count = ctx->count-1;
1391 state->ptr = ctx->ptr;
1392 RETURN_FAILURE;
1393
1394 case SRE_OP_GROUPREF:
1395 /* match backreference */
1396 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1397 ctx->ptr, ctx->pattern[0]));
1398 i = ctx->pattern[0];
1399 {
1400 int groupref = i+i;
1401 if (groupref >= state->lastmark) {
1402 RETURN_FAILURE;
1403 } else {
1404 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1405 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1406 if (!p || !e || e < p)
1407 RETURN_FAILURE;
1408 while (p < e) {
1409 if (ctx->ptr >= end || *ctx->ptr != *p)
1410 RETURN_FAILURE;
1411 p++; ctx->ptr++;
1412 }
1413 }
1414 }
1415 ctx->pattern++;
1416 break;
1417
1418 case SRE_OP_GROUPREF_IGNORE:
1419 /* match backreference */
1420 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1421 ctx->ptr, ctx->pattern[0]));
1422 i = ctx->pattern[0];
1423 {
1424 int groupref = i+i;
1425 if (groupref >= state->lastmark) {
1426 RETURN_FAILURE;
1427 } else {
1428 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1429 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1430 if (!p || !e || e < p)
1431 RETURN_FAILURE;
1432 while (p < e) {
1433 if (ctx->ptr >= end ||
1434 state->lower(*ctx->ptr) != state->lower(*p))
1435 RETURN_FAILURE;
1436 p++; ctx->ptr++;
1437 }
1438 }
1439 }
1440 ctx->pattern++;
1441 break;
1442
1443 case SRE_OP_GROUPREF_EXISTS:
1444 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1445 ctx->ptr, ctx->pattern[0]));
1446 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1447 i = ctx->pattern[0];
1448 {
1449 int groupref = i+i;
1450 if (groupref >= state->lastmark) {
1451 ctx->pattern += ctx->pattern[1];
1452 break;
1453 } else {
1454 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1455 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1456 if (!p || !e || e < p) {
1457 ctx->pattern += ctx->pattern[1];
1458 break;
1459 }
1460 }
1461 }
1462 ctx->pattern += 2;
1463 break;
1464
1465 case SRE_OP_ASSERT:
1466 /* assert subpattern */
1467 /* <ASSERT> <skip> <back> <pattern> */
1468 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1469 ctx->ptr, ctx->pattern[1]));
1470 state->ptr = ctx->ptr - ctx->pattern[1];
1471 if (state->ptr < state->beginning)
1472 RETURN_FAILURE;
1473#ifdef USE_RECURSION
1474 ret = SRE_MATCH(state, ctx->pattern+2, level+1);
1475#else
1476 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
1477#endif
1478 RETURN_ON_FAILURE(ret);
1479 ctx->pattern += ctx->pattern[0];
1480 break;
1481
1482 case SRE_OP_ASSERT_NOT:
1483 /* assert not subpattern */
1484 /* <ASSERT_NOT> <skip> <back> <pattern> */
1485 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1486 ctx->ptr, ctx->pattern[1]));
1487 state->ptr = ctx->ptr - ctx->pattern[1];
1488 if (state->ptr >= state->beginning) {
1489#ifdef USE_RECURSION
1490 ret = SRE_MATCH(state, ctx->pattern+2, level+1);
1491#else
1492 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
1493#endif
1494 if (ret) {
1495 RETURN_ON_ERROR(ret);
1496 RETURN_FAILURE;
1497 }
1498 }
1499 ctx->pattern += ctx->pattern[0];
1500 break;
1501
1502 case SRE_OP_FAILURE:
1503 /* immediate failure */
1504 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1505 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001506
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001507 default:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001508 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1509 ctx->pattern[-1]));
1510 RETURN_ERROR(SRE_ERROR_ILLEGAL);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001511 }
1512 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001513
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001514exit:
1515 ctx_pos = ctx->last_ctx_pos;
1516 jump = ctx->jump;
1517 DATA_POP_DISCARD(ctx);
1518 if (ctx_pos == -1)
1519 return ret;
1520 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1521
1522#ifndef USE_RECURSION
1523 switch (jump) {
1524 case JUMP_MAX_UNTIL_2:
1525 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1526 goto jump_max_until_2;
1527 case JUMP_MAX_UNTIL_3:
1528 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1529 goto jump_max_until_3;
1530 case JUMP_MIN_UNTIL_2:
1531 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1532 goto jump_min_until_2;
1533 case JUMP_MIN_UNTIL_3:
1534 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1535 goto jump_min_until_3;
1536 case JUMP_BRANCH:
1537 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1538 goto jump_branch;
1539 case JUMP_MAX_UNTIL_1:
1540 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1541 goto jump_max_until_1;
1542 case JUMP_MIN_UNTIL_1:
1543 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1544 goto jump_min_until_1;
1545 case JUMP_REPEAT:
1546 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1547 goto jump_repeat;
1548 case JUMP_REPEAT_ONE_1:
1549 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1550 goto jump_repeat_one_1;
1551 case JUMP_REPEAT_ONE_2:
1552 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1553 goto jump_repeat_one_2;
1554 case JUMP_MIN_REPEAT_ONE:
1555 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1556 goto jump_min_repeat_one;
1557 case JUMP_ASSERT:
1558 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1559 goto jump_assert;
1560 case JUMP_ASSERT_NOT:
1561 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1562 goto jump_assert_not;
1563 case JUMP_NONE:
1564 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1565 break;
1566 }
1567#endif
1568
1569 return ret; /* should never get here */
Guido van Rossumb700df92000-03-31 14:59:30 +00001570}
1571
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001572LOCAL(int)
Guido van Rossumb700df92000-03-31 14:59:30 +00001573SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1574{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001575 SRE_CHAR* ptr = state->start;
1576 SRE_CHAR* end = state->end;
1577 int status = 0;
Fredrik Lundh28552902000-07-05 21:14:16 +00001578 int prefix_len = 0;
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001579 int prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001580 SRE_CODE* prefix = NULL;
1581 SRE_CODE* charset = NULL;
1582 SRE_CODE* overlap = NULL;
1583 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001584
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001585 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001586 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001587 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001588
1589 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001590
Gustavo Niemeyer28b5bb32003-06-26 14:41:08 +00001591 if (pattern[3] > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001592 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001593 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001594 end -= pattern[3]-1;
1595 if (end <= ptr)
1596 end = ptr+1;
1597 }
1598
Fredrik Lundh3562f112000-07-02 12:00:07 +00001599 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001600 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001601 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001602 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001603 prefix_skip = pattern[6];
1604 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001605 overlap = prefix + prefix_len - 1;
1606 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001607 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001608 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001609 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001610
1611 pattern += 1 + pattern[1];
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001612 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001613
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001614 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1615 TRACE(("charset = %p\n", charset));
1616
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001617#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001618 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001619 /* pattern starts with a known prefix. use the overlap
1620 table to skip forward as fast as we possibly can */
1621 int i = 0;
1622 end = state->end;
1623 while (ptr < end) {
1624 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001625 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001626 if (!i)
1627 break;
1628 else
1629 i = overlap[i];
1630 } else {
1631 if (++i == prefix_len) {
1632 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001633 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1634 state->start = ptr + 1 - prefix_len;
1635 state->ptr = ptr + 1 - prefix_len + prefix_skip;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001636 if (flags & SRE_INFO_LITERAL)
1637 return 1; /* we got all of it */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001638 status = SRE_MATCH(state, pattern + 2*prefix_skip, 1);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001639 if (status != 0)
1640 return status;
1641 /* close but no cigar -- try again */
1642 i = overlap[i];
1643 }
1644 break;
1645 }
1646
1647 }
1648 ptr++;
1649 }
1650 return 0;
1651 }
1652#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001653
Fredrik Lundh3562f112000-07-02 12:00:07 +00001654 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001655 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001656 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001657 SRE_CODE chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001658 end = state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001659 for (;;) {
1660 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1661 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001662 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001663 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001664 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001665 state->start = ptr;
1666 state->ptr = ++ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001667 if (flags & SRE_INFO_LITERAL)
1668 return 1; /* we got all of it */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001669 status = SRE_MATCH(state, pattern + 2, 1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001670 if (status != 0)
1671 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001672 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001673 } else if (charset) {
1674 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001675 end = state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001676 for (;;) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001677 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001678 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001679 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001680 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001681 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001682 state->start = ptr;
1683 state->ptr = ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001684 status = SRE_MATCH(state, pattern, 1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001685 if (status != 0)
1686 break;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001687 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001688 }
1689 } else
1690 /* general case */
1691 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001692 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001693 state->start = state->ptr = ptr++;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001694 status = SRE_MATCH(state, pattern, 1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001695 if (status != 0)
1696 break;
1697 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001698
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001699 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001700}
Fredrik Lundh3562f112000-07-02 12:00:07 +00001701
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001702LOCAL(int)
1703SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, int len)
1704{
1705 /* check if given string is a literal template (i.e. no escapes) */
1706 while (len-- > 0)
1707 if (*ptr++ == '\\')
1708 return 0;
1709 return 1;
1710}
Guido van Rossumb700df92000-03-31 14:59:30 +00001711
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001712#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001713
1714/* -------------------------------------------------------------------- */
1715/* factories and destructors */
1716
1717/* see sre.h for object declarations */
1718
Jeremy Hylton938ace62002-07-17 16:30:39 +00001719static PyTypeObject Pattern_Type;
1720static PyTypeObject Match_Type;
1721static PyTypeObject Scanner_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00001722
1723static PyObject *
1724_compile(PyObject* self_, PyObject* args)
1725{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001726 /* "compile" pattern descriptor to pattern object */
Guido van Rossumb700df92000-03-31 14:59:30 +00001727
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001728 PatternObject* self;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001729 int i, n;
Guido van Rossumb700df92000-03-31 14:59:30 +00001730
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001731 PyObject* pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001732 int flags = 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001733 PyObject* code;
1734 int groups = 0;
1735 PyObject* groupindex = NULL;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001736 PyObject* indexgroup = NULL;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001737 if (!PyArg_ParseTuple(args, "OiO!|iOO", &pattern, &flags,
1738 &PyList_Type, &code, &groups,
1739 &groupindex, &indexgroup))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001740 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001741
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001742 n = PyList_GET_SIZE(code);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001743
Fredrik Lundhebc37b22000-10-28 19:30:41 +00001744 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001745 if (!self)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001746 return NULL;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001747
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001748 self->codesize = n;
1749
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001750 for (i = 0; i < n; i++) {
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001751 PyObject *o = PyList_GET_ITEM(code, i);
Martin v. Löwis78e2f062003-04-19 12:56:08 +00001752 if (PyInt_Check(o))
1753 self->code[i] = (SRE_CODE) PyInt_AsLong(o);
1754 else
1755 self->code[i] = (SRE_CODE) PyLong_AsUnsignedLong(o);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001756 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00001757
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001758 if (PyErr_Occurred()) {
1759 PyObject_DEL(self);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001760 return NULL;
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001761 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00001762
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001763 Py_INCREF(pattern);
1764 self->pattern = pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00001765
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001766 self->flags = flags;
1767
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001768 self->groups = groups;
Guido van Rossumb700df92000-03-31 14:59:30 +00001769
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001770 Py_XINCREF(groupindex);
1771 self->groupindex = groupindex;
Guido van Rossumb700df92000-03-31 14:59:30 +00001772
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001773 Py_XINCREF(indexgroup);
1774 self->indexgroup = indexgroup;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001775
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001776 return (PyObject*) self;
Guido van Rossumb700df92000-03-31 14:59:30 +00001777}
1778
1779static PyObject *
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001780sre_codesize(PyObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001781{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001782 return Py_BuildValue("i", sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001783}
1784
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001785static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001786sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001787{
1788 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001789 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001790 return NULL;
1791 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001792 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001793 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001794#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001795 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001796#else
1797 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001798#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001799 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001800}
1801
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001802LOCAL(void)
1803state_reset(SRE_STATE* state)
1804{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001805 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001806 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001807
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001808 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001809 state->lastindex = -1;
1810
1811 state->repeat = NULL;
1812
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001813 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001814}
1815
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001816static void*
1817getstring(PyObject* string, int* p_length, int* p_charsize)
Guido van Rossumb700df92000-03-31 14:59:30 +00001818{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001819 /* given a python object, return a data pointer, a length (in
1820 characters), and a character size. return NULL if the object
1821 is not a string (or not compatible) */
1822
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001823 PyBufferProcs *buffer;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001824 int size, bytes, charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001825 void* ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001826
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001827#if defined(HAVE_UNICODE)
1828 if (PyUnicode_Check(string)) {
1829 /* unicode strings doesn't always support the buffer interface */
1830 ptr = (void*) PyUnicode_AS_DATA(string);
1831 bytes = PyUnicode_GET_DATA_SIZE(string);
1832 size = PyUnicode_GET_SIZE(string);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001833 charsize = sizeof(Py_UNICODE);
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001834
1835 } else {
1836#endif
1837
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001838 /* get pointer to string buffer */
1839 buffer = string->ob_type->tp_as_buffer;
1840 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1841 buffer->bf_getsegcount(string, NULL) != 1) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001842 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001843 return NULL;
1844 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001845
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001846 /* determine buffer size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001847 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1848 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001849 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1850 return NULL;
1851 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001852
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001853 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001854#if PY_VERSION_HEX >= 0x01060000
1855 size = PyObject_Size(string);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001856#else
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001857 size = PyObject_Length(string);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001858#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001859
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001860 if (PyString_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001861 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001862#if defined(HAVE_UNICODE)
1863 else if (bytes == (int) (size * sizeof(Py_UNICODE)))
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001864 charsize = sizeof(Py_UNICODE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001865#endif
1866 else {
1867 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1868 return NULL;
1869 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001870
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001871#if defined(HAVE_UNICODE)
1872 }
1873#endif
1874
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001875 *p_length = size;
1876 *p_charsize = charsize;
1877
1878 return ptr;
1879}
1880
1881LOCAL(PyObject*)
1882state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1883 int start, int end)
1884{
1885 /* prepare state object */
1886
1887 int length;
1888 int charsize;
1889 void* ptr;
1890
1891 memset(state, 0, sizeof(SRE_STATE));
1892
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001893 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001894 state->lastindex = -1;
1895
1896 ptr = getstring(string, &length, &charsize);
1897 if (!ptr)
1898 return NULL;
1899
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001900 /* adjust boundaries */
1901 if (start < 0)
1902 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001903 else if (start > length)
1904 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001905
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001906 if (end < 0)
1907 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001908 else if (end > length)
1909 end = length;
1910
1911 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001912
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001913 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001914
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001915 state->start = (void*) ((char*) ptr + start * state->charsize);
1916 state->end = (void*) ((char*) ptr + end * state->charsize);
1917
1918 Py_INCREF(string);
1919 state->string = string;
1920 state->pos = start;
1921 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001922
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001923 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001924 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001925 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001926#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001927 state->lower = sre_lower_unicode;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001928#else
1929 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001930#endif
1931 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001932 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001933
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001934 return string;
Guido van Rossumb700df92000-03-31 14:59:30 +00001935}
1936
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001937LOCAL(void)
1938state_fini(SRE_STATE* state)
1939{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001940 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001941 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001942}
1943
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001944/* calculate offset from start of string */
1945#define STATE_OFFSET(state, member)\
1946 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1947
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001948LOCAL(PyObject*)
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001949state_getslice(SRE_STATE* state, int index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001950{
Fredrik Lundh58100642000-08-09 09:14:35 +00001951 int i, j;
1952
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001953 index = (index - 1) * 2;
1954
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001955 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001956 if (empty)
1957 /* want empty string */
1958 i = j = 0;
1959 else {
1960 Py_INCREF(Py_None);
1961 return Py_None;
1962 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001963 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001964 i = STATE_OFFSET(state, state->mark[index]);
1965 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001966 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001967
Fredrik Lundh58100642000-08-09 09:14:35 +00001968 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001969}
1970
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001971static void
1972pattern_error(int status)
1973{
1974 switch (status) {
1975 case SRE_ERROR_RECURSION_LIMIT:
1976 PyErr_SetString(
1977 PyExc_RuntimeError,
1978 "maximum recursion limit exceeded"
1979 );
1980 break;
1981 case SRE_ERROR_MEMORY:
1982 PyErr_NoMemory();
1983 break;
1984 default:
1985 /* other error codes indicate compiler/engine bugs */
1986 PyErr_SetString(
1987 PyExc_RuntimeError,
1988 "internal error in regular expression engine"
1989 );
1990 }
1991}
1992
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001993static PyObject*
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001994pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001995{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001996 /* create match object (from state object) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001997
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001998 MatchObject* match;
1999 int i, j;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002000 char* base;
2001 int n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002002
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002003 if (status > 0) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002004
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002005 /* create match object (with room for extra group marks) */
2006 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
Fredrik Lundh6f013982000-07-03 18:44:21 +00002007 2*(pattern->groups+1));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002008 if (!match)
2009 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002010
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002011 Py_INCREF(pattern);
2012 match->pattern = pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002013
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002014 Py_INCREF(state->string);
2015 match->string = state->string;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002016
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002017 match->regs = NULL;
2018 match->groups = pattern->groups+1;
2019
2020 /* fill in group slices */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002021
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002022 base = (char*) state->beginning;
2023 n = state->charsize;
2024
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002025 match->mark[0] = ((char*) state->start - base) / n;
2026 match->mark[1] = ((char*) state->ptr - base) / n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002027
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002028 for (i = j = 0; i < pattern->groups; i++, j+=2)
2029 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
2030 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
2031 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
2032 } else
2033 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
2034
2035 match->pos = state->pos;
2036 match->endpos = state->endpos;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002037
Fredrik Lundh6f013982000-07-03 18:44:21 +00002038 match->lastindex = state->lastindex;
2039
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002040 return (PyObject*) match;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00002041
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002042 } else if (status == 0) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002043
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002044 /* no match */
2045 Py_INCREF(Py_None);
2046 return Py_None;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002047
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002048 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002049
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002050 /* internal error */
2051 pattern_error(status);
2052 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002053}
2054
2055static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002056pattern_scanner(PatternObject* pattern, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002057{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002058 /* create search state object */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002059
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002060 ScannerObject* self;
2061
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002062 PyObject* string;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002063 int start = 0;
2064 int end = INT_MAX;
2065 if (!PyArg_ParseTuple(args, "O|ii:scanner", &string, &start, &end))
2066 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002067
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002068 /* create scanner object */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002069 self = PyObject_NEW(ScannerObject, &Scanner_Type);
Fredrik Lundh6f013982000-07-03 18:44:21 +00002070 if (!self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002071 return NULL;
2072
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002073 string = state_init(&self->state, pattern, string, start, end);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002074 if (!string) {
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00002075 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002076 return NULL;
2077 }
2078
2079 Py_INCREF(pattern);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002080 self->pattern = (PyObject*) pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002081
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002082 return (PyObject*) self;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002083}
2084
Guido van Rossumb700df92000-03-31 14:59:30 +00002085static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002086pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00002087{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002088 Py_XDECREF(self->pattern);
2089 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00002090 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002091 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00002092}
2093
2094static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002095pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002096{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002097 SRE_STATE state;
2098 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00002099
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002100 PyObject* string;
2101 int start = 0;
2102 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002103 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
2104 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:match", kwlist,
2105 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002106 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002107
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002108 string = state_init(&state, self, string, start, end);
2109 if (!string)
2110 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002111
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002112 state.ptr = state.start;
2113
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002114 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
2115
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002116 if (state.charsize == 1) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00002117 status = sre_match(&state, PatternObject_GetCode(self), 1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002118 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002119#if defined(HAVE_UNICODE)
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00002120 status = sre_umatch(&state, PatternObject_GetCode(self), 1);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002121#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002122 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002123
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002124 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
2125
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002126 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00002127
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002128 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00002129}
2130
2131static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002132pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002133{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002134 SRE_STATE state;
2135 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00002136
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002137 PyObject* string;
2138 int start = 0;
2139 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002140 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
2141 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:search", kwlist,
2142 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002143 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002144
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002145 string = state_init(&state, self, string, start, end);
2146 if (!string)
2147 return NULL;
2148
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002149 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
2150
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002151 if (state.charsize == 1) {
2152 status = sre_search(&state, PatternObject_GetCode(self));
2153 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002154#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002155 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002156#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002157 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002158
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002159 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
2160
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002161 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00002162
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002163 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00002164}
2165
2166static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002167call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002168{
2169 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002170 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002171 PyObject* func;
2172 PyObject* result;
2173
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002174 if (!args)
2175 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002176 name = PyString_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002177 if (!name)
2178 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002179 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002180 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002181 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002182 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002183 func = PyObject_GetAttrString(mod, function);
2184 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002185 if (!func)
2186 return NULL;
2187 result = PyObject_CallObject(func, args);
2188 Py_DECREF(func);
2189 Py_DECREF(args);
2190 return result;
2191}
2192
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002193#ifdef USE_BUILTIN_COPY
2194static int
2195deepcopy(PyObject** object, PyObject* memo)
2196{
2197 PyObject* copy;
2198
2199 copy = call(
2200 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002201 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002202 );
2203 if (!copy)
2204 return 0;
2205
2206 Py_DECREF(*object);
2207 *object = copy;
2208
2209 return 1; /* success */
2210}
2211#endif
2212
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002213static PyObject*
Michael W. Hudsonb6a45052002-07-31 09:54:24 +00002214join_list(PyObject* list, PyObject* pattern)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002215{
2216 /* join list elements */
2217
2218 PyObject* joiner;
2219#if PY_VERSION_HEX >= 0x01060000
2220 PyObject* function;
2221 PyObject* args;
2222#endif
2223 PyObject* result;
2224
2225 switch (PyList_GET_SIZE(list)) {
2226 case 0:
2227 Py_DECREF(list);
Fredrik Lundh09705f02002-11-22 12:46:35 +00002228 return PySequence_GetSlice(pattern, 0, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002229 case 1:
2230 result = PyList_GET_ITEM(list, 0);
2231 Py_INCREF(result);
2232 Py_DECREF(list);
2233 return result;
2234 }
2235
2236 /* two or more elements: slice out a suitable separator from the
2237 first member, and use that to join the entire list */
2238
2239 joiner = PySequence_GetSlice(pattern, 0, 0);
2240 if (!joiner)
2241 return NULL;
2242
2243#if PY_VERSION_HEX >= 0x01060000
2244 function = PyObject_GetAttrString(joiner, "join");
2245 if (!function) {
2246 Py_DECREF(joiner);
2247 return NULL;
2248 }
2249 args = PyTuple_New(1);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002250 if (!args) {
2251 Py_DECREF(function);
2252 Py_DECREF(joiner);
2253 return NULL;
2254 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002255 PyTuple_SET_ITEM(args, 0, list);
2256 result = PyObject_CallObject(function, args);
2257 Py_DECREF(args); /* also removes list */
2258 Py_DECREF(function);
2259#else
2260 result = call(
2261 "string", "join",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002262 PyTuple_Pack(2, list, joiner)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002263 );
2264#endif
2265 Py_DECREF(joiner);
2266
2267 return result;
2268}
2269
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002270static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002271pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002272{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002273 SRE_STATE state;
2274 PyObject* list;
2275 int status;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002276 int i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00002277
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002278 PyObject* string;
2279 int start = 0;
2280 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002281 static char* kwlist[] = { "source", "pos", "endpos", NULL };
2282 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:findall", kwlist,
2283 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002284 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002285
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002286 string = state_init(&state, self, string, start, end);
2287 if (!string)
2288 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002289
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002290 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002291 if (!list) {
2292 state_fini(&state);
2293 return NULL;
2294 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002295
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002296 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002297
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002298 PyObject* item;
2299
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002300 state_reset(&state);
2301
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002302 state.ptr = state.start;
2303
2304 if (state.charsize == 1) {
2305 status = sre_search(&state, PatternObject_GetCode(self));
2306 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002307#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002308 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002309#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002310 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002311
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002312 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002313 if (status == 0)
2314 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002315 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002316 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002317 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002318
2319 /* don't bother to build a match object */
2320 switch (self->groups) {
2321 case 0:
2322 b = STATE_OFFSET(&state, state.start);
2323 e = STATE_OFFSET(&state, state.ptr);
2324 item = PySequence_GetSlice(string, b, e);
2325 if (!item)
2326 goto error;
2327 break;
2328 case 1:
2329 item = state_getslice(&state, 1, string, 1);
2330 if (!item)
2331 goto error;
2332 break;
2333 default:
2334 item = PyTuple_New(self->groups);
2335 if (!item)
2336 goto error;
2337 for (i = 0; i < self->groups; i++) {
2338 PyObject* o = state_getslice(&state, i+1, string, 1);
2339 if (!o) {
2340 Py_DECREF(item);
2341 goto error;
2342 }
2343 PyTuple_SET_ITEM(item, i, o);
2344 }
2345 break;
2346 }
2347
2348 status = PyList_Append(list, item);
2349 Py_DECREF(item);
2350 if (status < 0)
2351 goto error;
2352
2353 if (state.ptr == state.start)
2354 state.start = (void*) ((char*) state.ptr + state.charsize);
2355 else
2356 state.start = state.ptr;
2357
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002358 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002359
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002360 state_fini(&state);
2361 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002362
2363error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002364 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002365 state_fini(&state);
2366 return NULL;
2367
Guido van Rossumb700df92000-03-31 14:59:30 +00002368}
2369
Fredrik Lundh703ce812001-10-24 22:16:30 +00002370#if PY_VERSION_HEX >= 0x02020000
2371static PyObject*
2372pattern_finditer(PatternObject* pattern, PyObject* args)
2373{
2374 PyObject* scanner;
2375 PyObject* search;
2376 PyObject* iterator;
2377
2378 scanner = pattern_scanner(pattern, args);
2379 if (!scanner)
2380 return NULL;
2381
2382 search = PyObject_GetAttrString(scanner, "search");
2383 Py_DECREF(scanner);
2384 if (!search)
2385 return NULL;
2386
2387 iterator = PyCallIter_New(search, Py_None);
2388 Py_DECREF(search);
2389
2390 return iterator;
2391}
2392#endif
2393
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002394static PyObject*
2395pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2396{
2397 SRE_STATE state;
2398 PyObject* list;
2399 PyObject* item;
2400 int status;
2401 int n;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002402 int i;
2403 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002404
2405 PyObject* string;
2406 int maxsplit = 0;
2407 static char* kwlist[] = { "source", "maxsplit", NULL };
2408 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|i:split", kwlist,
2409 &string, &maxsplit))
2410 return NULL;
2411
2412 string = state_init(&state, self, string, 0, INT_MAX);
2413 if (!string)
2414 return NULL;
2415
2416 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002417 if (!list) {
2418 state_fini(&state);
2419 return NULL;
2420 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002421
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002422 n = 0;
2423 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002424
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002425 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002426
2427 state_reset(&state);
2428
2429 state.ptr = state.start;
2430
2431 if (state.charsize == 1) {
2432 status = sre_search(&state, PatternObject_GetCode(self));
2433 } else {
2434#if defined(HAVE_UNICODE)
2435 status = sre_usearch(&state, PatternObject_GetCode(self));
2436#endif
2437 }
2438
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002439 if (status <= 0) {
2440 if (status == 0)
2441 break;
2442 pattern_error(status);
2443 goto error;
2444 }
2445
2446 if (state.start == state.ptr) {
2447 if (last == state.end)
2448 break;
2449 /* skip one character */
2450 state.start = (void*) ((char*) state.ptr + state.charsize);
2451 continue;
2452 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002453
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002454 /* get segment before this match */
2455 item = PySequence_GetSlice(
2456 string, STATE_OFFSET(&state, last),
2457 STATE_OFFSET(&state, state.start)
2458 );
2459 if (!item)
2460 goto error;
2461 status = PyList_Append(list, item);
2462 Py_DECREF(item);
2463 if (status < 0)
2464 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002465
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002466 /* add groups (if any) */
2467 for (i = 0; i < self->groups; i++) {
2468 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002469 if (!item)
2470 goto error;
2471 status = PyList_Append(list, item);
2472 Py_DECREF(item);
2473 if (status < 0)
2474 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002475 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002476
2477 n = n + 1;
2478
2479 last = state.start = state.ptr;
2480
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002481 }
2482
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002483 /* get segment following last match (even if empty) */
2484 item = PySequence_GetSlice(
2485 string, STATE_OFFSET(&state, last), state.endpos
2486 );
2487 if (!item)
2488 goto error;
2489 status = PyList_Append(list, item);
2490 Py_DECREF(item);
2491 if (status < 0)
2492 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002493
2494 state_fini(&state);
2495 return list;
2496
2497error:
2498 Py_DECREF(list);
2499 state_fini(&state);
2500 return NULL;
2501
2502}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002503
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002504static PyObject*
2505pattern_subx(PatternObject* self, PyObject* template, PyObject* string,
2506 int count, int subn)
2507{
2508 SRE_STATE state;
2509 PyObject* list;
2510 PyObject* item;
2511 PyObject* filter;
2512 PyObject* args;
2513 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002514 void* ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002515 int status;
2516 int n;
2517 int i, b, e;
2518 int filter_is_callable;
2519
Fredrik Lundhdac58492001-10-21 21:48:30 +00002520 if (PyCallable_Check(template)) {
2521 /* sub/subn takes either a function or a template */
2522 filter = template;
2523 Py_INCREF(filter);
2524 filter_is_callable = 1;
2525 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002526 /* if not callable, check if it's a literal string */
2527 int literal;
2528 ptr = getstring(template, &n, &b);
2529 if (ptr) {
2530 if (b == 1) {
2531 literal = sre_literal_template(ptr, n);
2532 } else {
2533#if defined(HAVE_UNICODE)
2534 literal = sre_uliteral_template(ptr, n);
2535#endif
2536 }
2537 } else {
2538 PyErr_Clear();
2539 literal = 0;
2540 }
2541 if (literal) {
2542 filter = template;
2543 Py_INCREF(filter);
2544 filter_is_callable = 0;
2545 } else {
2546 /* not a literal; hand it over to the template compiler */
2547 filter = call(
2548 SRE_MODULE, "_subx",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002549 PyTuple_Pack(2, self, template)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002550 );
2551 if (!filter)
2552 return NULL;
2553 filter_is_callable = PyCallable_Check(filter);
2554 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002555 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002556
2557 string = state_init(&state, self, string, 0, INT_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002558 if (!string) {
2559 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002560 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002561 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002562
2563 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002564 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002565 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002566 state_fini(&state);
2567 return NULL;
2568 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002569
2570 n = i = 0;
2571
2572 while (!count || n < count) {
2573
2574 state_reset(&state);
2575
2576 state.ptr = state.start;
2577
2578 if (state.charsize == 1) {
2579 status = sre_search(&state, PatternObject_GetCode(self));
2580 } else {
2581#if defined(HAVE_UNICODE)
2582 status = sre_usearch(&state, PatternObject_GetCode(self));
2583#endif
2584 }
2585
2586 if (status <= 0) {
2587 if (status == 0)
2588 break;
2589 pattern_error(status);
2590 goto error;
2591 }
2592
2593 b = STATE_OFFSET(&state, state.start);
2594 e = STATE_OFFSET(&state, state.ptr);
2595
2596 if (i < b) {
2597 /* get segment before this match */
2598 item = PySequence_GetSlice(string, i, b);
2599 if (!item)
2600 goto error;
2601 status = PyList_Append(list, item);
2602 Py_DECREF(item);
2603 if (status < 0)
2604 goto error;
2605
2606 } else if (i == b && i == e && n > 0)
2607 /* ignore empty match on latest position */
2608 goto next;
2609
2610 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002611 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002612 match = pattern_new_match(self, &state, 1);
2613 if (!match)
2614 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002615 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002616 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002617 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002618 goto error;
2619 }
2620 item = PyObject_CallObject(filter, args);
2621 Py_DECREF(args);
2622 Py_DECREF(match);
2623 if (!item)
2624 goto error;
2625 } else {
2626 /* filter is literal string */
2627 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002628 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002629 }
2630
2631 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002632 if (item != Py_None) {
2633 status = PyList_Append(list, item);
2634 Py_DECREF(item);
2635 if (status < 0)
2636 goto error;
2637 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002638
2639 i = e;
2640 n = n + 1;
2641
2642next:
2643 /* move on */
2644 if (state.ptr == state.start)
2645 state.start = (void*) ((char*) state.ptr + state.charsize);
2646 else
2647 state.start = state.ptr;
2648
2649 }
2650
2651 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002652 if (i < state.endpos) {
2653 item = PySequence_GetSlice(string, i, state.endpos);
2654 if (!item)
2655 goto error;
2656 status = PyList_Append(list, item);
2657 Py_DECREF(item);
2658 if (status < 0)
2659 goto error;
2660 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002661
2662 state_fini(&state);
2663
Guido van Rossum4e173842001-12-07 04:25:10 +00002664 Py_DECREF(filter);
2665
Fredrik Lundhdac58492001-10-21 21:48:30 +00002666 /* convert list to single string (also removes list) */
Michael W. Hudsonb6a45052002-07-31 09:54:24 +00002667 item = join_list(list, self->pattern);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002668
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002669 if (!item)
2670 return NULL;
2671
2672 if (subn)
2673 return Py_BuildValue("Ni", item, n);
2674
2675 return item;
2676
2677error:
2678 Py_DECREF(list);
2679 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002680 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002681 return NULL;
2682
2683}
2684
2685static PyObject*
2686pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2687{
2688 PyObject* template;
2689 PyObject* string;
2690 int count = 0;
2691 static char* kwlist[] = { "repl", "string", "count", NULL };
2692 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:sub", kwlist,
2693 &template, &string, &count))
2694 return NULL;
2695
2696 return pattern_subx(self, template, string, count, 0);
2697}
2698
2699static PyObject*
2700pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2701{
2702 PyObject* template;
2703 PyObject* string;
2704 int count = 0;
2705 static char* kwlist[] = { "repl", "string", "count", NULL };
2706 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:subn", kwlist,
2707 &template, &string, &count))
2708 return NULL;
2709
2710 return pattern_subx(self, template, string, count, 1);
2711}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002712
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002713static PyObject*
2714pattern_copy(PatternObject* self, PyObject* args)
2715{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002716#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002717 PatternObject* copy;
2718 int offset;
2719
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002720 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
2721 return NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002722
2723 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2724 if (!copy)
2725 return NULL;
2726
2727 offset = offsetof(PatternObject, groups);
2728
2729 Py_XINCREF(self->groupindex);
2730 Py_XINCREF(self->indexgroup);
2731 Py_XINCREF(self->pattern);
2732
2733 memcpy((char*) copy + offset, (char*) self + offset,
2734 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
2735
2736 return (PyObject*) copy;
2737#else
2738 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2739 return NULL;
2740#endif
2741}
2742
2743static PyObject*
2744pattern_deepcopy(PatternObject* self, PyObject* args)
2745{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002746#ifdef USE_BUILTIN_COPY
2747 PatternObject* copy;
2748
2749 PyObject* memo;
2750 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
2751 return NULL;
2752
2753 copy = (PatternObject*) pattern_copy(self, Py_None);
2754 if (!copy)
2755 return NULL;
2756
2757 if (!deepcopy(&copy->groupindex, memo) ||
2758 !deepcopy(&copy->indexgroup, memo) ||
2759 !deepcopy(&copy->pattern, memo)) {
2760 Py_DECREF(copy);
2761 return NULL;
2762 }
2763
2764#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002765 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2766 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002767#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002768}
2769
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002770static PyMethodDef pattern_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00002771 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS},
2772 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS},
2773 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS},
2774 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS},
2775 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS},
2776 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002777#if PY_VERSION_HEX >= 0x02020000
2778 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS},
2779#endif
Fredrik Lundh562586e2000-10-03 20:43:34 +00002780 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002781 {"__copy__", (PyCFunction) pattern_copy, METH_VARARGS},
2782 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002783 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002784};
2785
2786static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002787pattern_getattr(PatternObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00002788{
2789 PyObject* res;
2790
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002791 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00002792
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002793 if (res)
2794 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00002795
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002796 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00002797
2798 /* attributes */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002799 if (!strcmp(name, "pattern")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002800 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002801 return self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00002802 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002803
2804 if (!strcmp(name, "flags"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002805 return Py_BuildValue("i", self->flags);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002806
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002807 if (!strcmp(name, "groups"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002808 return Py_BuildValue("i", self->groups);
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002809
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002810 if (!strcmp(name, "groupindex") && self->groupindex) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002811 Py_INCREF(self->groupindex);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002812 return self->groupindex;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002813 }
2814
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002815 PyErr_SetString(PyExc_AttributeError, name);
2816 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002817}
2818
2819statichere PyTypeObject Pattern_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002820 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00002821 0, "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002822 sizeof(PatternObject), sizeof(SRE_CODE),
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002823 (destructor)pattern_dealloc, /*tp_dealloc*/
2824 0, /*tp_print*/
2825 (getattrfunc)pattern_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002826};
2827
2828/* -------------------------------------------------------------------- */
2829/* match methods */
2830
2831static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002832match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00002833{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002834 Py_XDECREF(self->regs);
2835 Py_XDECREF(self->string);
2836 Py_DECREF(self->pattern);
2837 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00002838}
2839
2840static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002841match_getslice_by_index(MatchObject* self, int index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00002842{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002843 if (index < 0 || index >= self->groups) {
2844 /* raise IndexError if we were given a bad group number */
2845 PyErr_SetString(
2846 PyExc_IndexError,
2847 "no such group"
2848 );
2849 return NULL;
2850 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002851
Fredrik Lundh6f013982000-07-03 18:44:21 +00002852 index *= 2;
2853
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002854 if (self->string == Py_None || self->mark[index] < 0) {
2855 /* return default value if the string or group is undefined */
2856 Py_INCREF(def);
2857 return def;
2858 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002859
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002860 return PySequence_GetSlice(
2861 self->string, self->mark[index], self->mark[index+1]
2862 );
Guido van Rossumb700df92000-03-31 14:59:30 +00002863}
2864
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002865static int
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002866match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00002867{
Fredrik Lundh6f013982000-07-03 18:44:21 +00002868 int i;
Guido van Rossumb700df92000-03-31 14:59:30 +00002869
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002870 if (PyInt_Check(index))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002871 return (int) PyInt_AS_LONG(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00002872
Fredrik Lundh6f013982000-07-03 18:44:21 +00002873 i = -1;
2874
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002875 if (self->pattern->groupindex) {
2876 index = PyObject_GetItem(self->pattern->groupindex, index);
2877 if (index) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00002878 if (PyInt_Check(index))
2879 i = (int) PyInt_AS_LONG(index);
2880 Py_DECREF(index);
2881 } else
2882 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002883 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00002884
2885 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002886}
2887
2888static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002889match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002890{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002891 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00002892}
2893
2894static PyObject*
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002895match_expand(MatchObject* self, PyObject* args)
2896{
2897 PyObject* template;
2898 if (!PyArg_ParseTuple(args, "O:expand", &template))
2899 return NULL;
2900
2901 /* delegate to Python code */
2902 return call(
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002903 SRE_MODULE, "_expand",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002904 PyTuple_Pack(3, self->pattern, self, template)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002905 );
2906}
2907
2908static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002909match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002910{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002911 PyObject* result;
2912 int i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00002913
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002914 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00002915
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002916 switch (size) {
2917 case 0:
2918 result = match_getslice(self, Py_False, Py_None);
2919 break;
2920 case 1:
2921 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2922 break;
2923 default:
2924 /* fetch multiple items */
2925 result = PyTuple_New(size);
2926 if (!result)
2927 return NULL;
2928 for (i = 0; i < size; i++) {
2929 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002930 self, PyTuple_GET_ITEM(args, i), Py_None
2931 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002932 if (!item) {
2933 Py_DECREF(result);
2934 return NULL;
2935 }
2936 PyTuple_SET_ITEM(result, i, item);
2937 }
2938 break;
2939 }
2940 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002941}
2942
2943static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002944match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002945{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002946 PyObject* result;
2947 int index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002948
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002949 PyObject* def = Py_None;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002950 static char* kwlist[] = { "default", NULL };
2951 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002952 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002953
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002954 result = PyTuple_New(self->groups-1);
2955 if (!result)
2956 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002957
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002958 for (index = 1; index < self->groups; index++) {
2959 PyObject* item;
2960 item = match_getslice_by_index(self, index, def);
2961 if (!item) {
2962 Py_DECREF(result);
2963 return NULL;
2964 }
2965 PyTuple_SET_ITEM(result, index-1, item);
2966 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002967
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002968 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002969}
2970
2971static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002972match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002973{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002974 PyObject* result;
2975 PyObject* keys;
2976 int index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002977
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002978 PyObject* def = Py_None;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002979 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00002980 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002981 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002982
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002983 result = PyDict_New();
2984 if (!result || !self->pattern->groupindex)
2985 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002986
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002987 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002988 if (!keys)
2989 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00002990
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002991 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00002992 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002993 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002994 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002995 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002996 if (!key)
2997 goto failed;
2998 value = match_getslice(self, key, def);
2999 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003000 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003001 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003002 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00003003 status = PyDict_SetItem(result, key, value);
3004 Py_DECREF(value);
3005 if (status < 0)
3006 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003007 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003008
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003009 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00003010
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003011 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003012
3013failed:
3014 Py_DECREF(keys);
3015 Py_DECREF(result);
3016 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003017}
3018
3019static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003020match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003021{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003022 int index;
3023
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003024 PyObject* index_ = Py_False; /* zero */
3025 if (!PyArg_ParseTuple(args, "|O:start", &index_))
3026 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003027
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003028 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003029
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003030 if (index < 0 || index >= self->groups) {
3031 PyErr_SetString(
3032 PyExc_IndexError,
3033 "no such group"
3034 );
3035 return NULL;
3036 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003037
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003038 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003039 return Py_BuildValue("i", self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00003040}
3041
3042static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003043match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003044{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003045 int index;
3046
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003047 PyObject* index_ = Py_False; /* zero */
3048 if (!PyArg_ParseTuple(args, "|O:end", &index_))
3049 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003050
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003051 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003052
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003053 if (index < 0 || index >= self->groups) {
3054 PyErr_SetString(
3055 PyExc_IndexError,
3056 "no such group"
3057 );
3058 return NULL;
3059 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003060
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003061 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003062 return Py_BuildValue("i", self->mark[index*2+1]);
3063}
3064
3065LOCAL(PyObject*)
3066_pair(int i1, int i2)
3067{
3068 PyObject* pair;
3069 PyObject* item;
3070
3071 pair = PyTuple_New(2);
3072 if (!pair)
3073 return NULL;
3074
3075 item = PyInt_FromLong(i1);
3076 if (!item)
3077 goto error;
3078 PyTuple_SET_ITEM(pair, 0, item);
3079
3080 item = PyInt_FromLong(i2);
3081 if (!item)
3082 goto error;
3083 PyTuple_SET_ITEM(pair, 1, item);
3084
3085 return pair;
3086
3087 error:
3088 Py_DECREF(pair);
3089 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003090}
3091
3092static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003093match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003094{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003095 int index;
3096
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003097 PyObject* index_ = Py_False; /* zero */
3098 if (!PyArg_ParseTuple(args, "|O:span", &index_))
3099 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003100
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003101 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003102
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003103 if (index < 0 || index >= self->groups) {
3104 PyErr_SetString(
3105 PyExc_IndexError,
3106 "no such group"
3107 );
3108 return NULL;
3109 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003110
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003111 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003112 return _pair(self->mark[index*2], self->mark[index*2+1]);
3113}
3114
3115static PyObject*
3116match_regs(MatchObject* self)
3117{
3118 PyObject* regs;
3119 PyObject* item;
3120 int index;
3121
3122 regs = PyTuple_New(self->groups);
3123 if (!regs)
3124 return NULL;
3125
3126 for (index = 0; index < self->groups; index++) {
3127 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3128 if (!item) {
3129 Py_DECREF(regs);
3130 return NULL;
3131 }
3132 PyTuple_SET_ITEM(regs, index, item);
3133 }
3134
3135 Py_INCREF(regs);
3136 self->regs = regs;
3137
3138 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00003139}
3140
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003141static PyObject*
3142match_copy(MatchObject* self, PyObject* args)
3143{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003144#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003145 MatchObject* copy;
3146 int slots, offset;
3147
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003148 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
3149 return NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003150
3151 slots = 2 * (self->pattern->groups+1);
3152
3153 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3154 if (!copy)
3155 return NULL;
3156
3157 /* this value a constant, but any compiler should be able to
3158 figure that out all by itself */
3159 offset = offsetof(MatchObject, string);
3160
3161 Py_XINCREF(self->pattern);
3162 Py_XINCREF(self->string);
3163 Py_XINCREF(self->regs);
3164
3165 memcpy((char*) copy + offset, (char*) self + offset,
3166 sizeof(MatchObject) + slots * sizeof(int) - offset);
3167
3168 return (PyObject*) copy;
3169#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003170 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003171 return NULL;
3172#endif
3173}
3174
3175static PyObject*
3176match_deepcopy(MatchObject* self, PyObject* args)
3177{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003178#ifdef USE_BUILTIN_COPY
3179 MatchObject* copy;
3180
3181 PyObject* memo;
3182 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
3183 return NULL;
3184
3185 copy = (MatchObject*) match_copy(self, Py_None);
3186 if (!copy)
3187 return NULL;
3188
3189 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3190 !deepcopy(&copy->string, memo) ||
3191 !deepcopy(&copy->regs, memo)) {
3192 Py_DECREF(copy);
3193 return NULL;
3194 }
3195
3196#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003197 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3198 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003199#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003200}
3201
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003202static PyMethodDef match_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00003203 {"group", (PyCFunction) match_group, METH_VARARGS},
3204 {"start", (PyCFunction) match_start, METH_VARARGS},
3205 {"end", (PyCFunction) match_end, METH_VARARGS},
3206 {"span", (PyCFunction) match_span, METH_VARARGS},
3207 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3208 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
3209 {"expand", (PyCFunction) match_expand, METH_VARARGS},
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003210 {"__copy__", (PyCFunction) match_copy, METH_VARARGS},
3211 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003212 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003213};
3214
3215static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003216match_getattr(MatchObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00003217{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003218 PyObject* res;
Guido van Rossumb700df92000-03-31 14:59:30 +00003219
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003220 res = Py_FindMethod(match_methods, (PyObject*) self, name);
3221 if (res)
3222 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00003223
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003224 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00003225
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003226 if (!strcmp(name, "lastindex")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00003227 if (self->lastindex >= 0)
3228 return Py_BuildValue("i", self->lastindex);
Fredrik Lundhc2301732000-07-02 22:25:39 +00003229 Py_INCREF(Py_None);
3230 return Py_None;
3231 }
3232
3233 if (!strcmp(name, "lastgroup")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00003234 if (self->pattern->indexgroup && self->lastindex >= 0) {
Fredrik Lundhc2301732000-07-02 22:25:39 +00003235 PyObject* result = PySequence_GetItem(
Fredrik Lundh6f013982000-07-03 18:44:21 +00003236 self->pattern->indexgroup, self->lastindex
Fredrik Lundhc2301732000-07-02 22:25:39 +00003237 );
3238 if (result)
3239 return result;
3240 PyErr_Clear();
3241 }
3242 Py_INCREF(Py_None);
3243 return Py_None;
3244 }
3245
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003246 if (!strcmp(name, "string")) {
3247 if (self->string) {
3248 Py_INCREF(self->string);
3249 return self->string;
3250 } else {
3251 Py_INCREF(Py_None);
3252 return Py_None;
3253 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003254 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003255
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003256 if (!strcmp(name, "regs")) {
3257 if (self->regs) {
3258 Py_INCREF(self->regs);
3259 return self->regs;
3260 } else
3261 return match_regs(self);
3262 }
3263
3264 if (!strcmp(name, "re")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00003265 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003266 return (PyObject*) self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00003267 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003268
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003269 if (!strcmp(name, "pos"))
3270 return Py_BuildValue("i", self->pos);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003271
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003272 if (!strcmp(name, "endpos"))
3273 return Py_BuildValue("i", self->endpos);
Guido van Rossumb700df92000-03-31 14:59:30 +00003274
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003275 PyErr_SetString(PyExc_AttributeError, name);
3276 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003277}
3278
3279/* FIXME: implement setattr("string", None) as a special case (to
3280 detach the associated string, if any */
3281
3282statichere PyTypeObject Match_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003283 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00003284 0, "_" SRE_MODULE ".SRE_Match",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003285 sizeof(MatchObject), sizeof(int),
3286 (destructor)match_dealloc, /*tp_dealloc*/
3287 0, /*tp_print*/
3288 (getattrfunc)match_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00003289};
3290
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003291/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003292/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003293
3294static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003295scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003296{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003297 state_fini(&self->state);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003298 Py_DECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003299 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003300}
3301
3302static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003303scanner_match(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003304{
3305 SRE_STATE* state = &self->state;
3306 PyObject* match;
3307 int status;
3308
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003309 state_reset(state);
3310
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003311 state->ptr = state->start;
3312
3313 if (state->charsize == 1) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00003314 status = sre_match(state, PatternObject_GetCode(self->pattern), 1);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003315 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003316#if defined(HAVE_UNICODE)
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00003317 status = sre_umatch(state, PatternObject_GetCode(self->pattern), 1);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003318#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003319 }
3320
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003321 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003322 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003323
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00003324 if ((status == 0 || state->ptr == state->start) &&
3325 state->ptr < state->end)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003326 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003327 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003328 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003329
3330 return match;
3331}
3332
3333
3334static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003335scanner_search(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003336{
3337 SRE_STATE* state = &self->state;
3338 PyObject* match;
3339 int status;
3340
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003341 state_reset(state);
3342
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003343 state->ptr = state->start;
3344
3345 if (state->charsize == 1) {
3346 status = sre_search(state, PatternObject_GetCode(self->pattern));
3347 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003348#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003349 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003350#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003351 }
3352
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003353 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003354 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003355
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00003356 if ((status == 0 || state->ptr == state->start) &&
3357 state->ptr < state->end)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003358 state->start = (void*) ((char*) state->ptr + state->charsize);
3359 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003360 state->start = state->ptr;
3361
3362 return match;
3363}
3364
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003365static PyMethodDef scanner_methods[] = {
Neal Norwitzbb2769f2002-03-31 15:46:00 +00003366 /* FIXME: use METH_OLDARGS instead of 0 or fix to use METH_VARARGS */
3367 /* METH_OLDARGS is not in Python 1.5.2 */
3368 {"match", (PyCFunction) scanner_match, 0},
3369 {"search", (PyCFunction) scanner_search, 0},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003370 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003371};
3372
3373static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003374scanner_getattr(ScannerObject* self, char* name)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003375{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003376 PyObject* res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003377
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003378 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
3379 if (res)
3380 return res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003381
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003382 PyErr_Clear();
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003383
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003384 /* attributes */
3385 if (!strcmp(name, "pattern")) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003386 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003387 return self->pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003388 }
3389
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003390 PyErr_SetString(PyExc_AttributeError, name);
3391 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003392}
3393
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003394statichere PyTypeObject Scanner_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003395 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00003396 0, "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003397 sizeof(ScannerObject), 0,
3398 (destructor)scanner_dealloc, /*tp_dealloc*/
3399 0, /*tp_print*/
3400 (getattrfunc)scanner_getattr, /*tp_getattr*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003401};
3402
Guido van Rossumb700df92000-03-31 14:59:30 +00003403static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00003404 {"compile", _compile, METH_VARARGS},
3405 {"getcodesize", sre_codesize, METH_VARARGS},
3406 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003407 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003408};
3409
Andrew M. Kuchlingc24fe362003-04-30 13:09:08 +00003410#if PY_VERSION_HEX < 0x02030000
3411DL_EXPORT(void) init_sre(void)
3412#else
Mark Hammond8235ea12002-07-19 06:55:41 +00003413PyMODINIT_FUNC init_sre(void)
Andrew M. Kuchlingc24fe362003-04-30 13:09:08 +00003414#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00003415{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003416 PyObject* m;
3417 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00003418 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003419
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003420 /* Patch object types */
3421 Pattern_Type.ob_type = Match_Type.ob_type =
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003422 Scanner_Type.ob_type = &PyType_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00003423
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00003424 m = Py_InitModule("_" SRE_MODULE, _functions);
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003425 d = PyModule_GetDict(m);
3426
Fredrik Lundh21009b92001-09-18 18:47:09 +00003427 x = PyInt_FromLong(SRE_MAGIC);
3428 if (x) {
3429 PyDict_SetItemString(d, "MAGIC", x);
3430 Py_DECREF(x);
3431 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00003432
Martin v. Löwis78e2f062003-04-19 12:56:08 +00003433 x = PyInt_FromLong(sizeof(SRE_CODE));
3434 if (x) {
3435 PyDict_SetItemString(d, "CODESIZE", x);
3436 Py_DECREF(x);
3437 }
3438
Fredrik Lundh21009b92001-09-18 18:47:09 +00003439 x = PyString_FromString(copyright);
3440 if (x) {
3441 PyDict_SetItemString(d, "copyright", x);
3442 Py_DECREF(x);
3443 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003444}
3445
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003446#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00003447
3448/* vim:ts=4:sw=4:et
3449*/