blob: 4d7b4fcc27a004c737e1ce79ddb2a89882e064e8 [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
Fredrik Lundhebc37b22000-10-28 19:30:41 +00009 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000010 * 2000-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000011 * 2000-09-20 fl added expand method
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000012 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000013 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000014 * 2001-04-28 fl added __copy__ methods (work in progress)
Fredrik Lundh09705f02002-11-22 12:46:35 +000015 * 2001-05-14 fl fixes for 1.5.2 compatibility
Fredrik Lundhf71ae462001-07-02 17:04:48 +000016 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh397a6542001-10-18 19:30:16 +000017 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Fredrik Lundh971e78b2001-10-20 17:48:46 +000018 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
Fredrik Lundhbec95b92001-10-21 16:47:57 +000019 * 2001-10-21 fl added sub/subn primitive
Fredrik Lundh703ce812001-10-24 22:16:30 +000020 * 2001-10-24 fl added finditer primitive (for 2.2 only)
Fredrik Lundh82b23072001-12-09 16:13:15 +000021 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
Fredrik Lundh09705f02002-11-22 12:46:35 +000022 * 2002-11-09 fl fixed empty sub/subn return type
Martin v. Löwis78e2f062003-04-19 12:56:08 +000023 * 2003-04-18 mvl fully support 4-byte codes
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +000024 * 2003-10-17 gn implemented non recursive scheme
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000026 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000027 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000028 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
Guido van Rossumb700df92000-03-31 14:59:30 +000032 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000033 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000034 * other compatibility work.
35 */
36
37#ifndef SRE_RECURSIVE
38
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000039static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000040 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000041
42#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000043#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000044
45#include "sre.h"
46
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000047#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000048
Fredrik Lundh436c3d582000-06-29 08:58:44 +000049/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000050#if !defined(SRE_MODULE)
51#define SRE_MODULE "sre"
52#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000053
Neal Norwitz94a9c092006-03-16 06:30:02 +000054#define SRE_PY_MODULE "re"
55
Guido van Rossumb700df92000-03-31 14:59:30 +000056/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000057#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000058
Fredrik Lundh971e78b2001-10-20 17:48:46 +000059#if PY_VERSION_HEX >= 0x01060000
60#if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
Fredrik Lundh22d25462000-07-01 17:50:59 +000061/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +000062#define HAVE_UNICODE
63#endif
Fredrik Lundh971e78b2001-10-20 17:48:46 +000064#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000065
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000066/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000067/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000068
69/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000070#define USE_FAST_SEARCH
71
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000072/* enables aggressive inlining (always on for Visual C) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000073#undef USE_INLINE
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000074
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000075/* enables copy/deepcopy handling (work in progress) */
76#undef USE_BUILTIN_COPY
77
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000078#if PY_VERSION_HEX < 0x01060000
79#define PyObject_DEL(op) PyMem_DEL((op))
80#endif
81
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000082/* -------------------------------------------------------------------- */
83
Fredrik Lundh80946112000-06-29 18:03:25 +000084#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000085#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000086#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000087/* fastest possible local call under MSVC */
88#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000089#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000090#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000091#else
92#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000093#endif
94
95/* error codes */
96#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000097#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000098#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000099#define SRE_ERROR_MEMORY -9 /* out of memory */
100
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000101#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +0000102#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +0000103#else
104#define TRACE(v)
105#endif
106
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000107/* -------------------------------------------------------------------- */
108/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000109
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000110/* default character predicates (run sre_chars.py to regenerate tables) */
111
112#define SRE_DIGIT_MASK 1
113#define SRE_SPACE_MASK 2
114#define SRE_LINEBREAK_MASK 4
115#define SRE_ALNUM_MASK 8
116#define SRE_WORD_MASK 16
117
Fredrik Lundh21009b92001-09-18 18:47:09 +0000118/* FIXME: this assumes ASCII. create tables in init_sre() instead */
119
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000120static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1212, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1220, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
12325, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12424, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1250, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12624, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
127
Fredrik Lundhb389df32000-06-29 12:48:37 +0000128static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000012910, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
13027, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
13144, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
13261, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
133108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
134122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
135106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
136120, 121, 122, 123, 124, 125, 126, 127 };
137
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000138#define SRE_IS_DIGIT(ch)\
139 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
140#define SRE_IS_SPACE(ch)\
141 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
142#define SRE_IS_LINEBREAK(ch)\
143 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
144#define SRE_IS_ALNUM(ch)\
145 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
146#define SRE_IS_WORD(ch)\
147 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000148
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000149static unsigned int sre_lower(unsigned int ch)
150{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000151 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000152}
153
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000154/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000155/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
156 * warnings when c's type supports only numbers < N+1 */
157#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
158#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000159#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000160#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000161#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
162
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000163static unsigned int sre_lower_locale(unsigned int ch)
164{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000165 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000166}
167
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000168/* unicode-specific character predicates */
169
170#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000171
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000172#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
173#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
174#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000175#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000176#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000177
178static unsigned int sre_lower_unicode(unsigned int ch)
179{
180 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
181}
182
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000183#endif
184
Guido van Rossumb700df92000-03-31 14:59:30 +0000185LOCAL(int)
186sre_category(SRE_CODE category, unsigned int ch)
187{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000188 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000189
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000190 case SRE_CATEGORY_DIGIT:
191 return SRE_IS_DIGIT(ch);
192 case SRE_CATEGORY_NOT_DIGIT:
193 return !SRE_IS_DIGIT(ch);
194 case SRE_CATEGORY_SPACE:
195 return SRE_IS_SPACE(ch);
196 case SRE_CATEGORY_NOT_SPACE:
197 return !SRE_IS_SPACE(ch);
198 case SRE_CATEGORY_WORD:
199 return SRE_IS_WORD(ch);
200 case SRE_CATEGORY_NOT_WORD:
201 return !SRE_IS_WORD(ch);
202 case SRE_CATEGORY_LINEBREAK:
203 return SRE_IS_LINEBREAK(ch);
204 case SRE_CATEGORY_NOT_LINEBREAK:
205 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000206
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000207 case SRE_CATEGORY_LOC_WORD:
208 return SRE_LOC_IS_WORD(ch);
209 case SRE_CATEGORY_LOC_NOT_WORD:
210 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000211
212#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000213 case SRE_CATEGORY_UNI_DIGIT:
214 return SRE_UNI_IS_DIGIT(ch);
215 case SRE_CATEGORY_UNI_NOT_DIGIT:
216 return !SRE_UNI_IS_DIGIT(ch);
217 case SRE_CATEGORY_UNI_SPACE:
218 return SRE_UNI_IS_SPACE(ch);
219 case SRE_CATEGORY_UNI_NOT_SPACE:
220 return !SRE_UNI_IS_SPACE(ch);
221 case SRE_CATEGORY_UNI_WORD:
222 return SRE_UNI_IS_WORD(ch);
223 case SRE_CATEGORY_UNI_NOT_WORD:
224 return !SRE_UNI_IS_WORD(ch);
225 case SRE_CATEGORY_UNI_LINEBREAK:
226 return SRE_UNI_IS_LINEBREAK(ch);
227 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
228 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000229#else
230 case SRE_CATEGORY_UNI_DIGIT:
231 return SRE_IS_DIGIT(ch);
232 case SRE_CATEGORY_UNI_NOT_DIGIT:
233 return !SRE_IS_DIGIT(ch);
234 case SRE_CATEGORY_UNI_SPACE:
235 return SRE_IS_SPACE(ch);
236 case SRE_CATEGORY_UNI_NOT_SPACE:
237 return !SRE_IS_SPACE(ch);
238 case SRE_CATEGORY_UNI_WORD:
239 return SRE_LOC_IS_WORD(ch);
240 case SRE_CATEGORY_UNI_NOT_WORD:
241 return !SRE_LOC_IS_WORD(ch);
242 case SRE_CATEGORY_UNI_LINEBREAK:
243 return SRE_IS_LINEBREAK(ch);
244 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
245 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000246#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000247 }
248 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000249}
250
251/* helpers */
252
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000253static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000254data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000255{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000256 if (state->data_stack) {
Jack Diederich2d400772006-05-27 15:44:34 +0000257 PyMem_FREE(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000258 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000259 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000260 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000261}
262
263static int
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000264data_stack_grow(SRE_STATE* state, int size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000265{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000266 int minsize, cursize;
267 minsize = state->data_stack_base+size;
268 cursize = state->data_stack_size;
269 if (cursize < minsize) {
270 void* stack;
271 cursize = minsize+minsize/4+1024;
272 TRACE(("allocate/grow stack %d\n", cursize));
Jack Diederich2d400772006-05-27 15:44:34 +0000273 stack = PyMem_REALLOC(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000274 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000275 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000276 return SRE_ERROR_MEMORY;
277 }
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000278 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000279 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000280 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000281 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000282}
283
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000284/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000285
286#define SRE_CHAR unsigned char
287#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000288#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000289#define SRE_CHARSET sre_charset
290#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000291#define SRE_MATCH sre_match
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000292#define SRE_MATCH_CONTEXT sre_match_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000293#define SRE_SEARCH sre_search
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000294#define SRE_LITERAL_TEMPLATE sre_literal_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000295
296#if defined(HAVE_UNICODE)
297
Guido van Rossumb700df92000-03-31 14:59:30 +0000298#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000299#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000300#undef SRE_RECURSIVE
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000301
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000302#undef SRE_LITERAL_TEMPLATE
Guido van Rossumb700df92000-03-31 14:59:30 +0000303#undef SRE_SEARCH
304#undef SRE_MATCH
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000305#undef SRE_MATCH_CONTEXT
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000306#undef SRE_INFO
307#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000308#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000309#undef SRE_AT
310#undef SRE_CHAR
311
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000312/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000313
314#define SRE_CHAR Py_UNICODE
315#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000316#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000317#define SRE_CHARSET sre_ucharset
318#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000319#define SRE_MATCH sre_umatch
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000320#define SRE_MATCH_CONTEXT sre_umatch_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000321#define SRE_SEARCH sre_usearch
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000322#define SRE_LITERAL_TEMPLATE sre_uliteral_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000323#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000324
325#endif /* SRE_RECURSIVE */
326
327/* -------------------------------------------------------------------- */
328/* String matching engine */
329
330/* the following section is compiled twice, with different character
331 settings */
332
333LOCAL(int)
334SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
335{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000336 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000337
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000338 int thisp, thatp;
Guido van Rossumb700df92000-03-31 14:59:30 +0000339
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000340 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000341
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000342 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000343 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000344 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000345
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000346 case SRE_AT_BEGINNING_LINE:
347 return ((void*) ptr == state->beginning ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000348 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000349
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000350 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000351 return (((void*) (ptr+1) == state->end &&
352 SRE_IS_LINEBREAK((int) ptr[0])) ||
353 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000354
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000355 case SRE_AT_END_LINE:
356 return ((void*) ptr == state->end ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000357 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000358
Fredrik Lundh770617b2001-01-14 15:06:11 +0000359 case SRE_AT_END_STRING:
360 return ((void*) ptr == state->end);
361
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000362 case SRE_AT_BOUNDARY:
363 if (state->beginning == state->end)
364 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000365 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000366 SRE_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000367 thisp = ((void*) ptr < state->end) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000368 SRE_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000369 return thisp != thatp;
Fredrik Lundh80946112000-06-29 18:03:25 +0000370
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000371 case SRE_AT_NON_BOUNDARY:
372 if (state->beginning == state->end)
373 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000374 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000375 SRE_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000376 thisp = ((void*) ptr < state->end) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000377 SRE_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000378 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000379
380 case SRE_AT_LOC_BOUNDARY:
381 if (state->beginning == state->end)
382 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000383 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000384 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000385 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000386 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000387 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000388
389 case SRE_AT_LOC_NON_BOUNDARY:
390 if (state->beginning == state->end)
391 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000392 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000393 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000394 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000395 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000396 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000397
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000398#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000399 case SRE_AT_UNI_BOUNDARY:
400 if (state->beginning == state->end)
401 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000402 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000403 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000404 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000405 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000406 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000407
408 case SRE_AT_UNI_NON_BOUNDARY:
409 if (state->beginning == state->end)
410 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000411 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000412 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000413 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000414 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000415 return thisp == thatp;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000416#endif
417
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000418 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000419
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000420 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000421}
422
423LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000424SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000425{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000426 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000427
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000428 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000429
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000430 for (;;) {
431 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000432
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000433 case SRE_OP_FAILURE:
434 return !ok;
435
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000436 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000437 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000438 if (ch == set[0])
439 return ok;
440 set++;
441 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000442
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000443 case SRE_OP_CATEGORY:
444 /* <CATEGORY> <code> */
445 if (sre_category(set[0], (int) ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000446 return ok;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000447 set += 1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000448 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000449
Fredrik Lundh3562f112000-07-02 12:00:07 +0000450 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000451 if (sizeof(SRE_CODE) == 2) {
452 /* <CHARSET> <bitmap> (16 bits per code word) */
453 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
454 return ok;
455 set += 16;
Tim Peters3d563502006-01-21 02:47:53 +0000456 }
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000457 else {
458 /* <CHARSET> <bitmap> (32 bits per code word) */
459 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
460 return ok;
461 set += 8;
462 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000463 break;
464
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000465 case SRE_OP_RANGE:
466 /* <RANGE> <lower> <upper> */
467 if (set[0] <= ch && ch <= set[1])
468 return ok;
469 set += 2;
470 break;
471
472 case SRE_OP_NEGATE:
473 ok = !ok;
474 break;
475
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000476 case SRE_OP_BIGCHARSET:
477 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
478 {
479 int count, block;
480 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000481
482 if (sizeof(SRE_CODE) == 2) {
483 block = ((unsigned char*)set)[ch >> 8];
484 set += 128;
485 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
486 return ok;
487 set += count*16;
488 }
489 else {
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000490 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
491 * warnings when c's type supports only numbers < N+1 */
492 if (!(ch & ~65535))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000493 block = ((unsigned char*)set)[ch >> 8];
494 else
495 block = -1;
496 set += 64;
Tim Peters3d563502006-01-21 02:47:53 +0000497 if (block >=0 &&
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000498 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
499 return ok;
500 set += count*8;
501 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000502 break;
503 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000504
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000505 default:
506 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000507 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000508 return 0;
509 }
510 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000511}
512
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000513LOCAL(int) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000514
515LOCAL(int)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000516SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, int maxcount)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000517{
518 SRE_CODE chr;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000519 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
520 SRE_CHAR* end = (SRE_CHAR *)state->end;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000521 int i;
522
523 /* adjust end */
524 if (maxcount < end - ptr && maxcount != 65535)
525 end = ptr + maxcount;
526
527 switch (pattern[0]) {
528
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000529 case SRE_OP_IN:
530 /* repeated set */
531 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
532 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
533 ptr++;
534 break;
535
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000536 case SRE_OP_ANY:
537 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000538 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000539 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
540 ptr++;
541 break;
542
543 case SRE_OP_ANY_ALL:
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000544 /* repeated dot wildcard. skip to the end of the target
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000545 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000546 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000547 ptr = end;
548 break;
549
550 case SRE_OP_LITERAL:
551 /* repeated literal */
552 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000553 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000554 while (ptr < end && (SRE_CODE) *ptr == chr)
555 ptr++;
556 break;
557
558 case SRE_OP_LITERAL_IGNORE:
559 /* repeated literal */
560 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000561 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000562 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
563 ptr++;
564 break;
565
566 case SRE_OP_NOT_LITERAL:
567 /* repeated non-literal */
568 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000569 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000570 while (ptr < end && (SRE_CODE) *ptr != chr)
571 ptr++;
572 break;
Tim Peters3d563502006-01-21 02:47:53 +0000573
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000574 case SRE_OP_NOT_LITERAL_IGNORE:
575 /* repeated non-literal */
576 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000577 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000578 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
579 ptr++;
580 break;
581
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000582 default:
583 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000584 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000585 while ((SRE_CHAR*) state->ptr < end) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000586 i = SRE_MATCH(state, pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000587 if (i < 0)
588 return i;
589 if (!i)
590 break;
591 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000592 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
593 (SRE_CHAR*) state->ptr - ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000594 return (SRE_CHAR*) state->ptr - ptr;
595 }
596
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000597 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000598 return ptr - (SRE_CHAR*) state->ptr;
599}
600
Fredrik Lundh33accc12000-08-27 20:59:47 +0000601#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000602LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000603SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
604{
605 /* check if an SRE_OP_INFO block matches at the current position.
606 returns the number of SRE_CODE objects to skip if successful, 0
607 if no match */
608
609 SRE_CHAR* end = state->end;
610 SRE_CHAR* ptr = state->ptr;
611 int i;
612
613 /* check minimal length */
614 if (pattern[3] && (end - ptr) < pattern[3])
615 return 0;
616
617 /* check known prefix */
618 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
619 /* <length> <skip> <prefix data> <overlap data> */
620 for (i = 0; i < pattern[5]; i++)
621 if ((SRE_CODE) ptr[i] != pattern[7 + i])
622 return 0;
623 return pattern[0] + 2 * pattern[6];
624 }
625 return pattern[0];
626}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000627#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000628
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000629/* The macros below should be used to protect recursive SRE_MATCH()
630 * calls that *failed* and do *not* return immediately (IOW, those
631 * that will backtrack). Explaining:
632 *
633 * - Recursive SRE_MATCH() returned true: that's usually a success
634 * (besides atypical cases like ASSERT_NOT), therefore there's no
635 * reason to restore lastmark;
636 *
637 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
638 * is returning to the caller: If the current SRE_MATCH() is the
639 * top function of the recursion, returning false will be a matching
640 * failure, and it doesn't matter where lastmark is pointing to.
641 * If it's *not* the top function, it will be a recursive SRE_MATCH()
642 * failure by itself, and the calling SRE_MATCH() will have to deal
643 * with the failure by the same rules explained here (it will restore
644 * lastmark by itself if necessary);
645 *
646 * - Recursive SRE_MATCH() returned false, and will continue the
647 * outside 'for' loop: must be protected when breaking, since the next
648 * OP could potentially depend on lastmark;
Tim Peters3d563502006-01-21 02:47:53 +0000649 *
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000650 * - Recursive SRE_MATCH() returned false, and will be called again
651 * inside a local for/while loop: must be protected between each
652 * loop iteration, since the recursive SRE_MATCH() could do anything,
653 * and could potentially depend on lastmark.
654 *
655 * For more information, check the discussion at SF patch #712900.
656 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000657#define LASTMARK_SAVE() \
658 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000659 ctx->lastmark = state->lastmark; \
660 ctx->lastindex = state->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000661 } while (0)
662#define LASTMARK_RESTORE() \
663 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000664 state->lastmark = ctx->lastmark; \
665 state->lastindex = ctx->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000666 } while (0)
667
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000668#define RETURN_ERROR(i) do { return i; } while(0)
669#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
670#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
671
672#define RETURN_ON_ERROR(i) \
673 do { if (i < 0) RETURN_ERROR(i); } while (0)
674#define RETURN_ON_SUCCESS(i) \
675 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
676#define RETURN_ON_FAILURE(i) \
677 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
678
679#define SFY(x) #x
680
681#define DATA_STACK_ALLOC(state, type, ptr) \
682do { \
683 alloc_pos = state->data_stack_base; \
684 TRACE(("allocating %s in %d (%d)\n", \
685 SFY(type), alloc_pos, sizeof(type))); \
686 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
687 int j = data_stack_grow(state, sizeof(type)); \
688 if (j < 0) return j; \
689 if (ctx_pos != -1) \
690 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
691 } \
692 ptr = (type*)(state->data_stack+alloc_pos); \
693 state->data_stack_base += sizeof(type); \
694} while (0)
695
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000696#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
697do { \
698 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
699 ptr = (type*)(state->data_stack+pos); \
700} while (0)
701
702#define DATA_STACK_PUSH(state, data, size) \
703do { \
704 TRACE(("copy data in %p to %d (%d)\n", \
705 data, state->data_stack_base, size)); \
706 if (state->data_stack_size < state->data_stack_base+size) { \
707 int j = data_stack_grow(state, size); \
708 if (j < 0) return j; \
709 if (ctx_pos != -1) \
710 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
711 } \
712 memcpy(state->data_stack+state->data_stack_base, data, size); \
713 state->data_stack_base += size; \
714} while (0)
715
716#define DATA_STACK_POP(state, data, size, discard) \
717do { \
718 TRACE(("copy data to %p from %d (%d)\n", \
719 data, state->data_stack_base-size, size)); \
720 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
721 if (discard) \
722 state->data_stack_base -= size; \
723} while (0)
724
725#define DATA_STACK_POP_DISCARD(state, size) \
726do { \
727 TRACE(("discard data from %d (%d)\n", \
728 state->data_stack_base-size, size)); \
729 state->data_stack_base -= size; \
730} while(0)
731
732#define DATA_PUSH(x) \
733 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
734#define DATA_POP(x) \
735 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000736#define DATA_POP_DISCARD(x) \
737 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
738#define DATA_ALLOC(t,p) \
739 DATA_STACK_ALLOC(state, t, p)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000740#define DATA_LOOKUP_AT(t,p,pos) \
741 DATA_STACK_LOOKUP_AT(state,t,p,pos)
742
743#define MARK_PUSH(lastmark) \
744 do if (lastmark > 0) { \
745 i = lastmark; /* ctx->lastmark may change if reallocated */ \
746 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
747 } while (0)
748#define MARK_POP(lastmark) \
749 do if (lastmark > 0) { \
750 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
751 } while (0)
752#define MARK_POP_KEEP(lastmark) \
753 do if (lastmark > 0) { \
754 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
755 } while (0)
756#define MARK_POP_DISCARD(lastmark) \
757 do if (lastmark > 0) { \
758 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
759 } while (0)
760
761#define JUMP_NONE 0
762#define JUMP_MAX_UNTIL_1 1
763#define JUMP_MAX_UNTIL_2 2
764#define JUMP_MAX_UNTIL_3 3
765#define JUMP_MIN_UNTIL_1 4
766#define JUMP_MIN_UNTIL_2 5
767#define JUMP_MIN_UNTIL_3 6
768#define JUMP_REPEAT 7
769#define JUMP_REPEAT_ONE_1 8
770#define JUMP_REPEAT_ONE_2 9
771#define JUMP_MIN_REPEAT_ONE 10
772#define JUMP_BRANCH 11
773#define JUMP_ASSERT 12
774#define JUMP_ASSERT_NOT 13
775
776#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
777 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
778 nextctx->last_ctx_pos = ctx_pos; \
779 nextctx->jump = jumpvalue; \
780 nextctx->pattern = nextpattern; \
781 ctx_pos = alloc_pos; \
782 ctx = nextctx; \
783 goto entrance; \
784 jumplabel: \
785 while (0) /* gcc doesn't like labels at end of scopes */ \
786
787typedef struct {
788 int last_ctx_pos;
789 int jump;
790 SRE_CHAR* ptr;
791 SRE_CODE* pattern;
792 int count;
793 int lastmark;
794 int lastindex;
795 union {
796 SRE_CODE chr;
797 SRE_REPEAT* rep;
798 } u;
799} SRE_MATCH_CONTEXT;
800
801/* check if string matches the given pattern. returns <0 for
802 error, 0 for failure, and 1 for success */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000803LOCAL(int)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000804SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
Guido van Rossumb700df92000-03-31 14:59:30 +0000805{
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000806 SRE_CHAR* end = (SRE_CHAR *)state->end;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000807 int alloc_pos, ctx_pos = -1;
808 int i, ret = 0;
809 int jump;
Guido van Rossumb700df92000-03-31 14:59:30 +0000810
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000811 SRE_MATCH_CONTEXT* ctx;
812 SRE_MATCH_CONTEXT* nextctx;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000813
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000814 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000815
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000816 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
817 ctx->last_ctx_pos = -1;
818 ctx->jump = JUMP_NONE;
819 ctx->pattern = pattern;
820 ctx_pos = alloc_pos;
821
822entrance:
823
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000824 ctx->ptr = (SRE_CHAR *)state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000825
826 if (ctx->pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000827 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000828 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000829 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000830 TRACE(("reject (got %d chars, need %d)\n",
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000831 (end - ctx->ptr), ctx->pattern[3]));
832 RETURN_FAILURE;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000833 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000834 ctx->pattern += ctx->pattern[1] + 1;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000835 }
836
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000837 for (;;) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000838
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000839 switch (*ctx->pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000840
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000841 case SRE_OP_MARK:
842 /* set mark */
843 /* <MARK> <gid> */
844 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
845 ctx->ptr, ctx->pattern[0]));
846 i = ctx->pattern[0];
847 if (i & 1)
848 state->lastindex = i/2 + 1;
849 if (i > state->lastmark) {
850 /* state->lastmark is the highest valid index in the
851 state->mark array. If it is increased by more than 1,
852 the intervening marks must be set to NULL to signal
Tim Peters3d563502006-01-21 02:47:53 +0000853 that these marks have not been encountered. */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000854 int j = state->lastmark + 1;
855 while (j < i)
856 state->mark[j++] = NULL;
857 state->lastmark = i;
858 }
859 state->mark[i] = ctx->ptr;
860 ctx->pattern++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000861 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000862
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000863 case SRE_OP_LITERAL:
864 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000865 /* <LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000866 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
867 ctx->ptr, *ctx->pattern));
868 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
869 RETURN_FAILURE;
870 ctx->pattern++;
871 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000872 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000873
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000874 case SRE_OP_NOT_LITERAL:
875 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000876 /* <NOT_LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000877 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
878 ctx->ptr, *ctx->pattern));
879 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
880 RETURN_FAILURE;
881 ctx->pattern++;
882 ctx->ptr++;
883 break;
884
885 case SRE_OP_SUCCESS:
886 /* end of pattern */
887 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
888 state->ptr = ctx->ptr;
889 RETURN_SUCCESS;
890
891 case SRE_OP_AT:
892 /* match at given position */
893 /* <AT> <code> */
894 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
895 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
896 RETURN_FAILURE;
897 ctx->pattern++;
898 break;
899
900 case SRE_OP_CATEGORY:
901 /* match at given category */
902 /* <CATEGORY> <code> */
903 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
904 ctx->ptr, *ctx->pattern));
905 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
906 RETURN_FAILURE;
907 ctx->pattern++;
908 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000909 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000910
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000911 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000912 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000913 /* <ANY> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000914 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
915 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
916 RETURN_FAILURE;
917 ctx->ptr++;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000918 break;
919
920 case SRE_OP_ANY_ALL:
921 /* match anything */
922 /* <ANY_ALL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000923 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
924 if (ctx->ptr >= end)
925 RETURN_FAILURE;
926 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000927 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000928
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000929 case SRE_OP_IN:
930 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000931 /* <IN> <skip> <set> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000932 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
933 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
934 RETURN_FAILURE;
935 ctx->pattern += ctx->pattern[0];
936 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000937 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000938
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000939 case SRE_OP_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000940 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
941 ctx->pattern, ctx->ptr, ctx->pattern[0]));
942 if (ctx->ptr >= end ||
943 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
944 RETURN_FAILURE;
945 ctx->pattern++;
946 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000947 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000948
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000949 case SRE_OP_NOT_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000950 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
951 ctx->pattern, ctx->ptr, *ctx->pattern));
952 if (ctx->ptr >= end ||
953 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
954 RETURN_FAILURE;
955 ctx->pattern++;
956 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000957 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000958
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000959 case SRE_OP_IN_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000960 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
961 if (ctx->ptr >= end
962 || !SRE_CHARSET(ctx->pattern+1,
963 (SRE_CODE)state->lower(*ctx->ptr)))
964 RETURN_FAILURE;
965 ctx->pattern += ctx->pattern[0];
966 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000967 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000968
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000969 case SRE_OP_JUMP:
970 case SRE_OP_INFO:
971 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000972 /* <JUMP> <offset> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000973 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
974 ctx->ptr, ctx->pattern[0]));
975 ctx->pattern += ctx->pattern[0];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000976 break;
977
978 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000979 /* alternation */
980 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000981 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000982 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000983 ctx->u.rep = state->repeat;
984 if (ctx->u.rep)
985 MARK_PUSH(ctx->lastmark);
986 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
987 if (ctx->pattern[1] == SRE_OP_LITERAL &&
988 (ctx->ptr >= end ||
989 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000990 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000991 if (ctx->pattern[1] == SRE_OP_IN &&
992 (ctx->ptr >= end ||
993 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000994 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000995 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000996 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000997 if (ret) {
998 if (ctx->u.rep)
999 MARK_POP_DISCARD(ctx->lastmark);
1000 RETURN_ON_ERROR(ret);
1001 RETURN_SUCCESS;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001002 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001003 if (ctx->u.rep)
1004 MARK_POP_KEEP(ctx->lastmark);
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001005 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001006 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001007 if (ctx->u.rep)
1008 MARK_POP_DISCARD(ctx->lastmark);
1009 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001010
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001011 case SRE_OP_REPEAT_ONE:
1012 /* match repeated sequence (maximizing regexp) */
1013
1014 /* this operator only works if the repeated item is
1015 exactly one character wide, and we're not already
1016 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +00001017 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001018
1019 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1020
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001021 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1022 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001023
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001024 if (ctx->ptr + ctx->pattern[1] > end)
1025 RETURN_FAILURE; /* cannot match */
Fredrik Lundhe1869832000-08-01 22:47:49 +00001026
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001027 state->ptr = ctx->ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001028
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001029 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1030 RETURN_ON_ERROR(ret);
1031 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1032 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001033 ctx->ptr += ctx->count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001034
1035 /* when we arrive here, count contains the number of
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001036 matches, and ctx->ptr points to the tail of the target
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001037 string. check if the rest of the pattern matches,
1038 and backtrack if not. */
1039
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001040 if (ctx->count < (int) ctx->pattern[1])
1041 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001042
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001043 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001044 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001045 state->ptr = ctx->ptr;
1046 RETURN_SUCCESS;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001047 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001048
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001049 LASTMARK_SAVE();
1050
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001051 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001052 /* tail starts with a literal. skip positions where
1053 the rest of the pattern cannot possibly match */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001054 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001055 for (;;) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001056 while (ctx->count >= (int) ctx->pattern[1] &&
1057 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1058 ctx->ptr--;
1059 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001060 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001061 if (ctx->count < (int) ctx->pattern[1])
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001062 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001063 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001064 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1065 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001066 if (ret) {
1067 RETURN_ON_ERROR(ret);
1068 RETURN_SUCCESS;
1069 }
Tim Peters3d563502006-01-21 02:47:53 +00001070
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001071 LASTMARK_RESTORE();
Tim Peters3d563502006-01-21 02:47:53 +00001072
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001073 ctx->ptr--;
1074 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001075 }
1076
1077 } else {
1078 /* general case */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001079 while (ctx->count >= (int) ctx->pattern[1]) {
1080 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001081 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1082 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001083 if (ret) {
1084 RETURN_ON_ERROR(ret);
1085 RETURN_SUCCESS;
1086 }
1087 ctx->ptr--;
1088 ctx->count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001089 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001090 }
1091 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001092 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001093
Guido van Rossum41c99e72003-04-14 17:59:34 +00001094 case SRE_OP_MIN_REPEAT_ONE:
1095 /* match repeated sequence (minimizing regexp) */
1096
1097 /* this operator only works if the repeated item is
1098 exactly one character wide, and we're not already
1099 collecting backtracking points. for other cases,
1100 use the MIN_REPEAT operator */
1101
1102 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1103
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001104 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1105 ctx->pattern[1], ctx->pattern[2]));
Guido van Rossum41c99e72003-04-14 17:59:34 +00001106
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001107 if (ctx->ptr + ctx->pattern[1] > end)
1108 RETURN_FAILURE; /* cannot match */
Guido van Rossum41c99e72003-04-14 17:59:34 +00001109
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001110 state->ptr = ctx->ptr;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001111
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001112 if (ctx->pattern[1] == 0)
1113 ctx->count = 0;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001114 else {
1115 /* count using pattern min as the maximum */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001116 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1117 RETURN_ON_ERROR(ret);
1118 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1119 if (ret < (int) ctx->pattern[1])
Tim Peters3d563502006-01-21 02:47:53 +00001120 /* didn't match minimum number of times */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001121 RETURN_FAILURE;
1122 /* advance past minimum matches of repeat */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001123 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001124 ctx->ptr += ctx->count;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001125 }
1126
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001127 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Guido van Rossum41c99e72003-04-14 17:59:34 +00001128 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001129 state->ptr = ctx->ptr;
1130 RETURN_SUCCESS;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001131
1132 } else {
1133 /* general case */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001134 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001135 while ((int)ctx->pattern[2] == 65535
1136 || ctx->count <= (int)ctx->pattern[2]) {
1137 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001138 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1139 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001140 if (ret) {
1141 RETURN_ON_ERROR(ret);
1142 RETURN_SUCCESS;
1143 }
1144 state->ptr = ctx->ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001145 ret = SRE_COUNT(state, ctx->pattern+3, 1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001146 RETURN_ON_ERROR(ret);
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001147 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001148 if (ret == 0)
Guido van Rossum41c99e72003-04-14 17:59:34 +00001149 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001150 assert(ret == 1);
1151 ctx->ptr++;
1152 ctx->count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001153 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001154 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001155 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001156 RETURN_FAILURE;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001157
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001158 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001159 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001160 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001161 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001162 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1163 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001164
1165 /* install new repeat context */
Jack Diederich2d400772006-05-27 15:44:34 +00001166 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001167 ctx->u.rep->count = -1;
1168 ctx->u.rep->pattern = ctx->pattern;
1169 ctx->u.rep->prev = state->repeat;
1170 ctx->u.rep->last_ptr = NULL;
1171 state->repeat = ctx->u.rep;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001172
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001173 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001174 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001175 state->repeat = ctx->u.rep->prev;
Jack Diederich2d400772006-05-27 15:44:34 +00001176 PyObject_FREE(ctx->u.rep);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001177
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001178 if (ret) {
1179 RETURN_ON_ERROR(ret);
1180 RETURN_SUCCESS;
1181 }
1182 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001183
1184 case SRE_OP_MAX_UNTIL:
1185 /* maximizing repeat */
1186 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1187
1188 /* FIXME: we probably need to deal with zero-width
1189 matches in here... */
1190
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001191 ctx->u.rep = state->repeat;
1192 if (!ctx->u.rep)
1193 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001194
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001195 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001196
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001197 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001198
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001199 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1200 ctx->ptr, ctx->count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001201
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001202 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001203 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001204 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001205 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1206 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001207 if (ret) {
1208 RETURN_ON_ERROR(ret);
1209 RETURN_SUCCESS;
1210 }
1211 ctx->u.rep->count = ctx->count-1;
1212 state->ptr = ctx->ptr;
1213 RETURN_FAILURE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001214 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001215
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001216 if ((ctx->count < ctx->u.rep->pattern[2] ||
1217 ctx->u.rep->pattern[2] == 65535) &&
1218 state->ptr != ctx->u.rep->last_ptr) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001219 /* we may have enough matches, but if we can
1220 match another item, do so */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001221 ctx->u.rep->count = ctx->count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001222 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001223 MARK_PUSH(ctx->lastmark);
1224 /* zero-width match protection */
1225 DATA_PUSH(&ctx->u.rep->last_ptr);
1226 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001227 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1228 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001229 DATA_POP(&ctx->u.rep->last_ptr);
1230 if (ret) {
1231 MARK_POP_DISCARD(ctx->lastmark);
1232 RETURN_ON_ERROR(ret);
1233 RETURN_SUCCESS;
1234 }
1235 MARK_POP(ctx->lastmark);
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001236 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001237 ctx->u.rep->count = ctx->count-1;
1238 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001239 }
1240
1241 /* cannot match more repeated items here. make sure the
1242 tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001243 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001244 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001245 RETURN_ON_SUCCESS(ret);
1246 state->repeat = ctx->u.rep;
1247 state->ptr = ctx->ptr;
1248 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001249
1250 case SRE_OP_MIN_UNTIL:
1251 /* minimizing repeat */
1252 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1253
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001254 ctx->u.rep = state->repeat;
1255 if (!ctx->u.rep)
1256 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001257
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001258 state->ptr = ctx->ptr;
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001259
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001260 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001261
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001262 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1263 ctx->ptr, ctx->count, ctx->u.rep->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001264
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001265 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001266 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001267 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001268 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1269 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001270 if (ret) {
1271 RETURN_ON_ERROR(ret);
1272 RETURN_SUCCESS;
1273 }
1274 ctx->u.rep->count = ctx->count-1;
1275 state->ptr = ctx->ptr;
1276 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001277 }
1278
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001279 LASTMARK_SAVE();
1280
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001281 /* see if the tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001282 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001283 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001284 if (ret) {
1285 RETURN_ON_ERROR(ret);
1286 RETURN_SUCCESS;
1287 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001288
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001289 state->repeat = ctx->u.rep;
1290 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001291
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001292 LASTMARK_RESTORE();
1293
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001294 if (ctx->count >= ctx->u.rep->pattern[2]
1295 && ctx->u.rep->pattern[2] != 65535)
1296 RETURN_FAILURE;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001297
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001298 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001299 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1300 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001301 if (ret) {
1302 RETURN_ON_ERROR(ret);
1303 RETURN_SUCCESS;
1304 }
1305 ctx->u.rep->count = ctx->count-1;
1306 state->ptr = ctx->ptr;
1307 RETURN_FAILURE;
1308
1309 case SRE_OP_GROUPREF:
1310 /* match backreference */
1311 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1312 ctx->ptr, ctx->pattern[0]));
1313 i = ctx->pattern[0];
1314 {
1315 int groupref = i+i;
1316 if (groupref >= state->lastmark) {
1317 RETURN_FAILURE;
1318 } else {
1319 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1320 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1321 if (!p || !e || e < p)
1322 RETURN_FAILURE;
1323 while (p < e) {
1324 if (ctx->ptr >= end || *ctx->ptr != *p)
1325 RETURN_FAILURE;
1326 p++; ctx->ptr++;
1327 }
1328 }
1329 }
1330 ctx->pattern++;
1331 break;
1332
1333 case SRE_OP_GROUPREF_IGNORE:
1334 /* match backreference */
1335 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1336 ctx->ptr, ctx->pattern[0]));
1337 i = ctx->pattern[0];
1338 {
1339 int groupref = i+i;
1340 if (groupref >= state->lastmark) {
1341 RETURN_FAILURE;
1342 } else {
1343 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1344 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1345 if (!p || !e || e < p)
1346 RETURN_FAILURE;
1347 while (p < e) {
1348 if (ctx->ptr >= end ||
1349 state->lower(*ctx->ptr) != state->lower(*p))
1350 RETURN_FAILURE;
1351 p++; ctx->ptr++;
1352 }
1353 }
1354 }
1355 ctx->pattern++;
1356 break;
1357
1358 case SRE_OP_GROUPREF_EXISTS:
1359 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1360 ctx->ptr, ctx->pattern[0]));
1361 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1362 i = ctx->pattern[0];
1363 {
1364 int groupref = i+i;
1365 if (groupref >= state->lastmark) {
1366 ctx->pattern += ctx->pattern[1];
1367 break;
1368 } else {
1369 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1370 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1371 if (!p || !e || e < p) {
1372 ctx->pattern += ctx->pattern[1];
1373 break;
1374 }
1375 }
1376 }
1377 ctx->pattern += 2;
1378 break;
1379
1380 case SRE_OP_ASSERT:
1381 /* assert subpattern */
1382 /* <ASSERT> <skip> <back> <pattern> */
1383 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1384 ctx->ptr, ctx->pattern[1]));
1385 state->ptr = ctx->ptr - ctx->pattern[1];
1386 if (state->ptr < state->beginning)
1387 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001388 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001389 RETURN_ON_FAILURE(ret);
1390 ctx->pattern += ctx->pattern[0];
1391 break;
1392
1393 case SRE_OP_ASSERT_NOT:
1394 /* assert not subpattern */
1395 /* <ASSERT_NOT> <skip> <back> <pattern> */
1396 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1397 ctx->ptr, ctx->pattern[1]));
1398 state->ptr = ctx->ptr - ctx->pattern[1];
1399 if (state->ptr >= state->beginning) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001400 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001401 if (ret) {
1402 RETURN_ON_ERROR(ret);
1403 RETURN_FAILURE;
1404 }
1405 }
1406 ctx->pattern += ctx->pattern[0];
1407 break;
1408
1409 case SRE_OP_FAILURE:
1410 /* immediate failure */
1411 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1412 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001413
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001414 default:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001415 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1416 ctx->pattern[-1]));
1417 RETURN_ERROR(SRE_ERROR_ILLEGAL);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001418 }
1419 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001420
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001421exit:
1422 ctx_pos = ctx->last_ctx_pos;
1423 jump = ctx->jump;
1424 DATA_POP_DISCARD(ctx);
1425 if (ctx_pos == -1)
1426 return ret;
1427 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1428
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001429 switch (jump) {
1430 case JUMP_MAX_UNTIL_2:
1431 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1432 goto jump_max_until_2;
1433 case JUMP_MAX_UNTIL_3:
1434 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1435 goto jump_max_until_3;
1436 case JUMP_MIN_UNTIL_2:
1437 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1438 goto jump_min_until_2;
1439 case JUMP_MIN_UNTIL_3:
1440 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1441 goto jump_min_until_3;
1442 case JUMP_BRANCH:
1443 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1444 goto jump_branch;
1445 case JUMP_MAX_UNTIL_1:
1446 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1447 goto jump_max_until_1;
1448 case JUMP_MIN_UNTIL_1:
1449 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1450 goto jump_min_until_1;
1451 case JUMP_REPEAT:
1452 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1453 goto jump_repeat;
1454 case JUMP_REPEAT_ONE_1:
1455 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1456 goto jump_repeat_one_1;
1457 case JUMP_REPEAT_ONE_2:
1458 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1459 goto jump_repeat_one_2;
1460 case JUMP_MIN_REPEAT_ONE:
1461 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1462 goto jump_min_repeat_one;
1463 case JUMP_ASSERT:
1464 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1465 goto jump_assert;
1466 case JUMP_ASSERT_NOT:
1467 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1468 goto jump_assert_not;
1469 case JUMP_NONE:
1470 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1471 break;
1472 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001473
1474 return ret; /* should never get here */
Guido van Rossumb700df92000-03-31 14:59:30 +00001475}
1476
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001477LOCAL(int)
Guido van Rossumb700df92000-03-31 14:59:30 +00001478SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1479{
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00001480 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1481 SRE_CHAR* end = (SRE_CHAR *)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001482 int status = 0;
Fredrik Lundh28552902000-07-05 21:14:16 +00001483 int prefix_len = 0;
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001484 int prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001485 SRE_CODE* prefix = NULL;
1486 SRE_CODE* charset = NULL;
1487 SRE_CODE* overlap = NULL;
1488 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001489
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001490 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001491 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001492 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001493
1494 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001495
Gustavo Niemeyer28b5bb32003-06-26 14:41:08 +00001496 if (pattern[3] > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001497 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001498 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001499 end -= pattern[3]-1;
1500 if (end <= ptr)
1501 end = ptr+1;
1502 }
1503
Fredrik Lundh3562f112000-07-02 12:00:07 +00001504 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001505 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001506 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001507 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001508 prefix_skip = pattern[6];
1509 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001510 overlap = prefix + prefix_len - 1;
1511 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001512 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001513 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001514 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001515
1516 pattern += 1 + pattern[1];
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001517 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001518
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001519 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1520 TRACE(("charset = %p\n", charset));
1521
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001522#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001523 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001524 /* pattern starts with a known prefix. use the overlap
1525 table to skip forward as fast as we possibly can */
1526 int i = 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00001527 end = (SRE_CHAR *)state->end;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001528 while (ptr < end) {
1529 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001530 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001531 if (!i)
1532 break;
1533 else
1534 i = overlap[i];
1535 } else {
1536 if (++i == prefix_len) {
1537 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001538 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1539 state->start = ptr + 1 - prefix_len;
1540 state->ptr = ptr + 1 - prefix_len + prefix_skip;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001541 if (flags & SRE_INFO_LITERAL)
1542 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001543 status = SRE_MATCH(state, pattern + 2*prefix_skip);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001544 if (status != 0)
1545 return status;
1546 /* close but no cigar -- try again */
1547 i = overlap[i];
1548 }
1549 break;
1550 }
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001551 }
1552 ptr++;
1553 }
1554 return 0;
1555 }
1556#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001557
Fredrik Lundh3562f112000-07-02 12:00:07 +00001558 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001559 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001560 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001561 SRE_CODE chr = pattern[1];
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00001562 end = (SRE_CHAR *)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001563 for (;;) {
1564 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1565 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001566 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001567 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001568 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001569 state->start = ptr;
1570 state->ptr = ++ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001571 if (flags & SRE_INFO_LITERAL)
1572 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001573 status = SRE_MATCH(state, pattern + 2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001574 if (status != 0)
1575 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001576 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001577 } else if (charset) {
1578 /* pattern starts with a character from a known set */
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00001579 end = (SRE_CHAR *)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001580 for (;;) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001581 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001582 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001583 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001584 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001585 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001586 state->start = ptr;
1587 state->ptr = ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001588 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001589 if (status != 0)
1590 break;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001591 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001592 }
1593 } else
1594 /* general case */
1595 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001596 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001597 state->start = state->ptr = ptr++;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001598 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001599 if (status != 0)
1600 break;
1601 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001602
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001603 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001604}
Tim Peters3d563502006-01-21 02:47:53 +00001605
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001606LOCAL(int)
1607SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, int len)
1608{
1609 /* check if given string is a literal template (i.e. no escapes) */
1610 while (len-- > 0)
1611 if (*ptr++ == '\\')
1612 return 0;
1613 return 1;
1614}
Guido van Rossumb700df92000-03-31 14:59:30 +00001615
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001616#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001617
1618/* -------------------------------------------------------------------- */
1619/* factories and destructors */
1620
1621/* see sre.h for object declarations */
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00001622static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1623static PyObject*pattern_scanner(PatternObject*, PyObject*);
Guido van Rossumb700df92000-03-31 14:59:30 +00001624
1625static PyObject *
Georg Brandl964f5972006-05-28 22:38:57 +00001626sre_codesize(PyObject* self, PyObject *unused)
Guido van Rossumb700df92000-03-31 14:59:30 +00001627{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001628 return Py_BuildValue("i", sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001629}
1630
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001631static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001632sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001633{
1634 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001635 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001636 return NULL;
1637 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001638 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001639 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001640#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001641 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001642#else
1643 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001644#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001645 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001646}
1647
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001648LOCAL(void)
1649state_reset(SRE_STATE* state)
1650{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001651 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001652 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001653
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001654 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001655 state->lastindex = -1;
1656
1657 state->repeat = NULL;
1658
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001659 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001660}
1661
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001662static void*
1663getstring(PyObject* string, int* p_length, int* p_charsize)
Guido van Rossumb700df92000-03-31 14:59:30 +00001664{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001665 /* given a python object, return a data pointer, a length (in
1666 characters), and a character size. return NULL if the object
1667 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +00001668
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001669 PyBufferProcs *buffer;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001670 int size, bytes, charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001671 void* ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001672
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001673#if defined(HAVE_UNICODE)
1674 if (PyUnicode_Check(string)) {
1675 /* unicode strings doesn't always support the buffer interface */
1676 ptr = (void*) PyUnicode_AS_DATA(string);
1677 bytes = PyUnicode_GET_DATA_SIZE(string);
1678 size = PyUnicode_GET_SIZE(string);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001679 charsize = sizeof(Py_UNICODE);
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001680
1681 } else {
1682#endif
1683
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001684 /* get pointer to string buffer */
1685 buffer = string->ob_type->tp_as_buffer;
1686 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1687 buffer->bf_getsegcount(string, NULL) != 1) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001688 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001689 return NULL;
1690 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001691
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001692 /* determine buffer size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001693 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1694 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001695 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1696 return NULL;
1697 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001698
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001699 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001700#if PY_VERSION_HEX >= 0x01060000
1701 size = PyObject_Size(string);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001702#else
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001703 size = PyObject_Length(string);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001704#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001705
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001706 if (PyString_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001707 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001708#if defined(HAVE_UNICODE)
1709 else if (bytes == (int) (size * sizeof(Py_UNICODE)))
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001710 charsize = sizeof(Py_UNICODE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001711#endif
1712 else {
1713 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1714 return NULL;
1715 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001716
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001717#if defined(HAVE_UNICODE)
1718 }
1719#endif
1720
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001721 *p_length = size;
1722 *p_charsize = charsize;
1723
1724 return ptr;
1725}
1726
1727LOCAL(PyObject*)
1728state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1729 int start, int end)
1730{
1731 /* prepare state object */
1732
1733 int length;
1734 int charsize;
1735 void* ptr;
1736
1737 memset(state, 0, sizeof(SRE_STATE));
1738
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001739 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001740 state->lastindex = -1;
1741
1742 ptr = getstring(string, &length, &charsize);
1743 if (!ptr)
1744 return NULL;
1745
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001746 /* adjust boundaries */
1747 if (start < 0)
1748 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001749 else if (start > length)
1750 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001751
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001752 if (end < 0)
1753 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001754 else if (end > length)
1755 end = length;
1756
1757 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001758
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001759 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001760
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001761 state->start = (void*) ((char*) ptr + start * state->charsize);
1762 state->end = (void*) ((char*) ptr + end * state->charsize);
1763
1764 Py_INCREF(string);
1765 state->string = string;
1766 state->pos = start;
1767 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001768
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001769 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001770 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001771 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001772#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001773 state->lower = sre_lower_unicode;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001774#else
1775 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001776#endif
1777 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001778 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001779
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001780 return string;
Guido van Rossumb700df92000-03-31 14:59:30 +00001781}
1782
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001783LOCAL(void)
1784state_fini(SRE_STATE* state)
1785{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001786 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001787 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001788}
1789
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001790/* calculate offset from start of string */
1791#define STATE_OFFSET(state, member)\
1792 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1793
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001794LOCAL(PyObject*)
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001795state_getslice(SRE_STATE* state, int index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001796{
Fredrik Lundh58100642000-08-09 09:14:35 +00001797 int i, j;
1798
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001799 index = (index - 1) * 2;
1800
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001801 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001802 if (empty)
1803 /* want empty string */
1804 i = j = 0;
1805 else {
1806 Py_INCREF(Py_None);
1807 return Py_None;
1808 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001809 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001810 i = STATE_OFFSET(state, state->mark[index]);
1811 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001812 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001813
Fredrik Lundh58100642000-08-09 09:14:35 +00001814 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001815}
1816
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001817static void
1818pattern_error(int status)
1819{
1820 switch (status) {
1821 case SRE_ERROR_RECURSION_LIMIT:
1822 PyErr_SetString(
1823 PyExc_RuntimeError,
1824 "maximum recursion limit exceeded"
1825 );
1826 break;
1827 case SRE_ERROR_MEMORY:
1828 PyErr_NoMemory();
1829 break;
1830 default:
1831 /* other error codes indicate compiler/engine bugs */
1832 PyErr_SetString(
1833 PyExc_RuntimeError,
1834 "internal error in regular expression engine"
1835 );
1836 }
1837}
1838
Guido van Rossumb700df92000-03-31 14:59:30 +00001839static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001840pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001841{
Raymond Hettinger027bb632004-05-31 03:09:25 +00001842 if (self->weakreflist != NULL)
1843 PyObject_ClearWeakRefs((PyObject *) self);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001844 Py_XDECREF(self->pattern);
1845 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001846 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001847 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001848}
1849
1850static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001851pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001852{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001853 SRE_STATE state;
1854 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001855
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001856 PyObject* string;
1857 int start = 0;
1858 int end = INT_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001859 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00001860 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:match", kwlist,
1861 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001862 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001863
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001864 string = state_init(&state, self, string, start, end);
1865 if (!string)
1866 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001867
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001868 state.ptr = state.start;
1869
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001870 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1871
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001872 if (state.charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001873 status = sre_match(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001874 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001875#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001876 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001877#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001878 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001879
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001880 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1881
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001882 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001883
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001884 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001885}
1886
1887static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001888pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001889{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001890 SRE_STATE state;
1891 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001892
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001893 PyObject* string;
1894 int start = 0;
1895 int end = INT_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001896 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00001897 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:search", kwlist,
1898 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001899 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001900
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001901 string = state_init(&state, self, string, start, end);
1902 if (!string)
1903 return NULL;
1904
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001905 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1906
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001907 if (state.charsize == 1) {
1908 status = sre_search(&state, PatternObject_GetCode(self));
1909 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001910#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001911 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001912#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001913 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001914
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001915 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1916
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001917 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001918
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001919 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001920}
1921
1922static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001923call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001924{
1925 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001926 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001927 PyObject* func;
1928 PyObject* result;
1929
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001930 if (!args)
1931 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001932 name = PyString_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001933 if (!name)
1934 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001935 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001936 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001937 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001938 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001939 func = PyObject_GetAttrString(mod, function);
1940 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001941 if (!func)
1942 return NULL;
1943 result = PyObject_CallObject(func, args);
1944 Py_DECREF(func);
1945 Py_DECREF(args);
1946 return result;
1947}
1948
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001949#ifdef USE_BUILTIN_COPY
1950static int
1951deepcopy(PyObject** object, PyObject* memo)
1952{
1953 PyObject* copy;
1954
1955 copy = call(
1956 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00001957 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001958 );
1959 if (!copy)
1960 return 0;
1961
1962 Py_DECREF(*object);
1963 *object = copy;
1964
1965 return 1; /* success */
1966}
1967#endif
1968
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001969static PyObject*
Michael W. Hudsonb6a45052002-07-31 09:54:24 +00001970join_list(PyObject* list, PyObject* pattern)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001971{
1972 /* join list elements */
1973
1974 PyObject* joiner;
1975#if PY_VERSION_HEX >= 0x01060000
1976 PyObject* function;
1977 PyObject* args;
1978#endif
1979 PyObject* result;
1980
1981 switch (PyList_GET_SIZE(list)) {
1982 case 0:
1983 Py_DECREF(list);
Fredrik Lundh09705f02002-11-22 12:46:35 +00001984 return PySequence_GetSlice(pattern, 0, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001985 case 1:
1986 result = PyList_GET_ITEM(list, 0);
1987 Py_INCREF(result);
1988 Py_DECREF(list);
1989 return result;
1990 }
1991
1992 /* two or more elements: slice out a suitable separator from the
1993 first member, and use that to join the entire list */
1994
1995 joiner = PySequence_GetSlice(pattern, 0, 0);
1996 if (!joiner)
1997 return NULL;
1998
1999#if PY_VERSION_HEX >= 0x01060000
2000 function = PyObject_GetAttrString(joiner, "join");
2001 if (!function) {
2002 Py_DECREF(joiner);
2003 return NULL;
2004 }
2005 args = PyTuple_New(1);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002006 if (!args) {
2007 Py_DECREF(function);
2008 Py_DECREF(joiner);
2009 return NULL;
2010 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002011 PyTuple_SET_ITEM(args, 0, list);
2012 result = PyObject_CallObject(function, args);
2013 Py_DECREF(args); /* also removes list */
2014 Py_DECREF(function);
2015#else
2016 result = call(
2017 "string", "join",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002018 PyTuple_Pack(2, list, joiner)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002019 );
2020#endif
2021 Py_DECREF(joiner);
2022
2023 return result;
2024}
2025
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002026static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002027pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002028{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002029 SRE_STATE state;
2030 PyObject* list;
2031 int status;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002032 int i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00002033
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002034 PyObject* string;
2035 int start = 0;
2036 int end = INT_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002037 static char* kwlist[] = { "source", "pos", "endpos", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00002038 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:findall", kwlist,
2039 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002040 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002041
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002042 string = state_init(&state, self, string, start, end);
2043 if (!string)
2044 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002045
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002046 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002047 if (!list) {
2048 state_fini(&state);
2049 return NULL;
2050 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002051
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002052 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002053
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002054 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +00002055
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002056 state_reset(&state);
2057
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002058 state.ptr = state.start;
2059
2060 if (state.charsize == 1) {
2061 status = sre_search(&state, PatternObject_GetCode(self));
2062 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002063#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002064 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002065#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002066 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002067
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002068 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002069 if (status == 0)
2070 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002071 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002072 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002073 }
Tim Peters3d563502006-01-21 02:47:53 +00002074
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002075 /* don't bother to build a match object */
2076 switch (self->groups) {
2077 case 0:
2078 b = STATE_OFFSET(&state, state.start);
2079 e = STATE_OFFSET(&state, state.ptr);
2080 item = PySequence_GetSlice(string, b, e);
2081 if (!item)
2082 goto error;
2083 break;
2084 case 1:
2085 item = state_getslice(&state, 1, string, 1);
2086 if (!item)
2087 goto error;
2088 break;
2089 default:
2090 item = PyTuple_New(self->groups);
2091 if (!item)
2092 goto error;
2093 for (i = 0; i < self->groups; i++) {
2094 PyObject* o = state_getslice(&state, i+1, string, 1);
2095 if (!o) {
2096 Py_DECREF(item);
2097 goto error;
2098 }
2099 PyTuple_SET_ITEM(item, i, o);
2100 }
2101 break;
2102 }
2103
2104 status = PyList_Append(list, item);
2105 Py_DECREF(item);
2106 if (status < 0)
2107 goto error;
2108
2109 if (state.ptr == state.start)
2110 state.start = (void*) ((char*) state.ptr + state.charsize);
2111 else
2112 state.start = state.ptr;
2113
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002114 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002115
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002116 state_fini(&state);
2117 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002118
2119error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002120 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002121 state_fini(&state);
2122 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002123
Guido van Rossumb700df92000-03-31 14:59:30 +00002124}
2125
Fredrik Lundh703ce812001-10-24 22:16:30 +00002126#if PY_VERSION_HEX >= 0x02020000
2127static PyObject*
2128pattern_finditer(PatternObject* pattern, PyObject* args)
2129{
2130 PyObject* scanner;
2131 PyObject* search;
2132 PyObject* iterator;
2133
2134 scanner = pattern_scanner(pattern, args);
2135 if (!scanner)
2136 return NULL;
2137
2138 search = PyObject_GetAttrString(scanner, "search");
2139 Py_DECREF(scanner);
2140 if (!search)
2141 return NULL;
2142
2143 iterator = PyCallIter_New(search, Py_None);
2144 Py_DECREF(search);
2145
2146 return iterator;
2147}
2148#endif
2149
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002150static PyObject*
2151pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2152{
2153 SRE_STATE state;
2154 PyObject* list;
2155 PyObject* item;
2156 int status;
2157 int n;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002158 int i;
2159 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002160
2161 PyObject* string;
2162 int maxsplit = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002163 static char* kwlist[] = { "source", "maxsplit", NULL };
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002164 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|i:split", kwlist,
2165 &string, &maxsplit))
2166 return NULL;
2167
2168 string = state_init(&state, self, string, 0, INT_MAX);
2169 if (!string)
2170 return NULL;
2171
2172 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002173 if (!list) {
2174 state_fini(&state);
2175 return NULL;
2176 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002177
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002178 n = 0;
2179 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002180
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002181 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002182
2183 state_reset(&state);
2184
2185 state.ptr = state.start;
2186
2187 if (state.charsize == 1) {
2188 status = sre_search(&state, PatternObject_GetCode(self));
2189 } else {
2190#if defined(HAVE_UNICODE)
2191 status = sre_usearch(&state, PatternObject_GetCode(self));
2192#endif
2193 }
2194
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002195 if (status <= 0) {
2196 if (status == 0)
2197 break;
2198 pattern_error(status);
2199 goto error;
2200 }
Tim Peters3d563502006-01-21 02:47:53 +00002201
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002202 if (state.start == state.ptr) {
2203 if (last == state.end)
2204 break;
2205 /* skip one character */
2206 state.start = (void*) ((char*) state.ptr + state.charsize);
2207 continue;
2208 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002209
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002210 /* get segment before this match */
2211 item = PySequence_GetSlice(
2212 string, STATE_OFFSET(&state, last),
2213 STATE_OFFSET(&state, state.start)
2214 );
2215 if (!item)
2216 goto error;
2217 status = PyList_Append(list, item);
2218 Py_DECREF(item);
2219 if (status < 0)
2220 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002221
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002222 /* add groups (if any) */
2223 for (i = 0; i < self->groups; i++) {
2224 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002225 if (!item)
2226 goto error;
2227 status = PyList_Append(list, item);
2228 Py_DECREF(item);
2229 if (status < 0)
2230 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002231 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002232
2233 n = n + 1;
2234
2235 last = state.start = state.ptr;
2236
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002237 }
2238
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002239 /* get segment following last match (even if empty) */
2240 item = PySequence_GetSlice(
2241 string, STATE_OFFSET(&state, last), state.endpos
2242 );
2243 if (!item)
2244 goto error;
2245 status = PyList_Append(list, item);
2246 Py_DECREF(item);
2247 if (status < 0)
2248 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002249
2250 state_fini(&state);
2251 return list;
2252
2253error:
2254 Py_DECREF(list);
2255 state_fini(&state);
2256 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002257
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002258}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002259
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002260static PyObject*
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002261pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002262 int count, int subn)
2263{
2264 SRE_STATE state;
2265 PyObject* list;
2266 PyObject* item;
2267 PyObject* filter;
2268 PyObject* args;
2269 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002270 void* ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002271 int status;
2272 int n;
2273 int i, b, e;
2274 int filter_is_callable;
2275
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002276 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002277 /* sub/subn takes either a function or a template */
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002278 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002279 Py_INCREF(filter);
2280 filter_is_callable = 1;
2281 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002282 /* if not callable, check if it's a literal string */
2283 int literal;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002284 ptr = getstring(ptemplate, &n, &b);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002285 if (ptr) {
2286 if (b == 1) {
Skip Montanaro816a1622006-04-18 11:53:09 +00002287 literal = sre_literal_template((unsigned char *)ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002288 } else {
2289#if defined(HAVE_UNICODE)
Skip Montanaro816a1622006-04-18 11:53:09 +00002290 literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002291#endif
2292 }
2293 } else {
2294 PyErr_Clear();
2295 literal = 0;
2296 }
2297 if (literal) {
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002298 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002299 Py_INCREF(filter);
2300 filter_is_callable = 0;
2301 } else {
2302 /* not a literal; hand it over to the template compiler */
2303 filter = call(
Neal Norwitz94a9c092006-03-16 06:30:02 +00002304 SRE_PY_MODULE, "_subx",
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002305 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002306 );
2307 if (!filter)
2308 return NULL;
2309 filter_is_callable = PyCallable_Check(filter);
2310 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002311 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002312
2313 string = state_init(&state, self, string, 0, INT_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002314 if (!string) {
2315 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002316 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002317 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002318
2319 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002320 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002321 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002322 state_fini(&state);
2323 return NULL;
2324 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002325
2326 n = i = 0;
2327
2328 while (!count || n < count) {
2329
2330 state_reset(&state);
2331
2332 state.ptr = state.start;
2333
2334 if (state.charsize == 1) {
2335 status = sre_search(&state, PatternObject_GetCode(self));
2336 } else {
2337#if defined(HAVE_UNICODE)
2338 status = sre_usearch(&state, PatternObject_GetCode(self));
2339#endif
2340 }
2341
2342 if (status <= 0) {
2343 if (status == 0)
2344 break;
2345 pattern_error(status);
2346 goto error;
2347 }
Tim Peters3d563502006-01-21 02:47:53 +00002348
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002349 b = STATE_OFFSET(&state, state.start);
2350 e = STATE_OFFSET(&state, state.ptr);
2351
2352 if (i < b) {
2353 /* get segment before this match */
2354 item = PySequence_GetSlice(string, i, b);
2355 if (!item)
2356 goto error;
2357 status = PyList_Append(list, item);
2358 Py_DECREF(item);
2359 if (status < 0)
2360 goto error;
2361
2362 } else if (i == b && i == e && n > 0)
2363 /* ignore empty match on latest position */
2364 goto next;
2365
2366 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002367 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002368 match = pattern_new_match(self, &state, 1);
2369 if (!match)
2370 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002371 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002372 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002373 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002374 goto error;
2375 }
2376 item = PyObject_CallObject(filter, args);
2377 Py_DECREF(args);
2378 Py_DECREF(match);
2379 if (!item)
2380 goto error;
2381 } else {
2382 /* filter is literal string */
2383 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002384 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002385 }
2386
2387 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002388 if (item != Py_None) {
2389 status = PyList_Append(list, item);
2390 Py_DECREF(item);
2391 if (status < 0)
2392 goto error;
2393 }
Tim Peters3d563502006-01-21 02:47:53 +00002394
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002395 i = e;
2396 n = n + 1;
2397
2398next:
2399 /* move on */
2400 if (state.ptr == state.start)
2401 state.start = (void*) ((char*) state.ptr + state.charsize);
2402 else
2403 state.start = state.ptr;
2404
2405 }
2406
2407 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002408 if (i < state.endpos) {
2409 item = PySequence_GetSlice(string, i, state.endpos);
2410 if (!item)
2411 goto error;
2412 status = PyList_Append(list, item);
2413 Py_DECREF(item);
2414 if (status < 0)
2415 goto error;
2416 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002417
2418 state_fini(&state);
2419
Guido van Rossum4e173842001-12-07 04:25:10 +00002420 Py_DECREF(filter);
2421
Fredrik Lundhdac58492001-10-21 21:48:30 +00002422 /* convert list to single string (also removes list) */
Michael W. Hudsonb6a45052002-07-31 09:54:24 +00002423 item = join_list(list, self->pattern);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002424
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002425 if (!item)
2426 return NULL;
2427
2428 if (subn)
2429 return Py_BuildValue("Ni", item, n);
2430
2431 return item;
2432
2433error:
2434 Py_DECREF(list);
2435 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002436 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002437 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002438
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002439}
2440
2441static PyObject*
2442pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2443{
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002444 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002445 PyObject* string;
2446 int count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002447 static char* kwlist[] = { "repl", "string", "count", NULL };
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002448 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:sub", kwlist,
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002449 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002450 return NULL;
2451
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002452 return pattern_subx(self, ptemplate, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002453}
2454
2455static PyObject*
2456pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2457{
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002458 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002459 PyObject* string;
2460 int count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002461 static char* kwlist[] = { "repl", "string", "count", NULL };
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002462 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:subn", kwlist,
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002463 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002464 return NULL;
2465
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002466 return pattern_subx(self, ptemplate, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002467}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002468
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002469static PyObject*
Georg Brandl964f5972006-05-28 22:38:57 +00002470pattern_copy(PatternObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002471{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002472#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002473 PatternObject* copy;
2474 int offset;
2475
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002476 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2477 if (!copy)
2478 return NULL;
2479
2480 offset = offsetof(PatternObject, groups);
2481
2482 Py_XINCREF(self->groupindex);
2483 Py_XINCREF(self->indexgroup);
2484 Py_XINCREF(self->pattern);
2485
2486 memcpy((char*) copy + offset, (char*) self + offset,
2487 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00002488 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002489
2490 return (PyObject*) copy;
2491#else
2492 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2493 return NULL;
2494#endif
2495}
2496
2497static PyObject*
Georg Brandlfbef5882006-05-28 22:14:04 +00002498pattern_deepcopy(PatternObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002499{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002500#ifdef USE_BUILTIN_COPY
2501 PatternObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00002502
Georg Brandlfbef5882006-05-28 22:14:04 +00002503 copy = (PatternObject*) pattern_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002504 if (!copy)
2505 return NULL;
2506
2507 if (!deepcopy(&copy->groupindex, memo) ||
2508 !deepcopy(&copy->indexgroup, memo) ||
2509 !deepcopy(&copy->pattern, memo)) {
2510 Py_DECREF(copy);
2511 return NULL;
2512 }
2513
2514#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002515 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2516 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002517#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002518}
2519
Raymond Hettinger94478742004-09-24 04:31:19 +00002520PyDoc_STRVAR(pattern_match_doc,
2521"match(string[, pos[, endpos]]) --> match object or None.\n\
2522 Matches zero or more characters at the beginning of the string");
2523
2524PyDoc_STRVAR(pattern_search_doc,
2525"search(string[, pos[, endpos]]) --> match object or None.\n\
2526 Scan through string looking for a match, and return a corresponding\n\
2527 MatchObject instance. Return None if no position in the string matches.");
2528
2529PyDoc_STRVAR(pattern_split_doc,
2530"split(string[, maxsplit = 0]) --> list.\n\
2531 Split string by the occurrences of pattern.");
2532
2533PyDoc_STRVAR(pattern_findall_doc,
2534"findall(string[, pos[, endpos]]) --> list.\n\
2535 Return a list of all non-overlapping matches of pattern in string.");
2536
2537PyDoc_STRVAR(pattern_finditer_doc,
2538"finditer(string[, pos[, endpos]]) --> iterator.\n\
2539 Return an iterator over all non-overlapping matches for the \n\
2540 RE pattern in string. For each match, the iterator returns a\n\
2541 match object.");
2542
2543PyDoc_STRVAR(pattern_sub_doc,
2544"sub(repl, string[, count = 0]) --> newstring\n\
2545 Return the string obtained by replacing the leftmost non-overlapping\n\
Tim Peters3d563502006-01-21 02:47:53 +00002546 occurrences of pattern in string by the replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002547
2548PyDoc_STRVAR(pattern_subn_doc,
2549"subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2550 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2551 the leftmost non-overlapping occurrences of pattern with the\n\
Tim Peters3d563502006-01-21 02:47:53 +00002552 replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002553
2554PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2555
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002556static PyMethodDef pattern_methods[] = {
Tim Peters3d563502006-01-21 02:47:53 +00002557 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002558 pattern_match_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002559 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002560 pattern_search_doc},
2561 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2562 pattern_sub_doc},
2563 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2564 pattern_subn_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002565 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002566 pattern_split_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002567 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002568 pattern_findall_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002569#if PY_VERSION_HEX >= 0x02020000
Raymond Hettinger94478742004-09-24 04:31:19 +00002570 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2571 pattern_finditer_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002572#endif
Fredrik Lundh562586e2000-10-03 20:43:34 +00002573 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
Georg Brandlfbef5882006-05-28 22:14:04 +00002574 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2575 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002576 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002577};
2578
Tim Peters3d563502006-01-21 02:47:53 +00002579static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002580pattern_getattr(PatternObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00002581{
2582 PyObject* res;
2583
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002584 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00002585
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002586 if (res)
2587 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00002588
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002589 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00002590
2591 /* attributes */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002592 if (!strcmp(name, "pattern")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002593 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002594 return self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00002595 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002596
2597 if (!strcmp(name, "flags"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002598 return Py_BuildValue("i", self->flags);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002599
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002600 if (!strcmp(name, "groups"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002601 return Py_BuildValue("i", self->groups);
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002602
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002603 if (!strcmp(name, "groupindex") && self->groupindex) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002604 Py_INCREF(self->groupindex);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002605 return self->groupindex;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002606 }
2607
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002608 PyErr_SetString(PyExc_AttributeError, name);
2609 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002610}
2611
2612statichere PyTypeObject Pattern_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002613 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00002614 0, "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002615 sizeof(PatternObject), sizeof(SRE_CODE),
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002616 (destructor)pattern_dealloc, /*tp_dealloc*/
2617 0, /*tp_print*/
Raymond Hettinger027bb632004-05-31 03:09:25 +00002618 (getattrfunc)pattern_getattr, /*tp_getattr*/
2619 0, /* tp_setattr */
2620 0, /* tp_compare */
2621 0, /* tp_repr */
2622 0, /* tp_as_number */
2623 0, /* tp_as_sequence */
2624 0, /* tp_as_mapping */
2625 0, /* tp_hash */
2626 0, /* tp_call */
2627 0, /* tp_str */
2628 0, /* tp_getattro */
2629 0, /* tp_setattro */
2630 0, /* tp_as_buffer */
2631 Py_TPFLAGS_HAVE_WEAKREFS, /* tp_flags */
Raymond Hettinger94478742004-09-24 04:31:19 +00002632 pattern_doc, /* tp_doc */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002633 0, /* tp_traverse */
2634 0, /* tp_clear */
2635 0, /* tp_richcompare */
2636 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
Guido van Rossumb700df92000-03-31 14:59:30 +00002637};
2638
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002639static PyObject *
2640_compile(PyObject* self_, PyObject* args)
2641{
2642 /* "compile" pattern descriptor to pattern object */
2643
2644 PatternObject* self;
2645 int i, n;
2646
2647 PyObject* pattern;
2648 int flags = 0;
2649 PyObject* code;
2650 int groups = 0;
2651 PyObject* groupindex = NULL;
2652 PyObject* indexgroup = NULL;
2653 if (!PyArg_ParseTuple(args, "OiO!|iOO", &pattern, &flags,
2654 &PyList_Type, &code, &groups,
2655 &groupindex, &indexgroup))
2656 return NULL;
2657
2658 n = PyList_GET_SIZE(code);
2659
2660 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2661 if (!self)
2662 return NULL;
2663
2664 self->codesize = n;
2665
2666 for (i = 0; i < n; i++) {
2667 PyObject *o = PyList_GET_ITEM(code, i);
2668 unsigned long value = PyInt_Check(o) ? (unsigned long)PyInt_AsLong(o)
2669 : PyLong_AsUnsignedLong(o);
2670 self->code[i] = (SRE_CODE) value;
2671 if ((unsigned long) self->code[i] != value) {
2672 PyErr_SetString(PyExc_OverflowError,
2673 "regular expression code size limit exceeded");
2674 break;
2675 }
2676 }
2677
2678 if (PyErr_Occurred()) {
2679 PyObject_DEL(self);
2680 return NULL;
2681 }
2682
2683 Py_INCREF(pattern);
2684 self->pattern = pattern;
2685
2686 self->flags = flags;
2687
2688 self->groups = groups;
2689
2690 Py_XINCREF(groupindex);
2691 self->groupindex = groupindex;
2692
2693 Py_XINCREF(indexgroup);
2694 self->indexgroup = indexgroup;
2695
2696 self->weakreflist = NULL;
2697
2698 return (PyObject*) self;
2699}
2700
Guido van Rossumb700df92000-03-31 14:59:30 +00002701/* -------------------------------------------------------------------- */
2702/* match methods */
2703
2704static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002705match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00002706{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002707 Py_XDECREF(self->regs);
2708 Py_XDECREF(self->string);
2709 Py_DECREF(self->pattern);
2710 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00002711}
2712
2713static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002714match_getslice_by_index(MatchObject* self, int index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00002715{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002716 if (index < 0 || index >= self->groups) {
2717 /* raise IndexError if we were given a bad group number */
2718 PyErr_SetString(
2719 PyExc_IndexError,
2720 "no such group"
2721 );
2722 return NULL;
2723 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002724
Fredrik Lundh6f013982000-07-03 18:44:21 +00002725 index *= 2;
2726
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002727 if (self->string == Py_None || self->mark[index] < 0) {
2728 /* return default value if the string or group is undefined */
2729 Py_INCREF(def);
2730 return def;
2731 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002732
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002733 return PySequence_GetSlice(
2734 self->string, self->mark[index], self->mark[index+1]
2735 );
Guido van Rossumb700df92000-03-31 14:59:30 +00002736}
2737
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002738static int
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002739match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00002740{
Fredrik Lundh6f013982000-07-03 18:44:21 +00002741 int i;
Guido van Rossumb700df92000-03-31 14:59:30 +00002742
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002743 if (PyInt_Check(index))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002744 return (int) PyInt_AS_LONG(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00002745
Fredrik Lundh6f013982000-07-03 18:44:21 +00002746 i = -1;
2747
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002748 if (self->pattern->groupindex) {
2749 index = PyObject_GetItem(self->pattern->groupindex, index);
2750 if (index) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00002751 if (PyInt_Check(index))
2752 i = (int) PyInt_AS_LONG(index);
2753 Py_DECREF(index);
2754 } else
2755 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002756 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00002757
2758 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002759}
2760
2761static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002762match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002763{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002764 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00002765}
2766
2767static PyObject*
Georg Brandlfbef5882006-05-28 22:14:04 +00002768match_expand(MatchObject* self, PyObject* ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002769{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002770 /* delegate to Python code */
2771 return call(
Neal Norwitz94a9c092006-03-16 06:30:02 +00002772 SRE_PY_MODULE, "_expand",
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002773 PyTuple_Pack(3, self->pattern, self, ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002774 );
2775}
2776
2777static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002778match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002779{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002780 PyObject* result;
2781 int i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00002782
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002783 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00002784
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002785 switch (size) {
2786 case 0:
2787 result = match_getslice(self, Py_False, Py_None);
2788 break;
2789 case 1:
2790 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2791 break;
2792 default:
2793 /* fetch multiple items */
2794 result = PyTuple_New(size);
2795 if (!result)
2796 return NULL;
2797 for (i = 0; i < size; i++) {
2798 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002799 self, PyTuple_GET_ITEM(args, i), Py_None
2800 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002801 if (!item) {
2802 Py_DECREF(result);
2803 return NULL;
2804 }
2805 PyTuple_SET_ITEM(result, i, item);
2806 }
2807 break;
2808 }
2809 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002810}
2811
2812static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002813match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002814{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002815 PyObject* result;
2816 int index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002817
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002818 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002819 static char* kwlist[] = { "default", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00002820 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002821 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002822
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002823 result = PyTuple_New(self->groups-1);
2824 if (!result)
2825 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002826
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002827 for (index = 1; index < self->groups; index++) {
2828 PyObject* item;
2829 item = match_getslice_by_index(self, index, def);
2830 if (!item) {
2831 Py_DECREF(result);
2832 return NULL;
2833 }
2834 PyTuple_SET_ITEM(result, index-1, item);
2835 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002836
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002837 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002838}
2839
2840static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002841match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002842{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002843 PyObject* result;
2844 PyObject* keys;
2845 int index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002846
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002847 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002848 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00002849 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002850 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002851
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002852 result = PyDict_New();
2853 if (!result || !self->pattern->groupindex)
2854 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002855
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002856 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002857 if (!keys)
2858 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00002859
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002860 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00002861 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002862 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002863 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002864 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002865 if (!key)
2866 goto failed;
2867 value = match_getslice(self, key, def);
2868 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002869 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002870 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002871 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00002872 status = PyDict_SetItem(result, key, value);
2873 Py_DECREF(value);
2874 if (status < 0)
2875 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002876 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002877
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002878 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00002879
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002880 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002881
2882failed:
Neal Norwitz60da3162006-03-07 04:48:24 +00002883 Py_XDECREF(keys);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002884 Py_DECREF(result);
2885 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002886}
2887
2888static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002889match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002890{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002891 int index;
2892
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002893 PyObject* index_ = Py_False; /* zero */
Georg Brandl96a8c392006-05-29 21:04:52 +00002894 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002895 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002896
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002897 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002898
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002899 if (index < 0 || index >= self->groups) {
2900 PyErr_SetString(
2901 PyExc_IndexError,
2902 "no such group"
2903 );
2904 return NULL;
2905 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002906
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002907 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002908 return Py_BuildValue("i", self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00002909}
2910
2911static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002912match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002913{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002914 int index;
2915
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002916 PyObject* index_ = Py_False; /* zero */
Georg Brandl96a8c392006-05-29 21:04:52 +00002917 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002918 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002919
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002920 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002921
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002922 if (index < 0 || index >= self->groups) {
2923 PyErr_SetString(
2924 PyExc_IndexError,
2925 "no such group"
2926 );
2927 return NULL;
2928 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002929
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002930 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002931 return Py_BuildValue("i", self->mark[index*2+1]);
2932}
2933
2934LOCAL(PyObject*)
2935_pair(int i1, int i2)
2936{
2937 PyObject* pair;
2938 PyObject* item;
2939
2940 pair = PyTuple_New(2);
2941 if (!pair)
2942 return NULL;
2943
2944 item = PyInt_FromLong(i1);
2945 if (!item)
2946 goto error;
2947 PyTuple_SET_ITEM(pair, 0, item);
2948
2949 item = PyInt_FromLong(i2);
2950 if (!item)
2951 goto error;
2952 PyTuple_SET_ITEM(pair, 1, item);
2953
2954 return pair;
2955
2956 error:
2957 Py_DECREF(pair);
2958 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002959}
2960
2961static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002962match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002963{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002964 int index;
2965
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002966 PyObject* index_ = Py_False; /* zero */
Georg Brandl96a8c392006-05-29 21:04:52 +00002967 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002968 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002969
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002970 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002971
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002972 if (index < 0 || index >= self->groups) {
2973 PyErr_SetString(
2974 PyExc_IndexError,
2975 "no such group"
2976 );
2977 return NULL;
2978 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002979
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002980 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002981 return _pair(self->mark[index*2], self->mark[index*2+1]);
2982}
2983
2984static PyObject*
2985match_regs(MatchObject* self)
2986{
2987 PyObject* regs;
2988 PyObject* item;
2989 int index;
2990
2991 regs = PyTuple_New(self->groups);
2992 if (!regs)
2993 return NULL;
2994
2995 for (index = 0; index < self->groups; index++) {
2996 item = _pair(self->mark[index*2], self->mark[index*2+1]);
2997 if (!item) {
2998 Py_DECREF(regs);
2999 return NULL;
3000 }
3001 PyTuple_SET_ITEM(regs, index, item);
3002 }
3003
3004 Py_INCREF(regs);
3005 self->regs = regs;
3006
3007 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00003008}
3009
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003010static PyObject*
Georg Brandl964f5972006-05-28 22:38:57 +00003011match_copy(MatchObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003012{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003013#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003014 MatchObject* copy;
3015 int slots, offset;
Tim Peters3d563502006-01-21 02:47:53 +00003016
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003017 slots = 2 * (self->pattern->groups+1);
3018
3019 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3020 if (!copy)
3021 return NULL;
3022
3023 /* this value a constant, but any compiler should be able to
3024 figure that out all by itself */
3025 offset = offsetof(MatchObject, string);
3026
3027 Py_XINCREF(self->pattern);
3028 Py_XINCREF(self->string);
3029 Py_XINCREF(self->regs);
3030
3031 memcpy((char*) copy + offset, (char*) self + offset,
3032 sizeof(MatchObject) + slots * sizeof(int) - offset);
3033
3034 return (PyObject*) copy;
3035#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003036 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003037 return NULL;
3038#endif
3039}
3040
3041static PyObject*
Georg Brandlfbef5882006-05-28 22:14:04 +00003042match_deepcopy(MatchObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003043{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003044#ifdef USE_BUILTIN_COPY
3045 MatchObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00003046
Georg Brandlfbef5882006-05-28 22:14:04 +00003047 copy = (MatchObject*) match_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003048 if (!copy)
3049 return NULL;
3050
3051 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3052 !deepcopy(&copy->string, memo) ||
3053 !deepcopy(&copy->regs, memo)) {
3054 Py_DECREF(copy);
3055 return NULL;
3056 }
3057
3058#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003059 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3060 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003061#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003062}
3063
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003064static PyMethodDef match_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00003065 {"group", (PyCFunction) match_group, METH_VARARGS},
3066 {"start", (PyCFunction) match_start, METH_VARARGS},
3067 {"end", (PyCFunction) match_end, METH_VARARGS},
3068 {"span", (PyCFunction) match_span, METH_VARARGS},
3069 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3070 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
Georg Brandlfbef5882006-05-28 22:14:04 +00003071 {"expand", (PyCFunction) match_expand, METH_O},
3072 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3073 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003074 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003075};
3076
Tim Peters3d563502006-01-21 02:47:53 +00003077static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003078match_getattr(MatchObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00003079{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003080 PyObject* res;
Guido van Rossumb700df92000-03-31 14:59:30 +00003081
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003082 res = Py_FindMethod(match_methods, (PyObject*) self, name);
3083 if (res)
3084 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00003085
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003086 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00003087
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003088 if (!strcmp(name, "lastindex")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00003089 if (self->lastindex >= 0)
3090 return Py_BuildValue("i", self->lastindex);
Fredrik Lundhc2301732000-07-02 22:25:39 +00003091 Py_INCREF(Py_None);
3092 return Py_None;
3093 }
3094
3095 if (!strcmp(name, "lastgroup")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00003096 if (self->pattern->indexgroup && self->lastindex >= 0) {
Fredrik Lundhc2301732000-07-02 22:25:39 +00003097 PyObject* result = PySequence_GetItem(
Fredrik Lundh6f013982000-07-03 18:44:21 +00003098 self->pattern->indexgroup, self->lastindex
Fredrik Lundhc2301732000-07-02 22:25:39 +00003099 );
3100 if (result)
3101 return result;
3102 PyErr_Clear();
3103 }
3104 Py_INCREF(Py_None);
3105 return Py_None;
3106 }
3107
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003108 if (!strcmp(name, "string")) {
3109 if (self->string) {
3110 Py_INCREF(self->string);
3111 return self->string;
3112 } else {
3113 Py_INCREF(Py_None);
3114 return Py_None;
3115 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003116 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003117
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003118 if (!strcmp(name, "regs")) {
3119 if (self->regs) {
3120 Py_INCREF(self->regs);
3121 return self->regs;
3122 } else
3123 return match_regs(self);
3124 }
3125
3126 if (!strcmp(name, "re")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00003127 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003128 return (PyObject*) self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00003129 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003130
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003131 if (!strcmp(name, "pos"))
3132 return Py_BuildValue("i", self->pos);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003133
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003134 if (!strcmp(name, "endpos"))
3135 return Py_BuildValue("i", self->endpos);
Guido van Rossumb700df92000-03-31 14:59:30 +00003136
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003137 PyErr_SetString(PyExc_AttributeError, name);
3138 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003139}
3140
3141/* FIXME: implement setattr("string", None) as a special case (to
3142 detach the associated string, if any */
3143
3144statichere PyTypeObject Match_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003145 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00003146 0, "_" SRE_MODULE ".SRE_Match",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003147 sizeof(MatchObject), sizeof(int),
3148 (destructor)match_dealloc, /*tp_dealloc*/
3149 0, /*tp_print*/
3150 (getattrfunc)match_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00003151};
3152
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003153static PyObject*
3154pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3155{
3156 /* create match object (from state object) */
3157
3158 MatchObject* match;
3159 int i, j;
3160 char* base;
3161 int n;
3162
3163 if (status > 0) {
3164
3165 /* create match object (with room for extra group marks) */
3166 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3167 2*(pattern->groups+1));
3168 if (!match)
3169 return NULL;
3170
3171 Py_INCREF(pattern);
3172 match->pattern = pattern;
3173
3174 Py_INCREF(state->string);
3175 match->string = state->string;
3176
3177 match->regs = NULL;
3178 match->groups = pattern->groups+1;
3179
3180 /* fill in group slices */
3181
3182 base = (char*) state->beginning;
3183 n = state->charsize;
3184
3185 match->mark[0] = ((char*) state->start - base) / n;
3186 match->mark[1] = ((char*) state->ptr - base) / n;
3187
3188 for (i = j = 0; i < pattern->groups; i++, j+=2)
3189 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3190 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3191 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3192 } else
3193 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3194
3195 match->pos = state->pos;
3196 match->endpos = state->endpos;
3197
3198 match->lastindex = state->lastindex;
3199
3200 return (PyObject*) match;
3201
3202 } else if (status == 0) {
3203
3204 /* no match */
3205 Py_INCREF(Py_None);
3206 return Py_None;
3207
3208 }
3209
3210 /* internal error */
3211 pattern_error(status);
3212 return NULL;
3213}
3214
3215
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003216/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003217/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003218
3219static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003220scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003221{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003222 state_fini(&self->state);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003223 Py_DECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003224 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003225}
3226
3227static PyObject*
Georg Brandl964f5972006-05-28 22:38:57 +00003228scanner_match(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003229{
3230 SRE_STATE* state = &self->state;
3231 PyObject* match;
3232 int status;
3233
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003234 state_reset(state);
3235
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003236 state->ptr = state->start;
3237
3238 if (state->charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003239 status = sre_match(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003240 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003241#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003242 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003243#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003244 }
3245
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003246 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003247 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003248
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003249 if (status == 0 || state->ptr == state->start)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003250 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003251 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003252 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003253
3254 return match;
3255}
3256
3257
3258static PyObject*
Georg Brandl964f5972006-05-28 22:38:57 +00003259scanner_search(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003260{
3261 SRE_STATE* state = &self->state;
3262 PyObject* match;
3263 int status;
3264
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003265 state_reset(state);
3266
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003267 state->ptr = state->start;
3268
3269 if (state->charsize == 1) {
3270 status = sre_search(state, PatternObject_GetCode(self->pattern));
3271 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003272#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003273 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003274#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003275 }
3276
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003277 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003278 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003279
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003280 if (status == 0 || state->ptr == state->start)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003281 state->start = (void*) ((char*) state->ptr + state->charsize);
3282 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003283 state->start = state->ptr;
3284
3285 return match;
3286}
3287
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003288static PyMethodDef scanner_methods[] = {
Georg Brandlfbef5882006-05-28 22:14:04 +00003289 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3290 {"search", (PyCFunction) scanner_search, METH_NOARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003291 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003292};
3293
Tim Peters3d563502006-01-21 02:47:53 +00003294static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003295scanner_getattr(ScannerObject* self, char* name)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003296{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003297 PyObject* res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003298
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003299 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
3300 if (res)
3301 return res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003302
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003303 PyErr_Clear();
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003304
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003305 /* attributes */
3306 if (!strcmp(name, "pattern")) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003307 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003308 return self->pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003309 }
3310
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003311 PyErr_SetString(PyExc_AttributeError, name);
3312 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003313}
3314
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003315statichere PyTypeObject Scanner_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003316 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00003317 0, "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003318 sizeof(ScannerObject), 0,
3319 (destructor)scanner_dealloc, /*tp_dealloc*/
3320 0, /*tp_print*/
3321 (getattrfunc)scanner_getattr, /*tp_getattr*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003322};
3323
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003324static PyObject*
3325pattern_scanner(PatternObject* pattern, PyObject* args)
3326{
3327 /* create search state object */
3328
3329 ScannerObject* self;
3330
3331 PyObject* string;
3332 int start = 0;
3333 int end = INT_MAX;
3334 if (!PyArg_ParseTuple(args, "O|ii:scanner", &string, &start, &end))
3335 return NULL;
3336
3337 /* create scanner object */
3338 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3339 if (!self)
3340 return NULL;
3341
3342 string = state_init(&self->state, pattern, string, start, end);
3343 if (!string) {
3344 PyObject_DEL(self);
3345 return NULL;
3346 }
3347
3348 Py_INCREF(pattern);
3349 self->pattern = (PyObject*) pattern;
3350
3351 return (PyObject*) self;
3352}
3353
Guido van Rossumb700df92000-03-31 14:59:30 +00003354static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00003355 {"compile", _compile, METH_VARARGS},
Georg Brandlfbef5882006-05-28 22:14:04 +00003356 {"getcodesize", sre_codesize, METH_NOARGS},
Neal Norwitzb0493252002-03-31 14:44:22 +00003357 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003358 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003359};
3360
Tim Peters3d563502006-01-21 02:47:53 +00003361#if PY_VERSION_HEX < 0x02030000
Andrew M. Kuchlingc24fe362003-04-30 13:09:08 +00003362DL_EXPORT(void) init_sre(void)
3363#else
Mark Hammond8235ea12002-07-19 06:55:41 +00003364PyMODINIT_FUNC init_sre(void)
Andrew M. Kuchlingc24fe362003-04-30 13:09:08 +00003365#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00003366{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003367 PyObject* m;
3368 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00003369 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003370
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003371 /* Patch object types */
3372 Pattern_Type.ob_type = Match_Type.ob_type =
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003373 Scanner_Type.ob_type = &PyType_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00003374
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00003375 m = Py_InitModule("_" SRE_MODULE, _functions);
Neal Norwitz1ac754f2006-01-19 06:09:39 +00003376 if (m == NULL)
3377 return;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003378 d = PyModule_GetDict(m);
3379
Fredrik Lundh21009b92001-09-18 18:47:09 +00003380 x = PyInt_FromLong(SRE_MAGIC);
3381 if (x) {
3382 PyDict_SetItemString(d, "MAGIC", x);
3383 Py_DECREF(x);
3384 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00003385
Martin v. Löwis78e2f062003-04-19 12:56:08 +00003386 x = PyInt_FromLong(sizeof(SRE_CODE));
3387 if (x) {
3388 PyDict_SetItemString(d, "CODESIZE", x);
3389 Py_DECREF(x);
3390 }
3391
Fredrik Lundh21009b92001-09-18 18:47:09 +00003392 x = PyString_FromString(copyright);
3393 if (x) {
3394 PyDict_SetItemString(d, "copyright", x);
3395 Py_DECREF(x);
3396 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003397}
3398
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003399#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00003400
3401/* vim:ts=4:sw=4:et
3402*/