blob: bd8f8908ba76f3e6f4b6915806acd2981d15d3da [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
Fredrik Lundhebc37b22000-10-28 19:30:41 +00009 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000010 * 2000-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000011 * 2000-09-20 fl added expand method
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000012 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000013 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000014 * 2001-04-28 fl added __copy__ methods (work in progress)
Fredrik Lundh09705f02002-11-22 12:46:35 +000015 * 2001-05-14 fl fixes for 1.5.2 compatibility
Fredrik Lundhf71ae462001-07-02 17:04:48 +000016 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh397a6542001-10-18 19:30:16 +000017 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Fredrik Lundh971e78b2001-10-20 17:48:46 +000018 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
Fredrik Lundhbec95b92001-10-21 16:47:57 +000019 * 2001-10-21 fl added sub/subn primitive
Fredrik Lundh703ce812001-10-24 22:16:30 +000020 * 2001-10-24 fl added finditer primitive (for 2.2 only)
Fredrik Lundh82b23072001-12-09 16:13:15 +000021 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
Fredrik Lundh09705f02002-11-22 12:46:35 +000022 * 2002-11-09 fl fixed empty sub/subn return type
Martin v. Löwis78e2f062003-04-19 12:56:08 +000023 * 2003-04-18 mvl fully support 4-byte codes
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +000024 * 2003-10-17 gn implemented non recursive scheme
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000026 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000027 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000028 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
Guido van Rossumb700df92000-03-31 14:59:30 +000032 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000033 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000034 * other compatibility work.
35 */
36
37#ifndef SRE_RECURSIVE
38
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000039static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000040 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000041
42#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000043#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000044
45#include "sre.h"
46
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000047#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000048
Fredrik Lundh436c3d52000-06-29 08:58:44 +000049/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000050#if !defined(SRE_MODULE)
51#define SRE_MODULE "sre"
52#endif
Fredrik Lundh436c3d52000-06-29 08:58:44 +000053
Neal Norwitz94a9c092006-03-16 06:30:02 +000054#define SRE_PY_MODULE "re"
55
Guido van Rossumb700df92000-03-31 14:59:30 +000056/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000057#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000058
Fredrik Lundh971e78b2001-10-20 17:48:46 +000059#if PY_VERSION_HEX >= 0x01060000
60#if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
Fredrik Lundh22d25462000-07-01 17:50:59 +000061/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d52000-06-29 08:58:44 +000062#define HAVE_UNICODE
63#endif
Fredrik Lundh971e78b2001-10-20 17:48:46 +000064#endif
Fredrik Lundh436c3d52000-06-29 08:58:44 +000065
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000066/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000067/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000068
69/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000070#define USE_FAST_SEARCH
71
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000072/* enables aggressive inlining (always on for Visual C) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000073#undef USE_INLINE
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000074
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000075/* enables copy/deepcopy handling (work in progress) */
76#undef USE_BUILTIN_COPY
77
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000078#if PY_VERSION_HEX < 0x01060000
79#define PyObject_DEL(op) PyMem_DEL((op))
80#endif
81
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000082/* -------------------------------------------------------------------- */
83
Fredrik Lundh80946112000-06-29 18:03:25 +000084#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000085#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000086#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000087/* fastest possible local call under MSVC */
88#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000089#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000090#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000091#else
92#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000093#endif
94
95/* error codes */
96#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000097#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000098#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000099#define SRE_ERROR_MEMORY -9 /* out of memory */
100
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000101#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +0000102#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +0000103#else
104#define TRACE(v)
105#endif
106
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000107/* -------------------------------------------------------------------- */
108/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000109
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000110/* default character predicates (run sre_chars.py to regenerate tables) */
111
112#define SRE_DIGIT_MASK 1
113#define SRE_SPACE_MASK 2
114#define SRE_LINEBREAK_MASK 4
115#define SRE_ALNUM_MASK 8
116#define SRE_WORD_MASK 16
117
Fredrik Lundh21009b92001-09-18 18:47:09 +0000118/* FIXME: this assumes ASCII. create tables in init_sre() instead */
119
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000120static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1212, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1220, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
12325, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12424, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1250, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12624, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
127
Fredrik Lundhb389df32000-06-29 12:48:37 +0000128static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d52000-06-29 08:58:44 +000012910, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
13027, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
13144, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
13261, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
133108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
134122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
135106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
136120, 121, 122, 123, 124, 125, 126, 127 };
137
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000138#define SRE_IS_DIGIT(ch)\
139 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
140#define SRE_IS_SPACE(ch)\
141 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
142#define SRE_IS_LINEBREAK(ch)\
143 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
144#define SRE_IS_ALNUM(ch)\
145 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
146#define SRE_IS_WORD(ch)\
147 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000148
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000149static unsigned int sre_lower(unsigned int ch)
150{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000151 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000152}
153
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000154/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000155/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
156 * warnings when c's type supports only numbers < N+1 */
157#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
158#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000159#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000160#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000161#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
162
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000163static unsigned int sre_lower_locale(unsigned int ch)
164{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000165 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000166}
167
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000168/* unicode-specific character predicates */
169
170#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000171
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000172#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
173#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
174#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000175#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000176#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000177
178static unsigned int sre_lower_unicode(unsigned int ch)
179{
180 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
181}
182
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000183#endif
184
Guido van Rossumb700df92000-03-31 14:59:30 +0000185LOCAL(int)
186sre_category(SRE_CODE category, unsigned int ch)
187{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000188 switch (category) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000189
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000190 case SRE_CATEGORY_DIGIT:
191 return SRE_IS_DIGIT(ch);
192 case SRE_CATEGORY_NOT_DIGIT:
193 return !SRE_IS_DIGIT(ch);
194 case SRE_CATEGORY_SPACE:
195 return SRE_IS_SPACE(ch);
196 case SRE_CATEGORY_NOT_SPACE:
197 return !SRE_IS_SPACE(ch);
198 case SRE_CATEGORY_WORD:
199 return SRE_IS_WORD(ch);
200 case SRE_CATEGORY_NOT_WORD:
201 return !SRE_IS_WORD(ch);
202 case SRE_CATEGORY_LINEBREAK:
203 return SRE_IS_LINEBREAK(ch);
204 case SRE_CATEGORY_NOT_LINEBREAK:
205 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000206
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000207 case SRE_CATEGORY_LOC_WORD:
208 return SRE_LOC_IS_WORD(ch);
209 case SRE_CATEGORY_LOC_NOT_WORD:
210 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000211
212#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000213 case SRE_CATEGORY_UNI_DIGIT:
214 return SRE_UNI_IS_DIGIT(ch);
215 case SRE_CATEGORY_UNI_NOT_DIGIT:
216 return !SRE_UNI_IS_DIGIT(ch);
217 case SRE_CATEGORY_UNI_SPACE:
218 return SRE_UNI_IS_SPACE(ch);
219 case SRE_CATEGORY_UNI_NOT_SPACE:
220 return !SRE_UNI_IS_SPACE(ch);
221 case SRE_CATEGORY_UNI_WORD:
222 return SRE_UNI_IS_WORD(ch);
223 case SRE_CATEGORY_UNI_NOT_WORD:
224 return !SRE_UNI_IS_WORD(ch);
225 case SRE_CATEGORY_UNI_LINEBREAK:
226 return SRE_UNI_IS_LINEBREAK(ch);
227 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
228 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000229#else
230 case SRE_CATEGORY_UNI_DIGIT:
231 return SRE_IS_DIGIT(ch);
232 case SRE_CATEGORY_UNI_NOT_DIGIT:
233 return !SRE_IS_DIGIT(ch);
234 case SRE_CATEGORY_UNI_SPACE:
235 return SRE_IS_SPACE(ch);
236 case SRE_CATEGORY_UNI_NOT_SPACE:
237 return !SRE_IS_SPACE(ch);
238 case SRE_CATEGORY_UNI_WORD:
239 return SRE_LOC_IS_WORD(ch);
240 case SRE_CATEGORY_UNI_NOT_WORD:
241 return !SRE_LOC_IS_WORD(ch);
242 case SRE_CATEGORY_UNI_LINEBREAK:
243 return SRE_IS_LINEBREAK(ch);
244 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
245 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000246#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000247 }
248 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000249}
250
251/* helpers */
252
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000253static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000254data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000255{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000256 if (state->data_stack) {
257 free(state->data_stack);
258 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000259 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000260 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000261}
262
263static int
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000264data_stack_grow(SRE_STATE* state, int size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000265{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000266 int minsize, cursize;
267 minsize = state->data_stack_base+size;
268 cursize = state->data_stack_size;
269 if (cursize < minsize) {
270 void* stack;
271 cursize = minsize+minsize/4+1024;
272 TRACE(("allocate/grow stack %d\n", cursize));
273 stack = realloc(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000274 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000275 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000276 return SRE_ERROR_MEMORY;
277 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000278 state->data_stack = stack;
279 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000280 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000281 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000282}
283
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000284/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000285
286#define SRE_CHAR unsigned char
287#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000288#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000289#define SRE_CHARSET sre_charset
290#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000291#define SRE_MATCH sre_match
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000292#define SRE_MATCH_CONTEXT sre_match_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000293#define SRE_SEARCH sre_search
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000294#define SRE_LITERAL_TEMPLATE sre_literal_template
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000295
296#if defined(HAVE_UNICODE)
297
Guido van Rossumb700df92000-03-31 14:59:30 +0000298#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000299#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000300#undef SRE_RECURSIVE
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000301
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000302#undef SRE_LITERAL_TEMPLATE
Guido van Rossumb700df92000-03-31 14:59:30 +0000303#undef SRE_SEARCH
304#undef SRE_MATCH
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000305#undef SRE_MATCH_CONTEXT
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000306#undef SRE_INFO
307#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000308#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000309#undef SRE_AT
310#undef SRE_CHAR
311
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000312/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000313
314#define SRE_CHAR Py_UNICODE
315#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000316#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000317#define SRE_CHARSET sre_ucharset
318#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000319#define SRE_MATCH sre_umatch
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000320#define SRE_MATCH_CONTEXT sre_umatch_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000321#define SRE_SEARCH sre_usearch
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000322#define SRE_LITERAL_TEMPLATE sre_uliteral_template
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000323#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000324
325#endif /* SRE_RECURSIVE */
326
327/* -------------------------------------------------------------------- */
328/* String matching engine */
329
330/* the following section is compiled twice, with different character
331 settings */
332
333LOCAL(int)
334SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
335{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000336 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000337
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000338 int this, that;
Guido van Rossumb700df92000-03-31 14:59:30 +0000339
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000340 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000341
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000342 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000343 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000344 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000345
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000346 case SRE_AT_BEGINNING_LINE:
347 return ((void*) ptr == state->beginning ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000348 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000349
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000350 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000351 return (((void*) (ptr+1) == state->end &&
352 SRE_IS_LINEBREAK((int) ptr[0])) ||
353 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000354
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000355 case SRE_AT_END_LINE:
356 return ((void*) ptr == state->end ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000357 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000358
Fredrik Lundh770617b2001-01-14 15:06:11 +0000359 case SRE_AT_END_STRING:
360 return ((void*) ptr == state->end);
361
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000362 case SRE_AT_BOUNDARY:
363 if (state->beginning == state->end)
364 return 0;
365 that = ((void*) ptr > state->beginning) ?
366 SRE_IS_WORD((int) ptr[-1]) : 0;
367 this = ((void*) ptr < state->end) ?
368 SRE_IS_WORD((int) ptr[0]) : 0;
369 return this != that;
Fredrik Lundh80946112000-06-29 18:03:25 +0000370
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000371 case SRE_AT_NON_BOUNDARY:
372 if (state->beginning == state->end)
373 return 0;
374 that = ((void*) ptr > state->beginning) ?
375 SRE_IS_WORD((int) ptr[-1]) : 0;
376 this = ((void*) ptr < state->end) ?
377 SRE_IS_WORD((int) ptr[0]) : 0;
378 return this == that;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000379
380 case SRE_AT_LOC_BOUNDARY:
381 if (state->beginning == state->end)
382 return 0;
383 that = ((void*) ptr > state->beginning) ?
384 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
385 this = ((void*) ptr < state->end) ?
386 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
387 return this != that;
388
389 case SRE_AT_LOC_NON_BOUNDARY:
390 if (state->beginning == state->end)
391 return 0;
392 that = ((void*) ptr > state->beginning) ?
393 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
394 this = ((void*) ptr < state->end) ?
395 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
396 return this == that;
397
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000398#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000399 case SRE_AT_UNI_BOUNDARY:
400 if (state->beginning == state->end)
401 return 0;
402 that = ((void*) ptr > state->beginning) ?
403 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
404 this = ((void*) ptr < state->end) ?
405 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
406 return this != that;
407
408 case SRE_AT_UNI_NON_BOUNDARY:
409 if (state->beginning == state->end)
410 return 0;
411 that = ((void*) ptr > state->beginning) ?
412 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
413 this = ((void*) ptr < state->end) ?
414 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
415 return this == that;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000416#endif
417
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000418 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000419
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000420 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000421}
422
423LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000424SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000425{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000426 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000427
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000428 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000429
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000430 for (;;) {
431 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000432
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000433 case SRE_OP_FAILURE:
434 return !ok;
435
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000436 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000437 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000438 if (ch == set[0])
439 return ok;
440 set++;
441 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000442
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000443 case SRE_OP_CATEGORY:
444 /* <CATEGORY> <code> */
445 if (sre_category(set[0], (int) ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000446 return ok;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000447 set += 1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000448 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000449
Fredrik Lundh3562f112000-07-02 12:00:07 +0000450 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000451 if (sizeof(SRE_CODE) == 2) {
452 /* <CHARSET> <bitmap> (16 bits per code word) */
453 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
454 return ok;
455 set += 16;
Tim Peters3d563502006-01-21 02:47:53 +0000456 }
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000457 else {
458 /* <CHARSET> <bitmap> (32 bits per code word) */
459 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
460 return ok;
461 set += 8;
462 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000463 break;
464
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000465 case SRE_OP_RANGE:
466 /* <RANGE> <lower> <upper> */
467 if (set[0] <= ch && ch <= set[1])
468 return ok;
469 set += 2;
470 break;
471
472 case SRE_OP_NEGATE:
473 ok = !ok;
474 break;
475
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000476 case SRE_OP_BIGCHARSET:
477 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
478 {
479 int count, block;
480 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000481
482 if (sizeof(SRE_CODE) == 2) {
483 block = ((unsigned char*)set)[ch >> 8];
484 set += 128;
485 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
486 return ok;
487 set += count*16;
488 }
489 else {
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000490 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
491 * warnings when c's type supports only numbers < N+1 */
492 if (!(ch & ~65535))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000493 block = ((unsigned char*)set)[ch >> 8];
494 else
495 block = -1;
496 set += 64;
Tim Peters3d563502006-01-21 02:47:53 +0000497 if (block >=0 &&
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000498 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
499 return ok;
500 set += count*8;
501 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000502 break;
503 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000504
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000505 default:
506 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000507 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000508 return 0;
509 }
510 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000511}
512
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000513LOCAL(int) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000514
515LOCAL(int)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000516SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, int maxcount)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000517{
518 SRE_CODE chr;
519 SRE_CHAR* ptr = state->ptr;
520 SRE_CHAR* end = state->end;
521 int i;
522
523 /* adjust end */
524 if (maxcount < end - ptr && maxcount != 65535)
525 end = ptr + maxcount;
526
527 switch (pattern[0]) {
528
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000529 case SRE_OP_IN:
530 /* repeated set */
531 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
532 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
533 ptr++;
534 break;
535
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000536 case SRE_OP_ANY:
537 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000538 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000539 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
540 ptr++;
541 break;
542
543 case SRE_OP_ANY_ALL:
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000544 /* repeated dot wildcard. skip to the end of the target
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000545 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000546 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000547 ptr = end;
548 break;
549
550 case SRE_OP_LITERAL:
551 /* repeated literal */
552 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000553 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000554 while (ptr < end && (SRE_CODE) *ptr == chr)
555 ptr++;
556 break;
557
558 case SRE_OP_LITERAL_IGNORE:
559 /* repeated literal */
560 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000561 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000562 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
563 ptr++;
564 break;
565
566 case SRE_OP_NOT_LITERAL:
567 /* repeated non-literal */
568 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000569 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000570 while (ptr < end && (SRE_CODE) *ptr != chr)
571 ptr++;
572 break;
Tim Peters3d563502006-01-21 02:47:53 +0000573
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000574 case SRE_OP_NOT_LITERAL_IGNORE:
575 /* repeated non-literal */
576 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000577 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000578 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
579 ptr++;
580 break;
581
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000582 default:
583 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000584 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000585 while ((SRE_CHAR*) state->ptr < end) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000586 i = SRE_MATCH(state, pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000587 if (i < 0)
588 return i;
589 if (!i)
590 break;
591 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000592 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
593 (SRE_CHAR*) state->ptr - ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000594 return (SRE_CHAR*) state->ptr - ptr;
595 }
596
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000597 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000598 return ptr - (SRE_CHAR*) state->ptr;
599}
600
Fredrik Lundh33accc12000-08-27 20:59:47 +0000601#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000602LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000603SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
604{
605 /* check if an SRE_OP_INFO block matches at the current position.
606 returns the number of SRE_CODE objects to skip if successful, 0
607 if no match */
608
609 SRE_CHAR* end = state->end;
610 SRE_CHAR* ptr = state->ptr;
611 int i;
612
613 /* check minimal length */
614 if (pattern[3] && (end - ptr) < pattern[3])
615 return 0;
616
617 /* check known prefix */
618 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
619 /* <length> <skip> <prefix data> <overlap data> */
620 for (i = 0; i < pattern[5]; i++)
621 if ((SRE_CODE) ptr[i] != pattern[7 + i])
622 return 0;
623 return pattern[0] + 2 * pattern[6];
624 }
625 return pattern[0];
626}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000627#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000628
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000629/* The macros below should be used to protect recursive SRE_MATCH()
630 * calls that *failed* and do *not* return immediately (IOW, those
631 * that will backtrack). Explaining:
632 *
633 * - Recursive SRE_MATCH() returned true: that's usually a success
634 * (besides atypical cases like ASSERT_NOT), therefore there's no
635 * reason to restore lastmark;
636 *
637 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
638 * is returning to the caller: If the current SRE_MATCH() is the
639 * top function of the recursion, returning false will be a matching
640 * failure, and it doesn't matter where lastmark is pointing to.
641 * If it's *not* the top function, it will be a recursive SRE_MATCH()
642 * failure by itself, and the calling SRE_MATCH() will have to deal
643 * with the failure by the same rules explained here (it will restore
644 * lastmark by itself if necessary);
645 *
646 * - Recursive SRE_MATCH() returned false, and will continue the
647 * outside 'for' loop: must be protected when breaking, since the next
648 * OP could potentially depend on lastmark;
Tim Peters3d563502006-01-21 02:47:53 +0000649 *
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000650 * - Recursive SRE_MATCH() returned false, and will be called again
651 * inside a local for/while loop: must be protected between each
652 * loop iteration, since the recursive SRE_MATCH() could do anything,
653 * and could potentially depend on lastmark.
654 *
655 * For more information, check the discussion at SF patch #712900.
656 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000657#define LASTMARK_SAVE() \
658 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000659 ctx->lastmark = state->lastmark; \
660 ctx->lastindex = state->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000661 } while (0)
662#define LASTMARK_RESTORE() \
663 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000664 state->lastmark = ctx->lastmark; \
665 state->lastindex = ctx->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000666 } while (0)
667
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000668#define RETURN_ERROR(i) do { return i; } while(0)
669#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
670#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
671
672#define RETURN_ON_ERROR(i) \
673 do { if (i < 0) RETURN_ERROR(i); } while (0)
674#define RETURN_ON_SUCCESS(i) \
675 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
676#define RETURN_ON_FAILURE(i) \
677 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
678
679#define SFY(x) #x
680
681#define DATA_STACK_ALLOC(state, type, ptr) \
682do { \
683 alloc_pos = state->data_stack_base; \
684 TRACE(("allocating %s in %d (%d)\n", \
685 SFY(type), alloc_pos, sizeof(type))); \
686 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
687 int j = data_stack_grow(state, sizeof(type)); \
688 if (j < 0) return j; \
689 if (ctx_pos != -1) \
690 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
691 } \
692 ptr = (type*)(state->data_stack+alloc_pos); \
693 state->data_stack_base += sizeof(type); \
694} while (0)
695
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000696#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
697do { \
698 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
699 ptr = (type*)(state->data_stack+pos); \
700} while (0)
701
702#define DATA_STACK_PUSH(state, data, size) \
703do { \
704 TRACE(("copy data in %p to %d (%d)\n", \
705 data, state->data_stack_base, size)); \
706 if (state->data_stack_size < state->data_stack_base+size) { \
707 int j = data_stack_grow(state, size); \
708 if (j < 0) return j; \
709 if (ctx_pos != -1) \
710 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
711 } \
712 memcpy(state->data_stack+state->data_stack_base, data, size); \
713 state->data_stack_base += size; \
714} while (0)
715
716#define DATA_STACK_POP(state, data, size, discard) \
717do { \
718 TRACE(("copy data to %p from %d (%d)\n", \
719 data, state->data_stack_base-size, size)); \
720 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
721 if (discard) \
722 state->data_stack_base -= size; \
723} while (0)
724
725#define DATA_STACK_POP_DISCARD(state, size) \
726do { \
727 TRACE(("discard data from %d (%d)\n", \
728 state->data_stack_base-size, size)); \
729 state->data_stack_base -= size; \
730} while(0)
731
732#define DATA_PUSH(x) \
733 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
734#define DATA_POP(x) \
735 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000736#define DATA_POP_DISCARD(x) \
737 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
738#define DATA_ALLOC(t,p) \
739 DATA_STACK_ALLOC(state, t, p)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000740#define DATA_LOOKUP_AT(t,p,pos) \
741 DATA_STACK_LOOKUP_AT(state,t,p,pos)
742
743#define MARK_PUSH(lastmark) \
744 do if (lastmark > 0) { \
745 i = lastmark; /* ctx->lastmark may change if reallocated */ \
746 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
747 } while (0)
748#define MARK_POP(lastmark) \
749 do if (lastmark > 0) { \
750 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
751 } while (0)
752#define MARK_POP_KEEP(lastmark) \
753 do if (lastmark > 0) { \
754 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
755 } while (0)
756#define MARK_POP_DISCARD(lastmark) \
757 do if (lastmark > 0) { \
758 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
759 } while (0)
760
761#define JUMP_NONE 0
762#define JUMP_MAX_UNTIL_1 1
763#define JUMP_MAX_UNTIL_2 2
764#define JUMP_MAX_UNTIL_3 3
765#define JUMP_MIN_UNTIL_1 4
766#define JUMP_MIN_UNTIL_2 5
767#define JUMP_MIN_UNTIL_3 6
768#define JUMP_REPEAT 7
769#define JUMP_REPEAT_ONE_1 8
770#define JUMP_REPEAT_ONE_2 9
771#define JUMP_MIN_REPEAT_ONE 10
772#define JUMP_BRANCH 11
773#define JUMP_ASSERT 12
774#define JUMP_ASSERT_NOT 13
775
776#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
777 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
778 nextctx->last_ctx_pos = ctx_pos; \
779 nextctx->jump = jumpvalue; \
780 nextctx->pattern = nextpattern; \
781 ctx_pos = alloc_pos; \
782 ctx = nextctx; \
783 goto entrance; \
784 jumplabel: \
785 while (0) /* gcc doesn't like labels at end of scopes */ \
786
787typedef struct {
788 int last_ctx_pos;
789 int jump;
790 SRE_CHAR* ptr;
791 SRE_CODE* pattern;
792 int count;
793 int lastmark;
794 int lastindex;
795 union {
796 SRE_CODE chr;
797 SRE_REPEAT* rep;
798 } u;
799} SRE_MATCH_CONTEXT;
800
801/* check if string matches the given pattern. returns <0 for
802 error, 0 for failure, and 1 for success */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000803LOCAL(int)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000804SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
Guido van Rossumb700df92000-03-31 14:59:30 +0000805{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000806 SRE_CHAR* end = state->end;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000807 int alloc_pos, ctx_pos = -1;
808 int i, ret = 0;
809 int jump;
Guido van Rossumb700df92000-03-31 14:59:30 +0000810
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000811 SRE_MATCH_CONTEXT* ctx;
812 SRE_MATCH_CONTEXT* nextctx;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000813
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000814 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000815
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000816 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
817 ctx->last_ctx_pos = -1;
818 ctx->jump = JUMP_NONE;
819 ctx->pattern = pattern;
820 ctx_pos = alloc_pos;
821
822entrance:
823
824 ctx->ptr = state->ptr;
825
826 if (ctx->pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000827 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000828 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000829 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000830 TRACE(("reject (got %d chars, need %d)\n",
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000831 (end - ctx->ptr), ctx->pattern[3]));
832 RETURN_FAILURE;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000833 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000834 ctx->pattern += ctx->pattern[1] + 1;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000835 }
836
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000837 for (;;) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000838
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000839 switch (*ctx->pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000840
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000841 case SRE_OP_MARK:
842 /* set mark */
843 /* <MARK> <gid> */
844 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
845 ctx->ptr, ctx->pattern[0]));
846 i = ctx->pattern[0];
847 if (i & 1)
848 state->lastindex = i/2 + 1;
849 if (i > state->lastmark) {
850 /* state->lastmark is the highest valid index in the
851 state->mark array. If it is increased by more than 1,
852 the intervening marks must be set to NULL to signal
Tim Peters3d563502006-01-21 02:47:53 +0000853 that these marks have not been encountered. */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000854 int j = state->lastmark + 1;
855 while (j < i)
856 state->mark[j++] = NULL;
857 state->lastmark = i;
858 }
859 state->mark[i] = ctx->ptr;
860 ctx->pattern++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000861 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000862
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000863 case SRE_OP_LITERAL:
864 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000865 /* <LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000866 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
867 ctx->ptr, *ctx->pattern));
868 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
869 RETURN_FAILURE;
870 ctx->pattern++;
871 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000872 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000873
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000874 case SRE_OP_NOT_LITERAL:
875 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000876 /* <NOT_LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000877 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
878 ctx->ptr, *ctx->pattern));
879 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
880 RETURN_FAILURE;
881 ctx->pattern++;
882 ctx->ptr++;
883 break;
884
885 case SRE_OP_SUCCESS:
886 /* end of pattern */
887 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
888 state->ptr = ctx->ptr;
889 RETURN_SUCCESS;
890
891 case SRE_OP_AT:
892 /* match at given position */
893 /* <AT> <code> */
894 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
895 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
896 RETURN_FAILURE;
897 ctx->pattern++;
898 break;
899
900 case SRE_OP_CATEGORY:
901 /* match at given category */
902 /* <CATEGORY> <code> */
903 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
904 ctx->ptr, *ctx->pattern));
905 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
906 RETURN_FAILURE;
907 ctx->pattern++;
908 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000909 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000910
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000911 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000912 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000913 /* <ANY> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000914 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
915 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
916 RETURN_FAILURE;
917 ctx->ptr++;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000918 break;
919
920 case SRE_OP_ANY_ALL:
921 /* match anything */
922 /* <ANY_ALL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000923 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
924 if (ctx->ptr >= end)
925 RETURN_FAILURE;
926 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000927 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000928
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000929 case SRE_OP_IN:
930 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000931 /* <IN> <skip> <set> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000932 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
933 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
934 RETURN_FAILURE;
935 ctx->pattern += ctx->pattern[0];
936 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000937 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000938
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000939 case SRE_OP_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000940 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
941 ctx->pattern, ctx->ptr, ctx->pattern[0]));
942 if (ctx->ptr >= end ||
943 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
944 RETURN_FAILURE;
945 ctx->pattern++;
946 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000947 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000948
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000949 case SRE_OP_NOT_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000950 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
951 ctx->pattern, ctx->ptr, *ctx->pattern));
952 if (ctx->ptr >= end ||
953 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
954 RETURN_FAILURE;
955 ctx->pattern++;
956 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000957 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000958
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000959 case SRE_OP_IN_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000960 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
961 if (ctx->ptr >= end
962 || !SRE_CHARSET(ctx->pattern+1,
963 (SRE_CODE)state->lower(*ctx->ptr)))
964 RETURN_FAILURE;
965 ctx->pattern += ctx->pattern[0];
966 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000967 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000968
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000969 case SRE_OP_JUMP:
970 case SRE_OP_INFO:
971 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000972 /* <JUMP> <offset> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000973 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
974 ctx->ptr, ctx->pattern[0]));
975 ctx->pattern += ctx->pattern[0];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000976 break;
977
978 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000979 /* alternation */
980 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000981 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000982 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000983 ctx->u.rep = state->repeat;
984 if (ctx->u.rep)
985 MARK_PUSH(ctx->lastmark);
986 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
987 if (ctx->pattern[1] == SRE_OP_LITERAL &&
988 (ctx->ptr >= end ||
989 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000990 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000991 if (ctx->pattern[1] == SRE_OP_IN &&
992 (ctx->ptr >= end ||
993 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000994 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000995 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000996 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000997 if (ret) {
998 if (ctx->u.rep)
999 MARK_POP_DISCARD(ctx->lastmark);
1000 RETURN_ON_ERROR(ret);
1001 RETURN_SUCCESS;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001002 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001003 if (ctx->u.rep)
1004 MARK_POP_KEEP(ctx->lastmark);
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001005 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001006 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001007 if (ctx->u.rep)
1008 MARK_POP_DISCARD(ctx->lastmark);
1009 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001010
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001011 case SRE_OP_REPEAT_ONE:
1012 /* match repeated sequence (maximizing regexp) */
1013
1014 /* this operator only works if the repeated item is
1015 exactly one character wide, and we're not already
1016 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +00001017 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001018
1019 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1020
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001021 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1022 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001023
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001024 if (ctx->ptr + ctx->pattern[1] > end)
1025 RETURN_FAILURE; /* cannot match */
Fredrik Lundhe1869832000-08-01 22:47:49 +00001026
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001027 state->ptr = ctx->ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001028
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001029 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1030 RETURN_ON_ERROR(ret);
1031 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1032 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001033 ctx->ptr += ctx->count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001034
1035 /* when we arrive here, count contains the number of
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001036 matches, and ctx->ptr points to the tail of the target
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001037 string. check if the rest of the pattern matches,
1038 and backtrack if not. */
1039
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001040 if (ctx->count < (int) ctx->pattern[1])
1041 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001042
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001043 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001044 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001045 state->ptr = ctx->ptr;
1046 RETURN_SUCCESS;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001047 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001048
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001049 LASTMARK_SAVE();
1050
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001051 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001052 /* tail starts with a literal. skip positions where
1053 the rest of the pattern cannot possibly match */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001054 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001055 for (;;) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001056 while (ctx->count >= (int) ctx->pattern[1] &&
1057 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1058 ctx->ptr--;
1059 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001060 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001061 if (ctx->count < (int) ctx->pattern[1])
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001062 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001063 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001064 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1065 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001066 if (ret) {
1067 RETURN_ON_ERROR(ret);
1068 RETURN_SUCCESS;
1069 }
Tim Peters3d563502006-01-21 02:47:53 +00001070
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001071 LASTMARK_RESTORE();
Tim Peters3d563502006-01-21 02:47:53 +00001072
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001073 ctx->ptr--;
1074 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001075 }
1076
1077 } else {
1078 /* general case */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001079 while (ctx->count >= (int) ctx->pattern[1]) {
1080 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001081 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1082 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001083 if (ret) {
1084 RETURN_ON_ERROR(ret);
1085 RETURN_SUCCESS;
1086 }
1087 ctx->ptr--;
1088 ctx->count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001089 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001090 }
1091 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001092 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001093
Guido van Rossum41c99e72003-04-14 17:59:34 +00001094 case SRE_OP_MIN_REPEAT_ONE:
1095 /* match repeated sequence (minimizing regexp) */
1096
1097 /* this operator only works if the repeated item is
1098 exactly one character wide, and we're not already
1099 collecting backtracking points. for other cases,
1100 use the MIN_REPEAT operator */
1101
1102 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1103
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001104 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1105 ctx->pattern[1], ctx->pattern[2]));
Guido van Rossum41c99e72003-04-14 17:59:34 +00001106
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001107 if (ctx->ptr + ctx->pattern[1] > end)
1108 RETURN_FAILURE; /* cannot match */
Guido van Rossum41c99e72003-04-14 17:59:34 +00001109
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001110 state->ptr = ctx->ptr;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001111
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001112 if (ctx->pattern[1] == 0)
1113 ctx->count = 0;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001114 else {
1115 /* count using pattern min as the maximum */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001116 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1117 RETURN_ON_ERROR(ret);
1118 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1119 if (ret < (int) ctx->pattern[1])
Tim Peters3d563502006-01-21 02:47:53 +00001120 /* didn't match minimum number of times */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001121 RETURN_FAILURE;
1122 /* advance past minimum matches of repeat */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001123 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001124 ctx->ptr += ctx->count;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001125 }
1126
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001127 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Guido van Rossum41c99e72003-04-14 17:59:34 +00001128 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001129 state->ptr = ctx->ptr;
1130 RETURN_SUCCESS;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001131
1132 } else {
1133 /* general case */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001134 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001135 while ((int)ctx->pattern[2] == 65535
1136 || ctx->count <= (int)ctx->pattern[2]) {
1137 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001138 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1139 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001140 if (ret) {
1141 RETURN_ON_ERROR(ret);
1142 RETURN_SUCCESS;
1143 }
1144 state->ptr = ctx->ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001145 ret = SRE_COUNT(state, ctx->pattern+3, 1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001146 RETURN_ON_ERROR(ret);
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001147 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001148 if (ret == 0)
Guido van Rossum41c99e72003-04-14 17:59:34 +00001149 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001150 assert(ret == 1);
1151 ctx->ptr++;
1152 ctx->count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001153 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001154 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001155 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001156 RETURN_FAILURE;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001157
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001158 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001159 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001160 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001161 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001162 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1163 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001164
1165 /* install new repeat context */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001166 ctx->u.rep = (SRE_REPEAT*) malloc(sizeof(*ctx->u.rep));
1167 ctx->u.rep->count = -1;
1168 ctx->u.rep->pattern = ctx->pattern;
1169 ctx->u.rep->prev = state->repeat;
1170 ctx->u.rep->last_ptr = NULL;
1171 state->repeat = ctx->u.rep;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001172
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001173 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001174 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001175 state->repeat = ctx->u.rep->prev;
1176 free(ctx->u.rep);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001177
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001178 if (ret) {
1179 RETURN_ON_ERROR(ret);
1180 RETURN_SUCCESS;
1181 }
1182 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001183
1184 case SRE_OP_MAX_UNTIL:
1185 /* maximizing repeat */
1186 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1187
1188 /* FIXME: we probably need to deal with zero-width
1189 matches in here... */
1190
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001191 ctx->u.rep = state->repeat;
1192 if (!ctx->u.rep)
1193 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001194
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001195 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001196
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001197 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001198
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001199 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1200 ctx->ptr, ctx->count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001201
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001202 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001203 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001204 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001205 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1206 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001207 if (ret) {
1208 RETURN_ON_ERROR(ret);
1209 RETURN_SUCCESS;
1210 }
1211 ctx->u.rep->count = ctx->count-1;
1212 state->ptr = ctx->ptr;
1213 RETURN_FAILURE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001214 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001215
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001216 if ((ctx->count < ctx->u.rep->pattern[2] ||
1217 ctx->u.rep->pattern[2] == 65535) &&
1218 state->ptr != ctx->u.rep->last_ptr) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001219 /* we may have enough matches, but if we can
1220 match another item, do so */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001221 ctx->u.rep->count = ctx->count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001222 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001223 MARK_PUSH(ctx->lastmark);
1224 /* zero-width match protection */
1225 DATA_PUSH(&ctx->u.rep->last_ptr);
1226 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001227 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1228 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001229 DATA_POP(&ctx->u.rep->last_ptr);
1230 if (ret) {
1231 MARK_POP_DISCARD(ctx->lastmark);
1232 RETURN_ON_ERROR(ret);
1233 RETURN_SUCCESS;
1234 }
1235 MARK_POP(ctx->lastmark);
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001236 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001237 ctx->u.rep->count = ctx->count-1;
1238 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001239 }
1240
1241 /* cannot match more repeated items here. make sure the
1242 tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001243 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001244 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001245 RETURN_ON_SUCCESS(ret);
1246 state->repeat = ctx->u.rep;
1247 state->ptr = ctx->ptr;
1248 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001249
1250 case SRE_OP_MIN_UNTIL:
1251 /* minimizing repeat */
1252 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1253
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001254 ctx->u.rep = state->repeat;
1255 if (!ctx->u.rep)
1256 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001257
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001258 state->ptr = ctx->ptr;
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001259
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001260 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001261
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001262 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1263 ctx->ptr, ctx->count, ctx->u.rep->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001264
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001265 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001266 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001267 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001268 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1269 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001270 if (ret) {
1271 RETURN_ON_ERROR(ret);
1272 RETURN_SUCCESS;
1273 }
1274 ctx->u.rep->count = ctx->count-1;
1275 state->ptr = ctx->ptr;
1276 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001277 }
1278
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001279 LASTMARK_SAVE();
1280
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001281 /* see if the tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001282 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001283 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001284 if (ret) {
1285 RETURN_ON_ERROR(ret);
1286 RETURN_SUCCESS;
1287 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001288
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001289 state->repeat = ctx->u.rep;
1290 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001291
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001292 LASTMARK_RESTORE();
1293
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001294 if (ctx->count >= ctx->u.rep->pattern[2]
1295 && ctx->u.rep->pattern[2] != 65535)
1296 RETURN_FAILURE;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001297
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001298 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001299 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1300 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001301 if (ret) {
1302 RETURN_ON_ERROR(ret);
1303 RETURN_SUCCESS;
1304 }
1305 ctx->u.rep->count = ctx->count-1;
1306 state->ptr = ctx->ptr;
1307 RETURN_FAILURE;
1308
1309 case SRE_OP_GROUPREF:
1310 /* match backreference */
1311 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1312 ctx->ptr, ctx->pattern[0]));
1313 i = ctx->pattern[0];
1314 {
1315 int groupref = i+i;
1316 if (groupref >= state->lastmark) {
1317 RETURN_FAILURE;
1318 } else {
1319 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1320 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1321 if (!p || !e || e < p)
1322 RETURN_FAILURE;
1323 while (p < e) {
1324 if (ctx->ptr >= end || *ctx->ptr != *p)
1325 RETURN_FAILURE;
1326 p++; ctx->ptr++;
1327 }
1328 }
1329 }
1330 ctx->pattern++;
1331 break;
1332
1333 case SRE_OP_GROUPREF_IGNORE:
1334 /* match backreference */
1335 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1336 ctx->ptr, ctx->pattern[0]));
1337 i = ctx->pattern[0];
1338 {
1339 int groupref = i+i;
1340 if (groupref >= state->lastmark) {
1341 RETURN_FAILURE;
1342 } else {
1343 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1344 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1345 if (!p || !e || e < p)
1346 RETURN_FAILURE;
1347 while (p < e) {
1348 if (ctx->ptr >= end ||
1349 state->lower(*ctx->ptr) != state->lower(*p))
1350 RETURN_FAILURE;
1351 p++; ctx->ptr++;
1352 }
1353 }
1354 }
1355 ctx->pattern++;
1356 break;
1357
1358 case SRE_OP_GROUPREF_EXISTS:
1359 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1360 ctx->ptr, ctx->pattern[0]));
1361 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1362 i = ctx->pattern[0];
1363 {
1364 int groupref = i+i;
1365 if (groupref >= state->lastmark) {
1366 ctx->pattern += ctx->pattern[1];
1367 break;
1368 } else {
1369 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1370 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1371 if (!p || !e || e < p) {
1372 ctx->pattern += ctx->pattern[1];
1373 break;
1374 }
1375 }
1376 }
1377 ctx->pattern += 2;
1378 break;
1379
1380 case SRE_OP_ASSERT:
1381 /* assert subpattern */
1382 /* <ASSERT> <skip> <back> <pattern> */
1383 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1384 ctx->ptr, ctx->pattern[1]));
1385 state->ptr = ctx->ptr - ctx->pattern[1];
1386 if (state->ptr < state->beginning)
1387 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001388 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001389 RETURN_ON_FAILURE(ret);
1390 ctx->pattern += ctx->pattern[0];
1391 break;
1392
1393 case SRE_OP_ASSERT_NOT:
1394 /* assert not subpattern */
1395 /* <ASSERT_NOT> <skip> <back> <pattern> */
1396 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1397 ctx->ptr, ctx->pattern[1]));
1398 state->ptr = ctx->ptr - ctx->pattern[1];
1399 if (state->ptr >= state->beginning) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001400 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001401 if (ret) {
1402 RETURN_ON_ERROR(ret);
1403 RETURN_FAILURE;
1404 }
1405 }
1406 ctx->pattern += ctx->pattern[0];
1407 break;
1408
1409 case SRE_OP_FAILURE:
1410 /* immediate failure */
1411 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1412 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001413
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001414 default:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001415 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1416 ctx->pattern[-1]));
1417 RETURN_ERROR(SRE_ERROR_ILLEGAL);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001418 }
1419 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001420
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001421exit:
1422 ctx_pos = ctx->last_ctx_pos;
1423 jump = ctx->jump;
1424 DATA_POP_DISCARD(ctx);
1425 if (ctx_pos == -1)
1426 return ret;
1427 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1428
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001429 switch (jump) {
1430 case JUMP_MAX_UNTIL_2:
1431 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1432 goto jump_max_until_2;
1433 case JUMP_MAX_UNTIL_3:
1434 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1435 goto jump_max_until_3;
1436 case JUMP_MIN_UNTIL_2:
1437 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1438 goto jump_min_until_2;
1439 case JUMP_MIN_UNTIL_3:
1440 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1441 goto jump_min_until_3;
1442 case JUMP_BRANCH:
1443 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1444 goto jump_branch;
1445 case JUMP_MAX_UNTIL_1:
1446 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1447 goto jump_max_until_1;
1448 case JUMP_MIN_UNTIL_1:
1449 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1450 goto jump_min_until_1;
1451 case JUMP_REPEAT:
1452 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1453 goto jump_repeat;
1454 case JUMP_REPEAT_ONE_1:
1455 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1456 goto jump_repeat_one_1;
1457 case JUMP_REPEAT_ONE_2:
1458 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1459 goto jump_repeat_one_2;
1460 case JUMP_MIN_REPEAT_ONE:
1461 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1462 goto jump_min_repeat_one;
1463 case JUMP_ASSERT:
1464 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1465 goto jump_assert;
1466 case JUMP_ASSERT_NOT:
1467 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1468 goto jump_assert_not;
1469 case JUMP_NONE:
1470 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1471 break;
1472 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001473
1474 return ret; /* should never get here */
Guido van Rossumb700df92000-03-31 14:59:30 +00001475}
1476
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001477LOCAL(int)
Guido van Rossumb700df92000-03-31 14:59:30 +00001478SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1479{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001480 SRE_CHAR* ptr = state->start;
1481 SRE_CHAR* end = state->end;
1482 int status = 0;
Fredrik Lundh28552902000-07-05 21:14:16 +00001483 int prefix_len = 0;
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001484 int prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001485 SRE_CODE* prefix = NULL;
1486 SRE_CODE* charset = NULL;
1487 SRE_CODE* overlap = NULL;
1488 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001489
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001490 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001491 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001492 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001493
1494 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001495
Gustavo Niemeyer28b5bb32003-06-26 14:41:08 +00001496 if (pattern[3] > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001497 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001498 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001499 end -= pattern[3]-1;
1500 if (end <= ptr)
1501 end = ptr+1;
1502 }
1503
Fredrik Lundh3562f112000-07-02 12:00:07 +00001504 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001505 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001506 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001507 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001508 prefix_skip = pattern[6];
1509 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001510 overlap = prefix + prefix_len - 1;
1511 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001512 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001513 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001514 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001515
1516 pattern += 1 + pattern[1];
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001517 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001518
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001519 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1520 TRACE(("charset = %p\n", charset));
1521
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001522#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001523 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001524 /* pattern starts with a known prefix. use the overlap
1525 table to skip forward as fast as we possibly can */
1526 int i = 0;
1527 end = state->end;
1528 while (ptr < end) {
1529 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001530 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001531 if (!i)
1532 break;
1533 else
1534 i = overlap[i];
1535 } else {
1536 if (++i == prefix_len) {
1537 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001538 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1539 state->start = ptr + 1 - prefix_len;
1540 state->ptr = ptr + 1 - prefix_len + prefix_skip;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001541 if (flags & SRE_INFO_LITERAL)
1542 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001543 status = SRE_MATCH(state, pattern + 2*prefix_skip);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001544 if (status != 0)
1545 return status;
1546 /* close but no cigar -- try again */
1547 i = overlap[i];
1548 }
1549 break;
1550 }
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001551 }
1552 ptr++;
1553 }
1554 return 0;
1555 }
1556#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001557
Fredrik Lundh3562f112000-07-02 12:00:07 +00001558 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001559 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001560 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001561 SRE_CODE chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001562 end = state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001563 for (;;) {
1564 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1565 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001566 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001567 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001568 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001569 state->start = ptr;
1570 state->ptr = ++ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001571 if (flags & SRE_INFO_LITERAL)
1572 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001573 status = SRE_MATCH(state, pattern + 2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001574 if (status != 0)
1575 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001576 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001577 } else if (charset) {
1578 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001579 end = state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001580 for (;;) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001581 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001582 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001583 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001584 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001585 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001586 state->start = ptr;
1587 state->ptr = ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001588 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001589 if (status != 0)
1590 break;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001591 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001592 }
1593 } else
1594 /* general case */
1595 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001596 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001597 state->start = state->ptr = ptr++;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001598 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001599 if (status != 0)
1600 break;
1601 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001602
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001603 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001604}
Tim Peters3d563502006-01-21 02:47:53 +00001605
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001606LOCAL(int)
1607SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, int len)
1608{
1609 /* check if given string is a literal template (i.e. no escapes) */
1610 while (len-- > 0)
1611 if (*ptr++ == '\\')
1612 return 0;
1613 return 1;
1614}
Guido van Rossumb700df92000-03-31 14:59:30 +00001615
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001616#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001617
1618/* -------------------------------------------------------------------- */
1619/* factories and destructors */
1620
1621/* see sre.h for object declarations */
1622
Jeremy Hylton938ace62002-07-17 16:30:39 +00001623static PyTypeObject Pattern_Type;
1624static PyTypeObject Match_Type;
1625static PyTypeObject Scanner_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00001626
1627static PyObject *
1628_compile(PyObject* self_, PyObject* args)
1629{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001630 /* "compile" pattern descriptor to pattern object */
Guido van Rossumb700df92000-03-31 14:59:30 +00001631
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001632 PatternObject* self;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001633 int i, n;
Guido van Rossumb700df92000-03-31 14:59:30 +00001634
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001635 PyObject* pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001636 int flags = 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001637 PyObject* code;
1638 int groups = 0;
1639 PyObject* groupindex = NULL;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001640 PyObject* indexgroup = NULL;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001641 if (!PyArg_ParseTuple(args, "OiO!|iOO", &pattern, &flags,
1642 &PyList_Type, &code, &groups,
1643 &groupindex, &indexgroup))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001644 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001645
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001646 n = PyList_GET_SIZE(code);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001647
Fredrik Lundhebc37b22000-10-28 19:30:41 +00001648 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001649 if (!self)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001650 return NULL;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001651
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001652 self->codesize = n;
1653
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001654 for (i = 0; i < n; i++) {
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001655 PyObject *o = PyList_GET_ITEM(code, i);
Tim Peters3d563502006-01-21 02:47:53 +00001656 unsigned long value = PyInt_Check(o) ? (unsigned long)PyInt_AsLong(o)
1657 : PyLong_AsUnsignedLong(o);
1658 self->code[i] = (SRE_CODE) value;
1659 if ((unsigned long) self->code[i] != value) {
1660 PyErr_SetString(PyExc_OverflowError,
1661 "regular expression code size limit exceeded");
1662 break;
1663 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001664 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00001665
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001666 if (PyErr_Occurred()) {
1667 PyObject_DEL(self);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001668 return NULL;
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001669 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00001670
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001671 Py_INCREF(pattern);
1672 self->pattern = pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00001673
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001674 self->flags = flags;
1675
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001676 self->groups = groups;
Guido van Rossumb700df92000-03-31 14:59:30 +00001677
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001678 Py_XINCREF(groupindex);
1679 self->groupindex = groupindex;
Guido van Rossumb700df92000-03-31 14:59:30 +00001680
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001681 Py_XINCREF(indexgroup);
1682 self->indexgroup = indexgroup;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001683
Raymond Hettinger027bb632004-05-31 03:09:25 +00001684 self->weakreflist = NULL;
1685
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001686 return (PyObject*) self;
Guido van Rossumb700df92000-03-31 14:59:30 +00001687}
1688
1689static PyObject *
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001690sre_codesize(PyObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001691{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001692 return Py_BuildValue("i", sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001693}
1694
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001695static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001696sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001697{
1698 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001699 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001700 return NULL;
1701 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001702 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001703 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001704#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001705 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001706#else
1707 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001708#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001709 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001710}
1711
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001712LOCAL(void)
1713state_reset(SRE_STATE* state)
1714{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001715 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001716 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001717
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001718 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001719 state->lastindex = -1;
1720
1721 state->repeat = NULL;
1722
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001723 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001724}
1725
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001726static void*
1727getstring(PyObject* string, int* p_length, int* p_charsize)
Guido van Rossumb700df92000-03-31 14:59:30 +00001728{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001729 /* given a python object, return a data pointer, a length (in
1730 characters), and a character size. return NULL if the object
1731 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +00001732
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001733 PyBufferProcs *buffer;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001734 int size, bytes, charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001735 void* ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001736
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001737#if defined(HAVE_UNICODE)
1738 if (PyUnicode_Check(string)) {
1739 /* unicode strings doesn't always support the buffer interface */
1740 ptr = (void*) PyUnicode_AS_DATA(string);
1741 bytes = PyUnicode_GET_DATA_SIZE(string);
1742 size = PyUnicode_GET_SIZE(string);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001743 charsize = sizeof(Py_UNICODE);
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001744
1745 } else {
1746#endif
1747
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001748 /* get pointer to string buffer */
1749 buffer = string->ob_type->tp_as_buffer;
1750 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1751 buffer->bf_getsegcount(string, NULL) != 1) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001752 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001753 return NULL;
1754 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001755
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001756 /* determine buffer size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001757 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1758 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001759 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1760 return NULL;
1761 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001762
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001763 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001764#if PY_VERSION_HEX >= 0x01060000
1765 size = PyObject_Size(string);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001766#else
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001767 size = PyObject_Length(string);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001768#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001769
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001770 if (PyString_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001771 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001772#if defined(HAVE_UNICODE)
1773 else if (bytes == (int) (size * sizeof(Py_UNICODE)))
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001774 charsize = sizeof(Py_UNICODE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001775#endif
1776 else {
1777 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1778 return NULL;
1779 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001780
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001781#if defined(HAVE_UNICODE)
1782 }
1783#endif
1784
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001785 *p_length = size;
1786 *p_charsize = charsize;
1787
1788 return ptr;
1789}
1790
1791LOCAL(PyObject*)
1792state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1793 int start, int end)
1794{
1795 /* prepare state object */
1796
1797 int length;
1798 int charsize;
1799 void* ptr;
1800
1801 memset(state, 0, sizeof(SRE_STATE));
1802
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001803 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001804 state->lastindex = -1;
1805
1806 ptr = getstring(string, &length, &charsize);
1807 if (!ptr)
1808 return NULL;
1809
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001810 /* adjust boundaries */
1811 if (start < 0)
1812 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001813 else if (start > length)
1814 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001815
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001816 if (end < 0)
1817 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001818 else if (end > length)
1819 end = length;
1820
1821 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001822
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001823 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001824
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001825 state->start = (void*) ((char*) ptr + start * state->charsize);
1826 state->end = (void*) ((char*) ptr + end * state->charsize);
1827
1828 Py_INCREF(string);
1829 state->string = string;
1830 state->pos = start;
1831 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001832
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001833 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001834 state->lower = sre_lower_locale;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001835 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001836#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001837 state->lower = sre_lower_unicode;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001838#else
1839 state->lower = sre_lower_locale;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001840#endif
1841 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001842 state->lower = sre_lower;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001843
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001844 return string;
Guido van Rossumb700df92000-03-31 14:59:30 +00001845}
1846
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001847LOCAL(void)
1848state_fini(SRE_STATE* state)
1849{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001850 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001851 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001852}
1853
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001854/* calculate offset from start of string */
1855#define STATE_OFFSET(state, member)\
1856 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1857
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001858LOCAL(PyObject*)
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001859state_getslice(SRE_STATE* state, int index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001860{
Fredrik Lundh58100642000-08-09 09:14:35 +00001861 int i, j;
1862
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001863 index = (index - 1) * 2;
1864
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001865 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001866 if (empty)
1867 /* want empty string */
1868 i = j = 0;
1869 else {
1870 Py_INCREF(Py_None);
1871 return Py_None;
1872 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001873 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001874 i = STATE_OFFSET(state, state->mark[index]);
1875 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001876 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001877
Fredrik Lundh58100642000-08-09 09:14:35 +00001878 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001879}
1880
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001881static void
1882pattern_error(int status)
1883{
1884 switch (status) {
1885 case SRE_ERROR_RECURSION_LIMIT:
1886 PyErr_SetString(
1887 PyExc_RuntimeError,
1888 "maximum recursion limit exceeded"
1889 );
1890 break;
1891 case SRE_ERROR_MEMORY:
1892 PyErr_NoMemory();
1893 break;
1894 default:
1895 /* other error codes indicate compiler/engine bugs */
1896 PyErr_SetString(
1897 PyExc_RuntimeError,
1898 "internal error in regular expression engine"
1899 );
1900 }
1901}
1902
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001903static PyObject*
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001904pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001905{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001906 /* create match object (from state object) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001907
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001908 MatchObject* match;
1909 int i, j;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001910 char* base;
1911 int n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001912
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001913 if (status > 0) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001914
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001915 /* create match object (with room for extra group marks) */
1916 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
Fredrik Lundh6f013982000-07-03 18:44:21 +00001917 2*(pattern->groups+1));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001918 if (!match)
1919 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001920
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001921 Py_INCREF(pattern);
1922 match->pattern = pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001923
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001924 Py_INCREF(state->string);
1925 match->string = state->string;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001926
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001927 match->regs = NULL;
1928 match->groups = pattern->groups+1;
1929
1930 /* fill in group slices */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001931
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001932 base = (char*) state->beginning;
1933 n = state->charsize;
1934
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001935 match->mark[0] = ((char*) state->start - base) / n;
1936 match->mark[1] = ((char*) state->ptr - base) / n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001937
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001938 for (i = j = 0; i < pattern->groups; i++, j+=2)
1939 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
1940 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
1941 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
1942 } else
1943 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
1944
1945 match->pos = state->pos;
1946 match->endpos = state->endpos;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001947
Fredrik Lundh6f013982000-07-03 18:44:21 +00001948 match->lastindex = state->lastindex;
1949
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001950 return (PyObject*) match;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001951
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001952 } else if (status == 0) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001953
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001954 /* no match */
1955 Py_INCREF(Py_None);
1956 return Py_None;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001957
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001958 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001959
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001960 /* internal error */
1961 pattern_error(status);
1962 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001963}
1964
1965static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001966pattern_scanner(PatternObject* pattern, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001967{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001968 /* create search state object */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001969
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001970 ScannerObject* self;
1971
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001972 PyObject* string;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001973 int start = 0;
1974 int end = INT_MAX;
1975 if (!PyArg_ParseTuple(args, "O|ii:scanner", &string, &start, &end))
1976 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001977
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001978 /* create scanner object */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001979 self = PyObject_NEW(ScannerObject, &Scanner_Type);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001980 if (!self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001981 return NULL;
1982
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001983 string = state_init(&self->state, pattern, string, start, end);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001984 if (!string) {
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001985 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001986 return NULL;
1987 }
1988
1989 Py_INCREF(pattern);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001990 self->pattern = (PyObject*) pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001991
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001992 return (PyObject*) self;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001993}
1994
Guido van Rossumb700df92000-03-31 14:59:30 +00001995static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001996pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001997{
Raymond Hettinger027bb632004-05-31 03:09:25 +00001998 if (self->weakreflist != NULL)
1999 PyObject_ClearWeakRefs((PyObject *) self);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002000 Py_XDECREF(self->pattern);
2001 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00002002 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002003 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00002004}
2005
2006static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002007pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002008{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002009 SRE_STATE state;
2010 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00002011
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002012 PyObject* string;
2013 int start = 0;
2014 int end = INT_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002015 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00002016 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:match", kwlist,
2017 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002018 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002019
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002020 string = state_init(&state, self, string, start, end);
2021 if (!string)
2022 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002023
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002024 state.ptr = state.start;
2025
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002026 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
2027
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002028 if (state.charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00002029 status = sre_match(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002030 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002031#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00002032 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002033#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002034 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002035
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002036 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
2037
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002038 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00002039
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002040 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00002041}
2042
2043static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002044pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002045{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002046 SRE_STATE state;
2047 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00002048
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002049 PyObject* string;
2050 int start = 0;
2051 int end = INT_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002052 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00002053 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:search", kwlist,
2054 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002055 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002056
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002057 string = state_init(&state, self, string, start, end);
2058 if (!string)
2059 return NULL;
2060
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002061 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
2062
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002063 if (state.charsize == 1) {
2064 status = sre_search(&state, PatternObject_GetCode(self));
2065 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002066#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002067 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002068#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002069 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002070
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002071 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
2072
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002073 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00002074
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002075 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00002076}
2077
2078static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002079call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002080{
2081 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002082 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002083 PyObject* func;
2084 PyObject* result;
2085
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002086 if (!args)
2087 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002088 name = PyString_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002089 if (!name)
2090 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002091 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002092 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002093 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002094 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002095 func = PyObject_GetAttrString(mod, function);
2096 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002097 if (!func)
2098 return NULL;
2099 result = PyObject_CallObject(func, args);
2100 Py_DECREF(func);
2101 Py_DECREF(args);
2102 return result;
2103}
2104
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002105#ifdef USE_BUILTIN_COPY
2106static int
2107deepcopy(PyObject** object, PyObject* memo)
2108{
2109 PyObject* copy;
2110
2111 copy = call(
2112 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002113 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002114 );
2115 if (!copy)
2116 return 0;
2117
2118 Py_DECREF(*object);
2119 *object = copy;
2120
2121 return 1; /* success */
2122}
2123#endif
2124
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002125static PyObject*
Michael W. Hudsonb6a45052002-07-31 09:54:24 +00002126join_list(PyObject* list, PyObject* pattern)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002127{
2128 /* join list elements */
2129
2130 PyObject* joiner;
2131#if PY_VERSION_HEX >= 0x01060000
2132 PyObject* function;
2133 PyObject* args;
2134#endif
2135 PyObject* result;
2136
2137 switch (PyList_GET_SIZE(list)) {
2138 case 0:
2139 Py_DECREF(list);
Fredrik Lundh09705f02002-11-22 12:46:35 +00002140 return PySequence_GetSlice(pattern, 0, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002141 case 1:
2142 result = PyList_GET_ITEM(list, 0);
2143 Py_INCREF(result);
2144 Py_DECREF(list);
2145 return result;
2146 }
2147
2148 /* two or more elements: slice out a suitable separator from the
2149 first member, and use that to join the entire list */
2150
2151 joiner = PySequence_GetSlice(pattern, 0, 0);
2152 if (!joiner)
2153 return NULL;
2154
2155#if PY_VERSION_HEX >= 0x01060000
2156 function = PyObject_GetAttrString(joiner, "join");
2157 if (!function) {
2158 Py_DECREF(joiner);
2159 return NULL;
2160 }
2161 args = PyTuple_New(1);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002162 if (!args) {
2163 Py_DECREF(function);
2164 Py_DECREF(joiner);
2165 return NULL;
2166 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002167 PyTuple_SET_ITEM(args, 0, list);
2168 result = PyObject_CallObject(function, args);
2169 Py_DECREF(args); /* also removes list */
2170 Py_DECREF(function);
2171#else
2172 result = call(
2173 "string", "join",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002174 PyTuple_Pack(2, list, joiner)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002175 );
2176#endif
2177 Py_DECREF(joiner);
2178
2179 return result;
2180}
2181
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002182static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002183pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002184{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002185 SRE_STATE state;
2186 PyObject* list;
2187 int status;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002188 int i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00002189
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002190 PyObject* string;
2191 int start = 0;
2192 int end = INT_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002193 static char* kwlist[] = { "source", "pos", "endpos", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00002194 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:findall", kwlist,
2195 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002196 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002197
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002198 string = state_init(&state, self, string, start, end);
2199 if (!string)
2200 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002201
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002202 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002203 if (!list) {
2204 state_fini(&state);
2205 return NULL;
2206 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002207
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002208 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002209
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002210 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +00002211
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002212 state_reset(&state);
2213
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002214 state.ptr = state.start;
2215
2216 if (state.charsize == 1) {
2217 status = sre_search(&state, PatternObject_GetCode(self));
2218 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002219#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002220 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002221#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002222 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002223
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002224 if (status <= 0) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002225 if (status == 0)
2226 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002227 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002228 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002229 }
Tim Peters3d563502006-01-21 02:47:53 +00002230
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002231 /* don't bother to build a match object */
2232 switch (self->groups) {
2233 case 0:
2234 b = STATE_OFFSET(&state, state.start);
2235 e = STATE_OFFSET(&state, state.ptr);
2236 item = PySequence_GetSlice(string, b, e);
2237 if (!item)
2238 goto error;
2239 break;
2240 case 1:
2241 item = state_getslice(&state, 1, string, 1);
2242 if (!item)
2243 goto error;
2244 break;
2245 default:
2246 item = PyTuple_New(self->groups);
2247 if (!item)
2248 goto error;
2249 for (i = 0; i < self->groups; i++) {
2250 PyObject* o = state_getslice(&state, i+1, string, 1);
2251 if (!o) {
2252 Py_DECREF(item);
2253 goto error;
2254 }
2255 PyTuple_SET_ITEM(item, i, o);
2256 }
2257 break;
2258 }
2259
2260 status = PyList_Append(list, item);
2261 Py_DECREF(item);
2262 if (status < 0)
2263 goto error;
2264
2265 if (state.ptr == state.start)
2266 state.start = (void*) ((char*) state.ptr + state.charsize);
2267 else
2268 state.start = state.ptr;
2269
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002270 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002271
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002272 state_fini(&state);
2273 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002274
2275error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002276 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002277 state_fini(&state);
2278 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002279
Guido van Rossumb700df92000-03-31 14:59:30 +00002280}
2281
Fredrik Lundh703ce812001-10-24 22:16:30 +00002282#if PY_VERSION_HEX >= 0x02020000
2283static PyObject*
2284pattern_finditer(PatternObject* pattern, PyObject* args)
2285{
2286 PyObject* scanner;
2287 PyObject* search;
2288 PyObject* iterator;
2289
2290 scanner = pattern_scanner(pattern, args);
2291 if (!scanner)
2292 return NULL;
2293
2294 search = PyObject_GetAttrString(scanner, "search");
2295 Py_DECREF(scanner);
2296 if (!search)
2297 return NULL;
2298
2299 iterator = PyCallIter_New(search, Py_None);
2300 Py_DECREF(search);
2301
2302 return iterator;
2303}
2304#endif
2305
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002306static PyObject*
2307pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2308{
2309 SRE_STATE state;
2310 PyObject* list;
2311 PyObject* item;
2312 int status;
2313 int n;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002314 int i;
2315 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002316
2317 PyObject* string;
2318 int maxsplit = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002319 static char* kwlist[] = { "source", "maxsplit", NULL };
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002320 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|i:split", kwlist,
2321 &string, &maxsplit))
2322 return NULL;
2323
2324 string = state_init(&state, self, string, 0, INT_MAX);
2325 if (!string)
2326 return NULL;
2327
2328 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002329 if (!list) {
2330 state_fini(&state);
2331 return NULL;
2332 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002333
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002334 n = 0;
2335 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002336
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002337 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002338
2339 state_reset(&state);
2340
2341 state.ptr = state.start;
2342
2343 if (state.charsize == 1) {
2344 status = sre_search(&state, PatternObject_GetCode(self));
2345 } else {
2346#if defined(HAVE_UNICODE)
2347 status = sre_usearch(&state, PatternObject_GetCode(self));
2348#endif
2349 }
2350
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002351 if (status <= 0) {
2352 if (status == 0)
2353 break;
2354 pattern_error(status);
2355 goto error;
2356 }
Tim Peters3d563502006-01-21 02:47:53 +00002357
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002358 if (state.start == state.ptr) {
2359 if (last == state.end)
2360 break;
2361 /* skip one character */
2362 state.start = (void*) ((char*) state.ptr + state.charsize);
2363 continue;
2364 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002365
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002366 /* get segment before this match */
2367 item = PySequence_GetSlice(
2368 string, STATE_OFFSET(&state, last),
2369 STATE_OFFSET(&state, state.start)
2370 );
2371 if (!item)
2372 goto error;
2373 status = PyList_Append(list, item);
2374 Py_DECREF(item);
2375 if (status < 0)
2376 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002377
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002378 /* add groups (if any) */
2379 for (i = 0; i < self->groups; i++) {
2380 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002381 if (!item)
2382 goto error;
2383 status = PyList_Append(list, item);
2384 Py_DECREF(item);
2385 if (status < 0)
2386 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002387 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002388
2389 n = n + 1;
2390
2391 last = state.start = state.ptr;
2392
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002393 }
2394
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002395 /* get segment following last match (even if empty) */
2396 item = PySequence_GetSlice(
2397 string, STATE_OFFSET(&state, last), state.endpos
2398 );
2399 if (!item)
2400 goto error;
2401 status = PyList_Append(list, item);
2402 Py_DECREF(item);
2403 if (status < 0)
2404 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002405
2406 state_fini(&state);
2407 return list;
2408
2409error:
2410 Py_DECREF(list);
2411 state_fini(&state);
2412 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002413
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002414}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002415
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002416static PyObject*
2417pattern_subx(PatternObject* self, PyObject* template, PyObject* string,
2418 int count, int subn)
2419{
2420 SRE_STATE state;
2421 PyObject* list;
2422 PyObject* item;
2423 PyObject* filter;
2424 PyObject* args;
2425 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002426 void* ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002427 int status;
2428 int n;
2429 int i, b, e;
2430 int filter_is_callable;
2431
Fredrik Lundhdac58492001-10-21 21:48:30 +00002432 if (PyCallable_Check(template)) {
2433 /* sub/subn takes either a function or a template */
2434 filter = template;
2435 Py_INCREF(filter);
2436 filter_is_callable = 1;
2437 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002438 /* if not callable, check if it's a literal string */
2439 int literal;
2440 ptr = getstring(template, &n, &b);
2441 if (ptr) {
2442 if (b == 1) {
2443 literal = sre_literal_template(ptr, n);
2444 } else {
2445#if defined(HAVE_UNICODE)
2446 literal = sre_uliteral_template(ptr, n);
2447#endif
2448 }
2449 } else {
2450 PyErr_Clear();
2451 literal = 0;
2452 }
2453 if (literal) {
2454 filter = template;
2455 Py_INCREF(filter);
2456 filter_is_callable = 0;
2457 } else {
2458 /* not a literal; hand it over to the template compiler */
2459 filter = call(
Neal Norwitz94a9c092006-03-16 06:30:02 +00002460 SRE_PY_MODULE, "_subx",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002461 PyTuple_Pack(2, self, template)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002462 );
2463 if (!filter)
2464 return NULL;
2465 filter_is_callable = PyCallable_Check(filter);
2466 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002467 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002468
2469 string = state_init(&state, self, string, 0, INT_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002470 if (!string) {
2471 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002472 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002473 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002474
2475 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002476 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002477 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002478 state_fini(&state);
2479 return NULL;
2480 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002481
2482 n = i = 0;
2483
2484 while (!count || n < count) {
2485
2486 state_reset(&state);
2487
2488 state.ptr = state.start;
2489
2490 if (state.charsize == 1) {
2491 status = sre_search(&state, PatternObject_GetCode(self));
2492 } else {
2493#if defined(HAVE_UNICODE)
2494 status = sre_usearch(&state, PatternObject_GetCode(self));
2495#endif
2496 }
2497
2498 if (status <= 0) {
2499 if (status == 0)
2500 break;
2501 pattern_error(status);
2502 goto error;
2503 }
Tim Peters3d563502006-01-21 02:47:53 +00002504
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002505 b = STATE_OFFSET(&state, state.start);
2506 e = STATE_OFFSET(&state, state.ptr);
2507
2508 if (i < b) {
2509 /* get segment before this match */
2510 item = PySequence_GetSlice(string, i, b);
2511 if (!item)
2512 goto error;
2513 status = PyList_Append(list, item);
2514 Py_DECREF(item);
2515 if (status < 0)
2516 goto error;
2517
2518 } else if (i == b && i == e && n > 0)
2519 /* ignore empty match on latest position */
2520 goto next;
2521
2522 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002523 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002524 match = pattern_new_match(self, &state, 1);
2525 if (!match)
2526 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002527 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002528 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002529 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002530 goto error;
2531 }
2532 item = PyObject_CallObject(filter, args);
2533 Py_DECREF(args);
2534 Py_DECREF(match);
2535 if (!item)
2536 goto error;
2537 } else {
2538 /* filter is literal string */
2539 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002540 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002541 }
2542
2543 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002544 if (item != Py_None) {
2545 status = PyList_Append(list, item);
2546 Py_DECREF(item);
2547 if (status < 0)
2548 goto error;
2549 }
Tim Peters3d563502006-01-21 02:47:53 +00002550
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002551 i = e;
2552 n = n + 1;
2553
2554next:
2555 /* move on */
2556 if (state.ptr == state.start)
2557 state.start = (void*) ((char*) state.ptr + state.charsize);
2558 else
2559 state.start = state.ptr;
2560
2561 }
2562
2563 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002564 if (i < state.endpos) {
2565 item = PySequence_GetSlice(string, i, state.endpos);
2566 if (!item)
2567 goto error;
2568 status = PyList_Append(list, item);
2569 Py_DECREF(item);
2570 if (status < 0)
2571 goto error;
2572 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002573
2574 state_fini(&state);
2575
Guido van Rossum4e173842001-12-07 04:25:10 +00002576 Py_DECREF(filter);
2577
Fredrik Lundhdac58492001-10-21 21:48:30 +00002578 /* convert list to single string (also removes list) */
Michael W. Hudsonb6a45052002-07-31 09:54:24 +00002579 item = join_list(list, self->pattern);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002580
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002581 if (!item)
2582 return NULL;
2583
2584 if (subn)
2585 return Py_BuildValue("Ni", item, n);
2586
2587 return item;
2588
2589error:
2590 Py_DECREF(list);
2591 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002592 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002593 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002594
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002595}
2596
2597static PyObject*
2598pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2599{
2600 PyObject* template;
2601 PyObject* string;
2602 int count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002603 static char* kwlist[] = { "repl", "string", "count", NULL };
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002604 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:sub", kwlist,
2605 &template, &string, &count))
2606 return NULL;
2607
2608 return pattern_subx(self, template, string, count, 0);
2609}
2610
2611static PyObject*
2612pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2613{
2614 PyObject* template;
2615 PyObject* string;
2616 int count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002617 static char* kwlist[] = { "repl", "string", "count", NULL };
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002618 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:subn", kwlist,
2619 &template, &string, &count))
2620 return NULL;
2621
2622 return pattern_subx(self, template, string, count, 1);
2623}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002624
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002625static PyObject*
2626pattern_copy(PatternObject* self, PyObject* args)
2627{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002628#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002629 PatternObject* copy;
2630 int offset;
2631
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002632 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
2633 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002634
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002635 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2636 if (!copy)
2637 return NULL;
2638
2639 offset = offsetof(PatternObject, groups);
2640
2641 Py_XINCREF(self->groupindex);
2642 Py_XINCREF(self->indexgroup);
2643 Py_XINCREF(self->pattern);
2644
2645 memcpy((char*) copy + offset, (char*) self + offset,
2646 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00002647 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002648
2649 return (PyObject*) copy;
2650#else
2651 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2652 return NULL;
2653#endif
2654}
2655
2656static PyObject*
2657pattern_deepcopy(PatternObject* self, PyObject* args)
2658{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002659#ifdef USE_BUILTIN_COPY
2660 PatternObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00002661
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002662 PyObject* memo;
2663 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
2664 return NULL;
2665
2666 copy = (PatternObject*) pattern_copy(self, Py_None);
2667 if (!copy)
2668 return NULL;
2669
2670 if (!deepcopy(&copy->groupindex, memo) ||
2671 !deepcopy(&copy->indexgroup, memo) ||
2672 !deepcopy(&copy->pattern, memo)) {
2673 Py_DECREF(copy);
2674 return NULL;
2675 }
2676
2677#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002678 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2679 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002680#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002681}
2682
Raymond Hettinger94478742004-09-24 04:31:19 +00002683PyDoc_STRVAR(pattern_match_doc,
2684"match(string[, pos[, endpos]]) --> match object or None.\n\
2685 Matches zero or more characters at the beginning of the string");
2686
2687PyDoc_STRVAR(pattern_search_doc,
2688"search(string[, pos[, endpos]]) --> match object or None.\n\
2689 Scan through string looking for a match, and return a corresponding\n\
2690 MatchObject instance. Return None if no position in the string matches.");
2691
2692PyDoc_STRVAR(pattern_split_doc,
2693"split(string[, maxsplit = 0]) --> list.\n\
2694 Split string by the occurrences of pattern.");
2695
2696PyDoc_STRVAR(pattern_findall_doc,
2697"findall(string[, pos[, endpos]]) --> list.\n\
2698 Return a list of all non-overlapping matches of pattern in string.");
2699
2700PyDoc_STRVAR(pattern_finditer_doc,
2701"finditer(string[, pos[, endpos]]) --> iterator.\n\
2702 Return an iterator over all non-overlapping matches for the \n\
2703 RE pattern in string. For each match, the iterator returns a\n\
2704 match object.");
2705
2706PyDoc_STRVAR(pattern_sub_doc,
2707"sub(repl, string[, count = 0]) --> newstring\n\
2708 Return the string obtained by replacing the leftmost non-overlapping\n\
Tim Peters3d563502006-01-21 02:47:53 +00002709 occurrences of pattern in string by the replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002710
2711PyDoc_STRVAR(pattern_subn_doc,
2712"subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2713 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2714 the leftmost non-overlapping occurrences of pattern with the\n\
Tim Peters3d563502006-01-21 02:47:53 +00002715 replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002716
2717PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2718
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002719static PyMethodDef pattern_methods[] = {
Tim Peters3d563502006-01-21 02:47:53 +00002720 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002721 pattern_match_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002722 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002723 pattern_search_doc},
2724 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2725 pattern_sub_doc},
2726 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2727 pattern_subn_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002728 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002729 pattern_split_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002730 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002731 pattern_findall_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002732#if PY_VERSION_HEX >= 0x02020000
Raymond Hettinger94478742004-09-24 04:31:19 +00002733 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2734 pattern_finditer_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002735#endif
Fredrik Lundh562586e2000-10-03 20:43:34 +00002736 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002737 {"__copy__", (PyCFunction) pattern_copy, METH_VARARGS},
2738 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002739 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002740};
2741
Tim Peters3d563502006-01-21 02:47:53 +00002742static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002743pattern_getattr(PatternObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00002744{
2745 PyObject* res;
2746
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002747 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00002748
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002749 if (res)
2750 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00002751
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002752 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00002753
2754 /* attributes */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002755 if (!strcmp(name, "pattern")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002756 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002757 return self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00002758 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002759
2760 if (!strcmp(name, "flags"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002761 return Py_BuildValue("i", self->flags);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002762
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002763 if (!strcmp(name, "groups"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002764 return Py_BuildValue("i", self->groups);
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002765
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002766 if (!strcmp(name, "groupindex") && self->groupindex) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002767 Py_INCREF(self->groupindex);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002768 return self->groupindex;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002769 }
2770
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002771 PyErr_SetString(PyExc_AttributeError, name);
2772 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002773}
2774
2775statichere PyTypeObject Pattern_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002776 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00002777 0, "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002778 sizeof(PatternObject), sizeof(SRE_CODE),
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002779 (destructor)pattern_dealloc, /*tp_dealloc*/
2780 0, /*tp_print*/
Raymond Hettinger027bb632004-05-31 03:09:25 +00002781 (getattrfunc)pattern_getattr, /*tp_getattr*/
2782 0, /* tp_setattr */
2783 0, /* tp_compare */
2784 0, /* tp_repr */
2785 0, /* tp_as_number */
2786 0, /* tp_as_sequence */
2787 0, /* tp_as_mapping */
2788 0, /* tp_hash */
2789 0, /* tp_call */
2790 0, /* tp_str */
2791 0, /* tp_getattro */
2792 0, /* tp_setattro */
2793 0, /* tp_as_buffer */
2794 Py_TPFLAGS_HAVE_WEAKREFS, /* tp_flags */
Raymond Hettinger94478742004-09-24 04:31:19 +00002795 pattern_doc, /* tp_doc */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002796 0, /* tp_traverse */
2797 0, /* tp_clear */
2798 0, /* tp_richcompare */
2799 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
Guido van Rossumb700df92000-03-31 14:59:30 +00002800};
2801
2802/* -------------------------------------------------------------------- */
2803/* match methods */
2804
2805static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002806match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00002807{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002808 Py_XDECREF(self->regs);
2809 Py_XDECREF(self->string);
2810 Py_DECREF(self->pattern);
2811 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00002812}
2813
2814static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002815match_getslice_by_index(MatchObject* self, int index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00002816{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002817 if (index < 0 || index >= self->groups) {
2818 /* raise IndexError if we were given a bad group number */
2819 PyErr_SetString(
2820 PyExc_IndexError,
2821 "no such group"
2822 );
2823 return NULL;
2824 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002825
Fredrik Lundh6f013982000-07-03 18:44:21 +00002826 index *= 2;
2827
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002828 if (self->string == Py_None || self->mark[index] < 0) {
2829 /* return default value if the string or group is undefined */
2830 Py_INCREF(def);
2831 return def;
2832 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002833
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002834 return PySequence_GetSlice(
2835 self->string, self->mark[index], self->mark[index+1]
2836 );
Guido van Rossumb700df92000-03-31 14:59:30 +00002837}
2838
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002839static int
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002840match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00002841{
Fredrik Lundh6f013982000-07-03 18:44:21 +00002842 int i;
Guido van Rossumb700df92000-03-31 14:59:30 +00002843
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002844 if (PyInt_Check(index))
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002845 return (int) PyInt_AS_LONG(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00002846
Fredrik Lundh6f013982000-07-03 18:44:21 +00002847 i = -1;
2848
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002849 if (self->pattern->groupindex) {
2850 index = PyObject_GetItem(self->pattern->groupindex, index);
2851 if (index) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00002852 if (PyInt_Check(index))
2853 i = (int) PyInt_AS_LONG(index);
2854 Py_DECREF(index);
2855 } else
2856 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002857 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00002858
2859 return i;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002860}
2861
2862static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002863match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002864{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002865 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00002866}
2867
2868static PyObject*
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002869match_expand(MatchObject* self, PyObject* args)
2870{
2871 PyObject* template;
2872 if (!PyArg_ParseTuple(args, "O:expand", &template))
2873 return NULL;
2874
2875 /* delegate to Python code */
2876 return call(
Neal Norwitz94a9c092006-03-16 06:30:02 +00002877 SRE_PY_MODULE, "_expand",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002878 PyTuple_Pack(3, self->pattern, self, template)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002879 );
2880}
2881
2882static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002883match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002884{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002885 PyObject* result;
2886 int i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00002887
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002888 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00002889
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002890 switch (size) {
2891 case 0:
2892 result = match_getslice(self, Py_False, Py_None);
2893 break;
2894 case 1:
2895 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2896 break;
2897 default:
2898 /* fetch multiple items */
2899 result = PyTuple_New(size);
2900 if (!result)
2901 return NULL;
2902 for (i = 0; i < size; i++) {
2903 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002904 self, PyTuple_GET_ITEM(args, i), Py_None
2905 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002906 if (!item) {
2907 Py_DECREF(result);
2908 return NULL;
2909 }
2910 PyTuple_SET_ITEM(result, i, item);
2911 }
2912 break;
2913 }
2914 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002915}
2916
2917static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002918match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002919{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002920 PyObject* result;
2921 int index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002922
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002923 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002924 static char* kwlist[] = { "default", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00002925 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002926 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002927
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002928 result = PyTuple_New(self->groups-1);
2929 if (!result)
2930 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002931
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002932 for (index = 1; index < self->groups; index++) {
2933 PyObject* item;
2934 item = match_getslice_by_index(self, index, def);
2935 if (!item) {
2936 Py_DECREF(result);
2937 return NULL;
2938 }
2939 PyTuple_SET_ITEM(result, index-1, item);
2940 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002941
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002942 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002943}
2944
2945static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002946match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002947{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002948 PyObject* result;
2949 PyObject* keys;
2950 int index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002951
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002952 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002953 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00002954 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002955 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002956
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002957 result = PyDict_New();
2958 if (!result || !self->pattern->groupindex)
2959 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002960
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002961 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002962 if (!keys)
2963 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00002964
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002965 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00002966 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002967 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002968 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002969 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002970 if (!key)
2971 goto failed;
2972 value = match_getslice(self, key, def);
2973 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002974 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002975 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002976 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00002977 status = PyDict_SetItem(result, key, value);
2978 Py_DECREF(value);
2979 if (status < 0)
2980 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002981 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002982
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002983 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00002984
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002985 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002986
2987failed:
Neal Norwitz60da3162006-03-07 04:48:24 +00002988 Py_XDECREF(keys);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002989 Py_DECREF(result);
2990 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002991}
2992
2993static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002994match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002995{
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002996 int index;
2997
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002998 PyObject* index_ = Py_False; /* zero */
2999 if (!PyArg_ParseTuple(args, "|O:start", &index_))
3000 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003001
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003002 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003003
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003004 if (index < 0 || index >= self->groups) {
3005 PyErr_SetString(
3006 PyExc_IndexError,
3007 "no such group"
3008 );
3009 return NULL;
3010 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003011
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003012 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003013 return Py_BuildValue("i", self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00003014}
3015
3016static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003017match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003018{
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003019 int index;
3020
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003021 PyObject* index_ = Py_False; /* zero */
3022 if (!PyArg_ParseTuple(args, "|O:end", &index_))
3023 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003024
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003025 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003026
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003027 if (index < 0 || index >= self->groups) {
3028 PyErr_SetString(
3029 PyExc_IndexError,
3030 "no such group"
3031 );
3032 return NULL;
3033 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003034
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003035 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003036 return Py_BuildValue("i", self->mark[index*2+1]);
3037}
3038
3039LOCAL(PyObject*)
3040_pair(int i1, int i2)
3041{
3042 PyObject* pair;
3043 PyObject* item;
3044
3045 pair = PyTuple_New(2);
3046 if (!pair)
3047 return NULL;
3048
3049 item = PyInt_FromLong(i1);
3050 if (!item)
3051 goto error;
3052 PyTuple_SET_ITEM(pair, 0, item);
3053
3054 item = PyInt_FromLong(i2);
3055 if (!item)
3056 goto error;
3057 PyTuple_SET_ITEM(pair, 1, item);
3058
3059 return pair;
3060
3061 error:
3062 Py_DECREF(pair);
3063 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003064}
3065
3066static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003067match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003068{
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003069 int index;
3070
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003071 PyObject* index_ = Py_False; /* zero */
3072 if (!PyArg_ParseTuple(args, "|O:span", &index_))
3073 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003074
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003075 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003076
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003077 if (index < 0 || index >= self->groups) {
3078 PyErr_SetString(
3079 PyExc_IndexError,
3080 "no such group"
3081 );
3082 return NULL;
3083 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003084
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003085 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003086 return _pair(self->mark[index*2], self->mark[index*2+1]);
3087}
3088
3089static PyObject*
3090match_regs(MatchObject* self)
3091{
3092 PyObject* regs;
3093 PyObject* item;
3094 int index;
3095
3096 regs = PyTuple_New(self->groups);
3097 if (!regs)
3098 return NULL;
3099
3100 for (index = 0; index < self->groups; index++) {
3101 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3102 if (!item) {
3103 Py_DECREF(regs);
3104 return NULL;
3105 }
3106 PyTuple_SET_ITEM(regs, index, item);
3107 }
3108
3109 Py_INCREF(regs);
3110 self->regs = regs;
3111
3112 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00003113}
3114
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003115static PyObject*
3116match_copy(MatchObject* self, PyObject* args)
3117{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003118#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003119 MatchObject* copy;
3120 int slots, offset;
Tim Peters3d563502006-01-21 02:47:53 +00003121
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003122 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
3123 return NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003124
3125 slots = 2 * (self->pattern->groups+1);
3126
3127 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3128 if (!copy)
3129 return NULL;
3130
3131 /* this value a constant, but any compiler should be able to
3132 figure that out all by itself */
3133 offset = offsetof(MatchObject, string);
3134
3135 Py_XINCREF(self->pattern);
3136 Py_XINCREF(self->string);
3137 Py_XINCREF(self->regs);
3138
3139 memcpy((char*) copy + offset, (char*) self + offset,
3140 sizeof(MatchObject) + slots * sizeof(int) - offset);
3141
3142 return (PyObject*) copy;
3143#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003144 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003145 return NULL;
3146#endif
3147}
3148
3149static PyObject*
3150match_deepcopy(MatchObject* self, PyObject* args)
3151{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003152#ifdef USE_BUILTIN_COPY
3153 MatchObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00003154
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003155 PyObject* memo;
3156 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
3157 return NULL;
3158
3159 copy = (MatchObject*) match_copy(self, Py_None);
3160 if (!copy)
3161 return NULL;
3162
3163 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3164 !deepcopy(&copy->string, memo) ||
3165 !deepcopy(&copy->regs, memo)) {
3166 Py_DECREF(copy);
3167 return NULL;
3168 }
3169
3170#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003171 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3172 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003173#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003174}
3175
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003176static PyMethodDef match_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00003177 {"group", (PyCFunction) match_group, METH_VARARGS},
3178 {"start", (PyCFunction) match_start, METH_VARARGS},
3179 {"end", (PyCFunction) match_end, METH_VARARGS},
3180 {"span", (PyCFunction) match_span, METH_VARARGS},
3181 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3182 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
3183 {"expand", (PyCFunction) match_expand, METH_VARARGS},
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003184 {"__copy__", (PyCFunction) match_copy, METH_VARARGS},
3185 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003186 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003187};
3188
Tim Peters3d563502006-01-21 02:47:53 +00003189static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003190match_getattr(MatchObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00003191{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003192 PyObject* res;
Guido van Rossumb700df92000-03-31 14:59:30 +00003193
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003194 res = Py_FindMethod(match_methods, (PyObject*) self, name);
3195 if (res)
3196 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00003197
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003198 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00003199
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003200 if (!strcmp(name, "lastindex")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00003201 if (self->lastindex >= 0)
3202 return Py_BuildValue("i", self->lastindex);
Fredrik Lundhc2301732000-07-02 22:25:39 +00003203 Py_INCREF(Py_None);
3204 return Py_None;
3205 }
3206
3207 if (!strcmp(name, "lastgroup")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00003208 if (self->pattern->indexgroup && self->lastindex >= 0) {
Fredrik Lundhc2301732000-07-02 22:25:39 +00003209 PyObject* result = PySequence_GetItem(
Fredrik Lundh6f013982000-07-03 18:44:21 +00003210 self->pattern->indexgroup, self->lastindex
Fredrik Lundhc2301732000-07-02 22:25:39 +00003211 );
3212 if (result)
3213 return result;
3214 PyErr_Clear();
3215 }
3216 Py_INCREF(Py_None);
3217 return Py_None;
3218 }
3219
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003220 if (!strcmp(name, "string")) {
3221 if (self->string) {
3222 Py_INCREF(self->string);
3223 return self->string;
3224 } else {
3225 Py_INCREF(Py_None);
3226 return Py_None;
3227 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003228 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003229
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003230 if (!strcmp(name, "regs")) {
3231 if (self->regs) {
3232 Py_INCREF(self->regs);
3233 return self->regs;
3234 } else
3235 return match_regs(self);
3236 }
3237
3238 if (!strcmp(name, "re")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00003239 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003240 return (PyObject*) self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00003241 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003242
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003243 if (!strcmp(name, "pos"))
3244 return Py_BuildValue("i", self->pos);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003245
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003246 if (!strcmp(name, "endpos"))
3247 return Py_BuildValue("i", self->endpos);
Guido van Rossumb700df92000-03-31 14:59:30 +00003248
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003249 PyErr_SetString(PyExc_AttributeError, name);
3250 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003251}
3252
3253/* FIXME: implement setattr("string", None) as a special case (to
3254 detach the associated string, if any */
3255
3256statichere PyTypeObject Match_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003257 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00003258 0, "_" SRE_MODULE ".SRE_Match",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003259 sizeof(MatchObject), sizeof(int),
3260 (destructor)match_dealloc, /*tp_dealloc*/
3261 0, /*tp_print*/
3262 (getattrfunc)match_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00003263};
3264
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003265/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003266/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003267
3268static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003269scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003270{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003271 state_fini(&self->state);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003272 Py_DECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003273 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003274}
3275
3276static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003277scanner_match(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003278{
3279 SRE_STATE* state = &self->state;
3280 PyObject* match;
3281 int status;
3282
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003283 state_reset(state);
3284
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003285 state->ptr = state->start;
3286
3287 if (state->charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003288 status = sre_match(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003289 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003290#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003291 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003292#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003293 }
3294
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003295 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003296 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003297
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003298 if (status == 0 || state->ptr == state->start)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003299 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003300 else
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003301 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003302
3303 return match;
3304}
3305
3306
3307static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003308scanner_search(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003309{
3310 SRE_STATE* state = &self->state;
3311 PyObject* match;
3312 int status;
3313
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003314 state_reset(state);
3315
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003316 state->ptr = state->start;
3317
3318 if (state->charsize == 1) {
3319 status = sre_search(state, PatternObject_GetCode(self->pattern));
3320 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003321#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003322 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003323#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003324 }
3325
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003326 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003327 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003328
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003329 if (status == 0 || state->ptr == state->start)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003330 state->start = (void*) ((char*) state->ptr + state->charsize);
3331 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003332 state->start = state->ptr;
3333
3334 return match;
3335}
3336
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003337static PyMethodDef scanner_methods[] = {
Neal Norwitzbb2769f2002-03-31 15:46:00 +00003338 /* FIXME: use METH_OLDARGS instead of 0 or fix to use METH_VARARGS */
3339 /* METH_OLDARGS is not in Python 1.5.2 */
3340 {"match", (PyCFunction) scanner_match, 0},
3341 {"search", (PyCFunction) scanner_search, 0},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003342 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003343};
3344
Tim Peters3d563502006-01-21 02:47:53 +00003345static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003346scanner_getattr(ScannerObject* self, char* name)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003347{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003348 PyObject* res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003349
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003350 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
3351 if (res)
3352 return res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003353
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003354 PyErr_Clear();
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003355
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003356 /* attributes */
3357 if (!strcmp(name, "pattern")) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003358 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003359 return self->pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003360 }
3361
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003362 PyErr_SetString(PyExc_AttributeError, name);
3363 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003364}
3365
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003366statichere PyTypeObject Scanner_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003367 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00003368 0, "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003369 sizeof(ScannerObject), 0,
3370 (destructor)scanner_dealloc, /*tp_dealloc*/
3371 0, /*tp_print*/
3372 (getattrfunc)scanner_getattr, /*tp_getattr*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003373};
3374
Guido van Rossumb700df92000-03-31 14:59:30 +00003375static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00003376 {"compile", _compile, METH_VARARGS},
3377 {"getcodesize", sre_codesize, METH_VARARGS},
3378 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003379 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003380};
3381
Tim Peters3d563502006-01-21 02:47:53 +00003382#if PY_VERSION_HEX < 0x02030000
Andrew M. Kuchlingc24fe362003-04-30 13:09:08 +00003383DL_EXPORT(void) init_sre(void)
3384#else
Mark Hammond8235ea12002-07-19 06:55:41 +00003385PyMODINIT_FUNC init_sre(void)
Andrew M. Kuchlingc24fe362003-04-30 13:09:08 +00003386#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00003387{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003388 PyObject* m;
3389 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00003390 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003391
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003392 /* Patch object types */
3393 Pattern_Type.ob_type = Match_Type.ob_type =
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003394 Scanner_Type.ob_type = &PyType_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00003395
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00003396 m = Py_InitModule("_" SRE_MODULE, _functions);
Neal Norwitz1ac754f2006-01-19 06:09:39 +00003397 if (m == NULL)
3398 return;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003399 d = PyModule_GetDict(m);
3400
Fredrik Lundh21009b92001-09-18 18:47:09 +00003401 x = PyInt_FromLong(SRE_MAGIC);
3402 if (x) {
3403 PyDict_SetItemString(d, "MAGIC", x);
3404 Py_DECREF(x);
3405 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00003406
Martin v. Löwis78e2f062003-04-19 12:56:08 +00003407 x = PyInt_FromLong(sizeof(SRE_CODE));
3408 if (x) {
3409 PyDict_SetItemString(d, "CODESIZE", x);
3410 Py_DECREF(x);
3411 }
3412
Fredrik Lundh21009b92001-09-18 18:47:09 +00003413 x = PyString_FromString(copyright);
3414 if (x) {
3415 PyDict_SetItemString(d, "copyright", x);
3416 Py_DECREF(x);
3417 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003418}
3419
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003420#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00003421
3422/* vim:ts=4:sw=4:et
3423*/