blob: 4be33d04ca8bc0dc20e657737e5201d4f3788450 [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
Fredrik Lundhebc37b22000-10-28 19:30:41 +00009 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000010 * 2000-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000011 * 2000-09-20 fl added expand method
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000012 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000013 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000014 * 2001-04-28 fl added __copy__ methods (work in progress)
Fredrik Lundh09705f02002-11-22 12:46:35 +000015 * 2001-05-14 fl fixes for 1.5.2 compatibility
Fredrik Lundhf71ae462001-07-02 17:04:48 +000016 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh397a6542001-10-18 19:30:16 +000017 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Fredrik Lundh971e78b2001-10-20 17:48:46 +000018 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
Fredrik Lundhbec95b92001-10-21 16:47:57 +000019 * 2001-10-21 fl added sub/subn primitive
Fredrik Lundh703ce812001-10-24 22:16:30 +000020 * 2001-10-24 fl added finditer primitive (for 2.2 only)
Fredrik Lundh82b23072001-12-09 16:13:15 +000021 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
Fredrik Lundh09705f02002-11-22 12:46:35 +000022 * 2002-11-09 fl fixed empty sub/subn return type
Martin v. Löwis78e2f062003-04-19 12:56:08 +000023 * 2003-04-18 mvl fully support 4-byte codes
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +000024 * 2003-10-17 gn implemented non recursive scheme
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000026 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000027 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000028 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
Guido van Rossumb700df92000-03-31 14:59:30 +000032 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000033 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000034 * other compatibility work.
35 */
36
37#ifndef SRE_RECURSIVE
38
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000039static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000040 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000041
42#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000043#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000044
45#include "sre.h"
46
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000047#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000048
Fredrik Lundh436c3d582000-06-29 08:58:44 +000049/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000050#if !defined(SRE_MODULE)
51#define SRE_MODULE "sre"
52#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000053
Guido van Rossumb700df92000-03-31 14:59:30 +000054/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000055#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000056
Fredrik Lundh971e78b2001-10-20 17:48:46 +000057#if PY_VERSION_HEX >= 0x01060000
58#if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
Fredrik Lundh22d25462000-07-01 17:50:59 +000059/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +000060#define HAVE_UNICODE
61#endif
Fredrik Lundh971e78b2001-10-20 17:48:46 +000062#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000063
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000065/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000066
67/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000068#define USE_FAST_SEARCH
69
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000070/* enables aggressive inlining (always on for Visual C) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000071#undef USE_INLINE
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000072
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000073/* enables copy/deepcopy handling (work in progress) */
74#undef USE_BUILTIN_COPY
75
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000076#if PY_VERSION_HEX < 0x01060000
77#define PyObject_DEL(op) PyMem_DEL((op))
78#endif
79
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000080/* -------------------------------------------------------------------- */
81
Fredrik Lundh80946112000-06-29 18:03:25 +000082#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000083#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000084#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000085/* fastest possible local call under MSVC */
86#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000087#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000088#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000089#else
90#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000091#endif
92
93/* error codes */
94#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000095#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000096#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000097#define SRE_ERROR_MEMORY -9 /* out of memory */
98
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000099#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +0000100#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +0000101#else
102#define TRACE(v)
103#endif
104
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000105/* -------------------------------------------------------------------- */
106/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000107
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000108/* default character predicates (run sre_chars.py to regenerate tables) */
109
110#define SRE_DIGIT_MASK 1
111#define SRE_SPACE_MASK 2
112#define SRE_LINEBREAK_MASK 4
113#define SRE_ALNUM_MASK 8
114#define SRE_WORD_MASK 16
115
Fredrik Lundh21009b92001-09-18 18:47:09 +0000116/* FIXME: this assumes ASCII. create tables in init_sre() instead */
117
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000118static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1192, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1200, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
12125, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12224, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1230, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12424, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
125
Fredrik Lundhb389df32000-06-29 12:48:37 +0000126static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000012710, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
12827, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
12944, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
13061, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
131108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
132122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
133106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
134120, 121, 122, 123, 124, 125, 126, 127 };
135
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000136#define SRE_IS_DIGIT(ch)\
137 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
138#define SRE_IS_SPACE(ch)\
139 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
140#define SRE_IS_LINEBREAK(ch)\
141 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
142#define SRE_IS_ALNUM(ch)\
143 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
144#define SRE_IS_WORD(ch)\
145 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000146
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000147static unsigned int sre_lower(unsigned int ch)
148{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000149 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000150}
151
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000152/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000153/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
154 * warnings when c's type supports only numbers < N+1 */
155#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
156#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000157#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000158#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000159#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
160
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000161static unsigned int sre_lower_locale(unsigned int ch)
162{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000163 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000164}
165
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000166/* unicode-specific character predicates */
167
168#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000169
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000170#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
171#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
172#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000173#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000174#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000175
176static unsigned int sre_lower_unicode(unsigned int ch)
177{
178 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
179}
180
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000181#endif
182
Guido van Rossumb700df92000-03-31 14:59:30 +0000183LOCAL(int)
184sre_category(SRE_CODE category, unsigned int ch)
185{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000186 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000187
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000188 case SRE_CATEGORY_DIGIT:
189 return SRE_IS_DIGIT(ch);
190 case SRE_CATEGORY_NOT_DIGIT:
191 return !SRE_IS_DIGIT(ch);
192 case SRE_CATEGORY_SPACE:
193 return SRE_IS_SPACE(ch);
194 case SRE_CATEGORY_NOT_SPACE:
195 return !SRE_IS_SPACE(ch);
196 case SRE_CATEGORY_WORD:
197 return SRE_IS_WORD(ch);
198 case SRE_CATEGORY_NOT_WORD:
199 return !SRE_IS_WORD(ch);
200 case SRE_CATEGORY_LINEBREAK:
201 return SRE_IS_LINEBREAK(ch);
202 case SRE_CATEGORY_NOT_LINEBREAK:
203 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000204
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000205 case SRE_CATEGORY_LOC_WORD:
206 return SRE_LOC_IS_WORD(ch);
207 case SRE_CATEGORY_LOC_NOT_WORD:
208 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000209
210#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000211 case SRE_CATEGORY_UNI_DIGIT:
212 return SRE_UNI_IS_DIGIT(ch);
213 case SRE_CATEGORY_UNI_NOT_DIGIT:
214 return !SRE_UNI_IS_DIGIT(ch);
215 case SRE_CATEGORY_UNI_SPACE:
216 return SRE_UNI_IS_SPACE(ch);
217 case SRE_CATEGORY_UNI_NOT_SPACE:
218 return !SRE_UNI_IS_SPACE(ch);
219 case SRE_CATEGORY_UNI_WORD:
220 return SRE_UNI_IS_WORD(ch);
221 case SRE_CATEGORY_UNI_NOT_WORD:
222 return !SRE_UNI_IS_WORD(ch);
223 case SRE_CATEGORY_UNI_LINEBREAK:
224 return SRE_UNI_IS_LINEBREAK(ch);
225 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
226 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000227#else
228 case SRE_CATEGORY_UNI_DIGIT:
229 return SRE_IS_DIGIT(ch);
230 case SRE_CATEGORY_UNI_NOT_DIGIT:
231 return !SRE_IS_DIGIT(ch);
232 case SRE_CATEGORY_UNI_SPACE:
233 return SRE_IS_SPACE(ch);
234 case SRE_CATEGORY_UNI_NOT_SPACE:
235 return !SRE_IS_SPACE(ch);
236 case SRE_CATEGORY_UNI_WORD:
237 return SRE_LOC_IS_WORD(ch);
238 case SRE_CATEGORY_UNI_NOT_WORD:
239 return !SRE_LOC_IS_WORD(ch);
240 case SRE_CATEGORY_UNI_LINEBREAK:
241 return SRE_IS_LINEBREAK(ch);
242 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
243 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000244#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000245 }
246 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000247}
248
249/* helpers */
250
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000251static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000252data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000253{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000254 if (state->data_stack) {
255 free(state->data_stack);
256 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000257 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000258 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000259}
260
261static int
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000262data_stack_grow(SRE_STATE* state, int size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000263{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000264 int minsize, cursize;
265 minsize = state->data_stack_base+size;
266 cursize = state->data_stack_size;
267 if (cursize < minsize) {
268 void* stack;
269 cursize = minsize+minsize/4+1024;
270 TRACE(("allocate/grow stack %d\n", cursize));
271 stack = realloc(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000272 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000273 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000274 return SRE_ERROR_MEMORY;
275 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000276 state->data_stack = stack;
277 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000278 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000279 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000280}
281
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000282/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000283
284#define SRE_CHAR unsigned char
285#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000286#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000287#define SRE_CHARSET sre_charset
288#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000289#define SRE_MATCH sre_match
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000290#define SRE_MATCH_CONTEXT sre_match_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000291#define SRE_SEARCH sre_search
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000292#define SRE_LITERAL_TEMPLATE sre_literal_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000293
294#if defined(HAVE_UNICODE)
295
Guido van Rossumb700df92000-03-31 14:59:30 +0000296#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000297#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000298#undef SRE_RECURSIVE
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000299
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000300#undef SRE_LITERAL_TEMPLATE
Guido van Rossumb700df92000-03-31 14:59:30 +0000301#undef SRE_SEARCH
302#undef SRE_MATCH
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000303#undef SRE_MATCH_CONTEXT
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000304#undef SRE_INFO
305#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000306#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000307#undef SRE_AT
308#undef SRE_CHAR
309
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000310/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000311
312#define SRE_CHAR Py_UNICODE
313#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000314#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000315#define SRE_CHARSET sre_ucharset
316#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000317#define SRE_MATCH sre_umatch
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000318#define SRE_MATCH_CONTEXT sre_umatch_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000319#define SRE_SEARCH sre_usearch
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000320#define SRE_LITERAL_TEMPLATE sre_uliteral_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000321#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000322
323#endif /* SRE_RECURSIVE */
324
325/* -------------------------------------------------------------------- */
326/* String matching engine */
327
328/* the following section is compiled twice, with different character
329 settings */
330
331LOCAL(int)
332SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
333{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000334 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000335
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000336 int this, that;
Guido van Rossumb700df92000-03-31 14:59:30 +0000337
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000338 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000339
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000340 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000341 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000342 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000343
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000344 case SRE_AT_BEGINNING_LINE:
345 return ((void*) ptr == state->beginning ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000346 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000347
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000348 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000349 return (((void*) (ptr+1) == state->end &&
350 SRE_IS_LINEBREAK((int) ptr[0])) ||
351 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000352
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000353 case SRE_AT_END_LINE:
354 return ((void*) ptr == state->end ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000355 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000356
Fredrik Lundh770617b2001-01-14 15:06:11 +0000357 case SRE_AT_END_STRING:
358 return ((void*) ptr == state->end);
359
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000360 case SRE_AT_BOUNDARY:
361 if (state->beginning == state->end)
362 return 0;
363 that = ((void*) ptr > state->beginning) ?
364 SRE_IS_WORD((int) ptr[-1]) : 0;
365 this = ((void*) ptr < state->end) ?
366 SRE_IS_WORD((int) ptr[0]) : 0;
367 return this != that;
Fredrik Lundh80946112000-06-29 18:03:25 +0000368
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000369 case SRE_AT_NON_BOUNDARY:
370 if (state->beginning == state->end)
371 return 0;
372 that = ((void*) ptr > state->beginning) ?
373 SRE_IS_WORD((int) ptr[-1]) : 0;
374 this = ((void*) ptr < state->end) ?
375 SRE_IS_WORD((int) ptr[0]) : 0;
376 return this == that;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000377
378 case SRE_AT_LOC_BOUNDARY:
379 if (state->beginning == state->end)
380 return 0;
381 that = ((void*) ptr > state->beginning) ?
382 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
383 this = ((void*) ptr < state->end) ?
384 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
385 return this != that;
386
387 case SRE_AT_LOC_NON_BOUNDARY:
388 if (state->beginning == state->end)
389 return 0;
390 that = ((void*) ptr > state->beginning) ?
391 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
392 this = ((void*) ptr < state->end) ?
393 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
394 return this == that;
395
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000396#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000397 case SRE_AT_UNI_BOUNDARY:
398 if (state->beginning == state->end)
399 return 0;
400 that = ((void*) ptr > state->beginning) ?
401 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
402 this = ((void*) ptr < state->end) ?
403 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
404 return this != that;
405
406 case SRE_AT_UNI_NON_BOUNDARY:
407 if (state->beginning == state->end)
408 return 0;
409 that = ((void*) ptr > state->beginning) ?
410 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
411 this = ((void*) ptr < state->end) ?
412 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
413 return this == that;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000414#endif
415
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000416 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000417
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000418 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000419}
420
421LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000422SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000423{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000424 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000425
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000426 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000427
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000428 for (;;) {
429 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000430
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000431 case SRE_OP_FAILURE:
432 return !ok;
433
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000434 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000435 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000436 if (ch == set[0])
437 return ok;
438 set++;
439 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000440
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000441 case SRE_OP_CATEGORY:
442 /* <CATEGORY> <code> */
443 if (sre_category(set[0], (int) ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000444 return ok;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000445 set += 1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000446 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000447
Fredrik Lundh3562f112000-07-02 12:00:07 +0000448 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000449 if (sizeof(SRE_CODE) == 2) {
450 /* <CHARSET> <bitmap> (16 bits per code word) */
451 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
452 return ok;
453 set += 16;
454 }
455 else {
456 /* <CHARSET> <bitmap> (32 bits per code word) */
457 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
458 return ok;
459 set += 8;
460 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000461 break;
462
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000463 case SRE_OP_RANGE:
464 /* <RANGE> <lower> <upper> */
465 if (set[0] <= ch && ch <= set[1])
466 return ok;
467 set += 2;
468 break;
469
470 case SRE_OP_NEGATE:
471 ok = !ok;
472 break;
473
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000474 case SRE_OP_BIGCHARSET:
475 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
476 {
477 int count, block;
478 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000479
480 if (sizeof(SRE_CODE) == 2) {
481 block = ((unsigned char*)set)[ch >> 8];
482 set += 128;
483 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
484 return ok;
485 set += count*16;
486 }
487 else {
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000488 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
489 * warnings when c's type supports only numbers < N+1 */
490 if (!(ch & ~65535))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000491 block = ((unsigned char*)set)[ch >> 8];
492 else
493 block = -1;
494 set += 64;
495 if (block >=0 &&
496 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
497 return ok;
498 set += count*8;
499 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000500 break;
501 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000502
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000503 default:
504 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000505 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000506 return 0;
507 }
508 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000509}
510
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000511LOCAL(int) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000512
513LOCAL(int)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000514SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, int maxcount)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000515{
516 SRE_CODE chr;
517 SRE_CHAR* ptr = state->ptr;
518 SRE_CHAR* end = state->end;
519 int i;
520
521 /* adjust end */
522 if (maxcount < end - ptr && maxcount != 65535)
523 end = ptr + maxcount;
524
525 switch (pattern[0]) {
526
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000527 case SRE_OP_IN:
528 /* repeated set */
529 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
530 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
531 ptr++;
532 break;
533
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000534 case SRE_OP_ANY:
535 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000536 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000537 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
538 ptr++;
539 break;
540
541 case SRE_OP_ANY_ALL:
542 /* repeated dot wildcare. skip to the end of the target
543 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000544 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000545 ptr = end;
546 break;
547
548 case SRE_OP_LITERAL:
549 /* repeated literal */
550 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000551 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000552 while (ptr < end && (SRE_CODE) *ptr == chr)
553 ptr++;
554 break;
555
556 case SRE_OP_LITERAL_IGNORE:
557 /* repeated literal */
558 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000559 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000560 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
561 ptr++;
562 break;
563
564 case SRE_OP_NOT_LITERAL:
565 /* repeated non-literal */
566 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000567 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000568 while (ptr < end && (SRE_CODE) *ptr != chr)
569 ptr++;
570 break;
571
572 case SRE_OP_NOT_LITERAL_IGNORE:
573 /* repeated non-literal */
574 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000575 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000576 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
577 ptr++;
578 break;
579
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000580 default:
581 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000582 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000583 while ((SRE_CHAR*) state->ptr < end) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000584 i = SRE_MATCH(state, pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000585 if (i < 0)
586 return i;
587 if (!i)
588 break;
589 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000590 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
591 (SRE_CHAR*) state->ptr - ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000592 return (SRE_CHAR*) state->ptr - ptr;
593 }
594
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000595 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000596 return ptr - (SRE_CHAR*) state->ptr;
597}
598
Fredrik Lundh33accc12000-08-27 20:59:47 +0000599#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000600LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000601SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
602{
603 /* check if an SRE_OP_INFO block matches at the current position.
604 returns the number of SRE_CODE objects to skip if successful, 0
605 if no match */
606
607 SRE_CHAR* end = state->end;
608 SRE_CHAR* ptr = state->ptr;
609 int i;
610
611 /* check minimal length */
612 if (pattern[3] && (end - ptr) < pattern[3])
613 return 0;
614
615 /* check known prefix */
616 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
617 /* <length> <skip> <prefix data> <overlap data> */
618 for (i = 0; i < pattern[5]; i++)
619 if ((SRE_CODE) ptr[i] != pattern[7 + i])
620 return 0;
621 return pattern[0] + 2 * pattern[6];
622 }
623 return pattern[0];
624}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000625#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000626
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000627/* The macros below should be used to protect recursive SRE_MATCH()
628 * calls that *failed* and do *not* return immediately (IOW, those
629 * that will backtrack). Explaining:
630 *
631 * - Recursive SRE_MATCH() returned true: that's usually a success
632 * (besides atypical cases like ASSERT_NOT), therefore there's no
633 * reason to restore lastmark;
634 *
635 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
636 * is returning to the caller: If the current SRE_MATCH() is the
637 * top function of the recursion, returning false will be a matching
638 * failure, and it doesn't matter where lastmark is pointing to.
639 * If it's *not* the top function, it will be a recursive SRE_MATCH()
640 * failure by itself, and the calling SRE_MATCH() will have to deal
641 * with the failure by the same rules explained here (it will restore
642 * lastmark by itself if necessary);
643 *
644 * - Recursive SRE_MATCH() returned false, and will continue the
645 * outside 'for' loop: must be protected when breaking, since the next
646 * OP could potentially depend on lastmark;
647 *
648 * - Recursive SRE_MATCH() returned false, and will be called again
649 * inside a local for/while loop: must be protected between each
650 * loop iteration, since the recursive SRE_MATCH() could do anything,
651 * and could potentially depend on lastmark.
652 *
653 * For more information, check the discussion at SF patch #712900.
654 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000655#define LASTMARK_SAVE() \
656 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000657 ctx->lastmark = state->lastmark; \
658 ctx->lastindex = state->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000659 } while (0)
660#define LASTMARK_RESTORE() \
661 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000662 state->lastmark = ctx->lastmark; \
663 state->lastindex = ctx->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000664 } while (0)
665
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000666#define RETURN_ERROR(i) do { return i; } while(0)
667#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
668#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
669
670#define RETURN_ON_ERROR(i) \
671 do { if (i < 0) RETURN_ERROR(i); } while (0)
672#define RETURN_ON_SUCCESS(i) \
673 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
674#define RETURN_ON_FAILURE(i) \
675 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
676
677#define SFY(x) #x
678
679#define DATA_STACK_ALLOC(state, type, ptr) \
680do { \
681 alloc_pos = state->data_stack_base; \
682 TRACE(("allocating %s in %d (%d)\n", \
683 SFY(type), alloc_pos, sizeof(type))); \
684 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
685 int j = data_stack_grow(state, sizeof(type)); \
686 if (j < 0) return j; \
687 if (ctx_pos != -1) \
688 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
689 } \
690 ptr = (type*)(state->data_stack+alloc_pos); \
691 state->data_stack_base += sizeof(type); \
692} while (0)
693
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000694#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
695do { \
696 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
697 ptr = (type*)(state->data_stack+pos); \
698} while (0)
699
700#define DATA_STACK_PUSH(state, data, size) \
701do { \
702 TRACE(("copy data in %p to %d (%d)\n", \
703 data, state->data_stack_base, size)); \
704 if (state->data_stack_size < state->data_stack_base+size) { \
705 int j = data_stack_grow(state, size); \
706 if (j < 0) return j; \
707 if (ctx_pos != -1) \
708 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
709 } \
710 memcpy(state->data_stack+state->data_stack_base, data, size); \
711 state->data_stack_base += size; \
712} while (0)
713
714#define DATA_STACK_POP(state, data, size, discard) \
715do { \
716 TRACE(("copy data to %p from %d (%d)\n", \
717 data, state->data_stack_base-size, size)); \
718 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
719 if (discard) \
720 state->data_stack_base -= size; \
721} while (0)
722
723#define DATA_STACK_POP_DISCARD(state, size) \
724do { \
725 TRACE(("discard data from %d (%d)\n", \
726 state->data_stack_base-size, size)); \
727 state->data_stack_base -= size; \
728} while(0)
729
730#define DATA_PUSH(x) \
731 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
732#define DATA_POP(x) \
733 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000734#define DATA_POP_DISCARD(x) \
735 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
736#define DATA_ALLOC(t,p) \
737 DATA_STACK_ALLOC(state, t, p)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000738#define DATA_LOOKUP_AT(t,p,pos) \
739 DATA_STACK_LOOKUP_AT(state,t,p,pos)
740
741#define MARK_PUSH(lastmark) \
742 do if (lastmark > 0) { \
743 i = lastmark; /* ctx->lastmark may change if reallocated */ \
744 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
745 } while (0)
746#define MARK_POP(lastmark) \
747 do if (lastmark > 0) { \
748 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
749 } while (0)
750#define MARK_POP_KEEP(lastmark) \
751 do if (lastmark > 0) { \
752 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
753 } while (0)
754#define MARK_POP_DISCARD(lastmark) \
755 do if (lastmark > 0) { \
756 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
757 } while (0)
758
759#define JUMP_NONE 0
760#define JUMP_MAX_UNTIL_1 1
761#define JUMP_MAX_UNTIL_2 2
762#define JUMP_MAX_UNTIL_3 3
763#define JUMP_MIN_UNTIL_1 4
764#define JUMP_MIN_UNTIL_2 5
765#define JUMP_MIN_UNTIL_3 6
766#define JUMP_REPEAT 7
767#define JUMP_REPEAT_ONE_1 8
768#define JUMP_REPEAT_ONE_2 9
769#define JUMP_MIN_REPEAT_ONE 10
770#define JUMP_BRANCH 11
771#define JUMP_ASSERT 12
772#define JUMP_ASSERT_NOT 13
773
774#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
775 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
776 nextctx->last_ctx_pos = ctx_pos; \
777 nextctx->jump = jumpvalue; \
778 nextctx->pattern = nextpattern; \
779 ctx_pos = alloc_pos; \
780 ctx = nextctx; \
781 goto entrance; \
782 jumplabel: \
783 while (0) /* gcc doesn't like labels at end of scopes */ \
784
785typedef struct {
786 int last_ctx_pos;
787 int jump;
788 SRE_CHAR* ptr;
789 SRE_CODE* pattern;
790 int count;
791 int lastmark;
792 int lastindex;
793 union {
794 SRE_CODE chr;
795 SRE_REPEAT* rep;
796 } u;
797} SRE_MATCH_CONTEXT;
798
799/* check if string matches the given pattern. returns <0 for
800 error, 0 for failure, and 1 for success */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000801LOCAL(int)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000802SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
Guido van Rossumb700df92000-03-31 14:59:30 +0000803{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000804 SRE_CHAR* end = state->end;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000805 int alloc_pos, ctx_pos = -1;
806 int i, ret = 0;
807 int jump;
Guido van Rossumb700df92000-03-31 14:59:30 +0000808
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000809 SRE_MATCH_CONTEXT* ctx;
810 SRE_MATCH_CONTEXT* nextctx;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000811
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000812 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000813
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000814 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
815 ctx->last_ctx_pos = -1;
816 ctx->jump = JUMP_NONE;
817 ctx->pattern = pattern;
818 ctx_pos = alloc_pos;
819
820entrance:
821
822 ctx->ptr = state->ptr;
823
824 if (ctx->pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000825 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000826 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000827 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000828 TRACE(("reject (got %d chars, need %d)\n",
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000829 (end - ctx->ptr), ctx->pattern[3]));
830 RETURN_FAILURE;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000831 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000832 ctx->pattern += ctx->pattern[1] + 1;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000833 }
834
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000835 for (;;) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000836
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000837 switch (*ctx->pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000838
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000839 case SRE_OP_MARK:
840 /* set mark */
841 /* <MARK> <gid> */
842 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
843 ctx->ptr, ctx->pattern[0]));
844 i = ctx->pattern[0];
845 if (i & 1)
846 state->lastindex = i/2 + 1;
847 if (i > state->lastmark) {
848 /* state->lastmark is the highest valid index in the
849 state->mark array. If it is increased by more than 1,
850 the intervening marks must be set to NULL to signal
851 that these marks have not been encountered. */
852 int j = state->lastmark + 1;
853 while (j < i)
854 state->mark[j++] = NULL;
855 state->lastmark = i;
856 }
857 state->mark[i] = ctx->ptr;
858 ctx->pattern++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000859 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000860
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000861 case SRE_OP_LITERAL:
862 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000863 /* <LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000864 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
865 ctx->ptr, *ctx->pattern));
866 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
867 RETURN_FAILURE;
868 ctx->pattern++;
869 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000870 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000871
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000872 case SRE_OP_NOT_LITERAL:
873 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000874 /* <NOT_LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000875 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
876 ctx->ptr, *ctx->pattern));
877 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
878 RETURN_FAILURE;
879 ctx->pattern++;
880 ctx->ptr++;
881 break;
882
883 case SRE_OP_SUCCESS:
884 /* end of pattern */
885 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
886 state->ptr = ctx->ptr;
887 RETURN_SUCCESS;
888
889 case SRE_OP_AT:
890 /* match at given position */
891 /* <AT> <code> */
892 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
893 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
894 RETURN_FAILURE;
895 ctx->pattern++;
896 break;
897
898 case SRE_OP_CATEGORY:
899 /* match at given category */
900 /* <CATEGORY> <code> */
901 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
902 ctx->ptr, *ctx->pattern));
903 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
904 RETURN_FAILURE;
905 ctx->pattern++;
906 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000907 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000908
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000909 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000910 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000911 /* <ANY> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000912 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
913 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
914 RETURN_FAILURE;
915 ctx->ptr++;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000916 break;
917
918 case SRE_OP_ANY_ALL:
919 /* match anything */
920 /* <ANY_ALL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000921 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
922 if (ctx->ptr >= end)
923 RETURN_FAILURE;
924 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000925 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000926
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000927 case SRE_OP_IN:
928 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000929 /* <IN> <skip> <set> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000930 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
931 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
932 RETURN_FAILURE;
933 ctx->pattern += ctx->pattern[0];
934 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000935 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000936
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000937 case SRE_OP_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000938 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
939 ctx->pattern, ctx->ptr, ctx->pattern[0]));
940 if (ctx->ptr >= end ||
941 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
942 RETURN_FAILURE;
943 ctx->pattern++;
944 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000945 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000946
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000947 case SRE_OP_NOT_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000948 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
949 ctx->pattern, ctx->ptr, *ctx->pattern));
950 if (ctx->ptr >= end ||
951 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
952 RETURN_FAILURE;
953 ctx->pattern++;
954 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000955 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000956
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000957 case SRE_OP_IN_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000958 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
959 if (ctx->ptr >= end
960 || !SRE_CHARSET(ctx->pattern+1,
961 (SRE_CODE)state->lower(*ctx->ptr)))
962 RETURN_FAILURE;
963 ctx->pattern += ctx->pattern[0];
964 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000965 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000966
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000967 case SRE_OP_JUMP:
968 case SRE_OP_INFO:
969 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000970 /* <JUMP> <offset> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000971 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
972 ctx->ptr, ctx->pattern[0]));
973 ctx->pattern += ctx->pattern[0];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000974 break;
975
976 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000977 /* alternation */
978 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000979 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000980 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000981 ctx->u.rep = state->repeat;
982 if (ctx->u.rep)
983 MARK_PUSH(ctx->lastmark);
984 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
985 if (ctx->pattern[1] == SRE_OP_LITERAL &&
986 (ctx->ptr >= end ||
987 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000988 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000989 if (ctx->pattern[1] == SRE_OP_IN &&
990 (ctx->ptr >= end ||
991 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000992 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000993 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000994 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000995 if (ret) {
996 if (ctx->u.rep)
997 MARK_POP_DISCARD(ctx->lastmark);
998 RETURN_ON_ERROR(ret);
999 RETURN_SUCCESS;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001000 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001001 if (ctx->u.rep)
1002 MARK_POP_KEEP(ctx->lastmark);
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001003 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001004 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001005 if (ctx->u.rep)
1006 MARK_POP_DISCARD(ctx->lastmark);
1007 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001008
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001009 case SRE_OP_REPEAT_ONE:
1010 /* match repeated sequence (maximizing regexp) */
1011
1012 /* this operator only works if the repeated item is
1013 exactly one character wide, and we're not already
1014 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +00001015 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001016
1017 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1018
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001019 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1020 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001021
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001022 if (ctx->ptr + ctx->pattern[1] > end)
1023 RETURN_FAILURE; /* cannot match */
Fredrik Lundhe1869832000-08-01 22:47:49 +00001024
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001025 state->ptr = ctx->ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001026
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001027 ctx->count = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001028 RETURN_ON_ERROR(ctx->count);
Fredrik Lundhe1869832000-08-01 22:47:49 +00001029
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001030 ctx->ptr += ctx->count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001031
1032 /* when we arrive here, count contains the number of
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001033 matches, and ctx->ptr points to the tail of the target
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001034 string. check if the rest of the pattern matches,
1035 and backtrack if not. */
1036
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001037 if (ctx->count < (int) ctx->pattern[1])
1038 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001039
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001040 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001041 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001042 state->ptr = ctx->ptr;
1043 RETURN_SUCCESS;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001044 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001045
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001046 LASTMARK_SAVE();
1047
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001048 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001049 /* tail starts with a literal. skip positions where
1050 the rest of the pattern cannot possibly match */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001051 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001052 for (;;) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001053 while (ctx->count >= (int) ctx->pattern[1] &&
1054 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1055 ctx->ptr--;
1056 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001057 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001058 if (ctx->count < (int) ctx->pattern[1])
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001059 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001060 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001061 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1062 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001063 if (ret) {
1064 RETURN_ON_ERROR(ret);
1065 RETURN_SUCCESS;
1066 }
1067
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001068 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001069
1070 ctx->ptr--;
1071 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001072 }
1073
1074 } else {
1075 /* general case */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001076 while (ctx->count >= (int) ctx->pattern[1]) {
1077 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001078 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1079 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001080 if (ret) {
1081 RETURN_ON_ERROR(ret);
1082 RETURN_SUCCESS;
1083 }
1084 ctx->ptr--;
1085 ctx->count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001086 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001087 }
1088 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001089 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001090
Guido van Rossum41c99e72003-04-14 17:59:34 +00001091 case SRE_OP_MIN_REPEAT_ONE:
1092 /* match repeated sequence (minimizing regexp) */
1093
1094 /* this operator only works if the repeated item is
1095 exactly one character wide, and we're not already
1096 collecting backtracking points. for other cases,
1097 use the MIN_REPEAT operator */
1098
1099 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1100
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001101 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1102 ctx->pattern[1], ctx->pattern[2]));
Guido van Rossum41c99e72003-04-14 17:59:34 +00001103
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001104 if (ctx->ptr + ctx->pattern[1] > end)
1105 RETURN_FAILURE; /* cannot match */
Guido van Rossum41c99e72003-04-14 17:59:34 +00001106
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001107 state->ptr = ctx->ptr;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001108
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001109 if (ctx->pattern[1] == 0)
1110 ctx->count = 0;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001111 else {
1112 /* count using pattern min as the maximum */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001113 ctx->count = SRE_COUNT(state, ctx->pattern+3,
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001114 ctx->pattern[1]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001115 RETURN_ON_ERROR(ctx->count);
1116 if (ctx->count < (int) ctx->pattern[1])
1117 /* didn't match minimum number of times */
1118 RETURN_FAILURE;
1119 /* advance past minimum matches of repeat */
1120 ctx->ptr += ctx->count;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001121 }
1122
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001123 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Guido van Rossum41c99e72003-04-14 17:59:34 +00001124 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001125 state->ptr = ctx->ptr;
1126 RETURN_SUCCESS;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001127
1128 } else {
1129 /* general case */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001130 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001131 while ((int)ctx->pattern[2] == 65535
1132 || ctx->count <= (int)ctx->pattern[2]) {
1133 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001134 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1135 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001136 if (ret) {
1137 RETURN_ON_ERROR(ret);
1138 RETURN_SUCCESS;
1139 }
1140 state->ptr = ctx->ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001141 ret = SRE_COUNT(state, ctx->pattern+3, 1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001142 RETURN_ON_ERROR(ret);
1143 if (ret == 0)
Guido van Rossum41c99e72003-04-14 17:59:34 +00001144 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001145 assert(ret == 1);
1146 ctx->ptr++;
1147 ctx->count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001148 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001149 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001150 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001151 RETURN_FAILURE;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001152
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001153 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001154 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001155 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001156 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001157 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1158 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001159
1160 /* install new repeat context */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001161 ctx->u.rep = (SRE_REPEAT*) malloc(sizeof(*ctx->u.rep));
1162 ctx->u.rep->count = -1;
1163 ctx->u.rep->pattern = ctx->pattern;
1164 ctx->u.rep->prev = state->repeat;
1165 ctx->u.rep->last_ptr = NULL;
1166 state->repeat = ctx->u.rep;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001167
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001168 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001169 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001170 state->repeat = ctx->u.rep->prev;
1171 free(ctx->u.rep);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001172
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001173 if (ret) {
1174 RETURN_ON_ERROR(ret);
1175 RETURN_SUCCESS;
1176 }
1177 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001178
1179 case SRE_OP_MAX_UNTIL:
1180 /* maximizing repeat */
1181 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1182
1183 /* FIXME: we probably need to deal with zero-width
1184 matches in here... */
1185
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001186 ctx->u.rep = state->repeat;
1187 if (!ctx->u.rep)
1188 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001189
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001190 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001191
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001192 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001193
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001194 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1195 ctx->ptr, ctx->count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001196
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001197 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001198 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001199 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001200 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1201 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001202 if (ret) {
1203 RETURN_ON_ERROR(ret);
1204 RETURN_SUCCESS;
1205 }
1206 ctx->u.rep->count = ctx->count-1;
1207 state->ptr = ctx->ptr;
1208 RETURN_FAILURE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001209 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001210
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001211 if ((ctx->count < ctx->u.rep->pattern[2] ||
1212 ctx->u.rep->pattern[2] == 65535) &&
1213 state->ptr != ctx->u.rep->last_ptr) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001214 /* we may have enough matches, but if we can
1215 match another item, do so */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001216 ctx->u.rep->count = ctx->count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001217 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001218 MARK_PUSH(ctx->lastmark);
1219 /* zero-width match protection */
1220 DATA_PUSH(&ctx->u.rep->last_ptr);
1221 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001222 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1223 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001224 DATA_POP(&ctx->u.rep->last_ptr);
1225 if (ret) {
1226 MARK_POP_DISCARD(ctx->lastmark);
1227 RETURN_ON_ERROR(ret);
1228 RETURN_SUCCESS;
1229 }
1230 MARK_POP(ctx->lastmark);
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001231 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001232 ctx->u.rep->count = ctx->count-1;
1233 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001234 }
1235
1236 /* cannot match more repeated items here. make sure the
1237 tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001238 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001239 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001240 RETURN_ON_SUCCESS(ret);
1241 state->repeat = ctx->u.rep;
1242 state->ptr = ctx->ptr;
1243 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001244
1245 case SRE_OP_MIN_UNTIL:
1246 /* minimizing repeat */
1247 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1248
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001249 ctx->u.rep = state->repeat;
1250 if (!ctx->u.rep)
1251 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001252
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001253 state->ptr = ctx->ptr;
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001254
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001255 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001256
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001257 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1258 ctx->ptr, ctx->count, ctx->u.rep->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001259
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001260 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001261 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001262 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001263 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1264 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001265 if (ret) {
1266 RETURN_ON_ERROR(ret);
1267 RETURN_SUCCESS;
1268 }
1269 ctx->u.rep->count = ctx->count-1;
1270 state->ptr = ctx->ptr;
1271 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001272 }
1273
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001274 LASTMARK_SAVE();
1275
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001276 /* see if the tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001277 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001278 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001279 if (ret) {
1280 RETURN_ON_ERROR(ret);
1281 RETURN_SUCCESS;
1282 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001283
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001284 state->repeat = ctx->u.rep;
1285 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001286
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001287 LASTMARK_RESTORE();
1288
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001289 if (ctx->count >= ctx->u.rep->pattern[2]
1290 && ctx->u.rep->pattern[2] != 65535)
1291 RETURN_FAILURE;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001292
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001293 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001294 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1295 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001296 if (ret) {
1297 RETURN_ON_ERROR(ret);
1298 RETURN_SUCCESS;
1299 }
1300 ctx->u.rep->count = ctx->count-1;
1301 state->ptr = ctx->ptr;
1302 RETURN_FAILURE;
1303
1304 case SRE_OP_GROUPREF:
1305 /* match backreference */
1306 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1307 ctx->ptr, ctx->pattern[0]));
1308 i = ctx->pattern[0];
1309 {
1310 int groupref = i+i;
1311 if (groupref >= state->lastmark) {
1312 RETURN_FAILURE;
1313 } else {
1314 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1315 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1316 if (!p || !e || e < p)
1317 RETURN_FAILURE;
1318 while (p < e) {
1319 if (ctx->ptr >= end || *ctx->ptr != *p)
1320 RETURN_FAILURE;
1321 p++; ctx->ptr++;
1322 }
1323 }
1324 }
1325 ctx->pattern++;
1326 break;
1327
1328 case SRE_OP_GROUPREF_IGNORE:
1329 /* match backreference */
1330 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1331 ctx->ptr, ctx->pattern[0]));
1332 i = ctx->pattern[0];
1333 {
1334 int groupref = i+i;
1335 if (groupref >= state->lastmark) {
1336 RETURN_FAILURE;
1337 } else {
1338 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1339 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1340 if (!p || !e || e < p)
1341 RETURN_FAILURE;
1342 while (p < e) {
1343 if (ctx->ptr >= end ||
1344 state->lower(*ctx->ptr) != state->lower(*p))
1345 RETURN_FAILURE;
1346 p++; ctx->ptr++;
1347 }
1348 }
1349 }
1350 ctx->pattern++;
1351 break;
1352
1353 case SRE_OP_GROUPREF_EXISTS:
1354 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1355 ctx->ptr, ctx->pattern[0]));
1356 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1357 i = ctx->pattern[0];
1358 {
1359 int groupref = i+i;
1360 if (groupref >= state->lastmark) {
1361 ctx->pattern += ctx->pattern[1];
1362 break;
1363 } else {
1364 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1365 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1366 if (!p || !e || e < p) {
1367 ctx->pattern += ctx->pattern[1];
1368 break;
1369 }
1370 }
1371 }
1372 ctx->pattern += 2;
1373 break;
1374
1375 case SRE_OP_ASSERT:
1376 /* assert subpattern */
1377 /* <ASSERT> <skip> <back> <pattern> */
1378 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1379 ctx->ptr, ctx->pattern[1]));
1380 state->ptr = ctx->ptr - ctx->pattern[1];
1381 if (state->ptr < state->beginning)
1382 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001383 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001384 RETURN_ON_FAILURE(ret);
1385 ctx->pattern += ctx->pattern[0];
1386 break;
1387
1388 case SRE_OP_ASSERT_NOT:
1389 /* assert not subpattern */
1390 /* <ASSERT_NOT> <skip> <back> <pattern> */
1391 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1392 ctx->ptr, ctx->pattern[1]));
1393 state->ptr = ctx->ptr - ctx->pattern[1];
1394 if (state->ptr >= state->beginning) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001395 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001396 if (ret) {
1397 RETURN_ON_ERROR(ret);
1398 RETURN_FAILURE;
1399 }
1400 }
1401 ctx->pattern += ctx->pattern[0];
1402 break;
1403
1404 case SRE_OP_FAILURE:
1405 /* immediate failure */
1406 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1407 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001408
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001409 default:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001410 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1411 ctx->pattern[-1]));
1412 RETURN_ERROR(SRE_ERROR_ILLEGAL);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001413 }
1414 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001415
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001416exit:
1417 ctx_pos = ctx->last_ctx_pos;
1418 jump = ctx->jump;
1419 DATA_POP_DISCARD(ctx);
1420 if (ctx_pos == -1)
1421 return ret;
1422 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1423
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001424 switch (jump) {
1425 case JUMP_MAX_UNTIL_2:
1426 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1427 goto jump_max_until_2;
1428 case JUMP_MAX_UNTIL_3:
1429 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1430 goto jump_max_until_3;
1431 case JUMP_MIN_UNTIL_2:
1432 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1433 goto jump_min_until_2;
1434 case JUMP_MIN_UNTIL_3:
1435 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1436 goto jump_min_until_3;
1437 case JUMP_BRANCH:
1438 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1439 goto jump_branch;
1440 case JUMP_MAX_UNTIL_1:
1441 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1442 goto jump_max_until_1;
1443 case JUMP_MIN_UNTIL_1:
1444 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1445 goto jump_min_until_1;
1446 case JUMP_REPEAT:
1447 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1448 goto jump_repeat;
1449 case JUMP_REPEAT_ONE_1:
1450 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1451 goto jump_repeat_one_1;
1452 case JUMP_REPEAT_ONE_2:
1453 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1454 goto jump_repeat_one_2;
1455 case JUMP_MIN_REPEAT_ONE:
1456 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1457 goto jump_min_repeat_one;
1458 case JUMP_ASSERT:
1459 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1460 goto jump_assert;
1461 case JUMP_ASSERT_NOT:
1462 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1463 goto jump_assert_not;
1464 case JUMP_NONE:
1465 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1466 break;
1467 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001468
1469 return ret; /* should never get here */
Guido van Rossumb700df92000-03-31 14:59:30 +00001470}
1471
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001472LOCAL(int)
Guido van Rossumb700df92000-03-31 14:59:30 +00001473SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1474{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001475 SRE_CHAR* ptr = state->start;
1476 SRE_CHAR* end = state->end;
1477 int status = 0;
Fredrik Lundh28552902000-07-05 21:14:16 +00001478 int prefix_len = 0;
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001479 int prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001480 SRE_CODE* prefix = NULL;
1481 SRE_CODE* charset = NULL;
1482 SRE_CODE* overlap = NULL;
1483 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001484
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001485 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001486 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001487 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001488
1489 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001490
Gustavo Niemeyer28b5bb32003-06-26 14:41:08 +00001491 if (pattern[3] > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001492 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001493 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001494 end -= pattern[3]-1;
1495 if (end <= ptr)
1496 end = ptr+1;
1497 }
1498
Fredrik Lundh3562f112000-07-02 12:00:07 +00001499 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001500 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001501 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001502 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001503 prefix_skip = pattern[6];
1504 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001505 overlap = prefix + prefix_len - 1;
1506 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001507 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001508 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001509 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001510
1511 pattern += 1 + pattern[1];
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001512 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001513
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001514 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1515 TRACE(("charset = %p\n", charset));
1516
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001517#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001518 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001519 /* pattern starts with a known prefix. use the overlap
1520 table to skip forward as fast as we possibly can */
1521 int i = 0;
1522 end = state->end;
1523 while (ptr < end) {
1524 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001525 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001526 if (!i)
1527 break;
1528 else
1529 i = overlap[i];
1530 } else {
1531 if (++i == prefix_len) {
1532 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001533 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1534 state->start = ptr + 1 - prefix_len;
1535 state->ptr = ptr + 1 - prefix_len + prefix_skip;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001536 if (flags & SRE_INFO_LITERAL)
1537 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001538 status = SRE_MATCH(state, pattern + 2*prefix_skip);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001539 if (status != 0)
1540 return status;
1541 /* close but no cigar -- try again */
1542 i = overlap[i];
1543 }
1544 break;
1545 }
1546
1547 }
1548 ptr++;
1549 }
1550 return 0;
1551 }
1552#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001553
Fredrik Lundh3562f112000-07-02 12:00:07 +00001554 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001555 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001556 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001557 SRE_CODE chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001558 end = state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001559 for (;;) {
1560 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1561 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001562 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001563 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001564 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001565 state->start = ptr;
1566 state->ptr = ++ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001567 if (flags & SRE_INFO_LITERAL)
1568 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001569 status = SRE_MATCH(state, pattern + 2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001570 if (status != 0)
1571 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001572 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001573 } else if (charset) {
1574 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001575 end = state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001576 for (;;) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001577 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001578 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001579 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001580 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001581 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001582 state->start = ptr;
1583 state->ptr = ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001584 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001585 if (status != 0)
1586 break;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001587 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001588 }
1589 } else
1590 /* general case */
1591 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001592 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001593 state->start = state->ptr = ptr++;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001594 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001595 if (status != 0)
1596 break;
1597 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001598
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001599 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001600}
Fredrik Lundh3562f112000-07-02 12:00:07 +00001601
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001602LOCAL(int)
1603SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, int len)
1604{
1605 /* check if given string is a literal template (i.e. no escapes) */
1606 while (len-- > 0)
1607 if (*ptr++ == '\\')
1608 return 0;
1609 return 1;
1610}
Guido van Rossumb700df92000-03-31 14:59:30 +00001611
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001612#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001613
1614/* -------------------------------------------------------------------- */
1615/* factories and destructors */
1616
1617/* see sre.h for object declarations */
1618
Jeremy Hylton938ace62002-07-17 16:30:39 +00001619static PyTypeObject Pattern_Type;
1620static PyTypeObject Match_Type;
1621static PyTypeObject Scanner_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00001622
1623static PyObject *
1624_compile(PyObject* self_, PyObject* args)
1625{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001626 /* "compile" pattern descriptor to pattern object */
Guido van Rossumb700df92000-03-31 14:59:30 +00001627
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001628 PatternObject* self;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001629 int i, n;
Guido van Rossumb700df92000-03-31 14:59:30 +00001630
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001631 PyObject* pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001632 int flags = 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001633 PyObject* code;
1634 int groups = 0;
1635 PyObject* groupindex = NULL;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001636 PyObject* indexgroup = NULL;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001637 if (!PyArg_ParseTuple(args, "OiO!|iOO", &pattern, &flags,
1638 &PyList_Type, &code, &groups,
1639 &groupindex, &indexgroup))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001640 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001641
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001642 n = PyList_GET_SIZE(code);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001643
Fredrik Lundhebc37b22000-10-28 19:30:41 +00001644 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001645 if (!self)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001646 return NULL;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001647
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001648 self->codesize = n;
1649
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001650 for (i = 0; i < n; i++) {
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001651 PyObject *o = PyList_GET_ITEM(code, i);
Martin v. Löwis78e2f062003-04-19 12:56:08 +00001652 if (PyInt_Check(o))
1653 self->code[i] = (SRE_CODE) PyInt_AsLong(o);
1654 else
1655 self->code[i] = (SRE_CODE) PyLong_AsUnsignedLong(o);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001656 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00001657
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001658 if (PyErr_Occurred()) {
1659 PyObject_DEL(self);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001660 return NULL;
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001661 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00001662
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001663 Py_INCREF(pattern);
1664 self->pattern = pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00001665
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001666 self->flags = flags;
1667
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001668 self->groups = groups;
Guido van Rossumb700df92000-03-31 14:59:30 +00001669
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001670 Py_XINCREF(groupindex);
1671 self->groupindex = groupindex;
Guido van Rossumb700df92000-03-31 14:59:30 +00001672
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001673 Py_XINCREF(indexgroup);
1674 self->indexgroup = indexgroup;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001675
Raymond Hettinger027bb632004-05-31 03:09:25 +00001676 self->weakreflist = NULL;
1677
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001678 return (PyObject*) self;
Guido van Rossumb700df92000-03-31 14:59:30 +00001679}
1680
1681static PyObject *
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001682sre_codesize(PyObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001683{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001684 return Py_BuildValue("i", sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001685}
1686
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001687static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001688sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001689{
1690 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001691 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001692 return NULL;
1693 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001694 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001695 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001696#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001697 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001698#else
1699 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001700#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001701 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001702}
1703
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001704LOCAL(void)
1705state_reset(SRE_STATE* state)
1706{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001707 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001708 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001709
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001710 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001711 state->lastindex = -1;
1712
1713 state->repeat = NULL;
1714
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001715 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001716}
1717
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001718static void*
1719getstring(PyObject* string, int* p_length, int* p_charsize)
Guido van Rossumb700df92000-03-31 14:59:30 +00001720{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001721 /* given a python object, return a data pointer, a length (in
1722 characters), and a character size. return NULL if the object
1723 is not a string (or not compatible) */
1724
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001725 PyBufferProcs *buffer;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001726 int size, bytes, charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001727 void* ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001728
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001729#if defined(HAVE_UNICODE)
1730 if (PyUnicode_Check(string)) {
1731 /* unicode strings doesn't always support the buffer interface */
1732 ptr = (void*) PyUnicode_AS_DATA(string);
1733 bytes = PyUnicode_GET_DATA_SIZE(string);
1734 size = PyUnicode_GET_SIZE(string);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001735 charsize = sizeof(Py_UNICODE);
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001736
1737 } else {
1738#endif
1739
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001740 /* get pointer to string buffer */
1741 buffer = string->ob_type->tp_as_buffer;
1742 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1743 buffer->bf_getsegcount(string, NULL) != 1) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001744 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001745 return NULL;
1746 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001747
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001748 /* determine buffer size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001749 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1750 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001751 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1752 return NULL;
1753 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001754
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001755 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001756#if PY_VERSION_HEX >= 0x01060000
1757 size = PyObject_Size(string);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001758#else
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001759 size = PyObject_Length(string);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001760#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001761
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001762 if (PyString_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001763 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001764#if defined(HAVE_UNICODE)
1765 else if (bytes == (int) (size * sizeof(Py_UNICODE)))
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001766 charsize = sizeof(Py_UNICODE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001767#endif
1768 else {
1769 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1770 return NULL;
1771 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001772
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001773#if defined(HAVE_UNICODE)
1774 }
1775#endif
1776
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001777 *p_length = size;
1778 *p_charsize = charsize;
1779
1780 return ptr;
1781}
1782
1783LOCAL(PyObject*)
1784state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1785 int start, int end)
1786{
1787 /* prepare state object */
1788
1789 int length;
1790 int charsize;
1791 void* ptr;
1792
1793 memset(state, 0, sizeof(SRE_STATE));
1794
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001795 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001796 state->lastindex = -1;
1797
1798 ptr = getstring(string, &length, &charsize);
1799 if (!ptr)
1800 return NULL;
1801
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001802 /* adjust boundaries */
1803 if (start < 0)
1804 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001805 else if (start > length)
1806 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001807
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001808 if (end < 0)
1809 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001810 else if (end > length)
1811 end = length;
1812
1813 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001814
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001815 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001816
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001817 state->start = (void*) ((char*) ptr + start * state->charsize);
1818 state->end = (void*) ((char*) ptr + end * state->charsize);
1819
1820 Py_INCREF(string);
1821 state->string = string;
1822 state->pos = start;
1823 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001824
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001825 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001826 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001827 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001828#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001829 state->lower = sre_lower_unicode;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001830#else
1831 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001832#endif
1833 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001834 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001835
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001836 return string;
Guido van Rossumb700df92000-03-31 14:59:30 +00001837}
1838
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001839LOCAL(void)
1840state_fini(SRE_STATE* state)
1841{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001842 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001843 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001844}
1845
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001846/* calculate offset from start of string */
1847#define STATE_OFFSET(state, member)\
1848 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1849
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001850LOCAL(PyObject*)
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001851state_getslice(SRE_STATE* state, int index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001852{
Fredrik Lundh58100642000-08-09 09:14:35 +00001853 int i, j;
1854
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001855 index = (index - 1) * 2;
1856
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001857 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001858 if (empty)
1859 /* want empty string */
1860 i = j = 0;
1861 else {
1862 Py_INCREF(Py_None);
1863 return Py_None;
1864 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001865 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001866 i = STATE_OFFSET(state, state->mark[index]);
1867 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001868 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001869
Fredrik Lundh58100642000-08-09 09:14:35 +00001870 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001871}
1872
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001873static void
1874pattern_error(int status)
1875{
1876 switch (status) {
1877 case SRE_ERROR_RECURSION_LIMIT:
1878 PyErr_SetString(
1879 PyExc_RuntimeError,
1880 "maximum recursion limit exceeded"
1881 );
1882 break;
1883 case SRE_ERROR_MEMORY:
1884 PyErr_NoMemory();
1885 break;
1886 default:
1887 /* other error codes indicate compiler/engine bugs */
1888 PyErr_SetString(
1889 PyExc_RuntimeError,
1890 "internal error in regular expression engine"
1891 );
1892 }
1893}
1894
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001895static PyObject*
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001896pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001897{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001898 /* create match object (from state object) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001899
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001900 MatchObject* match;
1901 int i, j;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001902 char* base;
1903 int n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001904
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001905 if (status > 0) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001906
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001907 /* create match object (with room for extra group marks) */
1908 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
Fredrik Lundh6f013982000-07-03 18:44:21 +00001909 2*(pattern->groups+1));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001910 if (!match)
1911 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001912
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001913 Py_INCREF(pattern);
1914 match->pattern = pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001915
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001916 Py_INCREF(state->string);
1917 match->string = state->string;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001918
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001919 match->regs = NULL;
1920 match->groups = pattern->groups+1;
1921
1922 /* fill in group slices */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001923
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001924 base = (char*) state->beginning;
1925 n = state->charsize;
1926
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001927 match->mark[0] = ((char*) state->start - base) / n;
1928 match->mark[1] = ((char*) state->ptr - base) / n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001929
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001930 for (i = j = 0; i < pattern->groups; i++, j+=2)
1931 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
1932 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
1933 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
1934 } else
1935 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
1936
1937 match->pos = state->pos;
1938 match->endpos = state->endpos;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001939
Fredrik Lundh6f013982000-07-03 18:44:21 +00001940 match->lastindex = state->lastindex;
1941
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001942 return (PyObject*) match;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001943
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001944 } else if (status == 0) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001945
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001946 /* no match */
1947 Py_INCREF(Py_None);
1948 return Py_None;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001949
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001950 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001951
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001952 /* internal error */
1953 pattern_error(status);
1954 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001955}
1956
1957static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001958pattern_scanner(PatternObject* pattern, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001959{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001960 /* create search state object */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001961
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001962 ScannerObject* self;
1963
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001964 PyObject* string;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001965 int start = 0;
1966 int end = INT_MAX;
1967 if (!PyArg_ParseTuple(args, "O|ii:scanner", &string, &start, &end))
1968 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001969
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001970 /* create scanner object */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001971 self = PyObject_NEW(ScannerObject, &Scanner_Type);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001972 if (!self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001973 return NULL;
1974
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001975 string = state_init(&self->state, pattern, string, start, end);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001976 if (!string) {
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001977 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001978 return NULL;
1979 }
1980
1981 Py_INCREF(pattern);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001982 self->pattern = (PyObject*) pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001983
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001984 return (PyObject*) self;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001985}
1986
Guido van Rossumb700df92000-03-31 14:59:30 +00001987static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001988pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001989{
Raymond Hettinger027bb632004-05-31 03:09:25 +00001990 if (self->weakreflist != NULL)
1991 PyObject_ClearWeakRefs((PyObject *) self);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001992 Py_XDECREF(self->pattern);
1993 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001994 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001995 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001996}
1997
1998static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001999pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002000{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002001 SRE_STATE state;
2002 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00002003
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002004 PyObject* string;
2005 int start = 0;
2006 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002007 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
2008 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:match", kwlist,
2009 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002010 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002011
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002012 string = state_init(&state, self, string, start, end);
2013 if (!string)
2014 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002015
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002016 state.ptr = state.start;
2017
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002018 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
2019
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002020 if (state.charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00002021 status = sre_match(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002022 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002023#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00002024 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002025#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002026 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002027
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002028 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
2029
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002030 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00002031
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002032 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00002033}
2034
2035static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002036pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002037{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002038 SRE_STATE state;
2039 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00002040
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002041 PyObject* string;
2042 int start = 0;
2043 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002044 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
2045 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:search", kwlist,
2046 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002047 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002048
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002049 string = state_init(&state, self, string, start, end);
2050 if (!string)
2051 return NULL;
2052
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002053 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
2054
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002055 if (state.charsize == 1) {
2056 status = sre_search(&state, PatternObject_GetCode(self));
2057 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002058#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002059 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002060#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002061 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002062
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002063 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
2064
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002065 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00002066
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002067 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00002068}
2069
2070static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002071call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002072{
2073 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002074 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002075 PyObject* func;
2076 PyObject* result;
2077
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002078 if (!args)
2079 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002080 name = PyString_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002081 if (!name)
2082 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002083 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002084 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002085 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002086 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002087 func = PyObject_GetAttrString(mod, function);
2088 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002089 if (!func)
2090 return NULL;
2091 result = PyObject_CallObject(func, args);
2092 Py_DECREF(func);
2093 Py_DECREF(args);
2094 return result;
2095}
2096
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002097#ifdef USE_BUILTIN_COPY
2098static int
2099deepcopy(PyObject** object, PyObject* memo)
2100{
2101 PyObject* copy;
2102
2103 copy = call(
2104 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002105 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002106 );
2107 if (!copy)
2108 return 0;
2109
2110 Py_DECREF(*object);
2111 *object = copy;
2112
2113 return 1; /* success */
2114}
2115#endif
2116
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002117static PyObject*
Michael W. Hudsonb6a45052002-07-31 09:54:24 +00002118join_list(PyObject* list, PyObject* pattern)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002119{
2120 /* join list elements */
2121
2122 PyObject* joiner;
2123#if PY_VERSION_HEX >= 0x01060000
2124 PyObject* function;
2125 PyObject* args;
2126#endif
2127 PyObject* result;
2128
2129 switch (PyList_GET_SIZE(list)) {
2130 case 0:
2131 Py_DECREF(list);
Fredrik Lundh09705f02002-11-22 12:46:35 +00002132 return PySequence_GetSlice(pattern, 0, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002133 case 1:
2134 result = PyList_GET_ITEM(list, 0);
2135 Py_INCREF(result);
2136 Py_DECREF(list);
2137 return result;
2138 }
2139
2140 /* two or more elements: slice out a suitable separator from the
2141 first member, and use that to join the entire list */
2142
2143 joiner = PySequence_GetSlice(pattern, 0, 0);
2144 if (!joiner)
2145 return NULL;
2146
2147#if PY_VERSION_HEX >= 0x01060000
2148 function = PyObject_GetAttrString(joiner, "join");
2149 if (!function) {
2150 Py_DECREF(joiner);
2151 return NULL;
2152 }
2153 args = PyTuple_New(1);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002154 if (!args) {
2155 Py_DECREF(function);
2156 Py_DECREF(joiner);
2157 return NULL;
2158 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002159 PyTuple_SET_ITEM(args, 0, list);
2160 result = PyObject_CallObject(function, args);
2161 Py_DECREF(args); /* also removes list */
2162 Py_DECREF(function);
2163#else
2164 result = call(
2165 "string", "join",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002166 PyTuple_Pack(2, list, joiner)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002167 );
2168#endif
2169 Py_DECREF(joiner);
2170
2171 return result;
2172}
2173
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002174static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002175pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002176{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002177 SRE_STATE state;
2178 PyObject* list;
2179 int status;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002180 int i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00002181
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002182 PyObject* string;
2183 int start = 0;
2184 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002185 static char* kwlist[] = { "source", "pos", "endpos", NULL };
2186 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:findall", kwlist,
2187 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002188 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002189
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002190 string = state_init(&state, self, string, start, end);
2191 if (!string)
2192 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002193
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002194 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002195 if (!list) {
2196 state_fini(&state);
2197 return NULL;
2198 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002199
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002200 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002201
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002202 PyObject* item;
2203
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002204 state_reset(&state);
2205
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002206 state.ptr = state.start;
2207
2208 if (state.charsize == 1) {
2209 status = sre_search(&state, PatternObject_GetCode(self));
2210 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002211#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002212 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002213#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002214 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002215
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002216 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002217 if (status == 0)
2218 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002219 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002220 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002221 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002222
2223 /* don't bother to build a match object */
2224 switch (self->groups) {
2225 case 0:
2226 b = STATE_OFFSET(&state, state.start);
2227 e = STATE_OFFSET(&state, state.ptr);
2228 item = PySequence_GetSlice(string, b, e);
2229 if (!item)
2230 goto error;
2231 break;
2232 case 1:
2233 item = state_getslice(&state, 1, string, 1);
2234 if (!item)
2235 goto error;
2236 break;
2237 default:
2238 item = PyTuple_New(self->groups);
2239 if (!item)
2240 goto error;
2241 for (i = 0; i < self->groups; i++) {
2242 PyObject* o = state_getslice(&state, i+1, string, 1);
2243 if (!o) {
2244 Py_DECREF(item);
2245 goto error;
2246 }
2247 PyTuple_SET_ITEM(item, i, o);
2248 }
2249 break;
2250 }
2251
2252 status = PyList_Append(list, item);
2253 Py_DECREF(item);
2254 if (status < 0)
2255 goto error;
2256
2257 if (state.ptr == state.start)
2258 state.start = (void*) ((char*) state.ptr + state.charsize);
2259 else
2260 state.start = state.ptr;
2261
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002262 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002263
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002264 state_fini(&state);
2265 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002266
2267error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002268 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002269 state_fini(&state);
2270 return NULL;
2271
Guido van Rossumb700df92000-03-31 14:59:30 +00002272}
2273
Fredrik Lundh703ce812001-10-24 22:16:30 +00002274#if PY_VERSION_HEX >= 0x02020000
2275static PyObject*
2276pattern_finditer(PatternObject* pattern, PyObject* args)
2277{
2278 PyObject* scanner;
2279 PyObject* search;
2280 PyObject* iterator;
2281
2282 scanner = pattern_scanner(pattern, args);
2283 if (!scanner)
2284 return NULL;
2285
2286 search = PyObject_GetAttrString(scanner, "search");
2287 Py_DECREF(scanner);
2288 if (!search)
2289 return NULL;
2290
2291 iterator = PyCallIter_New(search, Py_None);
2292 Py_DECREF(search);
2293
2294 return iterator;
2295}
2296#endif
2297
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002298static PyObject*
2299pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2300{
2301 SRE_STATE state;
2302 PyObject* list;
2303 PyObject* item;
2304 int status;
2305 int n;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002306 int i;
2307 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002308
2309 PyObject* string;
2310 int maxsplit = 0;
2311 static char* kwlist[] = { "source", "maxsplit", NULL };
2312 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|i:split", kwlist,
2313 &string, &maxsplit))
2314 return NULL;
2315
2316 string = state_init(&state, self, string, 0, INT_MAX);
2317 if (!string)
2318 return NULL;
2319
2320 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002321 if (!list) {
2322 state_fini(&state);
2323 return NULL;
2324 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002325
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002326 n = 0;
2327 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002328
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002329 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002330
2331 state_reset(&state);
2332
2333 state.ptr = state.start;
2334
2335 if (state.charsize == 1) {
2336 status = sre_search(&state, PatternObject_GetCode(self));
2337 } else {
2338#if defined(HAVE_UNICODE)
2339 status = sre_usearch(&state, PatternObject_GetCode(self));
2340#endif
2341 }
2342
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002343 if (status <= 0) {
2344 if (status == 0)
2345 break;
2346 pattern_error(status);
2347 goto error;
2348 }
2349
2350 if (state.start == state.ptr) {
2351 if (last == state.end)
2352 break;
2353 /* skip one character */
2354 state.start = (void*) ((char*) state.ptr + state.charsize);
2355 continue;
2356 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002357
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002358 /* get segment before this match */
2359 item = PySequence_GetSlice(
2360 string, STATE_OFFSET(&state, last),
2361 STATE_OFFSET(&state, state.start)
2362 );
2363 if (!item)
2364 goto error;
2365 status = PyList_Append(list, item);
2366 Py_DECREF(item);
2367 if (status < 0)
2368 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002369
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002370 /* add groups (if any) */
2371 for (i = 0; i < self->groups; i++) {
2372 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002373 if (!item)
2374 goto error;
2375 status = PyList_Append(list, item);
2376 Py_DECREF(item);
2377 if (status < 0)
2378 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002379 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002380
2381 n = n + 1;
2382
2383 last = state.start = state.ptr;
2384
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002385 }
2386
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002387 /* get segment following last match (even if empty) */
2388 item = PySequence_GetSlice(
2389 string, STATE_OFFSET(&state, last), state.endpos
2390 );
2391 if (!item)
2392 goto error;
2393 status = PyList_Append(list, item);
2394 Py_DECREF(item);
2395 if (status < 0)
2396 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002397
2398 state_fini(&state);
2399 return list;
2400
2401error:
2402 Py_DECREF(list);
2403 state_fini(&state);
2404 return NULL;
2405
2406}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002407
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002408static PyObject*
2409pattern_subx(PatternObject* self, PyObject* template, PyObject* string,
2410 int count, int subn)
2411{
2412 SRE_STATE state;
2413 PyObject* list;
2414 PyObject* item;
2415 PyObject* filter;
2416 PyObject* args;
2417 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002418 void* ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002419 int status;
2420 int n;
2421 int i, b, e;
2422 int filter_is_callable;
2423
Fredrik Lundhdac58492001-10-21 21:48:30 +00002424 if (PyCallable_Check(template)) {
2425 /* sub/subn takes either a function or a template */
2426 filter = template;
2427 Py_INCREF(filter);
2428 filter_is_callable = 1;
2429 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002430 /* if not callable, check if it's a literal string */
2431 int literal;
2432 ptr = getstring(template, &n, &b);
2433 if (ptr) {
2434 if (b == 1) {
2435 literal = sre_literal_template(ptr, n);
2436 } else {
2437#if defined(HAVE_UNICODE)
2438 literal = sre_uliteral_template(ptr, n);
2439#endif
2440 }
2441 } else {
2442 PyErr_Clear();
2443 literal = 0;
2444 }
2445 if (literal) {
2446 filter = template;
2447 Py_INCREF(filter);
2448 filter_is_callable = 0;
2449 } else {
2450 /* not a literal; hand it over to the template compiler */
2451 filter = call(
2452 SRE_MODULE, "_subx",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002453 PyTuple_Pack(2, self, template)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002454 );
2455 if (!filter)
2456 return NULL;
2457 filter_is_callable = PyCallable_Check(filter);
2458 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002459 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002460
2461 string = state_init(&state, self, string, 0, INT_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002462 if (!string) {
2463 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002464 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002465 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002466
2467 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002468 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002469 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002470 state_fini(&state);
2471 return NULL;
2472 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002473
2474 n = i = 0;
2475
2476 while (!count || n < count) {
2477
2478 state_reset(&state);
2479
2480 state.ptr = state.start;
2481
2482 if (state.charsize == 1) {
2483 status = sre_search(&state, PatternObject_GetCode(self));
2484 } else {
2485#if defined(HAVE_UNICODE)
2486 status = sre_usearch(&state, PatternObject_GetCode(self));
2487#endif
2488 }
2489
2490 if (status <= 0) {
2491 if (status == 0)
2492 break;
2493 pattern_error(status);
2494 goto error;
2495 }
2496
2497 b = STATE_OFFSET(&state, state.start);
2498 e = STATE_OFFSET(&state, state.ptr);
2499
2500 if (i < b) {
2501 /* get segment before this match */
2502 item = PySequence_GetSlice(string, i, b);
2503 if (!item)
2504 goto error;
2505 status = PyList_Append(list, item);
2506 Py_DECREF(item);
2507 if (status < 0)
2508 goto error;
2509
2510 } else if (i == b && i == e && n > 0)
2511 /* ignore empty match on latest position */
2512 goto next;
2513
2514 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002515 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002516 match = pattern_new_match(self, &state, 1);
2517 if (!match)
2518 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002519 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002520 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002521 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002522 goto error;
2523 }
2524 item = PyObject_CallObject(filter, args);
2525 Py_DECREF(args);
2526 Py_DECREF(match);
2527 if (!item)
2528 goto error;
2529 } else {
2530 /* filter is literal string */
2531 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002532 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002533 }
2534
2535 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002536 if (item != Py_None) {
2537 status = PyList_Append(list, item);
2538 Py_DECREF(item);
2539 if (status < 0)
2540 goto error;
2541 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002542
2543 i = e;
2544 n = n + 1;
2545
2546next:
2547 /* move on */
2548 if (state.ptr == state.start)
2549 state.start = (void*) ((char*) state.ptr + state.charsize);
2550 else
2551 state.start = state.ptr;
2552
2553 }
2554
2555 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002556 if (i < state.endpos) {
2557 item = PySequence_GetSlice(string, i, state.endpos);
2558 if (!item)
2559 goto error;
2560 status = PyList_Append(list, item);
2561 Py_DECREF(item);
2562 if (status < 0)
2563 goto error;
2564 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002565
2566 state_fini(&state);
2567
Guido van Rossum4e173842001-12-07 04:25:10 +00002568 Py_DECREF(filter);
2569
Fredrik Lundhdac58492001-10-21 21:48:30 +00002570 /* convert list to single string (also removes list) */
Michael W. Hudsonb6a45052002-07-31 09:54:24 +00002571 item = join_list(list, self->pattern);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002572
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002573 if (!item)
2574 return NULL;
2575
2576 if (subn)
2577 return Py_BuildValue("Ni", item, n);
2578
2579 return item;
2580
2581error:
2582 Py_DECREF(list);
2583 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002584 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002585 return NULL;
2586
2587}
2588
2589static PyObject*
2590pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2591{
2592 PyObject* template;
2593 PyObject* string;
2594 int count = 0;
2595 static char* kwlist[] = { "repl", "string", "count", NULL };
2596 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:sub", kwlist,
2597 &template, &string, &count))
2598 return NULL;
2599
2600 return pattern_subx(self, template, string, count, 0);
2601}
2602
2603static PyObject*
2604pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2605{
2606 PyObject* template;
2607 PyObject* string;
2608 int count = 0;
2609 static char* kwlist[] = { "repl", "string", "count", NULL };
2610 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:subn", kwlist,
2611 &template, &string, &count))
2612 return NULL;
2613
2614 return pattern_subx(self, template, string, count, 1);
2615}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002616
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002617static PyObject*
2618pattern_copy(PatternObject* self, PyObject* args)
2619{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002620#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002621 PatternObject* copy;
2622 int offset;
2623
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002624 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
2625 return NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002626
2627 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2628 if (!copy)
2629 return NULL;
2630
2631 offset = offsetof(PatternObject, groups);
2632
2633 Py_XINCREF(self->groupindex);
2634 Py_XINCREF(self->indexgroup);
2635 Py_XINCREF(self->pattern);
2636
2637 memcpy((char*) copy + offset, (char*) self + offset,
2638 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00002639 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002640
2641 return (PyObject*) copy;
2642#else
2643 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2644 return NULL;
2645#endif
2646}
2647
2648static PyObject*
2649pattern_deepcopy(PatternObject* self, PyObject* args)
2650{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002651#ifdef USE_BUILTIN_COPY
2652 PatternObject* copy;
2653
2654 PyObject* memo;
2655 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
2656 return NULL;
2657
2658 copy = (PatternObject*) pattern_copy(self, Py_None);
2659 if (!copy)
2660 return NULL;
2661
2662 if (!deepcopy(&copy->groupindex, memo) ||
2663 !deepcopy(&copy->indexgroup, memo) ||
2664 !deepcopy(&copy->pattern, memo)) {
2665 Py_DECREF(copy);
2666 return NULL;
2667 }
2668
2669#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002670 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2671 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002672#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002673}
2674
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002675static PyMethodDef pattern_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00002676 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS},
2677 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS},
2678 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS},
2679 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS},
2680 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS},
2681 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002682#if PY_VERSION_HEX >= 0x02020000
2683 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS},
2684#endif
Fredrik Lundh562586e2000-10-03 20:43:34 +00002685 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002686 {"__copy__", (PyCFunction) pattern_copy, METH_VARARGS},
2687 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002688 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002689};
2690
2691static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002692pattern_getattr(PatternObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00002693{
2694 PyObject* res;
2695
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002696 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00002697
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002698 if (res)
2699 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00002700
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002701 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00002702
2703 /* attributes */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002704 if (!strcmp(name, "pattern")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002705 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002706 return self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00002707 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002708
2709 if (!strcmp(name, "flags"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002710 return Py_BuildValue("i", self->flags);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002711
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002712 if (!strcmp(name, "groups"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002713 return Py_BuildValue("i", self->groups);
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002714
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002715 if (!strcmp(name, "groupindex") && self->groupindex) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002716 Py_INCREF(self->groupindex);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002717 return self->groupindex;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002718 }
2719
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002720 PyErr_SetString(PyExc_AttributeError, name);
2721 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002722}
2723
2724statichere PyTypeObject Pattern_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002725 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00002726 0, "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002727 sizeof(PatternObject), sizeof(SRE_CODE),
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002728 (destructor)pattern_dealloc, /*tp_dealloc*/
2729 0, /*tp_print*/
Raymond Hettinger027bb632004-05-31 03:09:25 +00002730 (getattrfunc)pattern_getattr, /*tp_getattr*/
2731 0, /* tp_setattr */
2732 0, /* tp_compare */
2733 0, /* tp_repr */
2734 0, /* tp_as_number */
2735 0, /* tp_as_sequence */
2736 0, /* tp_as_mapping */
2737 0, /* tp_hash */
2738 0, /* tp_call */
2739 0, /* tp_str */
2740 0, /* tp_getattro */
2741 0, /* tp_setattro */
2742 0, /* tp_as_buffer */
2743 Py_TPFLAGS_HAVE_WEAKREFS, /* tp_flags */
2744 0, /* tp_doc */
2745 0, /* tp_traverse */
2746 0, /* tp_clear */
2747 0, /* tp_richcompare */
2748 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
Guido van Rossumb700df92000-03-31 14:59:30 +00002749};
2750
2751/* -------------------------------------------------------------------- */
2752/* match methods */
2753
2754static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002755match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00002756{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002757 Py_XDECREF(self->regs);
2758 Py_XDECREF(self->string);
2759 Py_DECREF(self->pattern);
2760 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00002761}
2762
2763static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002764match_getslice_by_index(MatchObject* self, int index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00002765{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002766 if (index < 0 || index >= self->groups) {
2767 /* raise IndexError if we were given a bad group number */
2768 PyErr_SetString(
2769 PyExc_IndexError,
2770 "no such group"
2771 );
2772 return NULL;
2773 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002774
Fredrik Lundh6f013982000-07-03 18:44:21 +00002775 index *= 2;
2776
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002777 if (self->string == Py_None || self->mark[index] < 0) {
2778 /* return default value if the string or group is undefined */
2779 Py_INCREF(def);
2780 return def;
2781 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002782
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002783 return PySequence_GetSlice(
2784 self->string, self->mark[index], self->mark[index+1]
2785 );
Guido van Rossumb700df92000-03-31 14:59:30 +00002786}
2787
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002788static int
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002789match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00002790{
Fredrik Lundh6f013982000-07-03 18:44:21 +00002791 int i;
Guido van Rossumb700df92000-03-31 14:59:30 +00002792
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002793 if (PyInt_Check(index))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002794 return (int) PyInt_AS_LONG(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00002795
Fredrik Lundh6f013982000-07-03 18:44:21 +00002796 i = -1;
2797
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002798 if (self->pattern->groupindex) {
2799 index = PyObject_GetItem(self->pattern->groupindex, index);
2800 if (index) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00002801 if (PyInt_Check(index))
2802 i = (int) PyInt_AS_LONG(index);
2803 Py_DECREF(index);
2804 } else
2805 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002806 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00002807
2808 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002809}
2810
2811static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002812match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002813{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002814 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00002815}
2816
2817static PyObject*
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002818match_expand(MatchObject* self, PyObject* args)
2819{
2820 PyObject* template;
2821 if (!PyArg_ParseTuple(args, "O:expand", &template))
2822 return NULL;
2823
2824 /* delegate to Python code */
2825 return call(
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002826 SRE_MODULE, "_expand",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002827 PyTuple_Pack(3, self->pattern, self, template)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002828 );
2829}
2830
2831static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002832match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002833{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002834 PyObject* result;
2835 int i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00002836
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002837 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00002838
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002839 switch (size) {
2840 case 0:
2841 result = match_getslice(self, Py_False, Py_None);
2842 break;
2843 case 1:
2844 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2845 break;
2846 default:
2847 /* fetch multiple items */
2848 result = PyTuple_New(size);
2849 if (!result)
2850 return NULL;
2851 for (i = 0; i < size; i++) {
2852 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002853 self, PyTuple_GET_ITEM(args, i), Py_None
2854 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002855 if (!item) {
2856 Py_DECREF(result);
2857 return NULL;
2858 }
2859 PyTuple_SET_ITEM(result, i, item);
2860 }
2861 break;
2862 }
2863 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002864}
2865
2866static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002867match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002868{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002869 PyObject* result;
2870 int index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002871
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002872 PyObject* def = Py_None;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002873 static char* kwlist[] = { "default", NULL };
2874 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002875 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002876
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002877 result = PyTuple_New(self->groups-1);
2878 if (!result)
2879 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002880
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002881 for (index = 1; index < self->groups; index++) {
2882 PyObject* item;
2883 item = match_getslice_by_index(self, index, def);
2884 if (!item) {
2885 Py_DECREF(result);
2886 return NULL;
2887 }
2888 PyTuple_SET_ITEM(result, index-1, item);
2889 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002890
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002891 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002892}
2893
2894static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002895match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002896{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002897 PyObject* result;
2898 PyObject* keys;
2899 int index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002900
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002901 PyObject* def = Py_None;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002902 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00002903 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002904 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002905
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002906 result = PyDict_New();
2907 if (!result || !self->pattern->groupindex)
2908 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002909
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002910 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002911 if (!keys)
2912 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00002913
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002914 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00002915 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002916 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002917 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002918 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002919 if (!key)
2920 goto failed;
2921 value = match_getslice(self, key, def);
2922 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002923 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002924 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002925 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00002926 status = PyDict_SetItem(result, key, value);
2927 Py_DECREF(value);
2928 if (status < 0)
2929 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002930 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002931
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002932 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00002933
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002934 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002935
2936failed:
2937 Py_DECREF(keys);
2938 Py_DECREF(result);
2939 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002940}
2941
2942static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002943match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002944{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002945 int index;
2946
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002947 PyObject* index_ = Py_False; /* zero */
2948 if (!PyArg_ParseTuple(args, "|O:start", &index_))
2949 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002950
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002951 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002952
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002953 if (index < 0 || index >= self->groups) {
2954 PyErr_SetString(
2955 PyExc_IndexError,
2956 "no such group"
2957 );
2958 return NULL;
2959 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002960
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002961 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002962 return Py_BuildValue("i", self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00002963}
2964
2965static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002966match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002967{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002968 int index;
2969
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002970 PyObject* index_ = Py_False; /* zero */
2971 if (!PyArg_ParseTuple(args, "|O:end", &index_))
2972 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002973
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002974 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002975
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002976 if (index < 0 || index >= self->groups) {
2977 PyErr_SetString(
2978 PyExc_IndexError,
2979 "no such group"
2980 );
2981 return NULL;
2982 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002983
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002984 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002985 return Py_BuildValue("i", self->mark[index*2+1]);
2986}
2987
2988LOCAL(PyObject*)
2989_pair(int i1, int i2)
2990{
2991 PyObject* pair;
2992 PyObject* item;
2993
2994 pair = PyTuple_New(2);
2995 if (!pair)
2996 return NULL;
2997
2998 item = PyInt_FromLong(i1);
2999 if (!item)
3000 goto error;
3001 PyTuple_SET_ITEM(pair, 0, item);
3002
3003 item = PyInt_FromLong(i2);
3004 if (!item)
3005 goto error;
3006 PyTuple_SET_ITEM(pair, 1, item);
3007
3008 return pair;
3009
3010 error:
3011 Py_DECREF(pair);
3012 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003013}
3014
3015static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003016match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003017{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003018 int index;
3019
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003020 PyObject* index_ = Py_False; /* zero */
3021 if (!PyArg_ParseTuple(args, "|O:span", &index_))
3022 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003023
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003024 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003025
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003026 if (index < 0 || index >= self->groups) {
3027 PyErr_SetString(
3028 PyExc_IndexError,
3029 "no such group"
3030 );
3031 return NULL;
3032 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003033
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003034 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003035 return _pair(self->mark[index*2], self->mark[index*2+1]);
3036}
3037
3038static PyObject*
3039match_regs(MatchObject* self)
3040{
3041 PyObject* regs;
3042 PyObject* item;
3043 int index;
3044
3045 regs = PyTuple_New(self->groups);
3046 if (!regs)
3047 return NULL;
3048
3049 for (index = 0; index < self->groups; index++) {
3050 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3051 if (!item) {
3052 Py_DECREF(regs);
3053 return NULL;
3054 }
3055 PyTuple_SET_ITEM(regs, index, item);
3056 }
3057
3058 Py_INCREF(regs);
3059 self->regs = regs;
3060
3061 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00003062}
3063
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003064static PyObject*
3065match_copy(MatchObject* self, PyObject* args)
3066{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003067#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003068 MatchObject* copy;
3069 int slots, offset;
3070
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003071 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
3072 return NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003073
3074 slots = 2 * (self->pattern->groups+1);
3075
3076 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3077 if (!copy)
3078 return NULL;
3079
3080 /* this value a constant, but any compiler should be able to
3081 figure that out all by itself */
3082 offset = offsetof(MatchObject, string);
3083
3084 Py_XINCREF(self->pattern);
3085 Py_XINCREF(self->string);
3086 Py_XINCREF(self->regs);
3087
3088 memcpy((char*) copy + offset, (char*) self + offset,
3089 sizeof(MatchObject) + slots * sizeof(int) - offset);
3090
3091 return (PyObject*) copy;
3092#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003093 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003094 return NULL;
3095#endif
3096}
3097
3098static PyObject*
3099match_deepcopy(MatchObject* self, PyObject* args)
3100{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003101#ifdef USE_BUILTIN_COPY
3102 MatchObject* copy;
3103
3104 PyObject* memo;
3105 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
3106 return NULL;
3107
3108 copy = (MatchObject*) match_copy(self, Py_None);
3109 if (!copy)
3110 return NULL;
3111
3112 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3113 !deepcopy(&copy->string, memo) ||
3114 !deepcopy(&copy->regs, memo)) {
3115 Py_DECREF(copy);
3116 return NULL;
3117 }
3118
3119#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003120 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3121 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003122#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003123}
3124
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003125static PyMethodDef match_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00003126 {"group", (PyCFunction) match_group, METH_VARARGS},
3127 {"start", (PyCFunction) match_start, METH_VARARGS},
3128 {"end", (PyCFunction) match_end, METH_VARARGS},
3129 {"span", (PyCFunction) match_span, METH_VARARGS},
3130 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3131 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
3132 {"expand", (PyCFunction) match_expand, METH_VARARGS},
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003133 {"__copy__", (PyCFunction) match_copy, METH_VARARGS},
3134 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003135 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003136};
3137
3138static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003139match_getattr(MatchObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00003140{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003141 PyObject* res;
Guido van Rossumb700df92000-03-31 14:59:30 +00003142
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003143 res = Py_FindMethod(match_methods, (PyObject*) self, name);
3144 if (res)
3145 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00003146
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003147 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00003148
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003149 if (!strcmp(name, "lastindex")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00003150 if (self->lastindex >= 0)
3151 return Py_BuildValue("i", self->lastindex);
Fredrik Lundhc2301732000-07-02 22:25:39 +00003152 Py_INCREF(Py_None);
3153 return Py_None;
3154 }
3155
3156 if (!strcmp(name, "lastgroup")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00003157 if (self->pattern->indexgroup && self->lastindex >= 0) {
Fredrik Lundhc2301732000-07-02 22:25:39 +00003158 PyObject* result = PySequence_GetItem(
Fredrik Lundh6f013982000-07-03 18:44:21 +00003159 self->pattern->indexgroup, self->lastindex
Fredrik Lundhc2301732000-07-02 22:25:39 +00003160 );
3161 if (result)
3162 return result;
3163 PyErr_Clear();
3164 }
3165 Py_INCREF(Py_None);
3166 return Py_None;
3167 }
3168
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003169 if (!strcmp(name, "string")) {
3170 if (self->string) {
3171 Py_INCREF(self->string);
3172 return self->string;
3173 } else {
3174 Py_INCREF(Py_None);
3175 return Py_None;
3176 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003177 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003178
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003179 if (!strcmp(name, "regs")) {
3180 if (self->regs) {
3181 Py_INCREF(self->regs);
3182 return self->regs;
3183 } else
3184 return match_regs(self);
3185 }
3186
3187 if (!strcmp(name, "re")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00003188 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003189 return (PyObject*) self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00003190 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003191
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003192 if (!strcmp(name, "pos"))
3193 return Py_BuildValue("i", self->pos);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003194
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003195 if (!strcmp(name, "endpos"))
3196 return Py_BuildValue("i", self->endpos);
Guido van Rossumb700df92000-03-31 14:59:30 +00003197
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003198 PyErr_SetString(PyExc_AttributeError, name);
3199 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003200}
3201
3202/* FIXME: implement setattr("string", None) as a special case (to
3203 detach the associated string, if any */
3204
3205statichere PyTypeObject Match_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003206 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00003207 0, "_" SRE_MODULE ".SRE_Match",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003208 sizeof(MatchObject), sizeof(int),
3209 (destructor)match_dealloc, /*tp_dealloc*/
3210 0, /*tp_print*/
3211 (getattrfunc)match_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00003212};
3213
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003214/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003215/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003216
3217static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003218scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003219{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003220 state_fini(&self->state);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003221 Py_DECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003222 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003223}
3224
3225static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003226scanner_match(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003227{
3228 SRE_STATE* state = &self->state;
3229 PyObject* match;
3230 int status;
3231
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003232 state_reset(state);
3233
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003234 state->ptr = state->start;
3235
3236 if (state->charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003237 status = sre_match(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003238 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003239#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003240 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003241#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003242 }
3243
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003244 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003245 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003246
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00003247 if ((status == 0 || state->ptr == state->start) &&
3248 state->ptr < state->end)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003249 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003250 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003251 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003252
3253 return match;
3254}
3255
3256
3257static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003258scanner_search(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003259{
3260 SRE_STATE* state = &self->state;
3261 PyObject* match;
3262 int status;
3263
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003264 state_reset(state);
3265
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003266 state->ptr = state->start;
3267
3268 if (state->charsize == 1) {
3269 status = sre_search(state, PatternObject_GetCode(self->pattern));
3270 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003271#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003272 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003273#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003274 }
3275
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003276 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003277 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003278
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00003279 if ((status == 0 || state->ptr == state->start) &&
3280 state->ptr < state->end)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003281 state->start = (void*) ((char*) state->ptr + state->charsize);
3282 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003283 state->start = state->ptr;
3284
3285 return match;
3286}
3287
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003288static PyMethodDef scanner_methods[] = {
Neal Norwitzbb2769f2002-03-31 15:46:00 +00003289 /* FIXME: use METH_OLDARGS instead of 0 or fix to use METH_VARARGS */
3290 /* METH_OLDARGS is not in Python 1.5.2 */
3291 {"match", (PyCFunction) scanner_match, 0},
3292 {"search", (PyCFunction) scanner_search, 0},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003293 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003294};
3295
3296static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003297scanner_getattr(ScannerObject* self, char* name)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003298{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003299 PyObject* res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003300
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003301 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
3302 if (res)
3303 return res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003304
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003305 PyErr_Clear();
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003306
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003307 /* attributes */
3308 if (!strcmp(name, "pattern")) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003309 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003310 return self->pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003311 }
3312
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003313 PyErr_SetString(PyExc_AttributeError, name);
3314 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003315}
3316
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003317statichere PyTypeObject Scanner_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003318 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00003319 0, "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003320 sizeof(ScannerObject), 0,
3321 (destructor)scanner_dealloc, /*tp_dealloc*/
3322 0, /*tp_print*/
3323 (getattrfunc)scanner_getattr, /*tp_getattr*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003324};
3325
Guido van Rossumb700df92000-03-31 14:59:30 +00003326static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00003327 {"compile", _compile, METH_VARARGS},
3328 {"getcodesize", sre_codesize, METH_VARARGS},
3329 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003330 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003331};
3332
Andrew M. Kuchlingc24fe362003-04-30 13:09:08 +00003333#if PY_VERSION_HEX < 0x02030000
3334DL_EXPORT(void) init_sre(void)
3335#else
Mark Hammond8235ea12002-07-19 06:55:41 +00003336PyMODINIT_FUNC init_sre(void)
Andrew M. Kuchlingc24fe362003-04-30 13:09:08 +00003337#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00003338{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003339 PyObject* m;
3340 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00003341 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003342
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003343 /* Patch object types */
3344 Pattern_Type.ob_type = Match_Type.ob_type =
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003345 Scanner_Type.ob_type = &PyType_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00003346
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00003347 m = Py_InitModule("_" SRE_MODULE, _functions);
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003348 d = PyModule_GetDict(m);
3349
Fredrik Lundh21009b92001-09-18 18:47:09 +00003350 x = PyInt_FromLong(SRE_MAGIC);
3351 if (x) {
3352 PyDict_SetItemString(d, "MAGIC", x);
3353 Py_DECREF(x);
3354 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00003355
Martin v. Löwis78e2f062003-04-19 12:56:08 +00003356 x = PyInt_FromLong(sizeof(SRE_CODE));
3357 if (x) {
3358 PyDict_SetItemString(d, "CODESIZE", x);
3359 Py_DECREF(x);
3360 }
3361
Fredrik Lundh21009b92001-09-18 18:47:09 +00003362 x = PyString_FromString(copyright);
3363 if (x) {
3364 PyDict_SetItemString(d, "copyright", x);
3365 Py_DECREF(x);
3366 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003367}
3368
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003369#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00003370
3371/* vim:ts=4:sw=4:et
3372*/