blob: f97cb62761a52f6f20486d0f2d5d2c950d8de647 [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
Fredrik Lundhebc37b22000-10-28 19:30:41 +00009 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000010 * 2000-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000011 * 2000-09-20 fl added expand method
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000012 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000013 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000014 * 2001-04-28 fl added __copy__ methods (work in progress)
Fredrik Lundh09705f02002-11-22 12:46:35 +000015 * 2001-05-14 fl fixes for 1.5.2 compatibility
Fredrik Lundhf71ae462001-07-02 17:04:48 +000016 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh397a6542001-10-18 19:30:16 +000017 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Fredrik Lundh971e78b2001-10-20 17:48:46 +000018 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
Fredrik Lundhbec95b92001-10-21 16:47:57 +000019 * 2001-10-21 fl added sub/subn primitive
Fredrik Lundh703ce812001-10-24 22:16:30 +000020 * 2001-10-24 fl added finditer primitive (for 2.2 only)
Fredrik Lundh82b23072001-12-09 16:13:15 +000021 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
Fredrik Lundh09705f02002-11-22 12:46:35 +000022 * 2002-11-09 fl fixed empty sub/subn return type
Martin v. Löwis78e2f062003-04-19 12:56:08 +000023 * 2003-04-18 mvl fully support 4-byte codes
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +000024 * 2003-10-17 gn implemented non recursive scheme
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000026 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000027 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000028 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
Guido van Rossumb700df92000-03-31 14:59:30 +000032 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000033 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000034 * other compatibility work.
35 */
36
37#ifndef SRE_RECURSIVE
38
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000039static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000040 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000041
42#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000043#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000044
45#include "sre.h"
46
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000047#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000048
Fredrik Lundh436c3d582000-06-29 08:58:44 +000049/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000050#if !defined(SRE_MODULE)
51#define SRE_MODULE "sre"
52#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000053
Guido van Rossumb700df92000-03-31 14:59:30 +000054/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000055#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000056
Fredrik Lundh971e78b2001-10-20 17:48:46 +000057#if PY_VERSION_HEX >= 0x01060000
58#if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
Fredrik Lundh22d25462000-07-01 17:50:59 +000059/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +000060#define HAVE_UNICODE
61#endif
Fredrik Lundh971e78b2001-10-20 17:48:46 +000062#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000063
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000065/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000066
67/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000068#define USE_FAST_SEARCH
69
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000070/* enables aggressive inlining (always on for Visual C) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000071#undef USE_INLINE
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000072
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000073/* enables copy/deepcopy handling (work in progress) */
74#undef USE_BUILTIN_COPY
75
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000076#if PY_VERSION_HEX < 0x01060000
77#define PyObject_DEL(op) PyMem_DEL((op))
78#endif
79
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000080/* -------------------------------------------------------------------- */
81
Fredrik Lundh80946112000-06-29 18:03:25 +000082#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000083#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000084#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000085/* fastest possible local call under MSVC */
86#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000087#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000088#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000089#else
90#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000091#endif
92
93/* error codes */
94#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000095#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000096#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000097#define SRE_ERROR_MEMORY -9 /* out of memory */
98
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000099#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +0000100#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +0000101#else
102#define TRACE(v)
103#endif
104
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000105/* -------------------------------------------------------------------- */
106/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000107
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000108/* default character predicates (run sre_chars.py to regenerate tables) */
109
110#define SRE_DIGIT_MASK 1
111#define SRE_SPACE_MASK 2
112#define SRE_LINEBREAK_MASK 4
113#define SRE_ALNUM_MASK 8
114#define SRE_WORD_MASK 16
115
Fredrik Lundh21009b92001-09-18 18:47:09 +0000116/* FIXME: this assumes ASCII. create tables in init_sre() instead */
117
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000118static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1192, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1200, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
12125, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12224, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1230, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12424, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
125
Fredrik Lundhb389df32000-06-29 12:48:37 +0000126static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000012710, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
12827, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
12944, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
13061, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
131108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
132122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
133106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
134120, 121, 122, 123, 124, 125, 126, 127 };
135
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000136#define SRE_IS_DIGIT(ch)\
137 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
138#define SRE_IS_SPACE(ch)\
139 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
140#define SRE_IS_LINEBREAK(ch)\
141 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
142#define SRE_IS_ALNUM(ch)\
143 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
144#define SRE_IS_WORD(ch)\
145 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000146
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000147static unsigned int sre_lower(unsigned int ch)
148{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000149 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000150}
151
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000152/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000153/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
154 * warnings when c's type supports only numbers < N+1 */
155#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
156#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000157#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000158#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000159#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
160
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000161static unsigned int sre_lower_locale(unsigned int ch)
162{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000163 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000164}
165
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000166/* unicode-specific character predicates */
167
168#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000169
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000170#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
171#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
172#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000173#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000174#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000175
176static unsigned int sre_lower_unicode(unsigned int ch)
177{
178 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
179}
180
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000181#endif
182
Guido van Rossumb700df92000-03-31 14:59:30 +0000183LOCAL(int)
184sre_category(SRE_CODE category, unsigned int ch)
185{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000186 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000187
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000188 case SRE_CATEGORY_DIGIT:
189 return SRE_IS_DIGIT(ch);
190 case SRE_CATEGORY_NOT_DIGIT:
191 return !SRE_IS_DIGIT(ch);
192 case SRE_CATEGORY_SPACE:
193 return SRE_IS_SPACE(ch);
194 case SRE_CATEGORY_NOT_SPACE:
195 return !SRE_IS_SPACE(ch);
196 case SRE_CATEGORY_WORD:
197 return SRE_IS_WORD(ch);
198 case SRE_CATEGORY_NOT_WORD:
199 return !SRE_IS_WORD(ch);
200 case SRE_CATEGORY_LINEBREAK:
201 return SRE_IS_LINEBREAK(ch);
202 case SRE_CATEGORY_NOT_LINEBREAK:
203 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000204
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000205 case SRE_CATEGORY_LOC_WORD:
206 return SRE_LOC_IS_WORD(ch);
207 case SRE_CATEGORY_LOC_NOT_WORD:
208 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000209
210#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000211 case SRE_CATEGORY_UNI_DIGIT:
212 return SRE_UNI_IS_DIGIT(ch);
213 case SRE_CATEGORY_UNI_NOT_DIGIT:
214 return !SRE_UNI_IS_DIGIT(ch);
215 case SRE_CATEGORY_UNI_SPACE:
216 return SRE_UNI_IS_SPACE(ch);
217 case SRE_CATEGORY_UNI_NOT_SPACE:
218 return !SRE_UNI_IS_SPACE(ch);
219 case SRE_CATEGORY_UNI_WORD:
220 return SRE_UNI_IS_WORD(ch);
221 case SRE_CATEGORY_UNI_NOT_WORD:
222 return !SRE_UNI_IS_WORD(ch);
223 case SRE_CATEGORY_UNI_LINEBREAK:
224 return SRE_UNI_IS_LINEBREAK(ch);
225 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
226 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000227#else
228 case SRE_CATEGORY_UNI_DIGIT:
229 return SRE_IS_DIGIT(ch);
230 case SRE_CATEGORY_UNI_NOT_DIGIT:
231 return !SRE_IS_DIGIT(ch);
232 case SRE_CATEGORY_UNI_SPACE:
233 return SRE_IS_SPACE(ch);
234 case SRE_CATEGORY_UNI_NOT_SPACE:
235 return !SRE_IS_SPACE(ch);
236 case SRE_CATEGORY_UNI_WORD:
237 return SRE_LOC_IS_WORD(ch);
238 case SRE_CATEGORY_UNI_NOT_WORD:
239 return !SRE_LOC_IS_WORD(ch);
240 case SRE_CATEGORY_UNI_LINEBREAK:
241 return SRE_IS_LINEBREAK(ch);
242 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
243 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000244#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000245 }
246 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000247}
248
249/* helpers */
250
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000251static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000252data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000253{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000254 if (state->data_stack) {
255 free(state->data_stack);
256 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000257 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000258 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000259}
260
261static int
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000262data_stack_grow(SRE_STATE* state, int size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000263{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000264 int minsize, cursize;
265 minsize = state->data_stack_base+size;
266 cursize = state->data_stack_size;
267 if (cursize < minsize) {
268 void* stack;
269 cursize = minsize+minsize/4+1024;
270 TRACE(("allocate/grow stack %d\n", cursize));
271 stack = realloc(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000272 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000273 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000274 return SRE_ERROR_MEMORY;
275 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000276 state->data_stack = stack;
277 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000278 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000279 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000280}
281
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000282/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000283
284#define SRE_CHAR unsigned char
285#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000286#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000287#define SRE_CHARSET sre_charset
288#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000289#define SRE_MATCH sre_match
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000290#define SRE_MATCH_CONTEXT sre_match_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000291#define SRE_SEARCH sre_search
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000292#define SRE_LITERAL_TEMPLATE sre_literal_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000293
294#if defined(HAVE_UNICODE)
295
Guido van Rossumb700df92000-03-31 14:59:30 +0000296#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000297#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000298#undef SRE_RECURSIVE
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000299
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000300#undef SRE_LITERAL_TEMPLATE
Guido van Rossumb700df92000-03-31 14:59:30 +0000301#undef SRE_SEARCH
302#undef SRE_MATCH
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000303#undef SRE_MATCH_CONTEXT
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000304#undef SRE_INFO
305#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000306#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000307#undef SRE_AT
308#undef SRE_CHAR
309
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000310/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000311
312#define SRE_CHAR Py_UNICODE
313#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000314#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000315#define SRE_CHARSET sre_ucharset
316#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000317#define SRE_MATCH sre_umatch
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000318#define SRE_MATCH_CONTEXT sre_umatch_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000319#define SRE_SEARCH sre_usearch
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000320#define SRE_LITERAL_TEMPLATE sre_uliteral_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000321#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000322
323#endif /* SRE_RECURSIVE */
324
325/* -------------------------------------------------------------------- */
326/* String matching engine */
327
328/* the following section is compiled twice, with different character
329 settings */
330
331LOCAL(int)
332SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
333{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000334 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000335
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000336 int this, that;
Guido van Rossumb700df92000-03-31 14:59:30 +0000337
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000338 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000339
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000340 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000341 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000342 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000343
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000344 case SRE_AT_BEGINNING_LINE:
345 return ((void*) ptr == state->beginning ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000346 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000347
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000348 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000349 return (((void*) (ptr+1) == state->end &&
350 SRE_IS_LINEBREAK((int) ptr[0])) ||
351 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000352
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000353 case SRE_AT_END_LINE:
354 return ((void*) ptr == state->end ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000355 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000356
Fredrik Lundh770617b2001-01-14 15:06:11 +0000357 case SRE_AT_END_STRING:
358 return ((void*) ptr == state->end);
359
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000360 case SRE_AT_BOUNDARY:
361 if (state->beginning == state->end)
362 return 0;
363 that = ((void*) ptr > state->beginning) ?
364 SRE_IS_WORD((int) ptr[-1]) : 0;
365 this = ((void*) ptr < state->end) ?
366 SRE_IS_WORD((int) ptr[0]) : 0;
367 return this != that;
Fredrik Lundh80946112000-06-29 18:03:25 +0000368
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000369 case SRE_AT_NON_BOUNDARY:
370 if (state->beginning == state->end)
371 return 0;
372 that = ((void*) ptr > state->beginning) ?
373 SRE_IS_WORD((int) ptr[-1]) : 0;
374 this = ((void*) ptr < state->end) ?
375 SRE_IS_WORD((int) ptr[0]) : 0;
376 return this == that;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000377
378 case SRE_AT_LOC_BOUNDARY:
379 if (state->beginning == state->end)
380 return 0;
381 that = ((void*) ptr > state->beginning) ?
382 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
383 this = ((void*) ptr < state->end) ?
384 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
385 return this != that;
386
387 case SRE_AT_LOC_NON_BOUNDARY:
388 if (state->beginning == state->end)
389 return 0;
390 that = ((void*) ptr > state->beginning) ?
391 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
392 this = ((void*) ptr < state->end) ?
393 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
394 return this == that;
395
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000396#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000397 case SRE_AT_UNI_BOUNDARY:
398 if (state->beginning == state->end)
399 return 0;
400 that = ((void*) ptr > state->beginning) ?
401 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
402 this = ((void*) ptr < state->end) ?
403 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
404 return this != that;
405
406 case SRE_AT_UNI_NON_BOUNDARY:
407 if (state->beginning == state->end)
408 return 0;
409 that = ((void*) ptr > state->beginning) ?
410 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
411 this = ((void*) ptr < state->end) ?
412 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
413 return this == that;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000414#endif
415
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000416 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000417
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000418 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000419}
420
421LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000422SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000423{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000424 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000425
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000426 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000427
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000428 for (;;) {
429 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000430
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000431 case SRE_OP_FAILURE:
432 return !ok;
433
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000434 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000435 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000436 if (ch == set[0])
437 return ok;
438 set++;
439 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000440
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000441 case SRE_OP_CATEGORY:
442 /* <CATEGORY> <code> */
443 if (sre_category(set[0], (int) ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000444 return ok;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000445 set += 1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000446 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000447
Fredrik Lundh3562f112000-07-02 12:00:07 +0000448 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000449 if (sizeof(SRE_CODE) == 2) {
450 /* <CHARSET> <bitmap> (16 bits per code word) */
451 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
452 return ok;
453 set += 16;
454 }
455 else {
456 /* <CHARSET> <bitmap> (32 bits per code word) */
457 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
458 return ok;
459 set += 8;
460 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000461 break;
462
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000463 case SRE_OP_RANGE:
464 /* <RANGE> <lower> <upper> */
465 if (set[0] <= ch && ch <= set[1])
466 return ok;
467 set += 2;
468 break;
469
470 case SRE_OP_NEGATE:
471 ok = !ok;
472 break;
473
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000474 case SRE_OP_BIGCHARSET:
475 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
476 {
477 int count, block;
478 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000479
480 if (sizeof(SRE_CODE) == 2) {
481 block = ((unsigned char*)set)[ch >> 8];
482 set += 128;
483 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
484 return ok;
485 set += count*16;
486 }
487 else {
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000488 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
489 * warnings when c's type supports only numbers < N+1 */
490 if (!(ch & ~65535))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000491 block = ((unsigned char*)set)[ch >> 8];
492 else
493 block = -1;
494 set += 64;
495 if (block >=0 &&
496 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
497 return ok;
498 set += count*8;
499 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000500 break;
501 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000502
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000503 default:
504 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000505 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000506 return 0;
507 }
508 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000509}
510
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000511LOCAL(int) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000512
513LOCAL(int)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000514SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, int maxcount)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000515{
516 SRE_CODE chr;
517 SRE_CHAR* ptr = state->ptr;
518 SRE_CHAR* end = state->end;
519 int i;
520
521 /* adjust end */
522 if (maxcount < end - ptr && maxcount != 65535)
523 end = ptr + maxcount;
524
525 switch (pattern[0]) {
526
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000527 case SRE_OP_IN:
528 /* repeated set */
529 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
530 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
531 ptr++;
532 break;
533
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000534 case SRE_OP_ANY:
535 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000536 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000537 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
538 ptr++;
539 break;
540
541 case SRE_OP_ANY_ALL:
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000542 /* repeated dot wildcard. skip to the end of the target
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000543 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000544 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000545 ptr = end;
546 break;
547
548 case SRE_OP_LITERAL:
549 /* repeated literal */
550 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000551 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000552 while (ptr < end && (SRE_CODE) *ptr == chr)
553 ptr++;
554 break;
555
556 case SRE_OP_LITERAL_IGNORE:
557 /* repeated literal */
558 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000559 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000560 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
561 ptr++;
562 break;
563
564 case SRE_OP_NOT_LITERAL:
565 /* repeated non-literal */
566 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000567 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000568 while (ptr < end && (SRE_CODE) *ptr != chr)
569 ptr++;
570 break;
571
572 case SRE_OP_NOT_LITERAL_IGNORE:
573 /* repeated non-literal */
574 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000575 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000576 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
577 ptr++;
578 break;
579
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000580 default:
581 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000582 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000583 while ((SRE_CHAR*) state->ptr < end) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000584 i = SRE_MATCH(state, pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000585 if (i < 0)
586 return i;
587 if (!i)
588 break;
589 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000590 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
591 (SRE_CHAR*) state->ptr - ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000592 return (SRE_CHAR*) state->ptr - ptr;
593 }
594
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000595 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000596 return ptr - (SRE_CHAR*) state->ptr;
597}
598
Fredrik Lundh33accc12000-08-27 20:59:47 +0000599#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000600LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000601SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
602{
603 /* check if an SRE_OP_INFO block matches at the current position.
604 returns the number of SRE_CODE objects to skip if successful, 0
605 if no match */
606
607 SRE_CHAR* end = state->end;
608 SRE_CHAR* ptr = state->ptr;
609 int i;
610
611 /* check minimal length */
612 if (pattern[3] && (end - ptr) < pattern[3])
613 return 0;
614
615 /* check known prefix */
616 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
617 /* <length> <skip> <prefix data> <overlap data> */
618 for (i = 0; i < pattern[5]; i++)
619 if ((SRE_CODE) ptr[i] != pattern[7 + i])
620 return 0;
621 return pattern[0] + 2 * pattern[6];
622 }
623 return pattern[0];
624}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000625#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000626
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000627/* The macros below should be used to protect recursive SRE_MATCH()
628 * calls that *failed* and do *not* return immediately (IOW, those
629 * that will backtrack). Explaining:
630 *
631 * - Recursive SRE_MATCH() returned true: that's usually a success
632 * (besides atypical cases like ASSERT_NOT), therefore there's no
633 * reason to restore lastmark;
634 *
635 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
636 * is returning to the caller: If the current SRE_MATCH() is the
637 * top function of the recursion, returning false will be a matching
638 * failure, and it doesn't matter where lastmark is pointing to.
639 * If it's *not* the top function, it will be a recursive SRE_MATCH()
640 * failure by itself, and the calling SRE_MATCH() will have to deal
641 * with the failure by the same rules explained here (it will restore
642 * lastmark by itself if necessary);
643 *
644 * - Recursive SRE_MATCH() returned false, and will continue the
645 * outside 'for' loop: must be protected when breaking, since the next
646 * OP could potentially depend on lastmark;
647 *
648 * - Recursive SRE_MATCH() returned false, and will be called again
649 * inside a local for/while loop: must be protected between each
650 * loop iteration, since the recursive SRE_MATCH() could do anything,
651 * and could potentially depend on lastmark.
652 *
653 * For more information, check the discussion at SF patch #712900.
654 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000655#define LASTMARK_SAVE() \
656 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000657 ctx->lastmark = state->lastmark; \
658 ctx->lastindex = state->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000659 } while (0)
660#define LASTMARK_RESTORE() \
661 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000662 state->lastmark = ctx->lastmark; \
663 state->lastindex = ctx->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000664 } while (0)
665
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000666#define RETURN_ERROR(i) do { return i; } while(0)
667#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
668#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
669
670#define RETURN_ON_ERROR(i) \
671 do { if (i < 0) RETURN_ERROR(i); } while (0)
672#define RETURN_ON_SUCCESS(i) \
673 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
674#define RETURN_ON_FAILURE(i) \
675 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
676
677#define SFY(x) #x
678
679#define DATA_STACK_ALLOC(state, type, ptr) \
680do { \
681 alloc_pos = state->data_stack_base; \
682 TRACE(("allocating %s in %d (%d)\n", \
683 SFY(type), alloc_pos, sizeof(type))); \
684 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
685 int j = data_stack_grow(state, sizeof(type)); \
686 if (j < 0) return j; \
687 if (ctx_pos != -1) \
688 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
689 } \
690 ptr = (type*)(state->data_stack+alloc_pos); \
691 state->data_stack_base += sizeof(type); \
692} while (0)
693
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000694#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
695do { \
696 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
697 ptr = (type*)(state->data_stack+pos); \
698} while (0)
699
700#define DATA_STACK_PUSH(state, data, size) \
701do { \
702 TRACE(("copy data in %p to %d (%d)\n", \
703 data, state->data_stack_base, size)); \
704 if (state->data_stack_size < state->data_stack_base+size) { \
705 int j = data_stack_grow(state, size); \
706 if (j < 0) return j; \
707 if (ctx_pos != -1) \
708 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
709 } \
710 memcpy(state->data_stack+state->data_stack_base, data, size); \
711 state->data_stack_base += size; \
712} while (0)
713
714#define DATA_STACK_POP(state, data, size, discard) \
715do { \
716 TRACE(("copy data to %p from %d (%d)\n", \
717 data, state->data_stack_base-size, size)); \
718 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
719 if (discard) \
720 state->data_stack_base -= size; \
721} while (0)
722
723#define DATA_STACK_POP_DISCARD(state, size) \
724do { \
725 TRACE(("discard data from %d (%d)\n", \
726 state->data_stack_base-size, size)); \
727 state->data_stack_base -= size; \
728} while(0)
729
730#define DATA_PUSH(x) \
731 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
732#define DATA_POP(x) \
733 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000734#define DATA_POP_DISCARD(x) \
735 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
736#define DATA_ALLOC(t,p) \
737 DATA_STACK_ALLOC(state, t, p)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000738#define DATA_LOOKUP_AT(t,p,pos) \
739 DATA_STACK_LOOKUP_AT(state,t,p,pos)
740
741#define MARK_PUSH(lastmark) \
742 do if (lastmark > 0) { \
743 i = lastmark; /* ctx->lastmark may change if reallocated */ \
744 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
745 } while (0)
746#define MARK_POP(lastmark) \
747 do if (lastmark > 0) { \
748 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
749 } while (0)
750#define MARK_POP_KEEP(lastmark) \
751 do if (lastmark > 0) { \
752 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
753 } while (0)
754#define MARK_POP_DISCARD(lastmark) \
755 do if (lastmark > 0) { \
756 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
757 } while (0)
758
759#define JUMP_NONE 0
760#define JUMP_MAX_UNTIL_1 1
761#define JUMP_MAX_UNTIL_2 2
762#define JUMP_MAX_UNTIL_3 3
763#define JUMP_MIN_UNTIL_1 4
764#define JUMP_MIN_UNTIL_2 5
765#define JUMP_MIN_UNTIL_3 6
766#define JUMP_REPEAT 7
767#define JUMP_REPEAT_ONE_1 8
768#define JUMP_REPEAT_ONE_2 9
769#define JUMP_MIN_REPEAT_ONE 10
770#define JUMP_BRANCH 11
771#define JUMP_ASSERT 12
772#define JUMP_ASSERT_NOT 13
773
774#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
775 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
776 nextctx->last_ctx_pos = ctx_pos; \
777 nextctx->jump = jumpvalue; \
778 nextctx->pattern = nextpattern; \
779 ctx_pos = alloc_pos; \
780 ctx = nextctx; \
781 goto entrance; \
782 jumplabel: \
783 while (0) /* gcc doesn't like labels at end of scopes */ \
784
785typedef struct {
786 int last_ctx_pos;
787 int jump;
788 SRE_CHAR* ptr;
789 SRE_CODE* pattern;
790 int count;
791 int lastmark;
792 int lastindex;
793 union {
794 SRE_CODE chr;
795 SRE_REPEAT* rep;
796 } u;
797} SRE_MATCH_CONTEXT;
798
799/* check if string matches the given pattern. returns <0 for
800 error, 0 for failure, and 1 for success */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000801LOCAL(int)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000802SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
Guido van Rossumb700df92000-03-31 14:59:30 +0000803{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000804 SRE_CHAR* end = state->end;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000805 int alloc_pos, ctx_pos = -1;
806 int i, ret = 0;
807 int jump;
Guido van Rossumb700df92000-03-31 14:59:30 +0000808
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000809 SRE_MATCH_CONTEXT* ctx;
810 SRE_MATCH_CONTEXT* nextctx;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000811
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000812 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000813
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000814 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
815 ctx->last_ctx_pos = -1;
816 ctx->jump = JUMP_NONE;
817 ctx->pattern = pattern;
818 ctx_pos = alloc_pos;
819
820entrance:
821
822 ctx->ptr = state->ptr;
823
824 if (ctx->pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000825 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000826 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000827 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000828 TRACE(("reject (got %d chars, need %d)\n",
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000829 (end - ctx->ptr), ctx->pattern[3]));
830 RETURN_FAILURE;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000831 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000832 ctx->pattern += ctx->pattern[1] + 1;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000833 }
834
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000835 for (;;) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000836
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000837 switch (*ctx->pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000838
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000839 case SRE_OP_MARK:
840 /* set mark */
841 /* <MARK> <gid> */
842 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
843 ctx->ptr, ctx->pattern[0]));
844 i = ctx->pattern[0];
845 if (i & 1)
846 state->lastindex = i/2 + 1;
847 if (i > state->lastmark) {
848 /* state->lastmark is the highest valid index in the
849 state->mark array. If it is increased by more than 1,
850 the intervening marks must be set to NULL to signal
851 that these marks have not been encountered. */
852 int j = state->lastmark + 1;
853 while (j < i)
854 state->mark[j++] = NULL;
855 state->lastmark = i;
856 }
857 state->mark[i] = ctx->ptr;
858 ctx->pattern++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000859 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000860
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000861 case SRE_OP_LITERAL:
862 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000863 /* <LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000864 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
865 ctx->ptr, *ctx->pattern));
866 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
867 RETURN_FAILURE;
868 ctx->pattern++;
869 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000870 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000871
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000872 case SRE_OP_NOT_LITERAL:
873 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000874 /* <NOT_LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000875 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
876 ctx->ptr, *ctx->pattern));
877 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
878 RETURN_FAILURE;
879 ctx->pattern++;
880 ctx->ptr++;
881 break;
882
883 case SRE_OP_SUCCESS:
884 /* end of pattern */
885 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
886 state->ptr = ctx->ptr;
887 RETURN_SUCCESS;
888
889 case SRE_OP_AT:
890 /* match at given position */
891 /* <AT> <code> */
892 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
893 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
894 RETURN_FAILURE;
895 ctx->pattern++;
896 break;
897
898 case SRE_OP_CATEGORY:
899 /* match at given category */
900 /* <CATEGORY> <code> */
901 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
902 ctx->ptr, *ctx->pattern));
903 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
904 RETURN_FAILURE;
905 ctx->pattern++;
906 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000907 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000908
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000909 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000910 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000911 /* <ANY> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000912 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
913 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
914 RETURN_FAILURE;
915 ctx->ptr++;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000916 break;
917
918 case SRE_OP_ANY_ALL:
919 /* match anything */
920 /* <ANY_ALL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000921 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
922 if (ctx->ptr >= end)
923 RETURN_FAILURE;
924 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000925 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000926
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000927 case SRE_OP_IN:
928 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000929 /* <IN> <skip> <set> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000930 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
931 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
932 RETURN_FAILURE;
933 ctx->pattern += ctx->pattern[0];
934 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000935 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000936
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000937 case SRE_OP_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000938 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
939 ctx->pattern, ctx->ptr, ctx->pattern[0]));
940 if (ctx->ptr >= end ||
941 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
942 RETURN_FAILURE;
943 ctx->pattern++;
944 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000945 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000946
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000947 case SRE_OP_NOT_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000948 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
949 ctx->pattern, ctx->ptr, *ctx->pattern));
950 if (ctx->ptr >= end ||
951 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
952 RETURN_FAILURE;
953 ctx->pattern++;
954 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000955 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000956
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000957 case SRE_OP_IN_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000958 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
959 if (ctx->ptr >= end
960 || !SRE_CHARSET(ctx->pattern+1,
961 (SRE_CODE)state->lower(*ctx->ptr)))
962 RETURN_FAILURE;
963 ctx->pattern += ctx->pattern[0];
964 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000965 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000966
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000967 case SRE_OP_JUMP:
968 case SRE_OP_INFO:
969 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000970 /* <JUMP> <offset> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000971 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
972 ctx->ptr, ctx->pattern[0]));
973 ctx->pattern += ctx->pattern[0];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000974 break;
975
976 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000977 /* alternation */
978 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000979 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000980 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000981 ctx->u.rep = state->repeat;
982 if (ctx->u.rep)
983 MARK_PUSH(ctx->lastmark);
984 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
985 if (ctx->pattern[1] == SRE_OP_LITERAL &&
986 (ctx->ptr >= end ||
987 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000988 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000989 if (ctx->pattern[1] == SRE_OP_IN &&
990 (ctx->ptr >= end ||
991 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000992 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000993 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000994 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000995 if (ret) {
996 if (ctx->u.rep)
997 MARK_POP_DISCARD(ctx->lastmark);
998 RETURN_ON_ERROR(ret);
999 RETURN_SUCCESS;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001000 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001001 if (ctx->u.rep)
1002 MARK_POP_KEEP(ctx->lastmark);
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001003 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001004 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001005 if (ctx->u.rep)
1006 MARK_POP_DISCARD(ctx->lastmark);
1007 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001008
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001009 case SRE_OP_REPEAT_ONE:
1010 /* match repeated sequence (maximizing regexp) */
1011
1012 /* this operator only works if the repeated item is
1013 exactly one character wide, and we're not already
1014 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +00001015 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001016
1017 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1018
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001019 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1020 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001021
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001022 if (ctx->ptr + ctx->pattern[1] > end)
1023 RETURN_FAILURE; /* cannot match */
Fredrik Lundhe1869832000-08-01 22:47:49 +00001024
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001025 state->ptr = ctx->ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001026
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001027 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1028 RETURN_ON_ERROR(ret);
1029 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1030 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001031 ctx->ptr += ctx->count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001032
1033 /* when we arrive here, count contains the number of
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001034 matches, and ctx->ptr points to the tail of the target
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001035 string. check if the rest of the pattern matches,
1036 and backtrack if not. */
1037
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001038 if (ctx->count < (int) ctx->pattern[1])
1039 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001040
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001041 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001042 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001043 state->ptr = ctx->ptr;
1044 RETURN_SUCCESS;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001045 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001046
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001047 LASTMARK_SAVE();
1048
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001049 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001050 /* tail starts with a literal. skip positions where
1051 the rest of the pattern cannot possibly match */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001052 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001053 for (;;) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001054 while (ctx->count >= (int) ctx->pattern[1] &&
1055 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1056 ctx->ptr--;
1057 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001058 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001059 if (ctx->count < (int) ctx->pattern[1])
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001060 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001061 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001062 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1063 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001064 if (ret) {
1065 RETURN_ON_ERROR(ret);
1066 RETURN_SUCCESS;
1067 }
1068
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001069 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001070
1071 ctx->ptr--;
1072 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001073 }
1074
1075 } else {
1076 /* general case */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001077 while (ctx->count >= (int) ctx->pattern[1]) {
1078 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001079 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1080 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001081 if (ret) {
1082 RETURN_ON_ERROR(ret);
1083 RETURN_SUCCESS;
1084 }
1085 ctx->ptr--;
1086 ctx->count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001087 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001088 }
1089 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001090 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001091
Guido van Rossum41c99e72003-04-14 17:59:34 +00001092 case SRE_OP_MIN_REPEAT_ONE:
1093 /* match repeated sequence (minimizing regexp) */
1094
1095 /* this operator only works if the repeated item is
1096 exactly one character wide, and we're not already
1097 collecting backtracking points. for other cases,
1098 use the MIN_REPEAT operator */
1099
1100 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1101
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001102 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1103 ctx->pattern[1], ctx->pattern[2]));
Guido van Rossum41c99e72003-04-14 17:59:34 +00001104
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001105 if (ctx->ptr + ctx->pattern[1] > end)
1106 RETURN_FAILURE; /* cannot match */
Guido van Rossum41c99e72003-04-14 17:59:34 +00001107
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001108 state->ptr = ctx->ptr;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001109
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001110 if (ctx->pattern[1] == 0)
1111 ctx->count = 0;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001112 else {
1113 /* count using pattern min as the maximum */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001114 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1115 RETURN_ON_ERROR(ret);
1116 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1117 if (ret < (int) ctx->pattern[1])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001118 /* didn't match minimum number of times */
1119 RETURN_FAILURE;
1120 /* advance past minimum matches of repeat */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001121 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001122 ctx->ptr += ctx->count;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001123 }
1124
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001125 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Guido van Rossum41c99e72003-04-14 17:59:34 +00001126 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001127 state->ptr = ctx->ptr;
1128 RETURN_SUCCESS;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001129
1130 } else {
1131 /* general case */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001132 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001133 while ((int)ctx->pattern[2] == 65535
1134 || ctx->count <= (int)ctx->pattern[2]) {
1135 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001136 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1137 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001138 if (ret) {
1139 RETURN_ON_ERROR(ret);
1140 RETURN_SUCCESS;
1141 }
1142 state->ptr = ctx->ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001143 ret = SRE_COUNT(state, ctx->pattern+3, 1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001144 RETURN_ON_ERROR(ret);
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001145 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001146 if (ret == 0)
Guido van Rossum41c99e72003-04-14 17:59:34 +00001147 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001148 assert(ret == 1);
1149 ctx->ptr++;
1150 ctx->count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001151 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001152 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001153 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001154 RETURN_FAILURE;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001155
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001156 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001157 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001158 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001159 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001160 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1161 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001162
1163 /* install new repeat context */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001164 ctx->u.rep = (SRE_REPEAT*) malloc(sizeof(*ctx->u.rep));
1165 ctx->u.rep->count = -1;
1166 ctx->u.rep->pattern = ctx->pattern;
1167 ctx->u.rep->prev = state->repeat;
1168 ctx->u.rep->last_ptr = NULL;
1169 state->repeat = ctx->u.rep;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001170
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001171 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001172 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001173 state->repeat = ctx->u.rep->prev;
1174 free(ctx->u.rep);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001175
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001176 if (ret) {
1177 RETURN_ON_ERROR(ret);
1178 RETURN_SUCCESS;
1179 }
1180 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001181
1182 case SRE_OP_MAX_UNTIL:
1183 /* maximizing repeat */
1184 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1185
1186 /* FIXME: we probably need to deal with zero-width
1187 matches in here... */
1188
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001189 ctx->u.rep = state->repeat;
1190 if (!ctx->u.rep)
1191 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001192
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001193 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001194
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001195 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001196
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001197 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1198 ctx->ptr, ctx->count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001199
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001200 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001201 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001202 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001203 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1204 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001205 if (ret) {
1206 RETURN_ON_ERROR(ret);
1207 RETURN_SUCCESS;
1208 }
1209 ctx->u.rep->count = ctx->count-1;
1210 state->ptr = ctx->ptr;
1211 RETURN_FAILURE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001212 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001213
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001214 if ((ctx->count < ctx->u.rep->pattern[2] ||
1215 ctx->u.rep->pattern[2] == 65535) &&
1216 state->ptr != ctx->u.rep->last_ptr) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001217 /* we may have enough matches, but if we can
1218 match another item, do so */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001219 ctx->u.rep->count = ctx->count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001220 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001221 MARK_PUSH(ctx->lastmark);
1222 /* zero-width match protection */
1223 DATA_PUSH(&ctx->u.rep->last_ptr);
1224 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001225 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1226 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001227 DATA_POP(&ctx->u.rep->last_ptr);
1228 if (ret) {
1229 MARK_POP_DISCARD(ctx->lastmark);
1230 RETURN_ON_ERROR(ret);
1231 RETURN_SUCCESS;
1232 }
1233 MARK_POP(ctx->lastmark);
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001234 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001235 ctx->u.rep->count = ctx->count-1;
1236 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001237 }
1238
1239 /* cannot match more repeated items here. make sure the
1240 tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001241 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001242 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001243 RETURN_ON_SUCCESS(ret);
1244 state->repeat = ctx->u.rep;
1245 state->ptr = ctx->ptr;
1246 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001247
1248 case SRE_OP_MIN_UNTIL:
1249 /* minimizing repeat */
1250 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1251
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001252 ctx->u.rep = state->repeat;
1253 if (!ctx->u.rep)
1254 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001255
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001256 state->ptr = ctx->ptr;
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001257
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001258 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001259
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001260 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1261 ctx->ptr, ctx->count, ctx->u.rep->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001262
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001263 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001264 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001265 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001266 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1267 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001268 if (ret) {
1269 RETURN_ON_ERROR(ret);
1270 RETURN_SUCCESS;
1271 }
1272 ctx->u.rep->count = ctx->count-1;
1273 state->ptr = ctx->ptr;
1274 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001275 }
1276
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001277 LASTMARK_SAVE();
1278
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001279 /* see if the tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001280 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001281 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001282 if (ret) {
1283 RETURN_ON_ERROR(ret);
1284 RETURN_SUCCESS;
1285 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001286
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001287 state->repeat = ctx->u.rep;
1288 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001289
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001290 LASTMARK_RESTORE();
1291
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001292 if (ctx->count >= ctx->u.rep->pattern[2]
1293 && ctx->u.rep->pattern[2] != 65535)
1294 RETURN_FAILURE;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001295
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001296 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001297 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1298 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001299 if (ret) {
1300 RETURN_ON_ERROR(ret);
1301 RETURN_SUCCESS;
1302 }
1303 ctx->u.rep->count = ctx->count-1;
1304 state->ptr = ctx->ptr;
1305 RETURN_FAILURE;
1306
1307 case SRE_OP_GROUPREF:
1308 /* match backreference */
1309 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1310 ctx->ptr, ctx->pattern[0]));
1311 i = ctx->pattern[0];
1312 {
1313 int groupref = i+i;
1314 if (groupref >= state->lastmark) {
1315 RETURN_FAILURE;
1316 } else {
1317 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1318 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1319 if (!p || !e || e < p)
1320 RETURN_FAILURE;
1321 while (p < e) {
1322 if (ctx->ptr >= end || *ctx->ptr != *p)
1323 RETURN_FAILURE;
1324 p++; ctx->ptr++;
1325 }
1326 }
1327 }
1328 ctx->pattern++;
1329 break;
1330
1331 case SRE_OP_GROUPREF_IGNORE:
1332 /* match backreference */
1333 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1334 ctx->ptr, ctx->pattern[0]));
1335 i = ctx->pattern[0];
1336 {
1337 int groupref = i+i;
1338 if (groupref >= state->lastmark) {
1339 RETURN_FAILURE;
1340 } else {
1341 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1342 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1343 if (!p || !e || e < p)
1344 RETURN_FAILURE;
1345 while (p < e) {
1346 if (ctx->ptr >= end ||
1347 state->lower(*ctx->ptr) != state->lower(*p))
1348 RETURN_FAILURE;
1349 p++; ctx->ptr++;
1350 }
1351 }
1352 }
1353 ctx->pattern++;
1354 break;
1355
1356 case SRE_OP_GROUPREF_EXISTS:
1357 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1358 ctx->ptr, ctx->pattern[0]));
1359 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1360 i = ctx->pattern[0];
1361 {
1362 int groupref = i+i;
1363 if (groupref >= state->lastmark) {
1364 ctx->pattern += ctx->pattern[1];
1365 break;
1366 } else {
1367 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1368 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1369 if (!p || !e || e < p) {
1370 ctx->pattern += ctx->pattern[1];
1371 break;
1372 }
1373 }
1374 }
1375 ctx->pattern += 2;
1376 break;
1377
1378 case SRE_OP_ASSERT:
1379 /* assert subpattern */
1380 /* <ASSERT> <skip> <back> <pattern> */
1381 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1382 ctx->ptr, ctx->pattern[1]));
1383 state->ptr = ctx->ptr - ctx->pattern[1];
1384 if (state->ptr < state->beginning)
1385 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001386 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001387 RETURN_ON_FAILURE(ret);
1388 ctx->pattern += ctx->pattern[0];
1389 break;
1390
1391 case SRE_OP_ASSERT_NOT:
1392 /* assert not subpattern */
1393 /* <ASSERT_NOT> <skip> <back> <pattern> */
1394 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1395 ctx->ptr, ctx->pattern[1]));
1396 state->ptr = ctx->ptr - ctx->pattern[1];
1397 if (state->ptr >= state->beginning) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001398 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001399 if (ret) {
1400 RETURN_ON_ERROR(ret);
1401 RETURN_FAILURE;
1402 }
1403 }
1404 ctx->pattern += ctx->pattern[0];
1405 break;
1406
1407 case SRE_OP_FAILURE:
1408 /* immediate failure */
1409 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1410 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001411
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001412 default:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001413 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1414 ctx->pattern[-1]));
1415 RETURN_ERROR(SRE_ERROR_ILLEGAL);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001416 }
1417 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001418
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001419exit:
1420 ctx_pos = ctx->last_ctx_pos;
1421 jump = ctx->jump;
1422 DATA_POP_DISCARD(ctx);
1423 if (ctx_pos == -1)
1424 return ret;
1425 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1426
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001427 switch (jump) {
1428 case JUMP_MAX_UNTIL_2:
1429 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1430 goto jump_max_until_2;
1431 case JUMP_MAX_UNTIL_3:
1432 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1433 goto jump_max_until_3;
1434 case JUMP_MIN_UNTIL_2:
1435 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1436 goto jump_min_until_2;
1437 case JUMP_MIN_UNTIL_3:
1438 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1439 goto jump_min_until_3;
1440 case JUMP_BRANCH:
1441 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1442 goto jump_branch;
1443 case JUMP_MAX_UNTIL_1:
1444 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1445 goto jump_max_until_1;
1446 case JUMP_MIN_UNTIL_1:
1447 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1448 goto jump_min_until_1;
1449 case JUMP_REPEAT:
1450 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1451 goto jump_repeat;
1452 case JUMP_REPEAT_ONE_1:
1453 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1454 goto jump_repeat_one_1;
1455 case JUMP_REPEAT_ONE_2:
1456 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1457 goto jump_repeat_one_2;
1458 case JUMP_MIN_REPEAT_ONE:
1459 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1460 goto jump_min_repeat_one;
1461 case JUMP_ASSERT:
1462 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1463 goto jump_assert;
1464 case JUMP_ASSERT_NOT:
1465 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1466 goto jump_assert_not;
1467 case JUMP_NONE:
1468 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1469 break;
1470 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001471
1472 return ret; /* should never get here */
Guido van Rossumb700df92000-03-31 14:59:30 +00001473}
1474
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001475LOCAL(int)
Guido van Rossumb700df92000-03-31 14:59:30 +00001476SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1477{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001478 SRE_CHAR* ptr = state->start;
1479 SRE_CHAR* end = state->end;
1480 int status = 0;
Fredrik Lundh28552902000-07-05 21:14:16 +00001481 int prefix_len = 0;
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001482 int prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001483 SRE_CODE* prefix = NULL;
1484 SRE_CODE* charset = NULL;
1485 SRE_CODE* overlap = NULL;
1486 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001487
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001488 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001489 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001490 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001491
1492 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001493
Gustavo Niemeyer28b5bb32003-06-26 14:41:08 +00001494 if (pattern[3] > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001495 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001496 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001497 end -= pattern[3]-1;
1498 if (end <= ptr)
1499 end = ptr+1;
1500 }
1501
Fredrik Lundh3562f112000-07-02 12:00:07 +00001502 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001503 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001504 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001505 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001506 prefix_skip = pattern[6];
1507 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001508 overlap = prefix + prefix_len - 1;
1509 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001510 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001511 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001512 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001513
1514 pattern += 1 + pattern[1];
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001515 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001516
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001517 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1518 TRACE(("charset = %p\n", charset));
1519
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001520#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001521 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001522 /* pattern starts with a known prefix. use the overlap
1523 table to skip forward as fast as we possibly can */
1524 int i = 0;
1525 end = state->end;
1526 while (ptr < end) {
1527 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001528 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001529 if (!i)
1530 break;
1531 else
1532 i = overlap[i];
1533 } else {
1534 if (++i == prefix_len) {
1535 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001536 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1537 state->start = ptr + 1 - prefix_len;
1538 state->ptr = ptr + 1 - prefix_len + prefix_skip;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001539 if (flags & SRE_INFO_LITERAL)
1540 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001541 status = SRE_MATCH(state, pattern + 2*prefix_skip);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001542 if (status != 0)
1543 return status;
1544 /* close but no cigar -- try again */
1545 i = overlap[i];
1546 }
1547 break;
1548 }
1549
1550 }
1551 ptr++;
1552 }
1553 return 0;
1554 }
1555#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001556
Fredrik Lundh3562f112000-07-02 12:00:07 +00001557 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001558 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001559 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001560 SRE_CODE chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001561 end = state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001562 for (;;) {
1563 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1564 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001565 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001566 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001567 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001568 state->start = ptr;
1569 state->ptr = ++ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001570 if (flags & SRE_INFO_LITERAL)
1571 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001572 status = SRE_MATCH(state, pattern + 2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001573 if (status != 0)
1574 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001575 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001576 } else if (charset) {
1577 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001578 end = state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001579 for (;;) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001580 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001581 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001582 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001583 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001584 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001585 state->start = ptr;
1586 state->ptr = ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001587 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001588 if (status != 0)
1589 break;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001590 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001591 }
1592 } else
1593 /* general case */
1594 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001595 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001596 state->start = state->ptr = ptr++;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001597 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001598 if (status != 0)
1599 break;
1600 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001601
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001602 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001603}
Fredrik Lundh3562f112000-07-02 12:00:07 +00001604
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001605LOCAL(int)
1606SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, int len)
1607{
1608 /* check if given string is a literal template (i.e. no escapes) */
1609 while (len-- > 0)
1610 if (*ptr++ == '\\')
1611 return 0;
1612 return 1;
1613}
Guido van Rossumb700df92000-03-31 14:59:30 +00001614
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001615#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001616
1617/* -------------------------------------------------------------------- */
1618/* factories and destructors */
1619
1620/* see sre.h for object declarations */
1621
Jeremy Hylton938ace62002-07-17 16:30:39 +00001622static PyTypeObject Pattern_Type;
1623static PyTypeObject Match_Type;
1624static PyTypeObject Scanner_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00001625
1626static PyObject *
1627_compile(PyObject* self_, PyObject* args)
1628{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001629 /* "compile" pattern descriptor to pattern object */
Guido van Rossumb700df92000-03-31 14:59:30 +00001630
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001631 PatternObject* self;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001632 int i, n;
Guido van Rossumb700df92000-03-31 14:59:30 +00001633
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001634 PyObject* pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001635 int flags = 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001636 PyObject* code;
1637 int groups = 0;
1638 PyObject* groupindex = NULL;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001639 PyObject* indexgroup = NULL;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001640 if (!PyArg_ParseTuple(args, "OiO!|iOO", &pattern, &flags,
1641 &PyList_Type, &code, &groups,
1642 &groupindex, &indexgroup))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001643 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001644
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001645 n = PyList_GET_SIZE(code);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001646
Fredrik Lundhebc37b22000-10-28 19:30:41 +00001647 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001648 if (!self)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001649 return NULL;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001650
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001651 self->codesize = n;
1652
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001653 for (i = 0; i < n; i++) {
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001654 PyObject *o = PyList_GET_ITEM(code, i);
Martin v. Löwis78e2f062003-04-19 12:56:08 +00001655 if (PyInt_Check(o))
1656 self->code[i] = (SRE_CODE) PyInt_AsLong(o);
1657 else
1658 self->code[i] = (SRE_CODE) PyLong_AsUnsignedLong(o);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001659 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00001660
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001661 if (PyErr_Occurred()) {
1662 PyObject_DEL(self);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001663 return NULL;
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001664 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00001665
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001666 Py_INCREF(pattern);
1667 self->pattern = pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00001668
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001669 self->flags = flags;
1670
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001671 self->groups = groups;
Guido van Rossumb700df92000-03-31 14:59:30 +00001672
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001673 Py_XINCREF(groupindex);
1674 self->groupindex = groupindex;
Guido van Rossumb700df92000-03-31 14:59:30 +00001675
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001676 Py_XINCREF(indexgroup);
1677 self->indexgroup = indexgroup;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001678
Raymond Hettinger027bb632004-05-31 03:09:25 +00001679 self->weakreflist = NULL;
1680
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001681 return (PyObject*) self;
Guido van Rossumb700df92000-03-31 14:59:30 +00001682}
1683
1684static PyObject *
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001685sre_codesize(PyObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001686{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001687 return Py_BuildValue("i", sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001688}
1689
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001690static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001691sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001692{
1693 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001694 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001695 return NULL;
1696 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001697 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001698 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001699#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001700 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001701#else
1702 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001703#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001704 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001705}
1706
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001707LOCAL(void)
1708state_reset(SRE_STATE* state)
1709{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001710 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001711 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001712
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001713 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001714 state->lastindex = -1;
1715
1716 state->repeat = NULL;
1717
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001718 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001719}
1720
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001721static void*
1722getstring(PyObject* string, int* p_length, int* p_charsize)
Guido van Rossumb700df92000-03-31 14:59:30 +00001723{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001724 /* given a python object, return a data pointer, a length (in
1725 characters), and a character size. return NULL if the object
1726 is not a string (or not compatible) */
1727
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001728 PyBufferProcs *buffer;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001729 int size, bytes, charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001730 void* ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001731
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001732#if defined(HAVE_UNICODE)
1733 if (PyUnicode_Check(string)) {
1734 /* unicode strings doesn't always support the buffer interface */
1735 ptr = (void*) PyUnicode_AS_DATA(string);
1736 bytes = PyUnicode_GET_DATA_SIZE(string);
1737 size = PyUnicode_GET_SIZE(string);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001738 charsize = sizeof(Py_UNICODE);
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001739
1740 } else {
1741#endif
1742
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001743 /* get pointer to string buffer */
1744 buffer = string->ob_type->tp_as_buffer;
1745 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1746 buffer->bf_getsegcount(string, NULL) != 1) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001747 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001748 return NULL;
1749 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001750
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001751 /* determine buffer size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001752 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1753 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001754 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1755 return NULL;
1756 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001757
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001758 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001759#if PY_VERSION_HEX >= 0x01060000
1760 size = PyObject_Size(string);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001761#else
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001762 size = PyObject_Length(string);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001763#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001764
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001765 if (PyString_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001766 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001767#if defined(HAVE_UNICODE)
1768 else if (bytes == (int) (size * sizeof(Py_UNICODE)))
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001769 charsize = sizeof(Py_UNICODE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001770#endif
1771 else {
1772 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1773 return NULL;
1774 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001775
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001776#if defined(HAVE_UNICODE)
1777 }
1778#endif
1779
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001780 *p_length = size;
1781 *p_charsize = charsize;
1782
1783 return ptr;
1784}
1785
1786LOCAL(PyObject*)
1787state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1788 int start, int end)
1789{
1790 /* prepare state object */
1791
1792 int length;
1793 int charsize;
1794 void* ptr;
1795
1796 memset(state, 0, sizeof(SRE_STATE));
1797
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001798 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001799 state->lastindex = -1;
1800
1801 ptr = getstring(string, &length, &charsize);
1802 if (!ptr)
1803 return NULL;
1804
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001805 /* adjust boundaries */
1806 if (start < 0)
1807 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001808 else if (start > length)
1809 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001810
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001811 if (end < 0)
1812 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001813 else if (end > length)
1814 end = length;
1815
1816 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001817
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001818 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001819
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001820 state->start = (void*) ((char*) ptr + start * state->charsize);
1821 state->end = (void*) ((char*) ptr + end * state->charsize);
1822
1823 Py_INCREF(string);
1824 state->string = string;
1825 state->pos = start;
1826 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001827
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001828 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001829 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001830 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001831#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001832 state->lower = sre_lower_unicode;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001833#else
1834 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001835#endif
1836 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001837 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001838
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001839 return string;
Guido van Rossumb700df92000-03-31 14:59:30 +00001840}
1841
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001842LOCAL(void)
1843state_fini(SRE_STATE* state)
1844{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001845 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001846 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001847}
1848
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001849/* calculate offset from start of string */
1850#define STATE_OFFSET(state, member)\
1851 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1852
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001853LOCAL(PyObject*)
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001854state_getslice(SRE_STATE* state, int index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001855{
Fredrik Lundh58100642000-08-09 09:14:35 +00001856 int i, j;
1857
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001858 index = (index - 1) * 2;
1859
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001860 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001861 if (empty)
1862 /* want empty string */
1863 i = j = 0;
1864 else {
1865 Py_INCREF(Py_None);
1866 return Py_None;
1867 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001868 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001869 i = STATE_OFFSET(state, state->mark[index]);
1870 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001871 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001872
Fredrik Lundh58100642000-08-09 09:14:35 +00001873 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001874}
1875
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001876static void
1877pattern_error(int status)
1878{
1879 switch (status) {
1880 case SRE_ERROR_RECURSION_LIMIT:
1881 PyErr_SetString(
1882 PyExc_RuntimeError,
1883 "maximum recursion limit exceeded"
1884 );
1885 break;
1886 case SRE_ERROR_MEMORY:
1887 PyErr_NoMemory();
1888 break;
1889 default:
1890 /* other error codes indicate compiler/engine bugs */
1891 PyErr_SetString(
1892 PyExc_RuntimeError,
1893 "internal error in regular expression engine"
1894 );
1895 }
1896}
1897
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001898static PyObject*
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001899pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001900{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001901 /* create match object (from state object) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001902
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001903 MatchObject* match;
1904 int i, j;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001905 char* base;
1906 int n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001907
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001908 if (status > 0) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001909
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001910 /* create match object (with room for extra group marks) */
1911 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
Fredrik Lundh6f013982000-07-03 18:44:21 +00001912 2*(pattern->groups+1));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001913 if (!match)
1914 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001915
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001916 Py_INCREF(pattern);
1917 match->pattern = pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001918
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001919 Py_INCREF(state->string);
1920 match->string = state->string;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001921
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001922 match->regs = NULL;
1923 match->groups = pattern->groups+1;
1924
1925 /* fill in group slices */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001926
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001927 base = (char*) state->beginning;
1928 n = state->charsize;
1929
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001930 match->mark[0] = ((char*) state->start - base) / n;
1931 match->mark[1] = ((char*) state->ptr - base) / n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001932
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001933 for (i = j = 0; i < pattern->groups; i++, j+=2)
1934 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
1935 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
1936 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
1937 } else
1938 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
1939
1940 match->pos = state->pos;
1941 match->endpos = state->endpos;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001942
Fredrik Lundh6f013982000-07-03 18:44:21 +00001943 match->lastindex = state->lastindex;
1944
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001945 return (PyObject*) match;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001946
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001947 } else if (status == 0) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001948
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001949 /* no match */
1950 Py_INCREF(Py_None);
1951 return Py_None;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001952
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001953 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001954
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001955 /* internal error */
1956 pattern_error(status);
1957 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001958}
1959
1960static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001961pattern_scanner(PatternObject* pattern, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001962{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001963 /* create search state object */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001964
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001965 ScannerObject* self;
1966
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001967 PyObject* string;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001968 int start = 0;
1969 int end = INT_MAX;
1970 if (!PyArg_ParseTuple(args, "O|ii:scanner", &string, &start, &end))
1971 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001972
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001973 /* create scanner object */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001974 self = PyObject_NEW(ScannerObject, &Scanner_Type);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001975 if (!self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001976 return NULL;
1977
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001978 string = state_init(&self->state, pattern, string, start, end);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001979 if (!string) {
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001980 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001981 return NULL;
1982 }
1983
1984 Py_INCREF(pattern);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001985 self->pattern = (PyObject*) pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001986
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001987 return (PyObject*) self;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001988}
1989
Guido van Rossumb700df92000-03-31 14:59:30 +00001990static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001991pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001992{
Raymond Hettinger027bb632004-05-31 03:09:25 +00001993 if (self->weakreflist != NULL)
1994 PyObject_ClearWeakRefs((PyObject *) self);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001995 Py_XDECREF(self->pattern);
1996 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001997 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001998 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001999}
2000
2001static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002002pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002003{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002004 SRE_STATE state;
2005 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00002006
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002007 PyObject* string;
2008 int start = 0;
2009 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002010 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
2011 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:match", kwlist,
2012 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002013 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002014
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002015 string = state_init(&state, self, string, start, end);
2016 if (!string)
2017 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002018
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002019 state.ptr = state.start;
2020
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002021 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
2022
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002023 if (state.charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00002024 status = sre_match(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002025 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002026#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00002027 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002028#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002029 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002030
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002031 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
2032
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002033 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00002034
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002035 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00002036}
2037
2038static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002039pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002040{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002041 SRE_STATE state;
2042 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00002043
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002044 PyObject* string;
2045 int start = 0;
2046 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002047 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
2048 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:search", kwlist,
2049 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002050 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002051
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002052 string = state_init(&state, self, string, start, end);
2053 if (!string)
2054 return NULL;
2055
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002056 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
2057
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002058 if (state.charsize == 1) {
2059 status = sre_search(&state, PatternObject_GetCode(self));
2060 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002061#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002062 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002063#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002064 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002065
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002066 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
2067
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002068 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00002069
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002070 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00002071}
2072
2073static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002074call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002075{
2076 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002077 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002078 PyObject* func;
2079 PyObject* result;
2080
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002081 if (!args)
2082 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002083 name = PyString_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002084 if (!name)
2085 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002086 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002087 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002088 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002089 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002090 func = PyObject_GetAttrString(mod, function);
2091 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002092 if (!func)
2093 return NULL;
2094 result = PyObject_CallObject(func, args);
2095 Py_DECREF(func);
2096 Py_DECREF(args);
2097 return result;
2098}
2099
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002100#ifdef USE_BUILTIN_COPY
2101static int
2102deepcopy(PyObject** object, PyObject* memo)
2103{
2104 PyObject* copy;
2105
2106 copy = call(
2107 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002108 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002109 );
2110 if (!copy)
2111 return 0;
2112
2113 Py_DECREF(*object);
2114 *object = copy;
2115
2116 return 1; /* success */
2117}
2118#endif
2119
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002120static PyObject*
Michael W. Hudsonb6a45052002-07-31 09:54:24 +00002121join_list(PyObject* list, PyObject* pattern)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002122{
2123 /* join list elements */
2124
2125 PyObject* joiner;
2126#if PY_VERSION_HEX >= 0x01060000
2127 PyObject* function;
2128 PyObject* args;
2129#endif
2130 PyObject* result;
2131
2132 switch (PyList_GET_SIZE(list)) {
2133 case 0:
2134 Py_DECREF(list);
Fredrik Lundh09705f02002-11-22 12:46:35 +00002135 return PySequence_GetSlice(pattern, 0, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002136 case 1:
2137 result = PyList_GET_ITEM(list, 0);
2138 Py_INCREF(result);
2139 Py_DECREF(list);
2140 return result;
2141 }
2142
2143 /* two or more elements: slice out a suitable separator from the
2144 first member, and use that to join the entire list */
2145
2146 joiner = PySequence_GetSlice(pattern, 0, 0);
2147 if (!joiner)
2148 return NULL;
2149
2150#if PY_VERSION_HEX >= 0x01060000
2151 function = PyObject_GetAttrString(joiner, "join");
2152 if (!function) {
2153 Py_DECREF(joiner);
2154 return NULL;
2155 }
2156 args = PyTuple_New(1);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002157 if (!args) {
2158 Py_DECREF(function);
2159 Py_DECREF(joiner);
2160 return NULL;
2161 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002162 PyTuple_SET_ITEM(args, 0, list);
2163 result = PyObject_CallObject(function, args);
2164 Py_DECREF(args); /* also removes list */
2165 Py_DECREF(function);
2166#else
2167 result = call(
2168 "string", "join",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002169 PyTuple_Pack(2, list, joiner)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002170 );
2171#endif
2172 Py_DECREF(joiner);
2173
2174 return result;
2175}
2176
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002177static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002178pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002179{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002180 SRE_STATE state;
2181 PyObject* list;
2182 int status;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002183 int i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00002184
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002185 PyObject* string;
2186 int start = 0;
2187 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002188 static char* kwlist[] = { "source", "pos", "endpos", NULL };
2189 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:findall", kwlist,
2190 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002191 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002192
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002193 string = state_init(&state, self, string, start, end);
2194 if (!string)
2195 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002196
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002197 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002198 if (!list) {
2199 state_fini(&state);
2200 return NULL;
2201 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002202
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002203 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002204
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002205 PyObject* item;
2206
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002207 state_reset(&state);
2208
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002209 state.ptr = state.start;
2210
2211 if (state.charsize == 1) {
2212 status = sre_search(&state, PatternObject_GetCode(self));
2213 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002214#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002215 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002216#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002217 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002218
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002219 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002220 if (status == 0)
2221 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002222 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002223 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002224 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002225
2226 /* don't bother to build a match object */
2227 switch (self->groups) {
2228 case 0:
2229 b = STATE_OFFSET(&state, state.start);
2230 e = STATE_OFFSET(&state, state.ptr);
2231 item = PySequence_GetSlice(string, b, e);
2232 if (!item)
2233 goto error;
2234 break;
2235 case 1:
2236 item = state_getslice(&state, 1, string, 1);
2237 if (!item)
2238 goto error;
2239 break;
2240 default:
2241 item = PyTuple_New(self->groups);
2242 if (!item)
2243 goto error;
2244 for (i = 0; i < self->groups; i++) {
2245 PyObject* o = state_getslice(&state, i+1, string, 1);
2246 if (!o) {
2247 Py_DECREF(item);
2248 goto error;
2249 }
2250 PyTuple_SET_ITEM(item, i, o);
2251 }
2252 break;
2253 }
2254
2255 status = PyList_Append(list, item);
2256 Py_DECREF(item);
2257 if (status < 0)
2258 goto error;
2259
2260 if (state.ptr == state.start)
2261 state.start = (void*) ((char*) state.ptr + state.charsize);
2262 else
2263 state.start = state.ptr;
2264
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002265 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002266
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002267 state_fini(&state);
2268 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002269
2270error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002271 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002272 state_fini(&state);
2273 return NULL;
2274
Guido van Rossumb700df92000-03-31 14:59:30 +00002275}
2276
Fredrik Lundh703ce812001-10-24 22:16:30 +00002277#if PY_VERSION_HEX >= 0x02020000
2278static PyObject*
2279pattern_finditer(PatternObject* pattern, PyObject* args)
2280{
2281 PyObject* scanner;
2282 PyObject* search;
2283 PyObject* iterator;
2284
2285 scanner = pattern_scanner(pattern, args);
2286 if (!scanner)
2287 return NULL;
2288
2289 search = PyObject_GetAttrString(scanner, "search");
2290 Py_DECREF(scanner);
2291 if (!search)
2292 return NULL;
2293
2294 iterator = PyCallIter_New(search, Py_None);
2295 Py_DECREF(search);
2296
2297 return iterator;
2298}
2299#endif
2300
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002301static PyObject*
2302pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2303{
2304 SRE_STATE state;
2305 PyObject* list;
2306 PyObject* item;
2307 int status;
2308 int n;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002309 int i;
2310 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002311
2312 PyObject* string;
2313 int maxsplit = 0;
2314 static char* kwlist[] = { "source", "maxsplit", NULL };
2315 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|i:split", kwlist,
2316 &string, &maxsplit))
2317 return NULL;
2318
2319 string = state_init(&state, self, string, 0, INT_MAX);
2320 if (!string)
2321 return NULL;
2322
2323 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002324 if (!list) {
2325 state_fini(&state);
2326 return NULL;
2327 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002328
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002329 n = 0;
2330 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002331
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002332 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002333
2334 state_reset(&state);
2335
2336 state.ptr = state.start;
2337
2338 if (state.charsize == 1) {
2339 status = sre_search(&state, PatternObject_GetCode(self));
2340 } else {
2341#if defined(HAVE_UNICODE)
2342 status = sre_usearch(&state, PatternObject_GetCode(self));
2343#endif
2344 }
2345
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002346 if (status <= 0) {
2347 if (status == 0)
2348 break;
2349 pattern_error(status);
2350 goto error;
2351 }
2352
2353 if (state.start == state.ptr) {
2354 if (last == state.end)
2355 break;
2356 /* skip one character */
2357 state.start = (void*) ((char*) state.ptr + state.charsize);
2358 continue;
2359 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002360
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002361 /* get segment before this match */
2362 item = PySequence_GetSlice(
2363 string, STATE_OFFSET(&state, last),
2364 STATE_OFFSET(&state, state.start)
2365 );
2366 if (!item)
2367 goto error;
2368 status = PyList_Append(list, item);
2369 Py_DECREF(item);
2370 if (status < 0)
2371 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002372
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002373 /* add groups (if any) */
2374 for (i = 0; i < self->groups; i++) {
2375 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002376 if (!item)
2377 goto error;
2378 status = PyList_Append(list, item);
2379 Py_DECREF(item);
2380 if (status < 0)
2381 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002382 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002383
2384 n = n + 1;
2385
2386 last = state.start = state.ptr;
2387
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002388 }
2389
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002390 /* get segment following last match (even if empty) */
2391 item = PySequence_GetSlice(
2392 string, STATE_OFFSET(&state, last), state.endpos
2393 );
2394 if (!item)
2395 goto error;
2396 status = PyList_Append(list, item);
2397 Py_DECREF(item);
2398 if (status < 0)
2399 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002400
2401 state_fini(&state);
2402 return list;
2403
2404error:
2405 Py_DECREF(list);
2406 state_fini(&state);
2407 return NULL;
2408
2409}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002410
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002411static PyObject*
2412pattern_subx(PatternObject* self, PyObject* template, PyObject* string,
2413 int count, int subn)
2414{
2415 SRE_STATE state;
2416 PyObject* list;
2417 PyObject* item;
2418 PyObject* filter;
2419 PyObject* args;
2420 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002421 void* ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002422 int status;
2423 int n;
2424 int i, b, e;
2425 int filter_is_callable;
2426
Fredrik Lundhdac58492001-10-21 21:48:30 +00002427 if (PyCallable_Check(template)) {
2428 /* sub/subn takes either a function or a template */
2429 filter = template;
2430 Py_INCREF(filter);
2431 filter_is_callable = 1;
2432 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002433 /* if not callable, check if it's a literal string */
2434 int literal;
2435 ptr = getstring(template, &n, &b);
2436 if (ptr) {
2437 if (b == 1) {
2438 literal = sre_literal_template(ptr, n);
2439 } else {
2440#if defined(HAVE_UNICODE)
2441 literal = sre_uliteral_template(ptr, n);
2442#endif
2443 }
2444 } else {
2445 PyErr_Clear();
2446 literal = 0;
2447 }
2448 if (literal) {
2449 filter = template;
2450 Py_INCREF(filter);
2451 filter_is_callable = 0;
2452 } else {
2453 /* not a literal; hand it over to the template compiler */
2454 filter = call(
2455 SRE_MODULE, "_subx",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002456 PyTuple_Pack(2, self, template)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002457 );
2458 if (!filter)
2459 return NULL;
2460 filter_is_callable = PyCallable_Check(filter);
2461 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002462 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002463
2464 string = state_init(&state, self, string, 0, INT_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002465 if (!string) {
2466 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002467 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002468 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002469
2470 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002471 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002472 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002473 state_fini(&state);
2474 return NULL;
2475 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002476
2477 n = i = 0;
2478
2479 while (!count || n < count) {
2480
2481 state_reset(&state);
2482
2483 state.ptr = state.start;
2484
2485 if (state.charsize == 1) {
2486 status = sre_search(&state, PatternObject_GetCode(self));
2487 } else {
2488#if defined(HAVE_UNICODE)
2489 status = sre_usearch(&state, PatternObject_GetCode(self));
2490#endif
2491 }
2492
2493 if (status <= 0) {
2494 if (status == 0)
2495 break;
2496 pattern_error(status);
2497 goto error;
2498 }
2499
2500 b = STATE_OFFSET(&state, state.start);
2501 e = STATE_OFFSET(&state, state.ptr);
2502
2503 if (i < b) {
2504 /* get segment before this match */
2505 item = PySequence_GetSlice(string, i, b);
2506 if (!item)
2507 goto error;
2508 status = PyList_Append(list, item);
2509 Py_DECREF(item);
2510 if (status < 0)
2511 goto error;
2512
2513 } else if (i == b && i == e && n > 0)
2514 /* ignore empty match on latest position */
2515 goto next;
2516
2517 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002518 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002519 match = pattern_new_match(self, &state, 1);
2520 if (!match)
2521 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002522 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002523 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002524 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002525 goto error;
2526 }
2527 item = PyObject_CallObject(filter, args);
2528 Py_DECREF(args);
2529 Py_DECREF(match);
2530 if (!item)
2531 goto error;
2532 } else {
2533 /* filter is literal string */
2534 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002535 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002536 }
2537
2538 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002539 if (item != Py_None) {
2540 status = PyList_Append(list, item);
2541 Py_DECREF(item);
2542 if (status < 0)
2543 goto error;
2544 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002545
2546 i = e;
2547 n = n + 1;
2548
2549next:
2550 /* move on */
2551 if (state.ptr == state.start)
2552 state.start = (void*) ((char*) state.ptr + state.charsize);
2553 else
2554 state.start = state.ptr;
2555
2556 }
2557
2558 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002559 if (i < state.endpos) {
2560 item = PySequence_GetSlice(string, i, state.endpos);
2561 if (!item)
2562 goto error;
2563 status = PyList_Append(list, item);
2564 Py_DECREF(item);
2565 if (status < 0)
2566 goto error;
2567 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002568
2569 state_fini(&state);
2570
Guido van Rossum4e173842001-12-07 04:25:10 +00002571 Py_DECREF(filter);
2572
Fredrik Lundhdac58492001-10-21 21:48:30 +00002573 /* convert list to single string (also removes list) */
Michael W. Hudsonb6a45052002-07-31 09:54:24 +00002574 item = join_list(list, self->pattern);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002575
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002576 if (!item)
2577 return NULL;
2578
2579 if (subn)
2580 return Py_BuildValue("Ni", item, n);
2581
2582 return item;
2583
2584error:
2585 Py_DECREF(list);
2586 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002587 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002588 return NULL;
2589
2590}
2591
2592static PyObject*
2593pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2594{
2595 PyObject* template;
2596 PyObject* string;
2597 int count = 0;
2598 static char* kwlist[] = { "repl", "string", "count", NULL };
2599 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:sub", kwlist,
2600 &template, &string, &count))
2601 return NULL;
2602
2603 return pattern_subx(self, template, string, count, 0);
2604}
2605
2606static PyObject*
2607pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2608{
2609 PyObject* template;
2610 PyObject* string;
2611 int count = 0;
2612 static char* kwlist[] = { "repl", "string", "count", NULL };
2613 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:subn", kwlist,
2614 &template, &string, &count))
2615 return NULL;
2616
2617 return pattern_subx(self, template, string, count, 1);
2618}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002619
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002620static PyObject*
2621pattern_copy(PatternObject* self, PyObject* args)
2622{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002623#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002624 PatternObject* copy;
2625 int offset;
2626
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002627 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
2628 return NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002629
2630 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2631 if (!copy)
2632 return NULL;
2633
2634 offset = offsetof(PatternObject, groups);
2635
2636 Py_XINCREF(self->groupindex);
2637 Py_XINCREF(self->indexgroup);
2638 Py_XINCREF(self->pattern);
2639
2640 memcpy((char*) copy + offset, (char*) self + offset,
2641 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00002642 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002643
2644 return (PyObject*) copy;
2645#else
2646 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2647 return NULL;
2648#endif
2649}
2650
2651static PyObject*
2652pattern_deepcopy(PatternObject* self, PyObject* args)
2653{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002654#ifdef USE_BUILTIN_COPY
2655 PatternObject* copy;
2656
2657 PyObject* memo;
2658 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
2659 return NULL;
2660
2661 copy = (PatternObject*) pattern_copy(self, Py_None);
2662 if (!copy)
2663 return NULL;
2664
2665 if (!deepcopy(&copy->groupindex, memo) ||
2666 !deepcopy(&copy->indexgroup, memo) ||
2667 !deepcopy(&copy->pattern, memo)) {
2668 Py_DECREF(copy);
2669 return NULL;
2670 }
2671
2672#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002673 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2674 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002675#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002676}
2677
Raymond Hettinger94478742004-09-24 04:31:19 +00002678PyDoc_STRVAR(pattern_match_doc,
2679"match(string[, pos[, endpos]]) --> match object or None.\n\
2680 Matches zero or more characters at the beginning of the string");
2681
2682PyDoc_STRVAR(pattern_search_doc,
2683"search(string[, pos[, endpos]]) --> match object or None.\n\
2684 Scan through string looking for a match, and return a corresponding\n\
2685 MatchObject instance. Return None if no position in the string matches.");
2686
2687PyDoc_STRVAR(pattern_split_doc,
2688"split(string[, maxsplit = 0]) --> list.\n\
2689 Split string by the occurrences of pattern.");
2690
2691PyDoc_STRVAR(pattern_findall_doc,
2692"findall(string[, pos[, endpos]]) --> list.\n\
2693 Return a list of all non-overlapping matches of pattern in string.");
2694
2695PyDoc_STRVAR(pattern_finditer_doc,
2696"finditer(string[, pos[, endpos]]) --> iterator.\n\
2697 Return an iterator over all non-overlapping matches for the \n\
2698 RE pattern in string. For each match, the iterator returns a\n\
2699 match object.");
2700
2701PyDoc_STRVAR(pattern_sub_doc,
2702"sub(repl, string[, count = 0]) --> newstring\n\
2703 Return the string obtained by replacing the leftmost non-overlapping\n\
2704 occurrences of pattern in string by the replacement repl.");
2705
2706PyDoc_STRVAR(pattern_subn_doc,
2707"subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2708 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2709 the leftmost non-overlapping occurrences of pattern with the\n\
2710 replacement repl.");
2711
2712PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2713
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002714static PyMethodDef pattern_methods[] = {
Raymond Hettinger94478742004-09-24 04:31:19 +00002715 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
2716 pattern_match_doc},
2717 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
2718 pattern_search_doc},
2719 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2720 pattern_sub_doc},
2721 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2722 pattern_subn_doc},
2723 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
2724 pattern_split_doc},
2725 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
2726 pattern_findall_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002727#if PY_VERSION_HEX >= 0x02020000
Raymond Hettinger94478742004-09-24 04:31:19 +00002728 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2729 pattern_finditer_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002730#endif
Fredrik Lundh562586e2000-10-03 20:43:34 +00002731 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002732 {"__copy__", (PyCFunction) pattern_copy, METH_VARARGS},
2733 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002734 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002735};
2736
2737static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002738pattern_getattr(PatternObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00002739{
2740 PyObject* res;
2741
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002742 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00002743
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002744 if (res)
2745 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00002746
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002747 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00002748
2749 /* attributes */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002750 if (!strcmp(name, "pattern")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002751 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002752 return self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00002753 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002754
2755 if (!strcmp(name, "flags"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002756 return Py_BuildValue("i", self->flags);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002757
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002758 if (!strcmp(name, "groups"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002759 return Py_BuildValue("i", self->groups);
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002760
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002761 if (!strcmp(name, "groupindex") && self->groupindex) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002762 Py_INCREF(self->groupindex);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002763 return self->groupindex;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002764 }
2765
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002766 PyErr_SetString(PyExc_AttributeError, name);
2767 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002768}
2769
2770statichere PyTypeObject Pattern_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002771 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00002772 0, "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002773 sizeof(PatternObject), sizeof(SRE_CODE),
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002774 (destructor)pattern_dealloc, /*tp_dealloc*/
2775 0, /*tp_print*/
Raymond Hettinger027bb632004-05-31 03:09:25 +00002776 (getattrfunc)pattern_getattr, /*tp_getattr*/
2777 0, /* tp_setattr */
2778 0, /* tp_compare */
2779 0, /* tp_repr */
2780 0, /* tp_as_number */
2781 0, /* tp_as_sequence */
2782 0, /* tp_as_mapping */
2783 0, /* tp_hash */
2784 0, /* tp_call */
2785 0, /* tp_str */
2786 0, /* tp_getattro */
2787 0, /* tp_setattro */
2788 0, /* tp_as_buffer */
2789 Py_TPFLAGS_HAVE_WEAKREFS, /* tp_flags */
Raymond Hettinger94478742004-09-24 04:31:19 +00002790 pattern_doc, /* tp_doc */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002791 0, /* tp_traverse */
2792 0, /* tp_clear */
2793 0, /* tp_richcompare */
2794 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
Guido van Rossumb700df92000-03-31 14:59:30 +00002795};
2796
2797/* -------------------------------------------------------------------- */
2798/* match methods */
2799
2800static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002801match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00002802{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002803 Py_XDECREF(self->regs);
2804 Py_XDECREF(self->string);
2805 Py_DECREF(self->pattern);
2806 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00002807}
2808
2809static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002810match_getslice_by_index(MatchObject* self, int index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00002811{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002812 if (index < 0 || index >= self->groups) {
2813 /* raise IndexError if we were given a bad group number */
2814 PyErr_SetString(
2815 PyExc_IndexError,
2816 "no such group"
2817 );
2818 return NULL;
2819 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002820
Fredrik Lundh6f013982000-07-03 18:44:21 +00002821 index *= 2;
2822
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002823 if (self->string == Py_None || self->mark[index] < 0) {
2824 /* return default value if the string or group is undefined */
2825 Py_INCREF(def);
2826 return def;
2827 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002828
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002829 return PySequence_GetSlice(
2830 self->string, self->mark[index], self->mark[index+1]
2831 );
Guido van Rossumb700df92000-03-31 14:59:30 +00002832}
2833
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002834static int
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002835match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00002836{
Fredrik Lundh6f013982000-07-03 18:44:21 +00002837 int i;
Guido van Rossumb700df92000-03-31 14:59:30 +00002838
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002839 if (PyInt_Check(index))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002840 return (int) PyInt_AS_LONG(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00002841
Fredrik Lundh6f013982000-07-03 18:44:21 +00002842 i = -1;
2843
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002844 if (self->pattern->groupindex) {
2845 index = PyObject_GetItem(self->pattern->groupindex, index);
2846 if (index) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00002847 if (PyInt_Check(index))
2848 i = (int) PyInt_AS_LONG(index);
2849 Py_DECREF(index);
2850 } else
2851 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002852 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00002853
2854 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002855}
2856
2857static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002858match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002859{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002860 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00002861}
2862
2863static PyObject*
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002864match_expand(MatchObject* self, PyObject* args)
2865{
2866 PyObject* template;
2867 if (!PyArg_ParseTuple(args, "O:expand", &template))
2868 return NULL;
2869
2870 /* delegate to Python code */
2871 return call(
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002872 SRE_MODULE, "_expand",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002873 PyTuple_Pack(3, self->pattern, self, template)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002874 );
2875}
2876
2877static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002878match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002879{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002880 PyObject* result;
2881 int i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00002882
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002883 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00002884
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002885 switch (size) {
2886 case 0:
2887 result = match_getslice(self, Py_False, Py_None);
2888 break;
2889 case 1:
2890 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2891 break;
2892 default:
2893 /* fetch multiple items */
2894 result = PyTuple_New(size);
2895 if (!result)
2896 return NULL;
2897 for (i = 0; i < size; i++) {
2898 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002899 self, PyTuple_GET_ITEM(args, i), Py_None
2900 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002901 if (!item) {
2902 Py_DECREF(result);
2903 return NULL;
2904 }
2905 PyTuple_SET_ITEM(result, i, item);
2906 }
2907 break;
2908 }
2909 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002910}
2911
2912static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002913match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002914{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002915 PyObject* result;
2916 int index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002917
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002918 PyObject* def = Py_None;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002919 static char* kwlist[] = { "default", NULL };
2920 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002921 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002922
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002923 result = PyTuple_New(self->groups-1);
2924 if (!result)
2925 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002926
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002927 for (index = 1; index < self->groups; index++) {
2928 PyObject* item;
2929 item = match_getslice_by_index(self, index, def);
2930 if (!item) {
2931 Py_DECREF(result);
2932 return NULL;
2933 }
2934 PyTuple_SET_ITEM(result, index-1, item);
2935 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002936
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002937 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002938}
2939
2940static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002941match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002942{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002943 PyObject* result;
2944 PyObject* keys;
2945 int index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002946
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002947 PyObject* def = Py_None;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002948 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00002949 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002950 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002951
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002952 result = PyDict_New();
2953 if (!result || !self->pattern->groupindex)
2954 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002955
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002956 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002957 if (!keys)
2958 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00002959
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002960 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00002961 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002962 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002963 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002964 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002965 if (!key)
2966 goto failed;
2967 value = match_getslice(self, key, def);
2968 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002969 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002970 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002971 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00002972 status = PyDict_SetItem(result, key, value);
2973 Py_DECREF(value);
2974 if (status < 0)
2975 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002976 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002977
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002978 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00002979
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002980 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002981
2982failed:
2983 Py_DECREF(keys);
2984 Py_DECREF(result);
2985 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002986}
2987
2988static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002989match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002990{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002991 int index;
2992
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002993 PyObject* index_ = Py_False; /* zero */
2994 if (!PyArg_ParseTuple(args, "|O:start", &index_))
2995 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002996
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002997 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002998
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002999 if (index < 0 || index >= self->groups) {
3000 PyErr_SetString(
3001 PyExc_IndexError,
3002 "no such group"
3003 );
3004 return NULL;
3005 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003006
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003007 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003008 return Py_BuildValue("i", self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00003009}
3010
3011static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003012match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003013{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003014 int index;
3015
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003016 PyObject* index_ = Py_False; /* zero */
3017 if (!PyArg_ParseTuple(args, "|O:end", &index_))
3018 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003019
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003020 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003021
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003022 if (index < 0 || index >= self->groups) {
3023 PyErr_SetString(
3024 PyExc_IndexError,
3025 "no such group"
3026 );
3027 return NULL;
3028 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003029
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003030 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003031 return Py_BuildValue("i", self->mark[index*2+1]);
3032}
3033
3034LOCAL(PyObject*)
3035_pair(int i1, int i2)
3036{
3037 PyObject* pair;
3038 PyObject* item;
3039
3040 pair = PyTuple_New(2);
3041 if (!pair)
3042 return NULL;
3043
3044 item = PyInt_FromLong(i1);
3045 if (!item)
3046 goto error;
3047 PyTuple_SET_ITEM(pair, 0, item);
3048
3049 item = PyInt_FromLong(i2);
3050 if (!item)
3051 goto error;
3052 PyTuple_SET_ITEM(pair, 1, item);
3053
3054 return pair;
3055
3056 error:
3057 Py_DECREF(pair);
3058 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003059}
3060
3061static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003062match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003063{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003064 int index;
3065
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003066 PyObject* index_ = Py_False; /* zero */
3067 if (!PyArg_ParseTuple(args, "|O:span", &index_))
3068 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003069
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003070 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003071
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003072 if (index < 0 || index >= self->groups) {
3073 PyErr_SetString(
3074 PyExc_IndexError,
3075 "no such group"
3076 );
3077 return NULL;
3078 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003079
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003080 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003081 return _pair(self->mark[index*2], self->mark[index*2+1]);
3082}
3083
3084static PyObject*
3085match_regs(MatchObject* self)
3086{
3087 PyObject* regs;
3088 PyObject* item;
3089 int index;
3090
3091 regs = PyTuple_New(self->groups);
3092 if (!regs)
3093 return NULL;
3094
3095 for (index = 0; index < self->groups; index++) {
3096 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3097 if (!item) {
3098 Py_DECREF(regs);
3099 return NULL;
3100 }
3101 PyTuple_SET_ITEM(regs, index, item);
3102 }
3103
3104 Py_INCREF(regs);
3105 self->regs = regs;
3106
3107 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00003108}
3109
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003110static PyObject*
3111match_copy(MatchObject* self, PyObject* args)
3112{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003113#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003114 MatchObject* copy;
3115 int slots, offset;
3116
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003117 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
3118 return NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003119
3120 slots = 2 * (self->pattern->groups+1);
3121
3122 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3123 if (!copy)
3124 return NULL;
3125
3126 /* this value a constant, but any compiler should be able to
3127 figure that out all by itself */
3128 offset = offsetof(MatchObject, string);
3129
3130 Py_XINCREF(self->pattern);
3131 Py_XINCREF(self->string);
3132 Py_XINCREF(self->regs);
3133
3134 memcpy((char*) copy + offset, (char*) self + offset,
3135 sizeof(MatchObject) + slots * sizeof(int) - offset);
3136
3137 return (PyObject*) copy;
3138#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003139 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003140 return NULL;
3141#endif
3142}
3143
3144static PyObject*
3145match_deepcopy(MatchObject* self, PyObject* args)
3146{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003147#ifdef USE_BUILTIN_COPY
3148 MatchObject* copy;
3149
3150 PyObject* memo;
3151 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
3152 return NULL;
3153
3154 copy = (MatchObject*) match_copy(self, Py_None);
3155 if (!copy)
3156 return NULL;
3157
3158 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3159 !deepcopy(&copy->string, memo) ||
3160 !deepcopy(&copy->regs, memo)) {
3161 Py_DECREF(copy);
3162 return NULL;
3163 }
3164
3165#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003166 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3167 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003168#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003169}
3170
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003171static PyMethodDef match_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00003172 {"group", (PyCFunction) match_group, METH_VARARGS},
3173 {"start", (PyCFunction) match_start, METH_VARARGS},
3174 {"end", (PyCFunction) match_end, METH_VARARGS},
3175 {"span", (PyCFunction) match_span, METH_VARARGS},
3176 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3177 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
3178 {"expand", (PyCFunction) match_expand, METH_VARARGS},
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003179 {"__copy__", (PyCFunction) match_copy, METH_VARARGS},
3180 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003181 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003182};
3183
3184static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003185match_getattr(MatchObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00003186{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003187 PyObject* res;
Guido van Rossumb700df92000-03-31 14:59:30 +00003188
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003189 res = Py_FindMethod(match_methods, (PyObject*) self, name);
3190 if (res)
3191 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00003192
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003193 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00003194
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003195 if (!strcmp(name, "lastindex")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00003196 if (self->lastindex >= 0)
3197 return Py_BuildValue("i", self->lastindex);
Fredrik Lundhc2301732000-07-02 22:25:39 +00003198 Py_INCREF(Py_None);
3199 return Py_None;
3200 }
3201
3202 if (!strcmp(name, "lastgroup")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00003203 if (self->pattern->indexgroup && self->lastindex >= 0) {
Fredrik Lundhc2301732000-07-02 22:25:39 +00003204 PyObject* result = PySequence_GetItem(
Fredrik Lundh6f013982000-07-03 18:44:21 +00003205 self->pattern->indexgroup, self->lastindex
Fredrik Lundhc2301732000-07-02 22:25:39 +00003206 );
3207 if (result)
3208 return result;
3209 PyErr_Clear();
3210 }
3211 Py_INCREF(Py_None);
3212 return Py_None;
3213 }
3214
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003215 if (!strcmp(name, "string")) {
3216 if (self->string) {
3217 Py_INCREF(self->string);
3218 return self->string;
3219 } else {
3220 Py_INCREF(Py_None);
3221 return Py_None;
3222 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003223 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003224
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003225 if (!strcmp(name, "regs")) {
3226 if (self->regs) {
3227 Py_INCREF(self->regs);
3228 return self->regs;
3229 } else
3230 return match_regs(self);
3231 }
3232
3233 if (!strcmp(name, "re")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00003234 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003235 return (PyObject*) self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00003236 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003237
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003238 if (!strcmp(name, "pos"))
3239 return Py_BuildValue("i", self->pos);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003240
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003241 if (!strcmp(name, "endpos"))
3242 return Py_BuildValue("i", self->endpos);
Guido van Rossumb700df92000-03-31 14:59:30 +00003243
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003244 PyErr_SetString(PyExc_AttributeError, name);
3245 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003246}
3247
3248/* FIXME: implement setattr("string", None) as a special case (to
3249 detach the associated string, if any */
3250
3251statichere PyTypeObject Match_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003252 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00003253 0, "_" SRE_MODULE ".SRE_Match",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003254 sizeof(MatchObject), sizeof(int),
3255 (destructor)match_dealloc, /*tp_dealloc*/
3256 0, /*tp_print*/
3257 (getattrfunc)match_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00003258};
3259
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003260/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003261/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003262
3263static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003264scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003265{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003266 state_fini(&self->state);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003267 Py_DECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003268 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003269}
3270
3271static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003272scanner_match(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003273{
3274 SRE_STATE* state = &self->state;
3275 PyObject* match;
3276 int status;
3277
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003278 state_reset(state);
3279
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003280 state->ptr = state->start;
3281
3282 if (state->charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003283 status = sre_match(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003284 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003285#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003286 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003287#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003288 }
3289
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003290 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003291 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003292
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003293 if (status == 0 || state->ptr == state->start)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003294 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003295 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003296 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003297
3298 return match;
3299}
3300
3301
3302static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003303scanner_search(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003304{
3305 SRE_STATE* state = &self->state;
3306 PyObject* match;
3307 int status;
3308
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003309 state_reset(state);
3310
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003311 state->ptr = state->start;
3312
3313 if (state->charsize == 1) {
3314 status = sre_search(state, PatternObject_GetCode(self->pattern));
3315 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003316#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003317 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003318#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003319 }
3320
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003321 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003322 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003323
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003324 if (status == 0 || state->ptr == state->start)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003325 state->start = (void*) ((char*) state->ptr + state->charsize);
3326 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003327 state->start = state->ptr;
3328
3329 return match;
3330}
3331
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003332static PyMethodDef scanner_methods[] = {
Neal Norwitzbb2769f2002-03-31 15:46:00 +00003333 /* FIXME: use METH_OLDARGS instead of 0 or fix to use METH_VARARGS */
3334 /* METH_OLDARGS is not in Python 1.5.2 */
3335 {"match", (PyCFunction) scanner_match, 0},
3336 {"search", (PyCFunction) scanner_search, 0},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003337 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003338};
3339
3340static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003341scanner_getattr(ScannerObject* self, char* name)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003342{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003343 PyObject* res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003344
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003345 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
3346 if (res)
3347 return res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003348
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003349 PyErr_Clear();
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003350
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003351 /* attributes */
3352 if (!strcmp(name, "pattern")) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003353 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003354 return self->pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003355 }
3356
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003357 PyErr_SetString(PyExc_AttributeError, name);
3358 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003359}
3360
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003361statichere PyTypeObject Scanner_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003362 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00003363 0, "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003364 sizeof(ScannerObject), 0,
3365 (destructor)scanner_dealloc, /*tp_dealloc*/
3366 0, /*tp_print*/
3367 (getattrfunc)scanner_getattr, /*tp_getattr*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003368};
3369
Guido van Rossumb700df92000-03-31 14:59:30 +00003370static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00003371 {"compile", _compile, METH_VARARGS},
3372 {"getcodesize", sre_codesize, METH_VARARGS},
3373 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003374 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003375};
3376
Andrew M. Kuchlingc24fe362003-04-30 13:09:08 +00003377#if PY_VERSION_HEX < 0x02030000
3378DL_EXPORT(void) init_sre(void)
3379#else
Mark Hammond8235ea12002-07-19 06:55:41 +00003380PyMODINIT_FUNC init_sre(void)
Andrew M. Kuchlingc24fe362003-04-30 13:09:08 +00003381#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00003382{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003383 PyObject* m;
3384 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00003385 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003386
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003387 /* Patch object types */
3388 Pattern_Type.ob_type = Match_Type.ob_type =
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003389 Scanner_Type.ob_type = &PyType_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00003390
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00003391 m = Py_InitModule("_" SRE_MODULE, _functions);
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003392 d = PyModule_GetDict(m);
3393
Fredrik Lundh21009b92001-09-18 18:47:09 +00003394 x = PyInt_FromLong(SRE_MAGIC);
3395 if (x) {
3396 PyDict_SetItemString(d, "MAGIC", x);
3397 Py_DECREF(x);
3398 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00003399
Martin v. Löwis78e2f062003-04-19 12:56:08 +00003400 x = PyInt_FromLong(sizeof(SRE_CODE));
3401 if (x) {
3402 PyDict_SetItemString(d, "CODESIZE", x);
3403 Py_DECREF(x);
3404 }
3405
Fredrik Lundh21009b92001-09-18 18:47:09 +00003406 x = PyString_FromString(copyright);
3407 if (x) {
3408 PyDict_SetItemString(d, "copyright", x);
3409 Py_DECREF(x);
3410 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003411}
3412
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003413#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00003414
3415/* vim:ts=4:sw=4:et
3416*/