blob: 3cc90d4860fb7ae6f7361d56e50f9bf5200a8296 [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
Fredrik Lundhebc37b22000-10-28 19:30:41 +00009 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000010 * 2000-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000011 * 2000-09-20 fl added expand method
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000012 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000013 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000014 * 2001-04-28 fl added __copy__ methods (work in progress)
Fredrik Lundh09705f02002-11-22 12:46:35 +000015 * 2001-05-14 fl fixes for 1.5.2 compatibility
Fredrik Lundhf71ae462001-07-02 17:04:48 +000016 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh397a6542001-10-18 19:30:16 +000017 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Fredrik Lundh971e78b2001-10-20 17:48:46 +000018 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
Fredrik Lundhbec95b92001-10-21 16:47:57 +000019 * 2001-10-21 fl added sub/subn primitive
Fredrik Lundh703ce812001-10-24 22:16:30 +000020 * 2001-10-24 fl added finditer primitive (for 2.2 only)
Fredrik Lundh82b23072001-12-09 16:13:15 +000021 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
Fredrik Lundh09705f02002-11-22 12:46:35 +000022 * 2002-11-09 fl fixed empty sub/subn return type
Martin v. Löwis78e2f062003-04-19 12:56:08 +000023 * 2003-04-18 mvl fully support 4-byte codes
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +000024 * 2003-10-17 gn implemented non recursive scheme
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000026 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000027 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000028 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
Guido van Rossumb700df92000-03-31 14:59:30 +000032 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000033 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000034 * other compatibility work.
35 */
36
37#ifndef SRE_RECURSIVE
38
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000039static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000040 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000041
42#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000043#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000044
45#include "sre.h"
46
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000047#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000048
Fredrik Lundh436c3d582000-06-29 08:58:44 +000049/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000050#if !defined(SRE_MODULE)
51#define SRE_MODULE "sre"
52#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000053
Guido van Rossumb700df92000-03-31 14:59:30 +000054/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000055#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000056
Fredrik Lundh971e78b2001-10-20 17:48:46 +000057#if PY_VERSION_HEX >= 0x01060000
58#if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
Fredrik Lundh22d25462000-07-01 17:50:59 +000059/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +000060#define HAVE_UNICODE
61#endif
Fredrik Lundh971e78b2001-10-20 17:48:46 +000062#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000063
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000065/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000066
67/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000068#define USE_FAST_SEARCH
69
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000070/* enables aggressive inlining (always on for Visual C) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000071#undef USE_INLINE
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000072
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000073/* enables copy/deepcopy handling (work in progress) */
74#undef USE_BUILTIN_COPY
75
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000076#if PY_VERSION_HEX < 0x01060000
77#define PyObject_DEL(op) PyMem_DEL((op))
78#endif
79
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000080/* -------------------------------------------------------------------- */
81
Fredrik Lundh80946112000-06-29 18:03:25 +000082#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000083#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000084#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000085/* fastest possible local call under MSVC */
86#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000087#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000088#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000089#else
90#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000091#endif
92
93/* error codes */
94#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000095#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000096#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000097#define SRE_ERROR_MEMORY -9 /* out of memory */
98
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000099#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +0000100#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +0000101#else
102#define TRACE(v)
103#endif
104
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000105/* -------------------------------------------------------------------- */
106/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000107
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000108/* default character predicates (run sre_chars.py to regenerate tables) */
109
110#define SRE_DIGIT_MASK 1
111#define SRE_SPACE_MASK 2
112#define SRE_LINEBREAK_MASK 4
113#define SRE_ALNUM_MASK 8
114#define SRE_WORD_MASK 16
115
Fredrik Lundh21009b92001-09-18 18:47:09 +0000116/* FIXME: this assumes ASCII. create tables in init_sre() instead */
117
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000118static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1192, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1200, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
12125, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12224, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1230, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12424, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
125
Fredrik Lundhb389df32000-06-29 12:48:37 +0000126static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000012710, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
12827, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
12944, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
13061, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
131108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
132122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
133106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
134120, 121, 122, 123, 124, 125, 126, 127 };
135
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000136#define SRE_IS_DIGIT(ch)\
137 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
138#define SRE_IS_SPACE(ch)\
139 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
140#define SRE_IS_LINEBREAK(ch)\
141 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
142#define SRE_IS_ALNUM(ch)\
143 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
144#define SRE_IS_WORD(ch)\
145 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000146
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000147static unsigned int sre_lower(unsigned int ch)
148{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000149 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000150}
151
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000152/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000153/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
154 * warnings when c's type supports only numbers < N+1 */
155#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
156#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000157#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000158#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000159#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
160
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000161static unsigned int sre_lower_locale(unsigned int ch)
162{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000163 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000164}
165
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000166/* unicode-specific character predicates */
167
168#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000169
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000170#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
171#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
172#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000173#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000174#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000175
176static unsigned int sre_lower_unicode(unsigned int ch)
177{
178 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
179}
180
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000181#endif
182
Guido van Rossumb700df92000-03-31 14:59:30 +0000183LOCAL(int)
184sre_category(SRE_CODE category, unsigned int ch)
185{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000186 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000187
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000188 case SRE_CATEGORY_DIGIT:
189 return SRE_IS_DIGIT(ch);
190 case SRE_CATEGORY_NOT_DIGIT:
191 return !SRE_IS_DIGIT(ch);
192 case SRE_CATEGORY_SPACE:
193 return SRE_IS_SPACE(ch);
194 case SRE_CATEGORY_NOT_SPACE:
195 return !SRE_IS_SPACE(ch);
196 case SRE_CATEGORY_WORD:
197 return SRE_IS_WORD(ch);
198 case SRE_CATEGORY_NOT_WORD:
199 return !SRE_IS_WORD(ch);
200 case SRE_CATEGORY_LINEBREAK:
201 return SRE_IS_LINEBREAK(ch);
202 case SRE_CATEGORY_NOT_LINEBREAK:
203 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000204
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000205 case SRE_CATEGORY_LOC_WORD:
206 return SRE_LOC_IS_WORD(ch);
207 case SRE_CATEGORY_LOC_NOT_WORD:
208 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000209
210#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000211 case SRE_CATEGORY_UNI_DIGIT:
212 return SRE_UNI_IS_DIGIT(ch);
213 case SRE_CATEGORY_UNI_NOT_DIGIT:
214 return !SRE_UNI_IS_DIGIT(ch);
215 case SRE_CATEGORY_UNI_SPACE:
216 return SRE_UNI_IS_SPACE(ch);
217 case SRE_CATEGORY_UNI_NOT_SPACE:
218 return !SRE_UNI_IS_SPACE(ch);
219 case SRE_CATEGORY_UNI_WORD:
220 return SRE_UNI_IS_WORD(ch);
221 case SRE_CATEGORY_UNI_NOT_WORD:
222 return !SRE_UNI_IS_WORD(ch);
223 case SRE_CATEGORY_UNI_LINEBREAK:
224 return SRE_UNI_IS_LINEBREAK(ch);
225 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
226 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000227#else
228 case SRE_CATEGORY_UNI_DIGIT:
229 return SRE_IS_DIGIT(ch);
230 case SRE_CATEGORY_UNI_NOT_DIGIT:
231 return !SRE_IS_DIGIT(ch);
232 case SRE_CATEGORY_UNI_SPACE:
233 return SRE_IS_SPACE(ch);
234 case SRE_CATEGORY_UNI_NOT_SPACE:
235 return !SRE_IS_SPACE(ch);
236 case SRE_CATEGORY_UNI_WORD:
237 return SRE_LOC_IS_WORD(ch);
238 case SRE_CATEGORY_UNI_NOT_WORD:
239 return !SRE_LOC_IS_WORD(ch);
240 case SRE_CATEGORY_UNI_LINEBREAK:
241 return SRE_IS_LINEBREAK(ch);
242 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
243 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000244#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000245 }
246 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000247}
248
249/* helpers */
250
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000251static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000252data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000253{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000254 if (state->data_stack) {
255 free(state->data_stack);
256 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000257 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000258 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000259}
260
261static int
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000262data_stack_grow(SRE_STATE* state, int size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000263{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000264 int minsize, cursize;
265 minsize = state->data_stack_base+size;
266 cursize = state->data_stack_size;
267 if (cursize < minsize) {
268 void* stack;
269 cursize = minsize+minsize/4+1024;
270 TRACE(("allocate/grow stack %d\n", cursize));
271 stack = realloc(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000272 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000273 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000274 return SRE_ERROR_MEMORY;
275 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000276 state->data_stack = stack;
277 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000278 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000279 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000280}
281
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000282/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000283
284#define SRE_CHAR unsigned char
285#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000286#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000287#define SRE_CHARSET sre_charset
288#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000289#define SRE_MATCH sre_match
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000290#define SRE_MATCH_CONTEXT sre_match_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000291#define SRE_SEARCH sre_search
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000292#define SRE_LITERAL_TEMPLATE sre_literal_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000293
294#if defined(HAVE_UNICODE)
295
Guido van Rossumb700df92000-03-31 14:59:30 +0000296#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000297#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000298#undef SRE_RECURSIVE
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000299
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000300#undef SRE_LITERAL_TEMPLATE
Guido van Rossumb700df92000-03-31 14:59:30 +0000301#undef SRE_SEARCH
302#undef SRE_MATCH
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000303#undef SRE_MATCH_CONTEXT
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000304#undef SRE_INFO
305#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000306#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000307#undef SRE_AT
308#undef SRE_CHAR
309
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000310/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000311
312#define SRE_CHAR Py_UNICODE
313#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000314#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000315#define SRE_CHARSET sre_ucharset
316#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000317#define SRE_MATCH sre_umatch
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000318#define SRE_MATCH_CONTEXT sre_umatch_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000319#define SRE_SEARCH sre_usearch
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000320#define SRE_LITERAL_TEMPLATE sre_uliteral_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000321#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000322
323#endif /* SRE_RECURSIVE */
324
325/* -------------------------------------------------------------------- */
326/* String matching engine */
327
328/* the following section is compiled twice, with different character
329 settings */
330
331LOCAL(int)
332SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
333{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000334 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000335
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000336 int this, that;
Guido van Rossumb700df92000-03-31 14:59:30 +0000337
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000338 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000339
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000340 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000341 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000342 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000343
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000344 case SRE_AT_BEGINNING_LINE:
345 return ((void*) ptr == state->beginning ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000346 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000347
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000348 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000349 return (((void*) (ptr+1) == state->end &&
350 SRE_IS_LINEBREAK((int) ptr[0])) ||
351 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000352
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000353 case SRE_AT_END_LINE:
354 return ((void*) ptr == state->end ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000355 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000356
Fredrik Lundh770617b2001-01-14 15:06:11 +0000357 case SRE_AT_END_STRING:
358 return ((void*) ptr == state->end);
359
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000360 case SRE_AT_BOUNDARY:
361 if (state->beginning == state->end)
362 return 0;
363 that = ((void*) ptr > state->beginning) ?
364 SRE_IS_WORD((int) ptr[-1]) : 0;
365 this = ((void*) ptr < state->end) ?
366 SRE_IS_WORD((int) ptr[0]) : 0;
367 return this != that;
Fredrik Lundh80946112000-06-29 18:03:25 +0000368
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000369 case SRE_AT_NON_BOUNDARY:
370 if (state->beginning == state->end)
371 return 0;
372 that = ((void*) ptr > state->beginning) ?
373 SRE_IS_WORD((int) ptr[-1]) : 0;
374 this = ((void*) ptr < state->end) ?
375 SRE_IS_WORD((int) ptr[0]) : 0;
376 return this == that;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000377
378 case SRE_AT_LOC_BOUNDARY:
379 if (state->beginning == state->end)
380 return 0;
381 that = ((void*) ptr > state->beginning) ?
382 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
383 this = ((void*) ptr < state->end) ?
384 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
385 return this != that;
386
387 case SRE_AT_LOC_NON_BOUNDARY:
388 if (state->beginning == state->end)
389 return 0;
390 that = ((void*) ptr > state->beginning) ?
391 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
392 this = ((void*) ptr < state->end) ?
393 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
394 return this == that;
395
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000396#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000397 case SRE_AT_UNI_BOUNDARY:
398 if (state->beginning == state->end)
399 return 0;
400 that = ((void*) ptr > state->beginning) ?
401 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
402 this = ((void*) ptr < state->end) ?
403 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
404 return this != that;
405
406 case SRE_AT_UNI_NON_BOUNDARY:
407 if (state->beginning == state->end)
408 return 0;
409 that = ((void*) ptr > state->beginning) ?
410 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
411 this = ((void*) ptr < state->end) ?
412 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
413 return this == that;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000414#endif
415
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000416 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000417
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000418 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000419}
420
421LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000422SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000423{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000424 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000425
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000426 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000427
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000428 for (;;) {
429 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000430
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000431 case SRE_OP_FAILURE:
432 return !ok;
433
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000434 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000435 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000436 if (ch == set[0])
437 return ok;
438 set++;
439 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000440
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000441 case SRE_OP_CATEGORY:
442 /* <CATEGORY> <code> */
443 if (sre_category(set[0], (int) ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000444 return ok;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000445 set += 1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000446 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000447
Fredrik Lundh3562f112000-07-02 12:00:07 +0000448 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000449 if (sizeof(SRE_CODE) == 2) {
450 /* <CHARSET> <bitmap> (16 bits per code word) */
451 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
452 return ok;
453 set += 16;
Tim Peters3d563502006-01-21 02:47:53 +0000454 }
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000455 else {
456 /* <CHARSET> <bitmap> (32 bits per code word) */
457 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
458 return ok;
459 set += 8;
460 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000461 break;
462
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000463 case SRE_OP_RANGE:
464 /* <RANGE> <lower> <upper> */
465 if (set[0] <= ch && ch <= set[1])
466 return ok;
467 set += 2;
468 break;
469
470 case SRE_OP_NEGATE:
471 ok = !ok;
472 break;
473
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000474 case SRE_OP_BIGCHARSET:
475 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
476 {
477 int count, block;
478 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000479
480 if (sizeof(SRE_CODE) == 2) {
481 block = ((unsigned char*)set)[ch >> 8];
482 set += 128;
483 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
484 return ok;
485 set += count*16;
486 }
487 else {
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000488 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
489 * warnings when c's type supports only numbers < N+1 */
490 if (!(ch & ~65535))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000491 block = ((unsigned char*)set)[ch >> 8];
492 else
493 block = -1;
494 set += 64;
Tim Peters3d563502006-01-21 02:47:53 +0000495 if (block >=0 &&
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000496 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
497 return ok;
498 set += count*8;
499 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000500 break;
501 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000502
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000503 default:
504 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000505 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000506 return 0;
507 }
508 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000509}
510
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000511LOCAL(int) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000512
513LOCAL(int)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000514SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, int maxcount)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000515{
516 SRE_CODE chr;
517 SRE_CHAR* ptr = state->ptr;
518 SRE_CHAR* end = state->end;
519 int i;
520
521 /* adjust end */
522 if (maxcount < end - ptr && maxcount != 65535)
523 end = ptr + maxcount;
524
525 switch (pattern[0]) {
526
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000527 case SRE_OP_IN:
528 /* repeated set */
529 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
530 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
531 ptr++;
532 break;
533
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000534 case SRE_OP_ANY:
535 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000536 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000537 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
538 ptr++;
539 break;
540
541 case SRE_OP_ANY_ALL:
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000542 /* repeated dot wildcard. skip to the end of the target
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000543 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000544 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000545 ptr = end;
546 break;
547
548 case SRE_OP_LITERAL:
549 /* repeated literal */
550 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000551 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000552 while (ptr < end && (SRE_CODE) *ptr == chr)
553 ptr++;
554 break;
555
556 case SRE_OP_LITERAL_IGNORE:
557 /* repeated literal */
558 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000559 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000560 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
561 ptr++;
562 break;
563
564 case SRE_OP_NOT_LITERAL:
565 /* repeated non-literal */
566 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000567 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000568 while (ptr < end && (SRE_CODE) *ptr != chr)
569 ptr++;
570 break;
Tim Peters3d563502006-01-21 02:47:53 +0000571
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000572 case SRE_OP_NOT_LITERAL_IGNORE:
573 /* repeated non-literal */
574 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000575 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000576 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
577 ptr++;
578 break;
579
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000580 default:
581 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000582 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000583 while ((SRE_CHAR*) state->ptr < end) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000584 i = SRE_MATCH(state, pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000585 if (i < 0)
586 return i;
587 if (!i)
588 break;
589 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000590 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
591 (SRE_CHAR*) state->ptr - ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000592 return (SRE_CHAR*) state->ptr - ptr;
593 }
594
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000595 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000596 return ptr - (SRE_CHAR*) state->ptr;
597}
598
Fredrik Lundh33accc12000-08-27 20:59:47 +0000599#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000600LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000601SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
602{
603 /* check if an SRE_OP_INFO block matches at the current position.
604 returns the number of SRE_CODE objects to skip if successful, 0
605 if no match */
606
607 SRE_CHAR* end = state->end;
608 SRE_CHAR* ptr = state->ptr;
609 int i;
610
611 /* check minimal length */
612 if (pattern[3] && (end - ptr) < pattern[3])
613 return 0;
614
615 /* check known prefix */
616 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
617 /* <length> <skip> <prefix data> <overlap data> */
618 for (i = 0; i < pattern[5]; i++)
619 if ((SRE_CODE) ptr[i] != pattern[7 + i])
620 return 0;
621 return pattern[0] + 2 * pattern[6];
622 }
623 return pattern[0];
624}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000625#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000626
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000627/* The macros below should be used to protect recursive SRE_MATCH()
628 * calls that *failed* and do *not* return immediately (IOW, those
629 * that will backtrack). Explaining:
630 *
631 * - Recursive SRE_MATCH() returned true: that's usually a success
632 * (besides atypical cases like ASSERT_NOT), therefore there's no
633 * reason to restore lastmark;
634 *
635 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
636 * is returning to the caller: If the current SRE_MATCH() is the
637 * top function of the recursion, returning false will be a matching
638 * failure, and it doesn't matter where lastmark is pointing to.
639 * If it's *not* the top function, it will be a recursive SRE_MATCH()
640 * failure by itself, and the calling SRE_MATCH() will have to deal
641 * with the failure by the same rules explained here (it will restore
642 * lastmark by itself if necessary);
643 *
644 * - Recursive SRE_MATCH() returned false, and will continue the
645 * outside 'for' loop: must be protected when breaking, since the next
646 * OP could potentially depend on lastmark;
Tim Peters3d563502006-01-21 02:47:53 +0000647 *
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000648 * - Recursive SRE_MATCH() returned false, and will be called again
649 * inside a local for/while loop: must be protected between each
650 * loop iteration, since the recursive SRE_MATCH() could do anything,
651 * and could potentially depend on lastmark.
652 *
653 * For more information, check the discussion at SF patch #712900.
654 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000655#define LASTMARK_SAVE() \
656 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000657 ctx->lastmark = state->lastmark; \
658 ctx->lastindex = state->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000659 } while (0)
660#define LASTMARK_RESTORE() \
661 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000662 state->lastmark = ctx->lastmark; \
663 state->lastindex = ctx->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000664 } while (0)
665
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000666#define RETURN_ERROR(i) do { return i; } while(0)
667#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
668#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
669
670#define RETURN_ON_ERROR(i) \
671 do { if (i < 0) RETURN_ERROR(i); } while (0)
672#define RETURN_ON_SUCCESS(i) \
673 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
674#define RETURN_ON_FAILURE(i) \
675 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
676
677#define SFY(x) #x
678
679#define DATA_STACK_ALLOC(state, type, ptr) \
680do { \
681 alloc_pos = state->data_stack_base; \
682 TRACE(("allocating %s in %d (%d)\n", \
683 SFY(type), alloc_pos, sizeof(type))); \
684 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
685 int j = data_stack_grow(state, sizeof(type)); \
686 if (j < 0) return j; \
687 if (ctx_pos != -1) \
688 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
689 } \
690 ptr = (type*)(state->data_stack+alloc_pos); \
691 state->data_stack_base += sizeof(type); \
692} while (0)
693
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000694#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
695do { \
696 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
697 ptr = (type*)(state->data_stack+pos); \
698} while (0)
699
700#define DATA_STACK_PUSH(state, data, size) \
701do { \
702 TRACE(("copy data in %p to %d (%d)\n", \
703 data, state->data_stack_base, size)); \
704 if (state->data_stack_size < state->data_stack_base+size) { \
705 int j = data_stack_grow(state, size); \
706 if (j < 0) return j; \
707 if (ctx_pos != -1) \
708 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
709 } \
710 memcpy(state->data_stack+state->data_stack_base, data, size); \
711 state->data_stack_base += size; \
712} while (0)
713
714#define DATA_STACK_POP(state, data, size, discard) \
715do { \
716 TRACE(("copy data to %p from %d (%d)\n", \
717 data, state->data_stack_base-size, size)); \
718 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
719 if (discard) \
720 state->data_stack_base -= size; \
721} while (0)
722
723#define DATA_STACK_POP_DISCARD(state, size) \
724do { \
725 TRACE(("discard data from %d (%d)\n", \
726 state->data_stack_base-size, size)); \
727 state->data_stack_base -= size; \
728} while(0)
729
730#define DATA_PUSH(x) \
731 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
732#define DATA_POP(x) \
733 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000734#define DATA_POP_DISCARD(x) \
735 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
736#define DATA_ALLOC(t,p) \
737 DATA_STACK_ALLOC(state, t, p)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000738#define DATA_LOOKUP_AT(t,p,pos) \
739 DATA_STACK_LOOKUP_AT(state,t,p,pos)
740
741#define MARK_PUSH(lastmark) \
742 do if (lastmark > 0) { \
743 i = lastmark; /* ctx->lastmark may change if reallocated */ \
744 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
745 } while (0)
746#define MARK_POP(lastmark) \
747 do if (lastmark > 0) { \
748 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
749 } while (0)
750#define MARK_POP_KEEP(lastmark) \
751 do if (lastmark > 0) { \
752 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
753 } while (0)
754#define MARK_POP_DISCARD(lastmark) \
755 do if (lastmark > 0) { \
756 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
757 } while (0)
758
759#define JUMP_NONE 0
760#define JUMP_MAX_UNTIL_1 1
761#define JUMP_MAX_UNTIL_2 2
762#define JUMP_MAX_UNTIL_3 3
763#define JUMP_MIN_UNTIL_1 4
764#define JUMP_MIN_UNTIL_2 5
765#define JUMP_MIN_UNTIL_3 6
766#define JUMP_REPEAT 7
767#define JUMP_REPEAT_ONE_1 8
768#define JUMP_REPEAT_ONE_2 9
769#define JUMP_MIN_REPEAT_ONE 10
770#define JUMP_BRANCH 11
771#define JUMP_ASSERT 12
772#define JUMP_ASSERT_NOT 13
773
774#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
775 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
776 nextctx->last_ctx_pos = ctx_pos; \
777 nextctx->jump = jumpvalue; \
778 nextctx->pattern = nextpattern; \
779 ctx_pos = alloc_pos; \
780 ctx = nextctx; \
781 goto entrance; \
782 jumplabel: \
783 while (0) /* gcc doesn't like labels at end of scopes */ \
784
785typedef struct {
786 int last_ctx_pos;
787 int jump;
788 SRE_CHAR* ptr;
789 SRE_CODE* pattern;
790 int count;
791 int lastmark;
792 int lastindex;
793 union {
794 SRE_CODE chr;
795 SRE_REPEAT* rep;
796 } u;
797} SRE_MATCH_CONTEXT;
798
799/* check if string matches the given pattern. returns <0 for
800 error, 0 for failure, and 1 for success */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000801LOCAL(int)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000802SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
Guido van Rossumb700df92000-03-31 14:59:30 +0000803{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000804 SRE_CHAR* end = state->end;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000805 int alloc_pos, ctx_pos = -1;
806 int i, ret = 0;
807 int jump;
Guido van Rossumb700df92000-03-31 14:59:30 +0000808
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000809 SRE_MATCH_CONTEXT* ctx;
810 SRE_MATCH_CONTEXT* nextctx;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000811
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000812 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000813
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000814 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
815 ctx->last_ctx_pos = -1;
816 ctx->jump = JUMP_NONE;
817 ctx->pattern = pattern;
818 ctx_pos = alloc_pos;
819
820entrance:
821
822 ctx->ptr = state->ptr;
823
824 if (ctx->pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000825 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000826 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000827 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000828 TRACE(("reject (got %d chars, need %d)\n",
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000829 (end - ctx->ptr), ctx->pattern[3]));
830 RETURN_FAILURE;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000831 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000832 ctx->pattern += ctx->pattern[1] + 1;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000833 }
834
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000835 for (;;) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000836
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000837 switch (*ctx->pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000838
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000839 case SRE_OP_MARK:
840 /* set mark */
841 /* <MARK> <gid> */
842 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
843 ctx->ptr, ctx->pattern[0]));
844 i = ctx->pattern[0];
845 if (i & 1)
846 state->lastindex = i/2 + 1;
847 if (i > state->lastmark) {
848 /* state->lastmark is the highest valid index in the
849 state->mark array. If it is increased by more than 1,
850 the intervening marks must be set to NULL to signal
Tim Peters3d563502006-01-21 02:47:53 +0000851 that these marks have not been encountered. */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000852 int j = state->lastmark + 1;
853 while (j < i)
854 state->mark[j++] = NULL;
855 state->lastmark = i;
856 }
857 state->mark[i] = ctx->ptr;
858 ctx->pattern++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000859 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000860
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000861 case SRE_OP_LITERAL:
862 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000863 /* <LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000864 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
865 ctx->ptr, *ctx->pattern));
866 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
867 RETURN_FAILURE;
868 ctx->pattern++;
869 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000870 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000871
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000872 case SRE_OP_NOT_LITERAL:
873 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000874 /* <NOT_LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000875 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
876 ctx->ptr, *ctx->pattern));
877 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
878 RETURN_FAILURE;
879 ctx->pattern++;
880 ctx->ptr++;
881 break;
882
883 case SRE_OP_SUCCESS:
884 /* end of pattern */
885 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
886 state->ptr = ctx->ptr;
887 RETURN_SUCCESS;
888
889 case SRE_OP_AT:
890 /* match at given position */
891 /* <AT> <code> */
892 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
893 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
894 RETURN_FAILURE;
895 ctx->pattern++;
896 break;
897
898 case SRE_OP_CATEGORY:
899 /* match at given category */
900 /* <CATEGORY> <code> */
901 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
902 ctx->ptr, *ctx->pattern));
903 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
904 RETURN_FAILURE;
905 ctx->pattern++;
906 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000907 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000908
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000909 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000910 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000911 /* <ANY> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000912 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
913 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
914 RETURN_FAILURE;
915 ctx->ptr++;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000916 break;
917
918 case SRE_OP_ANY_ALL:
919 /* match anything */
920 /* <ANY_ALL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000921 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
922 if (ctx->ptr >= end)
923 RETURN_FAILURE;
924 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000925 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000926
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000927 case SRE_OP_IN:
928 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000929 /* <IN> <skip> <set> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000930 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
931 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
932 RETURN_FAILURE;
933 ctx->pattern += ctx->pattern[0];
934 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000935 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000936
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000937 case SRE_OP_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000938 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
939 ctx->pattern, ctx->ptr, ctx->pattern[0]));
940 if (ctx->ptr >= end ||
941 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
942 RETURN_FAILURE;
943 ctx->pattern++;
944 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000945 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000946
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000947 case SRE_OP_NOT_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000948 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
949 ctx->pattern, ctx->ptr, *ctx->pattern));
950 if (ctx->ptr >= end ||
951 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
952 RETURN_FAILURE;
953 ctx->pattern++;
954 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000955 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000956
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000957 case SRE_OP_IN_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000958 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
959 if (ctx->ptr >= end
960 || !SRE_CHARSET(ctx->pattern+1,
961 (SRE_CODE)state->lower(*ctx->ptr)))
962 RETURN_FAILURE;
963 ctx->pattern += ctx->pattern[0];
964 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000965 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000966
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000967 case SRE_OP_JUMP:
968 case SRE_OP_INFO:
969 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000970 /* <JUMP> <offset> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000971 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
972 ctx->ptr, ctx->pattern[0]));
973 ctx->pattern += ctx->pattern[0];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000974 break;
975
976 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000977 /* alternation */
978 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000979 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000980 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000981 ctx->u.rep = state->repeat;
982 if (ctx->u.rep)
983 MARK_PUSH(ctx->lastmark);
984 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
985 if (ctx->pattern[1] == SRE_OP_LITERAL &&
986 (ctx->ptr >= end ||
987 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000988 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000989 if (ctx->pattern[1] == SRE_OP_IN &&
990 (ctx->ptr >= end ||
991 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000992 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000993 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000994 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000995 if (ret) {
996 if (ctx->u.rep)
997 MARK_POP_DISCARD(ctx->lastmark);
998 RETURN_ON_ERROR(ret);
999 RETURN_SUCCESS;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001000 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001001 if (ctx->u.rep)
1002 MARK_POP_KEEP(ctx->lastmark);
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001003 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001004 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001005 if (ctx->u.rep)
1006 MARK_POP_DISCARD(ctx->lastmark);
1007 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001008
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001009 case SRE_OP_REPEAT_ONE:
1010 /* match repeated sequence (maximizing regexp) */
1011
1012 /* this operator only works if the repeated item is
1013 exactly one character wide, and we're not already
1014 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +00001015 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001016
1017 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1018
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001019 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1020 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001021
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001022 if (ctx->ptr + ctx->pattern[1] > end)
1023 RETURN_FAILURE; /* cannot match */
Fredrik Lundhe1869832000-08-01 22:47:49 +00001024
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001025 state->ptr = ctx->ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001026
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001027 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1028 RETURN_ON_ERROR(ret);
1029 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1030 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001031 ctx->ptr += ctx->count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001032
1033 /* when we arrive here, count contains the number of
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001034 matches, and ctx->ptr points to the tail of the target
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001035 string. check if the rest of the pattern matches,
1036 and backtrack if not. */
1037
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001038 if (ctx->count < (int) ctx->pattern[1])
1039 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001040
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001041 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001042 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001043 state->ptr = ctx->ptr;
1044 RETURN_SUCCESS;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001045 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001046
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001047 LASTMARK_SAVE();
1048
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001049 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001050 /* tail starts with a literal. skip positions where
1051 the rest of the pattern cannot possibly match */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001052 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001053 for (;;) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001054 while (ctx->count >= (int) ctx->pattern[1] &&
1055 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1056 ctx->ptr--;
1057 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001058 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001059 if (ctx->count < (int) ctx->pattern[1])
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001060 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001061 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001062 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1063 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001064 if (ret) {
1065 RETURN_ON_ERROR(ret);
1066 RETURN_SUCCESS;
1067 }
Tim Peters3d563502006-01-21 02:47:53 +00001068
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001069 LASTMARK_RESTORE();
Tim Peters3d563502006-01-21 02:47:53 +00001070
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001071 ctx->ptr--;
1072 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001073 }
1074
1075 } else {
1076 /* general case */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001077 while (ctx->count >= (int) ctx->pattern[1]) {
1078 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001079 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1080 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001081 if (ret) {
1082 RETURN_ON_ERROR(ret);
1083 RETURN_SUCCESS;
1084 }
1085 ctx->ptr--;
1086 ctx->count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001087 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001088 }
1089 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001090 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001091
Guido van Rossum41c99e72003-04-14 17:59:34 +00001092 case SRE_OP_MIN_REPEAT_ONE:
1093 /* match repeated sequence (minimizing regexp) */
1094
1095 /* this operator only works if the repeated item is
1096 exactly one character wide, and we're not already
1097 collecting backtracking points. for other cases,
1098 use the MIN_REPEAT operator */
1099
1100 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1101
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001102 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1103 ctx->pattern[1], ctx->pattern[2]));
Guido van Rossum41c99e72003-04-14 17:59:34 +00001104
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001105 if (ctx->ptr + ctx->pattern[1] > end)
1106 RETURN_FAILURE; /* cannot match */
Guido van Rossum41c99e72003-04-14 17:59:34 +00001107
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001108 state->ptr = ctx->ptr;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001109
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001110 if (ctx->pattern[1] == 0)
1111 ctx->count = 0;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001112 else {
1113 /* count using pattern min as the maximum */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001114 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1115 RETURN_ON_ERROR(ret);
1116 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1117 if (ret < (int) ctx->pattern[1])
Tim Peters3d563502006-01-21 02:47:53 +00001118 /* didn't match minimum number of times */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001119 RETURN_FAILURE;
1120 /* advance past minimum matches of repeat */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001121 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001122 ctx->ptr += ctx->count;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001123 }
1124
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001125 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Guido van Rossum41c99e72003-04-14 17:59:34 +00001126 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001127 state->ptr = ctx->ptr;
1128 RETURN_SUCCESS;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001129
1130 } else {
1131 /* general case */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001132 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001133 while ((int)ctx->pattern[2] == 65535
1134 || ctx->count <= (int)ctx->pattern[2]) {
1135 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001136 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1137 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001138 if (ret) {
1139 RETURN_ON_ERROR(ret);
1140 RETURN_SUCCESS;
1141 }
1142 state->ptr = ctx->ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001143 ret = SRE_COUNT(state, ctx->pattern+3, 1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001144 RETURN_ON_ERROR(ret);
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001145 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001146 if (ret == 0)
Guido van Rossum41c99e72003-04-14 17:59:34 +00001147 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001148 assert(ret == 1);
1149 ctx->ptr++;
1150 ctx->count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001151 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001152 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001153 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001154 RETURN_FAILURE;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001155
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001156 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001157 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001158 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001159 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001160 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1161 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001162
1163 /* install new repeat context */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001164 ctx->u.rep = (SRE_REPEAT*) malloc(sizeof(*ctx->u.rep));
1165 ctx->u.rep->count = -1;
1166 ctx->u.rep->pattern = ctx->pattern;
1167 ctx->u.rep->prev = state->repeat;
1168 ctx->u.rep->last_ptr = NULL;
1169 state->repeat = ctx->u.rep;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001170
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001171 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001172 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001173 state->repeat = ctx->u.rep->prev;
1174 free(ctx->u.rep);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001175
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001176 if (ret) {
1177 RETURN_ON_ERROR(ret);
1178 RETURN_SUCCESS;
1179 }
1180 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001181
1182 case SRE_OP_MAX_UNTIL:
1183 /* maximizing repeat */
1184 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1185
1186 /* FIXME: we probably need to deal with zero-width
1187 matches in here... */
1188
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001189 ctx->u.rep = state->repeat;
1190 if (!ctx->u.rep)
1191 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001192
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001193 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001194
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001195 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001196
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001197 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1198 ctx->ptr, ctx->count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001199
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001200 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001201 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001202 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001203 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1204 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001205 if (ret) {
1206 RETURN_ON_ERROR(ret);
1207 RETURN_SUCCESS;
1208 }
1209 ctx->u.rep->count = ctx->count-1;
1210 state->ptr = ctx->ptr;
1211 RETURN_FAILURE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001212 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001213
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001214 if ((ctx->count < ctx->u.rep->pattern[2] ||
1215 ctx->u.rep->pattern[2] == 65535) &&
1216 state->ptr != ctx->u.rep->last_ptr) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001217 /* we may have enough matches, but if we can
1218 match another item, do so */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001219 ctx->u.rep->count = ctx->count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001220 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001221 MARK_PUSH(ctx->lastmark);
1222 /* zero-width match protection */
1223 DATA_PUSH(&ctx->u.rep->last_ptr);
1224 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001225 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1226 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001227 DATA_POP(&ctx->u.rep->last_ptr);
1228 if (ret) {
1229 MARK_POP_DISCARD(ctx->lastmark);
1230 RETURN_ON_ERROR(ret);
1231 RETURN_SUCCESS;
1232 }
1233 MARK_POP(ctx->lastmark);
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001234 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001235 ctx->u.rep->count = ctx->count-1;
1236 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001237 }
1238
1239 /* cannot match more repeated items here. make sure the
1240 tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001241 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001242 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001243 RETURN_ON_SUCCESS(ret);
1244 state->repeat = ctx->u.rep;
1245 state->ptr = ctx->ptr;
1246 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001247
1248 case SRE_OP_MIN_UNTIL:
1249 /* minimizing repeat */
1250 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1251
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001252 ctx->u.rep = state->repeat;
1253 if (!ctx->u.rep)
1254 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001255
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001256 state->ptr = ctx->ptr;
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001257
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001258 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001259
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001260 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1261 ctx->ptr, ctx->count, ctx->u.rep->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001262
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001263 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001264 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001265 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001266 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1267 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001268 if (ret) {
1269 RETURN_ON_ERROR(ret);
1270 RETURN_SUCCESS;
1271 }
1272 ctx->u.rep->count = ctx->count-1;
1273 state->ptr = ctx->ptr;
1274 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001275 }
1276
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001277 LASTMARK_SAVE();
1278
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001279 /* see if the tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001280 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001281 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001282 if (ret) {
1283 RETURN_ON_ERROR(ret);
1284 RETURN_SUCCESS;
1285 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001286
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001287 state->repeat = ctx->u.rep;
1288 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001289
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001290 LASTMARK_RESTORE();
1291
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001292 if (ctx->count >= ctx->u.rep->pattern[2]
1293 && ctx->u.rep->pattern[2] != 65535)
1294 RETURN_FAILURE;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001295
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001296 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001297 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1298 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001299 if (ret) {
1300 RETURN_ON_ERROR(ret);
1301 RETURN_SUCCESS;
1302 }
1303 ctx->u.rep->count = ctx->count-1;
1304 state->ptr = ctx->ptr;
1305 RETURN_FAILURE;
1306
1307 case SRE_OP_GROUPREF:
1308 /* match backreference */
1309 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1310 ctx->ptr, ctx->pattern[0]));
1311 i = ctx->pattern[0];
1312 {
1313 int groupref = i+i;
1314 if (groupref >= state->lastmark) {
1315 RETURN_FAILURE;
1316 } else {
1317 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1318 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1319 if (!p || !e || e < p)
1320 RETURN_FAILURE;
1321 while (p < e) {
1322 if (ctx->ptr >= end || *ctx->ptr != *p)
1323 RETURN_FAILURE;
1324 p++; ctx->ptr++;
1325 }
1326 }
1327 }
1328 ctx->pattern++;
1329 break;
1330
1331 case SRE_OP_GROUPREF_IGNORE:
1332 /* match backreference */
1333 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1334 ctx->ptr, ctx->pattern[0]));
1335 i = ctx->pattern[0];
1336 {
1337 int groupref = i+i;
1338 if (groupref >= state->lastmark) {
1339 RETURN_FAILURE;
1340 } else {
1341 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1342 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1343 if (!p || !e || e < p)
1344 RETURN_FAILURE;
1345 while (p < e) {
1346 if (ctx->ptr >= end ||
1347 state->lower(*ctx->ptr) != state->lower(*p))
1348 RETURN_FAILURE;
1349 p++; ctx->ptr++;
1350 }
1351 }
1352 }
1353 ctx->pattern++;
1354 break;
1355
1356 case SRE_OP_GROUPREF_EXISTS:
1357 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1358 ctx->ptr, ctx->pattern[0]));
1359 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1360 i = ctx->pattern[0];
1361 {
1362 int groupref = i+i;
1363 if (groupref >= state->lastmark) {
1364 ctx->pattern += ctx->pattern[1];
1365 break;
1366 } else {
1367 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1368 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1369 if (!p || !e || e < p) {
1370 ctx->pattern += ctx->pattern[1];
1371 break;
1372 }
1373 }
1374 }
1375 ctx->pattern += 2;
1376 break;
1377
1378 case SRE_OP_ASSERT:
1379 /* assert subpattern */
1380 /* <ASSERT> <skip> <back> <pattern> */
1381 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1382 ctx->ptr, ctx->pattern[1]));
1383 state->ptr = ctx->ptr - ctx->pattern[1];
1384 if (state->ptr < state->beginning)
1385 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001386 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001387 RETURN_ON_FAILURE(ret);
1388 ctx->pattern += ctx->pattern[0];
1389 break;
1390
1391 case SRE_OP_ASSERT_NOT:
1392 /* assert not subpattern */
1393 /* <ASSERT_NOT> <skip> <back> <pattern> */
1394 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1395 ctx->ptr, ctx->pattern[1]));
1396 state->ptr = ctx->ptr - ctx->pattern[1];
1397 if (state->ptr >= state->beginning) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001398 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001399 if (ret) {
1400 RETURN_ON_ERROR(ret);
1401 RETURN_FAILURE;
1402 }
1403 }
1404 ctx->pattern += ctx->pattern[0];
1405 break;
1406
1407 case SRE_OP_FAILURE:
1408 /* immediate failure */
1409 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1410 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001411
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001412 default:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001413 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1414 ctx->pattern[-1]));
1415 RETURN_ERROR(SRE_ERROR_ILLEGAL);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001416 }
1417 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001418
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001419exit:
1420 ctx_pos = ctx->last_ctx_pos;
1421 jump = ctx->jump;
1422 DATA_POP_DISCARD(ctx);
1423 if (ctx_pos == -1)
1424 return ret;
1425 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1426
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001427 switch (jump) {
1428 case JUMP_MAX_UNTIL_2:
1429 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1430 goto jump_max_until_2;
1431 case JUMP_MAX_UNTIL_3:
1432 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1433 goto jump_max_until_3;
1434 case JUMP_MIN_UNTIL_2:
1435 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1436 goto jump_min_until_2;
1437 case JUMP_MIN_UNTIL_3:
1438 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1439 goto jump_min_until_3;
1440 case JUMP_BRANCH:
1441 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1442 goto jump_branch;
1443 case JUMP_MAX_UNTIL_1:
1444 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1445 goto jump_max_until_1;
1446 case JUMP_MIN_UNTIL_1:
1447 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1448 goto jump_min_until_1;
1449 case JUMP_REPEAT:
1450 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1451 goto jump_repeat;
1452 case JUMP_REPEAT_ONE_1:
1453 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1454 goto jump_repeat_one_1;
1455 case JUMP_REPEAT_ONE_2:
1456 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1457 goto jump_repeat_one_2;
1458 case JUMP_MIN_REPEAT_ONE:
1459 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1460 goto jump_min_repeat_one;
1461 case JUMP_ASSERT:
1462 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1463 goto jump_assert;
1464 case JUMP_ASSERT_NOT:
1465 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1466 goto jump_assert_not;
1467 case JUMP_NONE:
1468 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1469 break;
1470 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001471
1472 return ret; /* should never get here */
Guido van Rossumb700df92000-03-31 14:59:30 +00001473}
1474
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001475LOCAL(int)
Guido van Rossumb700df92000-03-31 14:59:30 +00001476SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1477{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001478 SRE_CHAR* ptr = state->start;
1479 SRE_CHAR* end = state->end;
1480 int status = 0;
Fredrik Lundh28552902000-07-05 21:14:16 +00001481 int prefix_len = 0;
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001482 int prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001483 SRE_CODE* prefix = NULL;
1484 SRE_CODE* charset = NULL;
1485 SRE_CODE* overlap = NULL;
1486 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001487
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001488 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001489 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001490 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001491
1492 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001493
Gustavo Niemeyer28b5bb32003-06-26 14:41:08 +00001494 if (pattern[3] > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001495 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001496 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001497 end -= pattern[3]-1;
1498 if (end <= ptr)
1499 end = ptr+1;
1500 }
1501
Fredrik Lundh3562f112000-07-02 12:00:07 +00001502 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001503 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001504 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001505 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001506 prefix_skip = pattern[6];
1507 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001508 overlap = prefix + prefix_len - 1;
1509 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001510 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001511 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001512 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001513
1514 pattern += 1 + pattern[1];
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001515 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001516
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001517 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1518 TRACE(("charset = %p\n", charset));
1519
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001520#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001521 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001522 /* pattern starts with a known prefix. use the overlap
1523 table to skip forward as fast as we possibly can */
1524 int i = 0;
1525 end = state->end;
1526 while (ptr < end) {
1527 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001528 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001529 if (!i)
1530 break;
1531 else
1532 i = overlap[i];
1533 } else {
1534 if (++i == prefix_len) {
1535 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001536 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1537 state->start = ptr + 1 - prefix_len;
1538 state->ptr = ptr + 1 - prefix_len + prefix_skip;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001539 if (flags & SRE_INFO_LITERAL)
1540 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001541 status = SRE_MATCH(state, pattern + 2*prefix_skip);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001542 if (status != 0)
1543 return status;
1544 /* close but no cigar -- try again */
1545 i = overlap[i];
1546 }
1547 break;
1548 }
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001549 }
1550 ptr++;
1551 }
1552 return 0;
1553 }
1554#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001555
Fredrik Lundh3562f112000-07-02 12:00:07 +00001556 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001557 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001558 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001559 SRE_CODE chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001560 end = state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001561 for (;;) {
1562 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1563 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001564 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001565 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001566 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001567 state->start = ptr;
1568 state->ptr = ++ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001569 if (flags & SRE_INFO_LITERAL)
1570 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001571 status = SRE_MATCH(state, pattern + 2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001572 if (status != 0)
1573 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001574 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001575 } else if (charset) {
1576 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001577 end = state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001578 for (;;) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001579 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001580 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001581 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001582 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001583 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001584 state->start = ptr;
1585 state->ptr = ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001586 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001587 if (status != 0)
1588 break;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001589 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001590 }
1591 } else
1592 /* general case */
1593 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001594 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001595 state->start = state->ptr = ptr++;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001596 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001597 if (status != 0)
1598 break;
1599 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001600
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001601 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001602}
Tim Peters3d563502006-01-21 02:47:53 +00001603
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001604LOCAL(int)
1605SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, int len)
1606{
1607 /* check if given string is a literal template (i.e. no escapes) */
1608 while (len-- > 0)
1609 if (*ptr++ == '\\')
1610 return 0;
1611 return 1;
1612}
Guido van Rossumb700df92000-03-31 14:59:30 +00001613
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001614#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001615
1616/* -------------------------------------------------------------------- */
1617/* factories and destructors */
1618
1619/* see sre.h for object declarations */
1620
Jeremy Hylton938ace62002-07-17 16:30:39 +00001621static PyTypeObject Pattern_Type;
1622static PyTypeObject Match_Type;
1623static PyTypeObject Scanner_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00001624
1625static PyObject *
1626_compile(PyObject* self_, PyObject* args)
1627{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001628 /* "compile" pattern descriptor to pattern object */
Guido van Rossumb700df92000-03-31 14:59:30 +00001629
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001630 PatternObject* self;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001631 int i, n;
Guido van Rossumb700df92000-03-31 14:59:30 +00001632
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001633 PyObject* pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001634 int flags = 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001635 PyObject* code;
1636 int groups = 0;
1637 PyObject* groupindex = NULL;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001638 PyObject* indexgroup = NULL;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001639 if (!PyArg_ParseTuple(args, "OiO!|iOO", &pattern, &flags,
1640 &PyList_Type, &code, &groups,
1641 &groupindex, &indexgroup))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001642 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001643
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001644 n = PyList_GET_SIZE(code);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001645
Fredrik Lundhebc37b22000-10-28 19:30:41 +00001646 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001647 if (!self)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001648 return NULL;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001649
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001650 self->codesize = n;
1651
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001652 for (i = 0; i < n; i++) {
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001653 PyObject *o = PyList_GET_ITEM(code, i);
Tim Peters3d563502006-01-21 02:47:53 +00001654 unsigned long value = PyInt_Check(o) ? (unsigned long)PyInt_AsLong(o)
1655 : PyLong_AsUnsignedLong(o);
1656 self->code[i] = (SRE_CODE) value;
1657 if ((unsigned long) self->code[i] != value) {
1658 PyErr_SetString(PyExc_OverflowError,
1659 "regular expression code size limit exceeded");
1660 break;
1661 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001662 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00001663
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001664 if (PyErr_Occurred()) {
1665 PyObject_DEL(self);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001666 return NULL;
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001667 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00001668
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001669 Py_INCREF(pattern);
1670 self->pattern = pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00001671
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001672 self->flags = flags;
1673
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001674 self->groups = groups;
Guido van Rossumb700df92000-03-31 14:59:30 +00001675
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001676 Py_XINCREF(groupindex);
1677 self->groupindex = groupindex;
Guido van Rossumb700df92000-03-31 14:59:30 +00001678
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001679 Py_XINCREF(indexgroup);
1680 self->indexgroup = indexgroup;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001681
Raymond Hettinger027bb632004-05-31 03:09:25 +00001682 self->weakreflist = NULL;
1683
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001684 return (PyObject*) self;
Guido van Rossumb700df92000-03-31 14:59:30 +00001685}
1686
1687static PyObject *
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001688sre_codesize(PyObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001689{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001690 return Py_BuildValue("i", sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001691}
1692
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001693static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001694sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001695{
1696 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001697 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001698 return NULL;
1699 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001700 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001701 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001702#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001703 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001704#else
1705 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001706#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001707 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001708}
1709
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001710LOCAL(void)
1711state_reset(SRE_STATE* state)
1712{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001713 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001714 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001715
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001716 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001717 state->lastindex = -1;
1718
1719 state->repeat = NULL;
1720
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001721 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001722}
1723
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001724static void*
1725getstring(PyObject* string, int* p_length, int* p_charsize)
Guido van Rossumb700df92000-03-31 14:59:30 +00001726{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001727 /* given a python object, return a data pointer, a length (in
1728 characters), and a character size. return NULL if the object
1729 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +00001730
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001731 PyBufferProcs *buffer;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001732 int size, bytes, charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001733 void* ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001734
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001735#if defined(HAVE_UNICODE)
1736 if (PyUnicode_Check(string)) {
1737 /* unicode strings doesn't always support the buffer interface */
1738 ptr = (void*) PyUnicode_AS_DATA(string);
1739 bytes = PyUnicode_GET_DATA_SIZE(string);
1740 size = PyUnicode_GET_SIZE(string);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001741 charsize = sizeof(Py_UNICODE);
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001742
1743 } else {
1744#endif
1745
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001746 /* get pointer to string buffer */
1747 buffer = string->ob_type->tp_as_buffer;
1748 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1749 buffer->bf_getsegcount(string, NULL) != 1) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001750 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001751 return NULL;
1752 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001753
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001754 /* determine buffer size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001755 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1756 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001757 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1758 return NULL;
1759 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001760
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001761 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001762#if PY_VERSION_HEX >= 0x01060000
1763 size = PyObject_Size(string);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001764#else
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001765 size = PyObject_Length(string);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001766#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001767
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001768 if (PyString_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001769 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001770#if defined(HAVE_UNICODE)
1771 else if (bytes == (int) (size * sizeof(Py_UNICODE)))
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001772 charsize = sizeof(Py_UNICODE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001773#endif
1774 else {
1775 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1776 return NULL;
1777 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001778
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001779#if defined(HAVE_UNICODE)
1780 }
1781#endif
1782
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001783 *p_length = size;
1784 *p_charsize = charsize;
1785
1786 return ptr;
1787}
1788
1789LOCAL(PyObject*)
1790state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1791 int start, int end)
1792{
1793 /* prepare state object */
1794
1795 int length;
1796 int charsize;
1797 void* ptr;
1798
1799 memset(state, 0, sizeof(SRE_STATE));
1800
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001801 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001802 state->lastindex = -1;
1803
1804 ptr = getstring(string, &length, &charsize);
1805 if (!ptr)
1806 return NULL;
1807
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001808 /* adjust boundaries */
1809 if (start < 0)
1810 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001811 else if (start > length)
1812 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001813
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001814 if (end < 0)
1815 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001816 else if (end > length)
1817 end = length;
1818
1819 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001820
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001821 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001822
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001823 state->start = (void*) ((char*) ptr + start * state->charsize);
1824 state->end = (void*) ((char*) ptr + end * state->charsize);
1825
1826 Py_INCREF(string);
1827 state->string = string;
1828 state->pos = start;
1829 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001830
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001831 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001832 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001833 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001834#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001835 state->lower = sre_lower_unicode;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001836#else
1837 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001838#endif
1839 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001840 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001841
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001842 return string;
Guido van Rossumb700df92000-03-31 14:59:30 +00001843}
1844
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001845LOCAL(void)
1846state_fini(SRE_STATE* state)
1847{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001848 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001849 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001850}
1851
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001852/* calculate offset from start of string */
1853#define STATE_OFFSET(state, member)\
1854 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1855
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001856LOCAL(PyObject*)
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001857state_getslice(SRE_STATE* state, int index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001858{
Fredrik Lundh58100642000-08-09 09:14:35 +00001859 int i, j;
1860
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001861 index = (index - 1) * 2;
1862
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001863 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001864 if (empty)
1865 /* want empty string */
1866 i = j = 0;
1867 else {
1868 Py_INCREF(Py_None);
1869 return Py_None;
1870 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001871 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001872 i = STATE_OFFSET(state, state->mark[index]);
1873 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001874 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001875
Fredrik Lundh58100642000-08-09 09:14:35 +00001876 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001877}
1878
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001879static void
1880pattern_error(int status)
1881{
1882 switch (status) {
1883 case SRE_ERROR_RECURSION_LIMIT:
1884 PyErr_SetString(
1885 PyExc_RuntimeError,
1886 "maximum recursion limit exceeded"
1887 );
1888 break;
1889 case SRE_ERROR_MEMORY:
1890 PyErr_NoMemory();
1891 break;
1892 default:
1893 /* other error codes indicate compiler/engine bugs */
1894 PyErr_SetString(
1895 PyExc_RuntimeError,
1896 "internal error in regular expression engine"
1897 );
1898 }
1899}
1900
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001901static PyObject*
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001902pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001903{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001904 /* create match object (from state object) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001905
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001906 MatchObject* match;
1907 int i, j;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001908 char* base;
1909 int n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001910
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001911 if (status > 0) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001912
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001913 /* create match object (with room for extra group marks) */
1914 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
Fredrik Lundh6f013982000-07-03 18:44:21 +00001915 2*(pattern->groups+1));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001916 if (!match)
1917 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001918
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001919 Py_INCREF(pattern);
1920 match->pattern = pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001921
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001922 Py_INCREF(state->string);
1923 match->string = state->string;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001924
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001925 match->regs = NULL;
1926 match->groups = pattern->groups+1;
1927
1928 /* fill in group slices */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001929
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001930 base = (char*) state->beginning;
1931 n = state->charsize;
1932
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001933 match->mark[0] = ((char*) state->start - base) / n;
1934 match->mark[1] = ((char*) state->ptr - base) / n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001935
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001936 for (i = j = 0; i < pattern->groups; i++, j+=2)
1937 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
1938 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
1939 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
1940 } else
1941 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
1942
1943 match->pos = state->pos;
1944 match->endpos = state->endpos;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001945
Fredrik Lundh6f013982000-07-03 18:44:21 +00001946 match->lastindex = state->lastindex;
1947
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001948 return (PyObject*) match;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001949
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001950 } else if (status == 0) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001951
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001952 /* no match */
1953 Py_INCREF(Py_None);
1954 return Py_None;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001955
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001956 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001957
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001958 /* internal error */
1959 pattern_error(status);
1960 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001961}
1962
1963static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001964pattern_scanner(PatternObject* pattern, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001965{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001966 /* create search state object */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001967
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001968 ScannerObject* self;
1969
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001970 PyObject* string;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001971 int start = 0;
1972 int end = INT_MAX;
1973 if (!PyArg_ParseTuple(args, "O|ii:scanner", &string, &start, &end))
1974 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001975
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001976 /* create scanner object */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001977 self = PyObject_NEW(ScannerObject, &Scanner_Type);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001978 if (!self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001979 return NULL;
1980
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001981 string = state_init(&self->state, pattern, string, start, end);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001982 if (!string) {
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001983 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001984 return NULL;
1985 }
1986
1987 Py_INCREF(pattern);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001988 self->pattern = (PyObject*) pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001989
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001990 return (PyObject*) self;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001991}
1992
Guido van Rossumb700df92000-03-31 14:59:30 +00001993static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001994pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001995{
Raymond Hettinger027bb632004-05-31 03:09:25 +00001996 if (self->weakreflist != NULL)
1997 PyObject_ClearWeakRefs((PyObject *) self);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001998 Py_XDECREF(self->pattern);
1999 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00002000 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002001 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00002002}
2003
2004static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002005pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002006{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002007 SRE_STATE state;
2008 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00002009
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002010 PyObject* string;
2011 int start = 0;
2012 int end = INT_MAX;
Jeremy Hyltonaf68c872005-12-10 18:50:16 +00002013 static const char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00002014 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:match", kwlist,
2015 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002016 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002017
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002018 string = state_init(&state, self, string, start, end);
2019 if (!string)
2020 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002021
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002022 state.ptr = state.start;
2023
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002024 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
2025
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002026 if (state.charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00002027 status = sre_match(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002028 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002029#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00002030 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002031#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002032 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002033
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002034 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
2035
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002036 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00002037
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002038 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00002039}
2040
2041static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002042pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002043{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002044 SRE_STATE state;
2045 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00002046
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002047 PyObject* string;
2048 int start = 0;
2049 int end = INT_MAX;
Jeremy Hyltonaf68c872005-12-10 18:50:16 +00002050 static const char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00002051 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:search", kwlist,
2052 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002053 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002054
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002055 string = state_init(&state, self, string, start, end);
2056 if (!string)
2057 return NULL;
2058
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002059 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
2060
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002061 if (state.charsize == 1) {
2062 status = sre_search(&state, PatternObject_GetCode(self));
2063 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002064#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002065 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002066#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002067 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002068
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002069 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
2070
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002071 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00002072
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002073 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00002074}
2075
2076static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002077call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002078{
2079 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002080 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002081 PyObject* func;
2082 PyObject* result;
2083
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002084 if (!args)
2085 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002086 name = PyString_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002087 if (!name)
2088 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002089 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002090 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002091 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002092 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002093 func = PyObject_GetAttrString(mod, function);
2094 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002095 if (!func)
2096 return NULL;
2097 result = PyObject_CallObject(func, args);
2098 Py_DECREF(func);
2099 Py_DECREF(args);
2100 return result;
2101}
2102
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002103#ifdef USE_BUILTIN_COPY
2104static int
2105deepcopy(PyObject** object, PyObject* memo)
2106{
2107 PyObject* copy;
2108
2109 copy = call(
2110 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002111 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002112 );
2113 if (!copy)
2114 return 0;
2115
2116 Py_DECREF(*object);
2117 *object = copy;
2118
2119 return 1; /* success */
2120}
2121#endif
2122
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002123static PyObject*
Michael W. Hudsonb6a45052002-07-31 09:54:24 +00002124join_list(PyObject* list, PyObject* pattern)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002125{
2126 /* join list elements */
2127
2128 PyObject* joiner;
2129#if PY_VERSION_HEX >= 0x01060000
2130 PyObject* function;
2131 PyObject* args;
2132#endif
2133 PyObject* result;
2134
2135 switch (PyList_GET_SIZE(list)) {
2136 case 0:
2137 Py_DECREF(list);
Fredrik Lundh09705f02002-11-22 12:46:35 +00002138 return PySequence_GetSlice(pattern, 0, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002139 case 1:
2140 result = PyList_GET_ITEM(list, 0);
2141 Py_INCREF(result);
2142 Py_DECREF(list);
2143 return result;
2144 }
2145
2146 /* two or more elements: slice out a suitable separator from the
2147 first member, and use that to join the entire list */
2148
2149 joiner = PySequence_GetSlice(pattern, 0, 0);
2150 if (!joiner)
2151 return NULL;
2152
2153#if PY_VERSION_HEX >= 0x01060000
2154 function = PyObject_GetAttrString(joiner, "join");
2155 if (!function) {
2156 Py_DECREF(joiner);
2157 return NULL;
2158 }
2159 args = PyTuple_New(1);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002160 if (!args) {
2161 Py_DECREF(function);
2162 Py_DECREF(joiner);
2163 return NULL;
2164 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002165 PyTuple_SET_ITEM(args, 0, list);
2166 result = PyObject_CallObject(function, args);
2167 Py_DECREF(args); /* also removes list */
2168 Py_DECREF(function);
2169#else
2170 result = call(
2171 "string", "join",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002172 PyTuple_Pack(2, list, joiner)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002173 );
2174#endif
2175 Py_DECREF(joiner);
2176
2177 return result;
2178}
2179
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002180static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002181pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002182{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002183 SRE_STATE state;
2184 PyObject* list;
2185 int status;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002186 int i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00002187
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002188 PyObject* string;
2189 int start = 0;
2190 int end = INT_MAX;
Jeremy Hyltonaf68c872005-12-10 18:50:16 +00002191 static const char* kwlist[] = { "source", "pos", "endpos", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00002192 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:findall", kwlist,
2193 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002194 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002195
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002196 string = state_init(&state, self, string, start, end);
2197 if (!string)
2198 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002199
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002200 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002201 if (!list) {
2202 state_fini(&state);
2203 return NULL;
2204 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002205
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002206 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002207
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002208 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +00002209
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002210 state_reset(&state);
2211
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002212 state.ptr = state.start;
2213
2214 if (state.charsize == 1) {
2215 status = sre_search(&state, PatternObject_GetCode(self));
2216 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002217#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002218 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002219#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002220 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002221
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002222 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002223 if (status == 0)
2224 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002225 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002226 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002227 }
Tim Peters3d563502006-01-21 02:47:53 +00002228
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002229 /* don't bother to build a match object */
2230 switch (self->groups) {
2231 case 0:
2232 b = STATE_OFFSET(&state, state.start);
2233 e = STATE_OFFSET(&state, state.ptr);
2234 item = PySequence_GetSlice(string, b, e);
2235 if (!item)
2236 goto error;
2237 break;
2238 case 1:
2239 item = state_getslice(&state, 1, string, 1);
2240 if (!item)
2241 goto error;
2242 break;
2243 default:
2244 item = PyTuple_New(self->groups);
2245 if (!item)
2246 goto error;
2247 for (i = 0; i < self->groups; i++) {
2248 PyObject* o = state_getslice(&state, i+1, string, 1);
2249 if (!o) {
2250 Py_DECREF(item);
2251 goto error;
2252 }
2253 PyTuple_SET_ITEM(item, i, o);
2254 }
2255 break;
2256 }
2257
2258 status = PyList_Append(list, item);
2259 Py_DECREF(item);
2260 if (status < 0)
2261 goto error;
2262
2263 if (state.ptr == state.start)
2264 state.start = (void*) ((char*) state.ptr + state.charsize);
2265 else
2266 state.start = state.ptr;
2267
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002268 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002269
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002270 state_fini(&state);
2271 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002272
2273error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002274 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002275 state_fini(&state);
2276 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002277
Guido van Rossumb700df92000-03-31 14:59:30 +00002278}
2279
Fredrik Lundh703ce812001-10-24 22:16:30 +00002280#if PY_VERSION_HEX >= 0x02020000
2281static PyObject*
2282pattern_finditer(PatternObject* pattern, PyObject* args)
2283{
2284 PyObject* scanner;
2285 PyObject* search;
2286 PyObject* iterator;
2287
2288 scanner = pattern_scanner(pattern, args);
2289 if (!scanner)
2290 return NULL;
2291
2292 search = PyObject_GetAttrString(scanner, "search");
2293 Py_DECREF(scanner);
2294 if (!search)
2295 return NULL;
2296
2297 iterator = PyCallIter_New(search, Py_None);
2298 Py_DECREF(search);
2299
2300 return iterator;
2301}
2302#endif
2303
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002304static PyObject*
2305pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2306{
2307 SRE_STATE state;
2308 PyObject* list;
2309 PyObject* item;
2310 int status;
2311 int n;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002312 int i;
2313 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002314
2315 PyObject* string;
2316 int maxsplit = 0;
Jeremy Hyltonaf68c872005-12-10 18:50:16 +00002317 static const char* kwlist[] = { "source", "maxsplit", NULL };
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002318 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|i:split", kwlist,
2319 &string, &maxsplit))
2320 return NULL;
2321
2322 string = state_init(&state, self, string, 0, INT_MAX);
2323 if (!string)
2324 return NULL;
2325
2326 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002327 if (!list) {
2328 state_fini(&state);
2329 return NULL;
2330 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002331
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002332 n = 0;
2333 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002334
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002335 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002336
2337 state_reset(&state);
2338
2339 state.ptr = state.start;
2340
2341 if (state.charsize == 1) {
2342 status = sre_search(&state, PatternObject_GetCode(self));
2343 } else {
2344#if defined(HAVE_UNICODE)
2345 status = sre_usearch(&state, PatternObject_GetCode(self));
2346#endif
2347 }
2348
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002349 if (status <= 0) {
2350 if (status == 0)
2351 break;
2352 pattern_error(status);
2353 goto error;
2354 }
Tim Peters3d563502006-01-21 02:47:53 +00002355
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002356 if (state.start == state.ptr) {
2357 if (last == state.end)
2358 break;
2359 /* skip one character */
2360 state.start = (void*) ((char*) state.ptr + state.charsize);
2361 continue;
2362 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002363
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002364 /* get segment before this match */
2365 item = PySequence_GetSlice(
2366 string, STATE_OFFSET(&state, last),
2367 STATE_OFFSET(&state, state.start)
2368 );
2369 if (!item)
2370 goto error;
2371 status = PyList_Append(list, item);
2372 Py_DECREF(item);
2373 if (status < 0)
2374 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002375
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002376 /* add groups (if any) */
2377 for (i = 0; i < self->groups; i++) {
2378 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002379 if (!item)
2380 goto error;
2381 status = PyList_Append(list, item);
2382 Py_DECREF(item);
2383 if (status < 0)
2384 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002385 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002386
2387 n = n + 1;
2388
2389 last = state.start = state.ptr;
2390
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002391 }
2392
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002393 /* get segment following last match (even if empty) */
2394 item = PySequence_GetSlice(
2395 string, STATE_OFFSET(&state, last), state.endpos
2396 );
2397 if (!item)
2398 goto error;
2399 status = PyList_Append(list, item);
2400 Py_DECREF(item);
2401 if (status < 0)
2402 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002403
2404 state_fini(&state);
2405 return list;
2406
2407error:
2408 Py_DECREF(list);
2409 state_fini(&state);
2410 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002411
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002412}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002413
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002414static PyObject*
2415pattern_subx(PatternObject* self, PyObject* template, PyObject* string,
2416 int count, int subn)
2417{
2418 SRE_STATE state;
2419 PyObject* list;
2420 PyObject* item;
2421 PyObject* filter;
2422 PyObject* args;
2423 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002424 void* ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002425 int status;
2426 int n;
2427 int i, b, e;
2428 int filter_is_callable;
2429
Fredrik Lundhdac58492001-10-21 21:48:30 +00002430 if (PyCallable_Check(template)) {
2431 /* sub/subn takes either a function or a template */
2432 filter = template;
2433 Py_INCREF(filter);
2434 filter_is_callable = 1;
2435 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002436 /* if not callable, check if it's a literal string */
2437 int literal;
2438 ptr = getstring(template, &n, &b);
2439 if (ptr) {
2440 if (b == 1) {
2441 literal = sre_literal_template(ptr, n);
2442 } else {
2443#if defined(HAVE_UNICODE)
2444 literal = sre_uliteral_template(ptr, n);
2445#endif
2446 }
2447 } else {
2448 PyErr_Clear();
2449 literal = 0;
2450 }
2451 if (literal) {
2452 filter = template;
2453 Py_INCREF(filter);
2454 filter_is_callable = 0;
2455 } else {
2456 /* not a literal; hand it over to the template compiler */
2457 filter = call(
2458 SRE_MODULE, "_subx",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002459 PyTuple_Pack(2, self, template)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002460 );
2461 if (!filter)
2462 return NULL;
2463 filter_is_callable = PyCallable_Check(filter);
2464 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002465 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002466
2467 string = state_init(&state, self, string, 0, INT_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002468 if (!string) {
2469 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002470 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002471 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002472
2473 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002474 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002475 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002476 state_fini(&state);
2477 return NULL;
2478 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002479
2480 n = i = 0;
2481
2482 while (!count || n < count) {
2483
2484 state_reset(&state);
2485
2486 state.ptr = state.start;
2487
2488 if (state.charsize == 1) {
2489 status = sre_search(&state, PatternObject_GetCode(self));
2490 } else {
2491#if defined(HAVE_UNICODE)
2492 status = sre_usearch(&state, PatternObject_GetCode(self));
2493#endif
2494 }
2495
2496 if (status <= 0) {
2497 if (status == 0)
2498 break;
2499 pattern_error(status);
2500 goto error;
2501 }
Tim Peters3d563502006-01-21 02:47:53 +00002502
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002503 b = STATE_OFFSET(&state, state.start);
2504 e = STATE_OFFSET(&state, state.ptr);
2505
2506 if (i < b) {
2507 /* get segment before this match */
2508 item = PySequence_GetSlice(string, i, b);
2509 if (!item)
2510 goto error;
2511 status = PyList_Append(list, item);
2512 Py_DECREF(item);
2513 if (status < 0)
2514 goto error;
2515
2516 } else if (i == b && i == e && n > 0)
2517 /* ignore empty match on latest position */
2518 goto next;
2519
2520 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002521 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002522 match = pattern_new_match(self, &state, 1);
2523 if (!match)
2524 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002525 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002526 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002527 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002528 goto error;
2529 }
2530 item = PyObject_CallObject(filter, args);
2531 Py_DECREF(args);
2532 Py_DECREF(match);
2533 if (!item)
2534 goto error;
2535 } else {
2536 /* filter is literal string */
2537 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002538 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002539 }
2540
2541 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002542 if (item != Py_None) {
2543 status = PyList_Append(list, item);
2544 Py_DECREF(item);
2545 if (status < 0)
2546 goto error;
2547 }
Tim Peters3d563502006-01-21 02:47:53 +00002548
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002549 i = e;
2550 n = n + 1;
2551
2552next:
2553 /* move on */
2554 if (state.ptr == state.start)
2555 state.start = (void*) ((char*) state.ptr + state.charsize);
2556 else
2557 state.start = state.ptr;
2558
2559 }
2560
2561 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002562 if (i < state.endpos) {
2563 item = PySequence_GetSlice(string, i, state.endpos);
2564 if (!item)
2565 goto error;
2566 status = PyList_Append(list, item);
2567 Py_DECREF(item);
2568 if (status < 0)
2569 goto error;
2570 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002571
2572 state_fini(&state);
2573
Guido van Rossum4e173842001-12-07 04:25:10 +00002574 Py_DECREF(filter);
2575
Fredrik Lundhdac58492001-10-21 21:48:30 +00002576 /* convert list to single string (also removes list) */
Michael W. Hudsonb6a45052002-07-31 09:54:24 +00002577 item = join_list(list, self->pattern);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002578
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002579 if (!item)
2580 return NULL;
2581
2582 if (subn)
2583 return Py_BuildValue("Ni", item, n);
2584
2585 return item;
2586
2587error:
2588 Py_DECREF(list);
2589 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002590 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002591 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002592
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002593}
2594
2595static PyObject*
2596pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2597{
2598 PyObject* template;
2599 PyObject* string;
2600 int count = 0;
Jeremy Hyltonaf68c872005-12-10 18:50:16 +00002601 static const char* kwlist[] = { "repl", "string", "count", NULL };
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002602 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:sub", kwlist,
2603 &template, &string, &count))
2604 return NULL;
2605
2606 return pattern_subx(self, template, string, count, 0);
2607}
2608
2609static PyObject*
2610pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2611{
2612 PyObject* template;
2613 PyObject* string;
2614 int count = 0;
Jeremy Hyltonaf68c872005-12-10 18:50:16 +00002615 static const char* kwlist[] = { "repl", "string", "count", NULL };
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002616 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:subn", kwlist,
2617 &template, &string, &count))
2618 return NULL;
2619
2620 return pattern_subx(self, template, string, count, 1);
2621}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002622
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002623static PyObject*
2624pattern_copy(PatternObject* self, PyObject* args)
2625{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002626#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002627 PatternObject* copy;
2628 int offset;
2629
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002630 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
2631 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002632
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002633 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2634 if (!copy)
2635 return NULL;
2636
2637 offset = offsetof(PatternObject, groups);
2638
2639 Py_XINCREF(self->groupindex);
2640 Py_XINCREF(self->indexgroup);
2641 Py_XINCREF(self->pattern);
2642
2643 memcpy((char*) copy + offset, (char*) self + offset,
2644 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00002645 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002646
2647 return (PyObject*) copy;
2648#else
2649 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2650 return NULL;
2651#endif
2652}
2653
2654static PyObject*
2655pattern_deepcopy(PatternObject* self, PyObject* args)
2656{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002657#ifdef USE_BUILTIN_COPY
2658 PatternObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00002659
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002660 PyObject* memo;
2661 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
2662 return NULL;
2663
2664 copy = (PatternObject*) pattern_copy(self, Py_None);
2665 if (!copy)
2666 return NULL;
2667
2668 if (!deepcopy(&copy->groupindex, memo) ||
2669 !deepcopy(&copy->indexgroup, memo) ||
2670 !deepcopy(&copy->pattern, memo)) {
2671 Py_DECREF(copy);
2672 return NULL;
2673 }
2674
2675#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002676 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2677 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002678#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002679}
2680
Raymond Hettinger94478742004-09-24 04:31:19 +00002681PyDoc_STRVAR(pattern_match_doc,
2682"match(string[, pos[, endpos]]) --> match object or None.\n\
2683 Matches zero or more characters at the beginning of the string");
2684
2685PyDoc_STRVAR(pattern_search_doc,
2686"search(string[, pos[, endpos]]) --> match object or None.\n\
2687 Scan through string looking for a match, and return a corresponding\n\
2688 MatchObject instance. Return None if no position in the string matches.");
2689
2690PyDoc_STRVAR(pattern_split_doc,
2691"split(string[, maxsplit = 0]) --> list.\n\
2692 Split string by the occurrences of pattern.");
2693
2694PyDoc_STRVAR(pattern_findall_doc,
2695"findall(string[, pos[, endpos]]) --> list.\n\
2696 Return a list of all non-overlapping matches of pattern in string.");
2697
2698PyDoc_STRVAR(pattern_finditer_doc,
2699"finditer(string[, pos[, endpos]]) --> iterator.\n\
2700 Return an iterator over all non-overlapping matches for the \n\
2701 RE pattern in string. For each match, the iterator returns a\n\
2702 match object.");
2703
2704PyDoc_STRVAR(pattern_sub_doc,
2705"sub(repl, string[, count = 0]) --> newstring\n\
2706 Return the string obtained by replacing the leftmost non-overlapping\n\
Tim Peters3d563502006-01-21 02:47:53 +00002707 occurrences of pattern in string by the replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002708
2709PyDoc_STRVAR(pattern_subn_doc,
2710"subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2711 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2712 the leftmost non-overlapping occurrences of pattern with the\n\
Tim Peters3d563502006-01-21 02:47:53 +00002713 replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002714
2715PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2716
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002717static PyMethodDef pattern_methods[] = {
Tim Peters3d563502006-01-21 02:47:53 +00002718 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002719 pattern_match_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002720 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002721 pattern_search_doc},
2722 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2723 pattern_sub_doc},
2724 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2725 pattern_subn_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002726 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002727 pattern_split_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002728 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002729 pattern_findall_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002730#if PY_VERSION_HEX >= 0x02020000
Raymond Hettinger94478742004-09-24 04:31:19 +00002731 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2732 pattern_finditer_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002733#endif
Fredrik Lundh562586e2000-10-03 20:43:34 +00002734 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002735 {"__copy__", (PyCFunction) pattern_copy, METH_VARARGS},
2736 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002737 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002738};
2739
Tim Peters3d563502006-01-21 02:47:53 +00002740static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002741pattern_getattr(PatternObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00002742{
2743 PyObject* res;
2744
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002745 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00002746
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002747 if (res)
2748 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00002749
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002750 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00002751
2752 /* attributes */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002753 if (!strcmp(name, "pattern")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002754 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002755 return self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00002756 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002757
2758 if (!strcmp(name, "flags"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002759 return Py_BuildValue("i", self->flags);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002760
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002761 if (!strcmp(name, "groups"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002762 return Py_BuildValue("i", self->groups);
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002763
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002764 if (!strcmp(name, "groupindex") && self->groupindex) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002765 Py_INCREF(self->groupindex);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002766 return self->groupindex;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002767 }
2768
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002769 PyErr_SetString(PyExc_AttributeError, name);
2770 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002771}
2772
2773statichere PyTypeObject Pattern_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002774 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00002775 0, "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002776 sizeof(PatternObject), sizeof(SRE_CODE),
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002777 (destructor)pattern_dealloc, /*tp_dealloc*/
2778 0, /*tp_print*/
Raymond Hettinger027bb632004-05-31 03:09:25 +00002779 (getattrfunc)pattern_getattr, /*tp_getattr*/
2780 0, /* tp_setattr */
2781 0, /* tp_compare */
2782 0, /* tp_repr */
2783 0, /* tp_as_number */
2784 0, /* tp_as_sequence */
2785 0, /* tp_as_mapping */
2786 0, /* tp_hash */
2787 0, /* tp_call */
2788 0, /* tp_str */
2789 0, /* tp_getattro */
2790 0, /* tp_setattro */
2791 0, /* tp_as_buffer */
2792 Py_TPFLAGS_HAVE_WEAKREFS, /* tp_flags */
Raymond Hettinger94478742004-09-24 04:31:19 +00002793 pattern_doc, /* tp_doc */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002794 0, /* tp_traverse */
2795 0, /* tp_clear */
2796 0, /* tp_richcompare */
2797 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
Guido van Rossumb700df92000-03-31 14:59:30 +00002798};
2799
2800/* -------------------------------------------------------------------- */
2801/* match methods */
2802
2803static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002804match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00002805{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002806 Py_XDECREF(self->regs);
2807 Py_XDECREF(self->string);
2808 Py_DECREF(self->pattern);
2809 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00002810}
2811
2812static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002813match_getslice_by_index(MatchObject* self, int index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00002814{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002815 if (index < 0 || index >= self->groups) {
2816 /* raise IndexError if we were given a bad group number */
2817 PyErr_SetString(
2818 PyExc_IndexError,
2819 "no such group"
2820 );
2821 return NULL;
2822 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002823
Fredrik Lundh6f013982000-07-03 18:44:21 +00002824 index *= 2;
2825
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002826 if (self->string == Py_None || self->mark[index] < 0) {
2827 /* return default value if the string or group is undefined */
2828 Py_INCREF(def);
2829 return def;
2830 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002831
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002832 return PySequence_GetSlice(
2833 self->string, self->mark[index], self->mark[index+1]
2834 );
Guido van Rossumb700df92000-03-31 14:59:30 +00002835}
2836
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002837static int
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002838match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00002839{
Fredrik Lundh6f013982000-07-03 18:44:21 +00002840 int i;
Guido van Rossumb700df92000-03-31 14:59:30 +00002841
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002842 if (PyInt_Check(index))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002843 return (int) PyInt_AS_LONG(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00002844
Fredrik Lundh6f013982000-07-03 18:44:21 +00002845 i = -1;
2846
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002847 if (self->pattern->groupindex) {
2848 index = PyObject_GetItem(self->pattern->groupindex, index);
2849 if (index) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00002850 if (PyInt_Check(index))
2851 i = (int) PyInt_AS_LONG(index);
2852 Py_DECREF(index);
2853 } else
2854 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002855 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00002856
2857 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002858}
2859
2860static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002861match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002862{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002863 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00002864}
2865
2866static PyObject*
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002867match_expand(MatchObject* self, PyObject* args)
2868{
2869 PyObject* template;
2870 if (!PyArg_ParseTuple(args, "O:expand", &template))
2871 return NULL;
2872
2873 /* delegate to Python code */
2874 return call(
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002875 SRE_MODULE, "_expand",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002876 PyTuple_Pack(3, self->pattern, self, template)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002877 );
2878}
2879
2880static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002881match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002882{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002883 PyObject* result;
2884 int i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00002885
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002886 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00002887
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002888 switch (size) {
2889 case 0:
2890 result = match_getslice(self, Py_False, Py_None);
2891 break;
2892 case 1:
2893 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2894 break;
2895 default:
2896 /* fetch multiple items */
2897 result = PyTuple_New(size);
2898 if (!result)
2899 return NULL;
2900 for (i = 0; i < size; i++) {
2901 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002902 self, PyTuple_GET_ITEM(args, i), Py_None
2903 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002904 if (!item) {
2905 Py_DECREF(result);
2906 return NULL;
2907 }
2908 PyTuple_SET_ITEM(result, i, item);
2909 }
2910 break;
2911 }
2912 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002913}
2914
2915static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002916match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002917{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002918 PyObject* result;
2919 int index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002920
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002921 PyObject* def = Py_None;
Jeremy Hyltonaf68c872005-12-10 18:50:16 +00002922 static const char* kwlist[] = { "default", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00002923 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002924 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002925
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002926 result = PyTuple_New(self->groups-1);
2927 if (!result)
2928 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002929
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002930 for (index = 1; index < self->groups; index++) {
2931 PyObject* item;
2932 item = match_getslice_by_index(self, index, def);
2933 if (!item) {
2934 Py_DECREF(result);
2935 return NULL;
2936 }
2937 PyTuple_SET_ITEM(result, index-1, item);
2938 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002939
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002940 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002941}
2942
2943static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002944match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002945{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002946 PyObject* result;
2947 PyObject* keys;
2948 int index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002949
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002950 PyObject* def = Py_None;
Jeremy Hyltonaf68c872005-12-10 18:50:16 +00002951 static const char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00002952 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002953 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002954
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002955 result = PyDict_New();
2956 if (!result || !self->pattern->groupindex)
2957 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002958
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002959 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002960 if (!keys)
2961 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00002962
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002963 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00002964 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002965 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002966 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002967 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002968 if (!key)
2969 goto failed;
2970 value = match_getslice(self, key, def);
2971 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002972 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002973 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002974 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00002975 status = PyDict_SetItem(result, key, value);
2976 Py_DECREF(value);
2977 if (status < 0)
2978 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002979 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002980
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002981 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00002982
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002983 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002984
2985failed:
2986 Py_DECREF(keys);
2987 Py_DECREF(result);
2988 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002989}
2990
2991static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002992match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002993{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002994 int index;
2995
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002996 PyObject* index_ = Py_False; /* zero */
2997 if (!PyArg_ParseTuple(args, "|O:start", &index_))
2998 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002999
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003000 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003001
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003002 if (index < 0 || index >= self->groups) {
3003 PyErr_SetString(
3004 PyExc_IndexError,
3005 "no such group"
3006 );
3007 return NULL;
3008 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003009
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003010 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003011 return Py_BuildValue("i", self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00003012}
3013
3014static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003015match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003016{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003017 int index;
3018
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003019 PyObject* index_ = Py_False; /* zero */
3020 if (!PyArg_ParseTuple(args, "|O:end", &index_))
3021 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003022
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003023 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003024
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003025 if (index < 0 || index >= self->groups) {
3026 PyErr_SetString(
3027 PyExc_IndexError,
3028 "no such group"
3029 );
3030 return NULL;
3031 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003032
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003033 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003034 return Py_BuildValue("i", self->mark[index*2+1]);
3035}
3036
3037LOCAL(PyObject*)
3038_pair(int i1, int i2)
3039{
3040 PyObject* pair;
3041 PyObject* item;
3042
3043 pair = PyTuple_New(2);
3044 if (!pair)
3045 return NULL;
3046
3047 item = PyInt_FromLong(i1);
3048 if (!item)
3049 goto error;
3050 PyTuple_SET_ITEM(pair, 0, item);
3051
3052 item = PyInt_FromLong(i2);
3053 if (!item)
3054 goto error;
3055 PyTuple_SET_ITEM(pair, 1, item);
3056
3057 return pair;
3058
3059 error:
3060 Py_DECREF(pair);
3061 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003062}
3063
3064static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003065match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003066{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003067 int index;
3068
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003069 PyObject* index_ = Py_False; /* zero */
3070 if (!PyArg_ParseTuple(args, "|O:span", &index_))
3071 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003072
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003073 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003074
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003075 if (index < 0 || index >= self->groups) {
3076 PyErr_SetString(
3077 PyExc_IndexError,
3078 "no such group"
3079 );
3080 return NULL;
3081 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003082
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003083 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003084 return _pair(self->mark[index*2], self->mark[index*2+1]);
3085}
3086
3087static PyObject*
3088match_regs(MatchObject* self)
3089{
3090 PyObject* regs;
3091 PyObject* item;
3092 int index;
3093
3094 regs = PyTuple_New(self->groups);
3095 if (!regs)
3096 return NULL;
3097
3098 for (index = 0; index < self->groups; index++) {
3099 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3100 if (!item) {
3101 Py_DECREF(regs);
3102 return NULL;
3103 }
3104 PyTuple_SET_ITEM(regs, index, item);
3105 }
3106
3107 Py_INCREF(regs);
3108 self->regs = regs;
3109
3110 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00003111}
3112
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003113static PyObject*
3114match_copy(MatchObject* self, PyObject* args)
3115{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003116#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003117 MatchObject* copy;
3118 int slots, offset;
Tim Peters3d563502006-01-21 02:47:53 +00003119
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003120 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
3121 return NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003122
3123 slots = 2 * (self->pattern->groups+1);
3124
3125 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3126 if (!copy)
3127 return NULL;
3128
3129 /* this value a constant, but any compiler should be able to
3130 figure that out all by itself */
3131 offset = offsetof(MatchObject, string);
3132
3133 Py_XINCREF(self->pattern);
3134 Py_XINCREF(self->string);
3135 Py_XINCREF(self->regs);
3136
3137 memcpy((char*) copy + offset, (char*) self + offset,
3138 sizeof(MatchObject) + slots * sizeof(int) - offset);
3139
3140 return (PyObject*) copy;
3141#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003142 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003143 return NULL;
3144#endif
3145}
3146
3147static PyObject*
3148match_deepcopy(MatchObject* self, PyObject* args)
3149{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003150#ifdef USE_BUILTIN_COPY
3151 MatchObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00003152
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003153 PyObject* memo;
3154 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
3155 return NULL;
3156
3157 copy = (MatchObject*) match_copy(self, Py_None);
3158 if (!copy)
3159 return NULL;
3160
3161 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3162 !deepcopy(&copy->string, memo) ||
3163 !deepcopy(&copy->regs, memo)) {
3164 Py_DECREF(copy);
3165 return NULL;
3166 }
3167
3168#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003169 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3170 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003171#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003172}
3173
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003174static PyMethodDef match_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00003175 {"group", (PyCFunction) match_group, METH_VARARGS},
3176 {"start", (PyCFunction) match_start, METH_VARARGS},
3177 {"end", (PyCFunction) match_end, METH_VARARGS},
3178 {"span", (PyCFunction) match_span, METH_VARARGS},
3179 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3180 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
3181 {"expand", (PyCFunction) match_expand, METH_VARARGS},
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003182 {"__copy__", (PyCFunction) match_copy, METH_VARARGS},
3183 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003184 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003185};
3186
Tim Peters3d563502006-01-21 02:47:53 +00003187static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003188match_getattr(MatchObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00003189{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003190 PyObject* res;
Guido van Rossumb700df92000-03-31 14:59:30 +00003191
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003192 res = Py_FindMethod(match_methods, (PyObject*) self, name);
3193 if (res)
3194 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00003195
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003196 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00003197
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003198 if (!strcmp(name, "lastindex")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00003199 if (self->lastindex >= 0)
3200 return Py_BuildValue("i", self->lastindex);
Fredrik Lundhc2301732000-07-02 22:25:39 +00003201 Py_INCREF(Py_None);
3202 return Py_None;
3203 }
3204
3205 if (!strcmp(name, "lastgroup")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00003206 if (self->pattern->indexgroup && self->lastindex >= 0) {
Fredrik Lundhc2301732000-07-02 22:25:39 +00003207 PyObject* result = PySequence_GetItem(
Fredrik Lundh6f013982000-07-03 18:44:21 +00003208 self->pattern->indexgroup, self->lastindex
Fredrik Lundhc2301732000-07-02 22:25:39 +00003209 );
3210 if (result)
3211 return result;
3212 PyErr_Clear();
3213 }
3214 Py_INCREF(Py_None);
3215 return Py_None;
3216 }
3217
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003218 if (!strcmp(name, "string")) {
3219 if (self->string) {
3220 Py_INCREF(self->string);
3221 return self->string;
3222 } else {
3223 Py_INCREF(Py_None);
3224 return Py_None;
3225 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003226 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003227
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003228 if (!strcmp(name, "regs")) {
3229 if (self->regs) {
3230 Py_INCREF(self->regs);
3231 return self->regs;
3232 } else
3233 return match_regs(self);
3234 }
3235
3236 if (!strcmp(name, "re")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00003237 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003238 return (PyObject*) self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00003239 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003240
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003241 if (!strcmp(name, "pos"))
3242 return Py_BuildValue("i", self->pos);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003243
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003244 if (!strcmp(name, "endpos"))
3245 return Py_BuildValue("i", self->endpos);
Guido van Rossumb700df92000-03-31 14:59:30 +00003246
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003247 PyErr_SetString(PyExc_AttributeError, name);
3248 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003249}
3250
3251/* FIXME: implement setattr("string", None) as a special case (to
3252 detach the associated string, if any */
3253
3254statichere PyTypeObject Match_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003255 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00003256 0, "_" SRE_MODULE ".SRE_Match",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003257 sizeof(MatchObject), sizeof(int),
3258 (destructor)match_dealloc, /*tp_dealloc*/
3259 0, /*tp_print*/
3260 (getattrfunc)match_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00003261};
3262
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003263/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003264/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003265
3266static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003267scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003268{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003269 state_fini(&self->state);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003270 Py_DECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003271 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003272}
3273
3274static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003275scanner_match(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003276{
3277 SRE_STATE* state = &self->state;
3278 PyObject* match;
3279 int status;
3280
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003281 state_reset(state);
3282
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003283 state->ptr = state->start;
3284
3285 if (state->charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003286 status = sre_match(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003287 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003288#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003289 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003290#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003291 }
3292
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003293 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003294 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003295
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003296 if (status == 0 || state->ptr == state->start)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003297 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003298 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003299 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003300
3301 return match;
3302}
3303
3304
3305static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003306scanner_search(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003307{
3308 SRE_STATE* state = &self->state;
3309 PyObject* match;
3310 int status;
3311
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003312 state_reset(state);
3313
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003314 state->ptr = state->start;
3315
3316 if (state->charsize == 1) {
3317 status = sre_search(state, PatternObject_GetCode(self->pattern));
3318 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003319#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003320 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003321#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003322 }
3323
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003324 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003325 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003326
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003327 if (status == 0 || state->ptr == state->start)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003328 state->start = (void*) ((char*) state->ptr + state->charsize);
3329 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003330 state->start = state->ptr;
3331
3332 return match;
3333}
3334
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003335static PyMethodDef scanner_methods[] = {
Neal Norwitzbb2769f2002-03-31 15:46:00 +00003336 /* FIXME: use METH_OLDARGS instead of 0 or fix to use METH_VARARGS */
3337 /* METH_OLDARGS is not in Python 1.5.2 */
3338 {"match", (PyCFunction) scanner_match, 0},
3339 {"search", (PyCFunction) scanner_search, 0},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003340 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003341};
3342
Tim Peters3d563502006-01-21 02:47:53 +00003343static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003344scanner_getattr(ScannerObject* self, char* name)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003345{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003346 PyObject* res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003347
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003348 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
3349 if (res)
3350 return res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003351
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003352 PyErr_Clear();
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003353
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003354 /* attributes */
3355 if (!strcmp(name, "pattern")) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003356 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003357 return self->pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003358 }
3359
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003360 PyErr_SetString(PyExc_AttributeError, name);
3361 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003362}
3363
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003364statichere PyTypeObject Scanner_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003365 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00003366 0, "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003367 sizeof(ScannerObject), 0,
3368 (destructor)scanner_dealloc, /*tp_dealloc*/
3369 0, /*tp_print*/
3370 (getattrfunc)scanner_getattr, /*tp_getattr*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003371};
3372
Guido van Rossumb700df92000-03-31 14:59:30 +00003373static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00003374 {"compile", _compile, METH_VARARGS},
3375 {"getcodesize", sre_codesize, METH_VARARGS},
3376 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003377 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003378};
3379
Tim Peters3d563502006-01-21 02:47:53 +00003380#if PY_VERSION_HEX < 0x02030000
Andrew M. Kuchlingc24fe362003-04-30 13:09:08 +00003381DL_EXPORT(void) init_sre(void)
3382#else
Mark Hammond8235ea12002-07-19 06:55:41 +00003383PyMODINIT_FUNC init_sre(void)
Andrew M. Kuchlingc24fe362003-04-30 13:09:08 +00003384#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00003385{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003386 PyObject* m;
3387 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00003388 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003389
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003390 /* Patch object types */
3391 Pattern_Type.ob_type = Match_Type.ob_type =
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003392 Scanner_Type.ob_type = &PyType_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00003393
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00003394 m = Py_InitModule("_" SRE_MODULE, _functions);
Neal Norwitz1ac754f2006-01-19 06:09:39 +00003395 if (m == NULL)
3396 return;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003397 d = PyModule_GetDict(m);
3398
Fredrik Lundh21009b92001-09-18 18:47:09 +00003399 x = PyInt_FromLong(SRE_MAGIC);
3400 if (x) {
3401 PyDict_SetItemString(d, "MAGIC", x);
3402 Py_DECREF(x);
3403 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00003404
Martin v. Löwis78e2f062003-04-19 12:56:08 +00003405 x = PyInt_FromLong(sizeof(SRE_CODE));
3406 if (x) {
3407 PyDict_SetItemString(d, "CODESIZE", x);
3408 Py_DECREF(x);
3409 }
3410
Fredrik Lundh21009b92001-09-18 18:47:09 +00003411 x = PyString_FromString(copyright);
3412 if (x) {
3413 PyDict_SetItemString(d, "copyright", x);
3414 Py_DECREF(x);
3415 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003416}
3417
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003418#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00003419
3420/* vim:ts=4:sw=4:et
3421*/