blob: ee073426b5ea6456ef1e276c4d1726acc66c0493 [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
Fredrik Lundhebc37b22000-10-28 19:30:41 +00009 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000010 * 2000-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000011 * 2000-09-20 fl added expand method
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000012 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000013 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000014 * 2001-04-28 fl added __copy__ methods (work in progress)
Fredrik Lundh09705f02002-11-22 12:46:35 +000015 * 2001-05-14 fl fixes for 1.5.2 compatibility
Fredrik Lundhf71ae462001-07-02 17:04:48 +000016 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh397a6542001-10-18 19:30:16 +000017 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Fredrik Lundh971e78b2001-10-20 17:48:46 +000018 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
Fredrik Lundhbec95b92001-10-21 16:47:57 +000019 * 2001-10-21 fl added sub/subn primitive
Fredrik Lundh703ce812001-10-24 22:16:30 +000020 * 2001-10-24 fl added finditer primitive (for 2.2 only)
Fredrik Lundh82b23072001-12-09 16:13:15 +000021 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
Fredrik Lundh09705f02002-11-22 12:46:35 +000022 * 2002-11-09 fl fixed empty sub/subn return type
Martin v. Löwis78e2f062003-04-19 12:56:08 +000023 * 2003-04-18 mvl fully support 4-byte codes
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +000024 * 2003-10-17 gn implemented non recursive scheme
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000026 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000027 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000028 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
Guido van Rossumb700df92000-03-31 14:59:30 +000032 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000033 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000034 * other compatibility work.
35 */
36
37#ifndef SRE_RECURSIVE
38
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000039static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000040 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000041
42#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000043#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000044
45#include "sre.h"
46
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000047#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000048
Fredrik Lundh436c3d582000-06-29 08:58:44 +000049/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000050#if !defined(SRE_MODULE)
51#define SRE_MODULE "sre"
52#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000053
Guido van Rossumb700df92000-03-31 14:59:30 +000054/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000055#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000056
Fredrik Lundh971e78b2001-10-20 17:48:46 +000057#if PY_VERSION_HEX >= 0x01060000
58#if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
Fredrik Lundh22d25462000-07-01 17:50:59 +000059/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +000060#define HAVE_UNICODE
61#endif
Fredrik Lundh971e78b2001-10-20 17:48:46 +000062#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000063
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000065/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000066
67/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000068#define USE_FAST_SEARCH
69
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000070/* enables aggressive inlining (always on for Visual C) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000071#undef USE_INLINE
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000072
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000073/* enables copy/deepcopy handling (work in progress) */
74#undef USE_BUILTIN_COPY
75
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000076#if PY_VERSION_HEX < 0x01060000
77#define PyObject_DEL(op) PyMem_DEL((op))
78#endif
79
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000080/* -------------------------------------------------------------------- */
81
Fredrik Lundh80946112000-06-29 18:03:25 +000082#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000083#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000084#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000085/* fastest possible local call under MSVC */
86#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000087#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000088#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000089#else
90#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000091#endif
92
93/* error codes */
94#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000095#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000096#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000097#define SRE_ERROR_MEMORY -9 /* out of memory */
98
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000099#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +0000100#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +0000101#else
102#define TRACE(v)
103#endif
104
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000105/* -------------------------------------------------------------------- */
106/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000107
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000108/* default character predicates (run sre_chars.py to regenerate tables) */
109
110#define SRE_DIGIT_MASK 1
111#define SRE_SPACE_MASK 2
112#define SRE_LINEBREAK_MASK 4
113#define SRE_ALNUM_MASK 8
114#define SRE_WORD_MASK 16
115
Fredrik Lundh21009b92001-09-18 18:47:09 +0000116/* FIXME: this assumes ASCII. create tables in init_sre() instead */
117
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000118static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1192, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1200, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
12125, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12224, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1230, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12424, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
125
Fredrik Lundhb389df32000-06-29 12:48:37 +0000126static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000012710, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
12827, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
12944, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
13061, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
131108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
132122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
133106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
134120, 121, 122, 123, 124, 125, 126, 127 };
135
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000136#define SRE_IS_DIGIT(ch)\
137 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
138#define SRE_IS_SPACE(ch)\
139 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
140#define SRE_IS_LINEBREAK(ch)\
141 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
142#define SRE_IS_ALNUM(ch)\
143 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
144#define SRE_IS_WORD(ch)\
145 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000146
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000147static unsigned int sre_lower(unsigned int ch)
148{
149 return ((ch) < 128 ? sre_char_lower[ch] : ch);
150}
151
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000152/* locale-specific character predicates */
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000153
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000154#define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
155#define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
156#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
157#define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
158#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
159
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000160static unsigned int sre_lower_locale(unsigned int ch)
161{
162 return ((ch) < 256 ? tolower((ch)) : ch);
163}
164
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000165/* unicode-specific character predicates */
166
167#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000168
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000169#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
170#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
171#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000172#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000173#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000174
175static unsigned int sre_lower_unicode(unsigned int ch)
176{
177 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
178}
179
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000180#endif
181
Guido van Rossumb700df92000-03-31 14:59:30 +0000182LOCAL(int)
183sre_category(SRE_CODE category, unsigned int ch)
184{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000185 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000186
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000187 case SRE_CATEGORY_DIGIT:
188 return SRE_IS_DIGIT(ch);
189 case SRE_CATEGORY_NOT_DIGIT:
190 return !SRE_IS_DIGIT(ch);
191 case SRE_CATEGORY_SPACE:
192 return SRE_IS_SPACE(ch);
193 case SRE_CATEGORY_NOT_SPACE:
194 return !SRE_IS_SPACE(ch);
195 case SRE_CATEGORY_WORD:
196 return SRE_IS_WORD(ch);
197 case SRE_CATEGORY_NOT_WORD:
198 return !SRE_IS_WORD(ch);
199 case SRE_CATEGORY_LINEBREAK:
200 return SRE_IS_LINEBREAK(ch);
201 case SRE_CATEGORY_NOT_LINEBREAK:
202 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000203
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000204 case SRE_CATEGORY_LOC_WORD:
205 return SRE_LOC_IS_WORD(ch);
206 case SRE_CATEGORY_LOC_NOT_WORD:
207 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000208
209#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000210 case SRE_CATEGORY_UNI_DIGIT:
211 return SRE_UNI_IS_DIGIT(ch);
212 case SRE_CATEGORY_UNI_NOT_DIGIT:
213 return !SRE_UNI_IS_DIGIT(ch);
214 case SRE_CATEGORY_UNI_SPACE:
215 return SRE_UNI_IS_SPACE(ch);
216 case SRE_CATEGORY_UNI_NOT_SPACE:
217 return !SRE_UNI_IS_SPACE(ch);
218 case SRE_CATEGORY_UNI_WORD:
219 return SRE_UNI_IS_WORD(ch);
220 case SRE_CATEGORY_UNI_NOT_WORD:
221 return !SRE_UNI_IS_WORD(ch);
222 case SRE_CATEGORY_UNI_LINEBREAK:
223 return SRE_UNI_IS_LINEBREAK(ch);
224 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
225 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000226#else
227 case SRE_CATEGORY_UNI_DIGIT:
228 return SRE_IS_DIGIT(ch);
229 case SRE_CATEGORY_UNI_NOT_DIGIT:
230 return !SRE_IS_DIGIT(ch);
231 case SRE_CATEGORY_UNI_SPACE:
232 return SRE_IS_SPACE(ch);
233 case SRE_CATEGORY_UNI_NOT_SPACE:
234 return !SRE_IS_SPACE(ch);
235 case SRE_CATEGORY_UNI_WORD:
236 return SRE_LOC_IS_WORD(ch);
237 case SRE_CATEGORY_UNI_NOT_WORD:
238 return !SRE_LOC_IS_WORD(ch);
239 case SRE_CATEGORY_UNI_LINEBREAK:
240 return SRE_IS_LINEBREAK(ch);
241 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
242 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000243#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000244 }
245 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000246}
247
248/* helpers */
249
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000250static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000251data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000252{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000253 if (state->data_stack) {
254 free(state->data_stack);
255 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000256 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000257 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000258}
259
260static int
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000261data_stack_grow(SRE_STATE* state, int size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000262{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000263 int minsize, cursize;
264 minsize = state->data_stack_base+size;
265 cursize = state->data_stack_size;
266 if (cursize < minsize) {
267 void* stack;
268 cursize = minsize+minsize/4+1024;
269 TRACE(("allocate/grow stack %d\n", cursize));
270 stack = realloc(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000271 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000272 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000273 return SRE_ERROR_MEMORY;
274 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000275 state->data_stack = stack;
276 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000277 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000278 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000279}
280
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000281/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000282
283#define SRE_CHAR unsigned char
284#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000285#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000286#define SRE_CHARSET sre_charset
287#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000288#define SRE_MATCH sre_match
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000289#define SRE_MATCH_CONTEXT sre_match_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000290#define SRE_SEARCH sre_search
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000291#define SRE_LITERAL_TEMPLATE sre_literal_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000292
293#if defined(HAVE_UNICODE)
294
Guido van Rossumb700df92000-03-31 14:59:30 +0000295#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000296#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000297#undef SRE_RECURSIVE
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000298
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000299#undef SRE_LITERAL_TEMPLATE
Guido van Rossumb700df92000-03-31 14:59:30 +0000300#undef SRE_SEARCH
301#undef SRE_MATCH
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000302#undef SRE_MATCH_CONTEXT
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000303#undef SRE_INFO
304#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000305#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000306#undef SRE_AT
307#undef SRE_CHAR
308
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000309/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000310
311#define SRE_CHAR Py_UNICODE
312#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000313#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000314#define SRE_CHARSET sre_ucharset
315#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000316#define SRE_MATCH sre_umatch
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000317#define SRE_MATCH_CONTEXT sre_umatch_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000318#define SRE_SEARCH sre_usearch
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000319#define SRE_LITERAL_TEMPLATE sre_uliteral_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000320#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000321
322#endif /* SRE_RECURSIVE */
323
324/* -------------------------------------------------------------------- */
325/* String matching engine */
326
327/* the following section is compiled twice, with different character
328 settings */
329
330LOCAL(int)
331SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
332{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000333 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000334
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000335 int this, that;
Guido van Rossumb700df92000-03-31 14:59:30 +0000336
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000337 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000338
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000339 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000340 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000341 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000342
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000343 case SRE_AT_BEGINNING_LINE:
344 return ((void*) ptr == state->beginning ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000345 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000346
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000347 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000348 return (((void*) (ptr+1) == state->end &&
349 SRE_IS_LINEBREAK((int) ptr[0])) ||
350 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000351
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000352 case SRE_AT_END_LINE:
353 return ((void*) ptr == state->end ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000354 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000355
Fredrik Lundh770617b2001-01-14 15:06:11 +0000356 case SRE_AT_END_STRING:
357 return ((void*) ptr == state->end);
358
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000359 case SRE_AT_BOUNDARY:
360 if (state->beginning == state->end)
361 return 0;
362 that = ((void*) ptr > state->beginning) ?
363 SRE_IS_WORD((int) ptr[-1]) : 0;
364 this = ((void*) ptr < state->end) ?
365 SRE_IS_WORD((int) ptr[0]) : 0;
366 return this != that;
Fredrik Lundh80946112000-06-29 18:03:25 +0000367
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000368 case SRE_AT_NON_BOUNDARY:
369 if (state->beginning == state->end)
370 return 0;
371 that = ((void*) ptr > state->beginning) ?
372 SRE_IS_WORD((int) ptr[-1]) : 0;
373 this = ((void*) ptr < state->end) ?
374 SRE_IS_WORD((int) ptr[0]) : 0;
375 return this == that;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000376
377 case SRE_AT_LOC_BOUNDARY:
378 if (state->beginning == state->end)
379 return 0;
380 that = ((void*) ptr > state->beginning) ?
381 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
382 this = ((void*) ptr < state->end) ?
383 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
384 return this != that;
385
386 case SRE_AT_LOC_NON_BOUNDARY:
387 if (state->beginning == state->end)
388 return 0;
389 that = ((void*) ptr > state->beginning) ?
390 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
391 this = ((void*) ptr < state->end) ?
392 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
393 return this == that;
394
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000395#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000396 case SRE_AT_UNI_BOUNDARY:
397 if (state->beginning == state->end)
398 return 0;
399 that = ((void*) ptr > state->beginning) ?
400 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
401 this = ((void*) ptr < state->end) ?
402 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
403 return this != that;
404
405 case SRE_AT_UNI_NON_BOUNDARY:
406 if (state->beginning == state->end)
407 return 0;
408 that = ((void*) ptr > state->beginning) ?
409 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
410 this = ((void*) ptr < state->end) ?
411 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
412 return this == that;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000413#endif
414
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000415 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000416
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000417 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000418}
419
420LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000421SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000422{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000423 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000424
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000425 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000426
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000427 for (;;) {
428 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000429
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000430 case SRE_OP_FAILURE:
431 return !ok;
432
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000433 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000434 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000435 if (ch == set[0])
436 return ok;
437 set++;
438 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000439
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000440 case SRE_OP_CATEGORY:
441 /* <CATEGORY> <code> */
442 if (sre_category(set[0], (int) ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000443 return ok;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000444 set += 1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000445 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000446
Fredrik Lundh3562f112000-07-02 12:00:07 +0000447 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000448 if (sizeof(SRE_CODE) == 2) {
449 /* <CHARSET> <bitmap> (16 bits per code word) */
450 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
451 return ok;
452 set += 16;
453 }
454 else {
455 /* <CHARSET> <bitmap> (32 bits per code word) */
456 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
457 return ok;
458 set += 8;
459 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000460 break;
461
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000462 case SRE_OP_RANGE:
463 /* <RANGE> <lower> <upper> */
464 if (set[0] <= ch && ch <= set[1])
465 return ok;
466 set += 2;
467 break;
468
469 case SRE_OP_NEGATE:
470 ok = !ok;
471 break;
472
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000473 case SRE_OP_BIGCHARSET:
474 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
475 {
476 int count, block;
477 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000478
479 if (sizeof(SRE_CODE) == 2) {
480 block = ((unsigned char*)set)[ch >> 8];
481 set += 128;
482 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
483 return ok;
484 set += count*16;
485 }
486 else {
487 if (ch < 65536)
488 block = ((unsigned char*)set)[ch >> 8];
489 else
490 block = -1;
491 set += 64;
492 if (block >=0 &&
493 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
494 return ok;
495 set += count*8;
496 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000497 break;
498 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000499
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000500 default:
501 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000502 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000503 return 0;
504 }
505 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000506}
507
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000508LOCAL(int) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000509
510LOCAL(int)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000511SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, int maxcount)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000512{
513 SRE_CODE chr;
514 SRE_CHAR* ptr = state->ptr;
515 SRE_CHAR* end = state->end;
516 int i;
517
518 /* adjust end */
519 if (maxcount < end - ptr && maxcount != 65535)
520 end = ptr + maxcount;
521
522 switch (pattern[0]) {
523
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000524 case SRE_OP_IN:
525 /* repeated set */
526 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
527 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
528 ptr++;
529 break;
530
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000531 case SRE_OP_ANY:
532 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000533 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000534 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
535 ptr++;
536 break;
537
538 case SRE_OP_ANY_ALL:
539 /* repeated dot wildcare. skip to the end of the target
540 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000541 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000542 ptr = end;
543 break;
544
545 case SRE_OP_LITERAL:
546 /* repeated literal */
547 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000548 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000549 while (ptr < end && (SRE_CODE) *ptr == chr)
550 ptr++;
551 break;
552
553 case SRE_OP_LITERAL_IGNORE:
554 /* repeated literal */
555 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000556 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000557 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
558 ptr++;
559 break;
560
561 case SRE_OP_NOT_LITERAL:
562 /* repeated non-literal */
563 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000564 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000565 while (ptr < end && (SRE_CODE) *ptr != chr)
566 ptr++;
567 break;
568
569 case SRE_OP_NOT_LITERAL_IGNORE:
570 /* repeated non-literal */
571 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000572 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000573 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
574 ptr++;
575 break;
576
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000577 default:
578 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000579 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000580 while ((SRE_CHAR*) state->ptr < end) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000581 i = SRE_MATCH(state, pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000582 if (i < 0)
583 return i;
584 if (!i)
585 break;
586 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000587 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
588 (SRE_CHAR*) state->ptr - ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000589 return (SRE_CHAR*) state->ptr - ptr;
590 }
591
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000592 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000593 return ptr - (SRE_CHAR*) state->ptr;
594}
595
Fredrik Lundh33accc12000-08-27 20:59:47 +0000596#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000597LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000598SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
599{
600 /* check if an SRE_OP_INFO block matches at the current position.
601 returns the number of SRE_CODE objects to skip if successful, 0
602 if no match */
603
604 SRE_CHAR* end = state->end;
605 SRE_CHAR* ptr = state->ptr;
606 int i;
607
608 /* check minimal length */
609 if (pattern[3] && (end - ptr) < pattern[3])
610 return 0;
611
612 /* check known prefix */
613 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
614 /* <length> <skip> <prefix data> <overlap data> */
615 for (i = 0; i < pattern[5]; i++)
616 if ((SRE_CODE) ptr[i] != pattern[7 + i])
617 return 0;
618 return pattern[0] + 2 * pattern[6];
619 }
620 return pattern[0];
621}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000622#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000623
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000624/* The macros below should be used to protect recursive SRE_MATCH()
625 * calls that *failed* and do *not* return immediately (IOW, those
626 * that will backtrack). Explaining:
627 *
628 * - Recursive SRE_MATCH() returned true: that's usually a success
629 * (besides atypical cases like ASSERT_NOT), therefore there's no
630 * reason to restore lastmark;
631 *
632 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
633 * is returning to the caller: If the current SRE_MATCH() is the
634 * top function of the recursion, returning false will be a matching
635 * failure, and it doesn't matter where lastmark is pointing to.
636 * If it's *not* the top function, it will be a recursive SRE_MATCH()
637 * failure by itself, and the calling SRE_MATCH() will have to deal
638 * with the failure by the same rules explained here (it will restore
639 * lastmark by itself if necessary);
640 *
641 * - Recursive SRE_MATCH() returned false, and will continue the
642 * outside 'for' loop: must be protected when breaking, since the next
643 * OP could potentially depend on lastmark;
644 *
645 * - Recursive SRE_MATCH() returned false, and will be called again
646 * inside a local for/while loop: must be protected between each
647 * loop iteration, since the recursive SRE_MATCH() could do anything,
648 * and could potentially depend on lastmark.
649 *
650 * For more information, check the discussion at SF patch #712900.
651 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000652#define LASTMARK_SAVE() \
653 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000654 ctx->lastmark = state->lastmark; \
655 ctx->lastindex = state->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000656 } while (0)
657#define LASTMARK_RESTORE() \
658 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000659 state->lastmark = ctx->lastmark; \
660 state->lastindex = ctx->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000661 } while (0)
662
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000663#define RETURN_ERROR(i) do { return i; } while(0)
664#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
665#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
666
667#define RETURN_ON_ERROR(i) \
668 do { if (i < 0) RETURN_ERROR(i); } while (0)
669#define RETURN_ON_SUCCESS(i) \
670 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
671#define RETURN_ON_FAILURE(i) \
672 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
673
674#define SFY(x) #x
675
676#define DATA_STACK_ALLOC(state, type, ptr) \
677do { \
678 alloc_pos = state->data_stack_base; \
679 TRACE(("allocating %s in %d (%d)\n", \
680 SFY(type), alloc_pos, sizeof(type))); \
681 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
682 int j = data_stack_grow(state, sizeof(type)); \
683 if (j < 0) return j; \
684 if (ctx_pos != -1) \
685 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
686 } \
687 ptr = (type*)(state->data_stack+alloc_pos); \
688 state->data_stack_base += sizeof(type); \
689} while (0)
690
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000691#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
692do { \
693 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
694 ptr = (type*)(state->data_stack+pos); \
695} while (0)
696
697#define DATA_STACK_PUSH(state, data, size) \
698do { \
699 TRACE(("copy data in %p to %d (%d)\n", \
700 data, state->data_stack_base, size)); \
701 if (state->data_stack_size < state->data_stack_base+size) { \
702 int j = data_stack_grow(state, size); \
703 if (j < 0) return j; \
704 if (ctx_pos != -1) \
705 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
706 } \
707 memcpy(state->data_stack+state->data_stack_base, data, size); \
708 state->data_stack_base += size; \
709} while (0)
710
711#define DATA_STACK_POP(state, data, size, discard) \
712do { \
713 TRACE(("copy data to %p from %d (%d)\n", \
714 data, state->data_stack_base-size, size)); \
715 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
716 if (discard) \
717 state->data_stack_base -= size; \
718} while (0)
719
720#define DATA_STACK_POP_DISCARD(state, size) \
721do { \
722 TRACE(("discard data from %d (%d)\n", \
723 state->data_stack_base-size, size)); \
724 state->data_stack_base -= size; \
725} while(0)
726
727#define DATA_PUSH(x) \
728 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
729#define DATA_POP(x) \
730 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000731#define DATA_POP_DISCARD(x) \
732 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
733#define DATA_ALLOC(t,p) \
734 DATA_STACK_ALLOC(state, t, p)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000735#define DATA_LOOKUP_AT(t,p,pos) \
736 DATA_STACK_LOOKUP_AT(state,t,p,pos)
737
738#define MARK_PUSH(lastmark) \
739 do if (lastmark > 0) { \
740 i = lastmark; /* ctx->lastmark may change if reallocated */ \
741 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
742 } while (0)
743#define MARK_POP(lastmark) \
744 do if (lastmark > 0) { \
745 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
746 } while (0)
747#define MARK_POP_KEEP(lastmark) \
748 do if (lastmark > 0) { \
749 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
750 } while (0)
751#define MARK_POP_DISCARD(lastmark) \
752 do if (lastmark > 0) { \
753 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
754 } while (0)
755
756#define JUMP_NONE 0
757#define JUMP_MAX_UNTIL_1 1
758#define JUMP_MAX_UNTIL_2 2
759#define JUMP_MAX_UNTIL_3 3
760#define JUMP_MIN_UNTIL_1 4
761#define JUMP_MIN_UNTIL_2 5
762#define JUMP_MIN_UNTIL_3 6
763#define JUMP_REPEAT 7
764#define JUMP_REPEAT_ONE_1 8
765#define JUMP_REPEAT_ONE_2 9
766#define JUMP_MIN_REPEAT_ONE 10
767#define JUMP_BRANCH 11
768#define JUMP_ASSERT 12
769#define JUMP_ASSERT_NOT 13
770
771#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
772 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
773 nextctx->last_ctx_pos = ctx_pos; \
774 nextctx->jump = jumpvalue; \
775 nextctx->pattern = nextpattern; \
776 ctx_pos = alloc_pos; \
777 ctx = nextctx; \
778 goto entrance; \
779 jumplabel: \
780 while (0) /* gcc doesn't like labels at end of scopes */ \
781
782typedef struct {
783 int last_ctx_pos;
784 int jump;
785 SRE_CHAR* ptr;
786 SRE_CODE* pattern;
787 int count;
788 int lastmark;
789 int lastindex;
790 union {
791 SRE_CODE chr;
792 SRE_REPEAT* rep;
793 } u;
794} SRE_MATCH_CONTEXT;
795
796/* check if string matches the given pattern. returns <0 for
797 error, 0 for failure, and 1 for success */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000798LOCAL(int)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000799SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
Guido van Rossumb700df92000-03-31 14:59:30 +0000800{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000801 SRE_CHAR* end = state->end;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000802 int alloc_pos, ctx_pos = -1;
803 int i, ret = 0;
804 int jump;
Guido van Rossumb700df92000-03-31 14:59:30 +0000805
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000806 SRE_MATCH_CONTEXT* ctx;
807 SRE_MATCH_CONTEXT* nextctx;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000808
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000809 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000810
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000811 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
812 ctx->last_ctx_pos = -1;
813 ctx->jump = JUMP_NONE;
814 ctx->pattern = pattern;
815 ctx_pos = alloc_pos;
816
817entrance:
818
819 ctx->ptr = state->ptr;
820
821 if (ctx->pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000822 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000823 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000824 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000825 TRACE(("reject (got %d chars, need %d)\n",
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000826 (end - ctx->ptr), ctx->pattern[3]));
827 RETURN_FAILURE;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000828 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000829 ctx->pattern += ctx->pattern[1] + 1;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000830 }
831
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000832 for (;;) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000833
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000834 switch (*ctx->pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000835
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000836 case SRE_OP_MARK:
837 /* set mark */
838 /* <MARK> <gid> */
839 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
840 ctx->ptr, ctx->pattern[0]));
841 i = ctx->pattern[0];
842 if (i & 1)
843 state->lastindex = i/2 + 1;
844 if (i > state->lastmark) {
845 /* state->lastmark is the highest valid index in the
846 state->mark array. If it is increased by more than 1,
847 the intervening marks must be set to NULL to signal
848 that these marks have not been encountered. */
849 int j = state->lastmark + 1;
850 while (j < i)
851 state->mark[j++] = NULL;
852 state->lastmark = i;
853 }
854 state->mark[i] = ctx->ptr;
855 ctx->pattern++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000856 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000857
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000858 case SRE_OP_LITERAL:
859 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000860 /* <LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000861 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
862 ctx->ptr, *ctx->pattern));
863 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
864 RETURN_FAILURE;
865 ctx->pattern++;
866 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000867 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000868
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000869 case SRE_OP_NOT_LITERAL:
870 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000871 /* <NOT_LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000872 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
873 ctx->ptr, *ctx->pattern));
874 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
875 RETURN_FAILURE;
876 ctx->pattern++;
877 ctx->ptr++;
878 break;
879
880 case SRE_OP_SUCCESS:
881 /* end of pattern */
882 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
883 state->ptr = ctx->ptr;
884 RETURN_SUCCESS;
885
886 case SRE_OP_AT:
887 /* match at given position */
888 /* <AT> <code> */
889 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
890 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
891 RETURN_FAILURE;
892 ctx->pattern++;
893 break;
894
895 case SRE_OP_CATEGORY:
896 /* match at given category */
897 /* <CATEGORY> <code> */
898 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
899 ctx->ptr, *ctx->pattern));
900 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
901 RETURN_FAILURE;
902 ctx->pattern++;
903 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000904 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000905
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000906 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000907 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000908 /* <ANY> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000909 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
910 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
911 RETURN_FAILURE;
912 ctx->ptr++;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000913 break;
914
915 case SRE_OP_ANY_ALL:
916 /* match anything */
917 /* <ANY_ALL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000918 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
919 if (ctx->ptr >= end)
920 RETURN_FAILURE;
921 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000922 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000923
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000924 case SRE_OP_IN:
925 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000926 /* <IN> <skip> <set> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000927 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
928 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
929 RETURN_FAILURE;
930 ctx->pattern += ctx->pattern[0];
931 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000932 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000933
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000934 case SRE_OP_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000935 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
936 ctx->pattern, ctx->ptr, ctx->pattern[0]));
937 if (ctx->ptr >= end ||
938 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
939 RETURN_FAILURE;
940 ctx->pattern++;
941 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000942 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000943
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000944 case SRE_OP_NOT_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000945 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
946 ctx->pattern, ctx->ptr, *ctx->pattern));
947 if (ctx->ptr >= end ||
948 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
949 RETURN_FAILURE;
950 ctx->pattern++;
951 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000952 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000953
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000954 case SRE_OP_IN_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000955 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
956 if (ctx->ptr >= end
957 || !SRE_CHARSET(ctx->pattern+1,
958 (SRE_CODE)state->lower(*ctx->ptr)))
959 RETURN_FAILURE;
960 ctx->pattern += ctx->pattern[0];
961 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000962 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000963
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000964 case SRE_OP_JUMP:
965 case SRE_OP_INFO:
966 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000967 /* <JUMP> <offset> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000968 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
969 ctx->ptr, ctx->pattern[0]));
970 ctx->pattern += ctx->pattern[0];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000971 break;
972
973 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000974 /* alternation */
975 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000976 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000977 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000978 ctx->u.rep = state->repeat;
979 if (ctx->u.rep)
980 MARK_PUSH(ctx->lastmark);
981 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
982 if (ctx->pattern[1] == SRE_OP_LITERAL &&
983 (ctx->ptr >= end ||
984 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000985 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000986 if (ctx->pattern[1] == SRE_OP_IN &&
987 (ctx->ptr >= end ||
988 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000989 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000990 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000991 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000992 if (ret) {
993 if (ctx->u.rep)
994 MARK_POP_DISCARD(ctx->lastmark);
995 RETURN_ON_ERROR(ret);
996 RETURN_SUCCESS;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000997 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000998 if (ctx->u.rep)
999 MARK_POP_KEEP(ctx->lastmark);
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001000 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001001 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001002 if (ctx->u.rep)
1003 MARK_POP_DISCARD(ctx->lastmark);
1004 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001005
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001006 case SRE_OP_REPEAT_ONE:
1007 /* match repeated sequence (maximizing regexp) */
1008
1009 /* this operator only works if the repeated item is
1010 exactly one character wide, and we're not already
1011 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +00001012 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001013
1014 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1015
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001016 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1017 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001018
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001019 if (ctx->ptr + ctx->pattern[1] > end)
1020 RETURN_FAILURE; /* cannot match */
Fredrik Lundhe1869832000-08-01 22:47:49 +00001021
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001022 state->ptr = ctx->ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001023
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001024 ctx->count = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001025 RETURN_ON_ERROR(ctx->count);
Fredrik Lundhe1869832000-08-01 22:47:49 +00001026
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001027 ctx->ptr += ctx->count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001028
1029 /* when we arrive here, count contains the number of
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001030 matches, and ctx->ptr points to the tail of the target
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001031 string. check if the rest of the pattern matches,
1032 and backtrack if not. */
1033
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001034 if (ctx->count < (int) ctx->pattern[1])
1035 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001036
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001037 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001038 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001039 state->ptr = ctx->ptr;
1040 RETURN_SUCCESS;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001041 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001042
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001043 LASTMARK_SAVE();
1044
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001045 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001046 /* tail starts with a literal. skip positions where
1047 the rest of the pattern cannot possibly match */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001048 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001049 for (;;) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001050 while (ctx->count >= (int) ctx->pattern[1] &&
1051 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1052 ctx->ptr--;
1053 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001054 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001055 if (ctx->count < (int) ctx->pattern[1])
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001056 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001057 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001058 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1059 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001060 if (ret) {
1061 RETURN_ON_ERROR(ret);
1062 RETURN_SUCCESS;
1063 }
1064
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001065 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001066
1067 ctx->ptr--;
1068 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001069 }
1070
1071 } else {
1072 /* general case */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001073 while (ctx->count >= (int) ctx->pattern[1]) {
1074 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001075 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1076 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001077 if (ret) {
1078 RETURN_ON_ERROR(ret);
1079 RETURN_SUCCESS;
1080 }
1081 ctx->ptr--;
1082 ctx->count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001083 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001084 }
1085 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001086 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001087
Guido van Rossum41c99e72003-04-14 17:59:34 +00001088 case SRE_OP_MIN_REPEAT_ONE:
1089 /* match repeated sequence (minimizing regexp) */
1090
1091 /* this operator only works if the repeated item is
1092 exactly one character wide, and we're not already
1093 collecting backtracking points. for other cases,
1094 use the MIN_REPEAT operator */
1095
1096 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1097
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001098 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1099 ctx->pattern[1], ctx->pattern[2]));
Guido van Rossum41c99e72003-04-14 17:59:34 +00001100
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001101 if (ctx->ptr + ctx->pattern[1] > end)
1102 RETURN_FAILURE; /* cannot match */
Guido van Rossum41c99e72003-04-14 17:59:34 +00001103
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001104 state->ptr = ctx->ptr;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001105
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001106 if (ctx->pattern[1] == 0)
1107 ctx->count = 0;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001108 else {
1109 /* count using pattern min as the maximum */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001110 ctx->count = SRE_COUNT(state, ctx->pattern+3,
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001111 ctx->pattern[1]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001112 RETURN_ON_ERROR(ctx->count);
1113 if (ctx->count < (int) ctx->pattern[1])
1114 /* didn't match minimum number of times */
1115 RETURN_FAILURE;
1116 /* advance past minimum matches of repeat */
1117 ctx->ptr += ctx->count;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001118 }
1119
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001120 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Guido van Rossum41c99e72003-04-14 17:59:34 +00001121 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001122 state->ptr = ctx->ptr;
1123 RETURN_SUCCESS;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001124
1125 } else {
1126 /* general case */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001127 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001128 while ((int)ctx->pattern[2] == 65535
1129 || ctx->count <= (int)ctx->pattern[2]) {
1130 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001131 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1132 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001133 if (ret) {
1134 RETURN_ON_ERROR(ret);
1135 RETURN_SUCCESS;
1136 }
1137 state->ptr = ctx->ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001138 ret = SRE_COUNT(state, ctx->pattern+3, 1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001139 RETURN_ON_ERROR(ret);
1140 if (ret == 0)
Guido van Rossum41c99e72003-04-14 17:59:34 +00001141 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001142 assert(ret == 1);
1143 ctx->ptr++;
1144 ctx->count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001145 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001146 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001147 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001148 RETURN_FAILURE;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001149
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001150 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001151 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001152 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001153 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001154 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1155 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001156
1157 /* install new repeat context */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001158 ctx->u.rep = (SRE_REPEAT*) malloc(sizeof(*ctx->u.rep));
1159 ctx->u.rep->count = -1;
1160 ctx->u.rep->pattern = ctx->pattern;
1161 ctx->u.rep->prev = state->repeat;
1162 ctx->u.rep->last_ptr = NULL;
1163 state->repeat = ctx->u.rep;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001164
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001165 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001166 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001167 state->repeat = ctx->u.rep->prev;
1168 free(ctx->u.rep);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001169
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001170 if (ret) {
1171 RETURN_ON_ERROR(ret);
1172 RETURN_SUCCESS;
1173 }
1174 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001175
1176 case SRE_OP_MAX_UNTIL:
1177 /* maximizing repeat */
1178 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1179
1180 /* FIXME: we probably need to deal with zero-width
1181 matches in here... */
1182
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001183 ctx->u.rep = state->repeat;
1184 if (!ctx->u.rep)
1185 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001186
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001187 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001188
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001189 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001190
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001191 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1192 ctx->ptr, ctx->count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001193
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001194 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001195 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001196 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001197 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1198 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001199 if (ret) {
1200 RETURN_ON_ERROR(ret);
1201 RETURN_SUCCESS;
1202 }
1203 ctx->u.rep->count = ctx->count-1;
1204 state->ptr = ctx->ptr;
1205 RETURN_FAILURE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001206 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001207
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001208 if ((ctx->count < ctx->u.rep->pattern[2] ||
1209 ctx->u.rep->pattern[2] == 65535) &&
1210 state->ptr != ctx->u.rep->last_ptr) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001211 /* we may have enough matches, but if we can
1212 match another item, do so */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001213 ctx->u.rep->count = ctx->count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001214 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001215 MARK_PUSH(ctx->lastmark);
1216 /* zero-width match protection */
1217 DATA_PUSH(&ctx->u.rep->last_ptr);
1218 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001219 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1220 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001221 DATA_POP(&ctx->u.rep->last_ptr);
1222 if (ret) {
1223 MARK_POP_DISCARD(ctx->lastmark);
1224 RETURN_ON_ERROR(ret);
1225 RETURN_SUCCESS;
1226 }
1227 MARK_POP(ctx->lastmark);
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001228 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001229 ctx->u.rep->count = ctx->count-1;
1230 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001231 }
1232
1233 /* cannot match more repeated items here. make sure the
1234 tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001235 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001236 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001237 RETURN_ON_SUCCESS(ret);
1238 state->repeat = ctx->u.rep;
1239 state->ptr = ctx->ptr;
1240 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001241
1242 case SRE_OP_MIN_UNTIL:
1243 /* minimizing repeat */
1244 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1245
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001246 ctx->u.rep = state->repeat;
1247 if (!ctx->u.rep)
1248 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001249
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001250 state->ptr = ctx->ptr;
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001251
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001252 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001253
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001254 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1255 ctx->ptr, ctx->count, ctx->u.rep->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001256
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001257 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001258 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001259 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001260 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1261 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001262 if (ret) {
1263 RETURN_ON_ERROR(ret);
1264 RETURN_SUCCESS;
1265 }
1266 ctx->u.rep->count = ctx->count-1;
1267 state->ptr = ctx->ptr;
1268 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001269 }
1270
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001271 LASTMARK_SAVE();
1272
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001273 /* see if the tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001274 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001275 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001276 if (ret) {
1277 RETURN_ON_ERROR(ret);
1278 RETURN_SUCCESS;
1279 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001280
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001281 state->repeat = ctx->u.rep;
1282 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001283
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001284 LASTMARK_RESTORE();
1285
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001286 if (ctx->count >= ctx->u.rep->pattern[2]
1287 && ctx->u.rep->pattern[2] != 65535)
1288 RETURN_FAILURE;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001289
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001290 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001291 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1292 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001293 if (ret) {
1294 RETURN_ON_ERROR(ret);
1295 RETURN_SUCCESS;
1296 }
1297 ctx->u.rep->count = ctx->count-1;
1298 state->ptr = ctx->ptr;
1299 RETURN_FAILURE;
1300
1301 case SRE_OP_GROUPREF:
1302 /* match backreference */
1303 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1304 ctx->ptr, ctx->pattern[0]));
1305 i = ctx->pattern[0];
1306 {
1307 int groupref = i+i;
1308 if (groupref >= state->lastmark) {
1309 RETURN_FAILURE;
1310 } else {
1311 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1312 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1313 if (!p || !e || e < p)
1314 RETURN_FAILURE;
1315 while (p < e) {
1316 if (ctx->ptr >= end || *ctx->ptr != *p)
1317 RETURN_FAILURE;
1318 p++; ctx->ptr++;
1319 }
1320 }
1321 }
1322 ctx->pattern++;
1323 break;
1324
1325 case SRE_OP_GROUPREF_IGNORE:
1326 /* match backreference */
1327 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1328 ctx->ptr, ctx->pattern[0]));
1329 i = ctx->pattern[0];
1330 {
1331 int groupref = i+i;
1332 if (groupref >= state->lastmark) {
1333 RETURN_FAILURE;
1334 } else {
1335 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1336 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1337 if (!p || !e || e < p)
1338 RETURN_FAILURE;
1339 while (p < e) {
1340 if (ctx->ptr >= end ||
1341 state->lower(*ctx->ptr) != state->lower(*p))
1342 RETURN_FAILURE;
1343 p++; ctx->ptr++;
1344 }
1345 }
1346 }
1347 ctx->pattern++;
1348 break;
1349
1350 case SRE_OP_GROUPREF_EXISTS:
1351 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1352 ctx->ptr, ctx->pattern[0]));
1353 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1354 i = ctx->pattern[0];
1355 {
1356 int groupref = i+i;
1357 if (groupref >= state->lastmark) {
1358 ctx->pattern += ctx->pattern[1];
1359 break;
1360 } else {
1361 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1362 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1363 if (!p || !e || e < p) {
1364 ctx->pattern += ctx->pattern[1];
1365 break;
1366 }
1367 }
1368 }
1369 ctx->pattern += 2;
1370 break;
1371
1372 case SRE_OP_ASSERT:
1373 /* assert subpattern */
1374 /* <ASSERT> <skip> <back> <pattern> */
1375 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1376 ctx->ptr, ctx->pattern[1]));
1377 state->ptr = ctx->ptr - ctx->pattern[1];
1378 if (state->ptr < state->beginning)
1379 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001380 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001381 RETURN_ON_FAILURE(ret);
1382 ctx->pattern += ctx->pattern[0];
1383 break;
1384
1385 case SRE_OP_ASSERT_NOT:
1386 /* assert not subpattern */
1387 /* <ASSERT_NOT> <skip> <back> <pattern> */
1388 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1389 ctx->ptr, ctx->pattern[1]));
1390 state->ptr = ctx->ptr - ctx->pattern[1];
1391 if (state->ptr >= state->beginning) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001392 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001393 if (ret) {
1394 RETURN_ON_ERROR(ret);
1395 RETURN_FAILURE;
1396 }
1397 }
1398 ctx->pattern += ctx->pattern[0];
1399 break;
1400
1401 case SRE_OP_FAILURE:
1402 /* immediate failure */
1403 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1404 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001405
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001406 default:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001407 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1408 ctx->pattern[-1]));
1409 RETURN_ERROR(SRE_ERROR_ILLEGAL);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001410 }
1411 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001412
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001413exit:
1414 ctx_pos = ctx->last_ctx_pos;
1415 jump = ctx->jump;
1416 DATA_POP_DISCARD(ctx);
1417 if (ctx_pos == -1)
1418 return ret;
1419 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1420
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001421 switch (jump) {
1422 case JUMP_MAX_UNTIL_2:
1423 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1424 goto jump_max_until_2;
1425 case JUMP_MAX_UNTIL_3:
1426 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1427 goto jump_max_until_3;
1428 case JUMP_MIN_UNTIL_2:
1429 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1430 goto jump_min_until_2;
1431 case JUMP_MIN_UNTIL_3:
1432 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1433 goto jump_min_until_3;
1434 case JUMP_BRANCH:
1435 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1436 goto jump_branch;
1437 case JUMP_MAX_UNTIL_1:
1438 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1439 goto jump_max_until_1;
1440 case JUMP_MIN_UNTIL_1:
1441 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1442 goto jump_min_until_1;
1443 case JUMP_REPEAT:
1444 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1445 goto jump_repeat;
1446 case JUMP_REPEAT_ONE_1:
1447 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1448 goto jump_repeat_one_1;
1449 case JUMP_REPEAT_ONE_2:
1450 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1451 goto jump_repeat_one_2;
1452 case JUMP_MIN_REPEAT_ONE:
1453 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1454 goto jump_min_repeat_one;
1455 case JUMP_ASSERT:
1456 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1457 goto jump_assert;
1458 case JUMP_ASSERT_NOT:
1459 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1460 goto jump_assert_not;
1461 case JUMP_NONE:
1462 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1463 break;
1464 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001465
1466 return ret; /* should never get here */
Guido van Rossumb700df92000-03-31 14:59:30 +00001467}
1468
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001469LOCAL(int)
Guido van Rossumb700df92000-03-31 14:59:30 +00001470SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1471{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001472 SRE_CHAR* ptr = state->start;
1473 SRE_CHAR* end = state->end;
1474 int status = 0;
Fredrik Lundh28552902000-07-05 21:14:16 +00001475 int prefix_len = 0;
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001476 int prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001477 SRE_CODE* prefix = NULL;
1478 SRE_CODE* charset = NULL;
1479 SRE_CODE* overlap = NULL;
1480 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001481
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001482 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001483 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001484 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001485
1486 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001487
Gustavo Niemeyer28b5bb32003-06-26 14:41:08 +00001488 if (pattern[3] > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001489 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001490 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001491 end -= pattern[3]-1;
1492 if (end <= ptr)
1493 end = ptr+1;
1494 }
1495
Fredrik Lundh3562f112000-07-02 12:00:07 +00001496 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001497 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001498 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001499 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001500 prefix_skip = pattern[6];
1501 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001502 overlap = prefix + prefix_len - 1;
1503 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001504 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001505 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001506 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001507
1508 pattern += 1 + pattern[1];
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001509 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001510
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001511 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1512 TRACE(("charset = %p\n", charset));
1513
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001514#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001515 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001516 /* pattern starts with a known prefix. use the overlap
1517 table to skip forward as fast as we possibly can */
1518 int i = 0;
1519 end = state->end;
1520 while (ptr < end) {
1521 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001522 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001523 if (!i)
1524 break;
1525 else
1526 i = overlap[i];
1527 } else {
1528 if (++i == prefix_len) {
1529 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001530 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1531 state->start = ptr + 1 - prefix_len;
1532 state->ptr = ptr + 1 - prefix_len + prefix_skip;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001533 if (flags & SRE_INFO_LITERAL)
1534 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001535 status = SRE_MATCH(state, pattern + 2*prefix_skip);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001536 if (status != 0)
1537 return status;
1538 /* close but no cigar -- try again */
1539 i = overlap[i];
1540 }
1541 break;
1542 }
1543
1544 }
1545 ptr++;
1546 }
1547 return 0;
1548 }
1549#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001550
Fredrik Lundh3562f112000-07-02 12:00:07 +00001551 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001552 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001553 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001554 SRE_CODE chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001555 end = state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001556 for (;;) {
1557 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1558 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001559 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001560 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001561 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001562 state->start = ptr;
1563 state->ptr = ++ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001564 if (flags & SRE_INFO_LITERAL)
1565 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001566 status = SRE_MATCH(state, pattern + 2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001567 if (status != 0)
1568 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001569 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001570 } else if (charset) {
1571 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001572 end = state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001573 for (;;) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001574 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001575 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001576 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001577 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001578 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001579 state->start = ptr;
1580 state->ptr = ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001581 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001582 if (status != 0)
1583 break;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001584 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001585 }
1586 } else
1587 /* general case */
1588 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001589 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001590 state->start = state->ptr = ptr++;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001591 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001592 if (status != 0)
1593 break;
1594 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001595
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001596 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001597}
Fredrik Lundh3562f112000-07-02 12:00:07 +00001598
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001599LOCAL(int)
1600SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, int len)
1601{
1602 /* check if given string is a literal template (i.e. no escapes) */
1603 while (len-- > 0)
1604 if (*ptr++ == '\\')
1605 return 0;
1606 return 1;
1607}
Guido van Rossumb700df92000-03-31 14:59:30 +00001608
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001609#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001610
1611/* -------------------------------------------------------------------- */
1612/* factories and destructors */
1613
1614/* see sre.h for object declarations */
1615
Jeremy Hylton938ace62002-07-17 16:30:39 +00001616static PyTypeObject Pattern_Type;
1617static PyTypeObject Match_Type;
1618static PyTypeObject Scanner_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00001619
1620static PyObject *
1621_compile(PyObject* self_, PyObject* args)
1622{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001623 /* "compile" pattern descriptor to pattern object */
Guido van Rossumb700df92000-03-31 14:59:30 +00001624
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001625 PatternObject* self;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001626 int i, n;
Guido van Rossumb700df92000-03-31 14:59:30 +00001627
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001628 PyObject* pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001629 int flags = 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001630 PyObject* code;
1631 int groups = 0;
1632 PyObject* groupindex = NULL;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001633 PyObject* indexgroup = NULL;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001634 if (!PyArg_ParseTuple(args, "OiO!|iOO", &pattern, &flags,
1635 &PyList_Type, &code, &groups,
1636 &groupindex, &indexgroup))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001637 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001638
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001639 n = PyList_GET_SIZE(code);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001640
Fredrik Lundhebc37b22000-10-28 19:30:41 +00001641 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001642 if (!self)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001643 return NULL;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001644
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001645 self->codesize = n;
1646
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001647 for (i = 0; i < n; i++) {
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001648 PyObject *o = PyList_GET_ITEM(code, i);
Martin v. Löwis78e2f062003-04-19 12:56:08 +00001649 if (PyInt_Check(o))
1650 self->code[i] = (SRE_CODE) PyInt_AsLong(o);
1651 else
1652 self->code[i] = (SRE_CODE) PyLong_AsUnsignedLong(o);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001653 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00001654
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001655 if (PyErr_Occurred()) {
1656 PyObject_DEL(self);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001657 return NULL;
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001658 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00001659
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001660 Py_INCREF(pattern);
1661 self->pattern = pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00001662
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001663 self->flags = flags;
1664
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001665 self->groups = groups;
Guido van Rossumb700df92000-03-31 14:59:30 +00001666
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001667 Py_XINCREF(groupindex);
1668 self->groupindex = groupindex;
Guido van Rossumb700df92000-03-31 14:59:30 +00001669
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001670 Py_XINCREF(indexgroup);
1671 self->indexgroup = indexgroup;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001672
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001673 return (PyObject*) self;
Guido van Rossumb700df92000-03-31 14:59:30 +00001674}
1675
1676static PyObject *
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001677sre_codesize(PyObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001678{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001679 return Py_BuildValue("i", sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001680}
1681
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001682static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001683sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001684{
1685 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001686 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001687 return NULL;
1688 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001689 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001690 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001691#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001692 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001693#else
1694 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001695#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001696 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001697}
1698
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001699LOCAL(void)
1700state_reset(SRE_STATE* state)
1701{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001702 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001703 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001704
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001705 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001706 state->lastindex = -1;
1707
1708 state->repeat = NULL;
1709
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001710 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001711}
1712
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001713static void*
1714getstring(PyObject* string, int* p_length, int* p_charsize)
Guido van Rossumb700df92000-03-31 14:59:30 +00001715{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001716 /* given a python object, return a data pointer, a length (in
1717 characters), and a character size. return NULL if the object
1718 is not a string (or not compatible) */
1719
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001720 PyBufferProcs *buffer;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001721 int size, bytes, charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001722 void* ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001723
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001724#if defined(HAVE_UNICODE)
1725 if (PyUnicode_Check(string)) {
1726 /* unicode strings doesn't always support the buffer interface */
1727 ptr = (void*) PyUnicode_AS_DATA(string);
1728 bytes = PyUnicode_GET_DATA_SIZE(string);
1729 size = PyUnicode_GET_SIZE(string);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001730 charsize = sizeof(Py_UNICODE);
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001731
1732 } else {
1733#endif
1734
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001735 /* get pointer to string buffer */
1736 buffer = string->ob_type->tp_as_buffer;
1737 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1738 buffer->bf_getsegcount(string, NULL) != 1) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001739 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001740 return NULL;
1741 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001742
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001743 /* determine buffer size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001744 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1745 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001746 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1747 return NULL;
1748 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001749
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001750 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001751#if PY_VERSION_HEX >= 0x01060000
1752 size = PyObject_Size(string);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001753#else
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001754 size = PyObject_Length(string);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001755#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001756
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001757 if (PyString_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001758 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001759#if defined(HAVE_UNICODE)
1760 else if (bytes == (int) (size * sizeof(Py_UNICODE)))
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001761 charsize = sizeof(Py_UNICODE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001762#endif
1763 else {
1764 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1765 return NULL;
1766 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001767
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001768#if defined(HAVE_UNICODE)
1769 }
1770#endif
1771
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001772 *p_length = size;
1773 *p_charsize = charsize;
1774
1775 return ptr;
1776}
1777
1778LOCAL(PyObject*)
1779state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1780 int start, int end)
1781{
1782 /* prepare state object */
1783
1784 int length;
1785 int charsize;
1786 void* ptr;
1787
1788 memset(state, 0, sizeof(SRE_STATE));
1789
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001790 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001791 state->lastindex = -1;
1792
1793 ptr = getstring(string, &length, &charsize);
1794 if (!ptr)
1795 return NULL;
1796
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001797 /* adjust boundaries */
1798 if (start < 0)
1799 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001800 else if (start > length)
1801 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001802
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001803 if (end < 0)
1804 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001805 else if (end > length)
1806 end = length;
1807
1808 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001809
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001810 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001811
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001812 state->start = (void*) ((char*) ptr + start * state->charsize);
1813 state->end = (void*) ((char*) ptr + end * state->charsize);
1814
1815 Py_INCREF(string);
1816 state->string = string;
1817 state->pos = start;
1818 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001819
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001820 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001821 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001822 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001823#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001824 state->lower = sre_lower_unicode;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001825#else
1826 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001827#endif
1828 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001829 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001830
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001831 return string;
Guido van Rossumb700df92000-03-31 14:59:30 +00001832}
1833
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001834LOCAL(void)
1835state_fini(SRE_STATE* state)
1836{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001837 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001838 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001839}
1840
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001841/* calculate offset from start of string */
1842#define STATE_OFFSET(state, member)\
1843 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1844
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001845LOCAL(PyObject*)
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001846state_getslice(SRE_STATE* state, int index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001847{
Fredrik Lundh58100642000-08-09 09:14:35 +00001848 int i, j;
1849
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001850 index = (index - 1) * 2;
1851
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001852 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001853 if (empty)
1854 /* want empty string */
1855 i = j = 0;
1856 else {
1857 Py_INCREF(Py_None);
1858 return Py_None;
1859 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001860 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001861 i = STATE_OFFSET(state, state->mark[index]);
1862 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001863 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001864
Fredrik Lundh58100642000-08-09 09:14:35 +00001865 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001866}
1867
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001868static void
1869pattern_error(int status)
1870{
1871 switch (status) {
1872 case SRE_ERROR_RECURSION_LIMIT:
1873 PyErr_SetString(
1874 PyExc_RuntimeError,
1875 "maximum recursion limit exceeded"
1876 );
1877 break;
1878 case SRE_ERROR_MEMORY:
1879 PyErr_NoMemory();
1880 break;
1881 default:
1882 /* other error codes indicate compiler/engine bugs */
1883 PyErr_SetString(
1884 PyExc_RuntimeError,
1885 "internal error in regular expression engine"
1886 );
1887 }
1888}
1889
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001890static PyObject*
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001891pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001892{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001893 /* create match object (from state object) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001894
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001895 MatchObject* match;
1896 int i, j;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001897 char* base;
1898 int n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001899
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001900 if (status > 0) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001901
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001902 /* create match object (with room for extra group marks) */
1903 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
Fredrik Lundh6f013982000-07-03 18:44:21 +00001904 2*(pattern->groups+1));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001905 if (!match)
1906 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001907
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001908 Py_INCREF(pattern);
1909 match->pattern = pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001910
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001911 Py_INCREF(state->string);
1912 match->string = state->string;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001913
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001914 match->regs = NULL;
1915 match->groups = pattern->groups+1;
1916
1917 /* fill in group slices */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001918
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001919 base = (char*) state->beginning;
1920 n = state->charsize;
1921
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001922 match->mark[0] = ((char*) state->start - base) / n;
1923 match->mark[1] = ((char*) state->ptr - base) / n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001924
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001925 for (i = j = 0; i < pattern->groups; i++, j+=2)
1926 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
1927 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
1928 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
1929 } else
1930 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
1931
1932 match->pos = state->pos;
1933 match->endpos = state->endpos;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001934
Fredrik Lundh6f013982000-07-03 18:44:21 +00001935 match->lastindex = state->lastindex;
1936
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001937 return (PyObject*) match;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001938
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001939 } else if (status == 0) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001940
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001941 /* no match */
1942 Py_INCREF(Py_None);
1943 return Py_None;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001944
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001945 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001946
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001947 /* internal error */
1948 pattern_error(status);
1949 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001950}
1951
1952static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001953pattern_scanner(PatternObject* pattern, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001954{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001955 /* create search state object */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001956
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001957 ScannerObject* self;
1958
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001959 PyObject* string;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001960 int start = 0;
1961 int end = INT_MAX;
1962 if (!PyArg_ParseTuple(args, "O|ii:scanner", &string, &start, &end))
1963 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001964
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001965 /* create scanner object */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001966 self = PyObject_NEW(ScannerObject, &Scanner_Type);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001967 if (!self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001968 return NULL;
1969
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001970 string = state_init(&self->state, pattern, string, start, end);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001971 if (!string) {
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001972 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001973 return NULL;
1974 }
1975
1976 Py_INCREF(pattern);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001977 self->pattern = (PyObject*) pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001978
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001979 return (PyObject*) self;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001980}
1981
Guido van Rossumb700df92000-03-31 14:59:30 +00001982static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001983pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001984{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001985 Py_XDECREF(self->pattern);
1986 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001987 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001988 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001989}
1990
1991static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001992pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001993{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001994 SRE_STATE state;
1995 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001996
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001997 PyObject* string;
1998 int start = 0;
1999 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002000 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
2001 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:match", kwlist,
2002 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002003 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002004
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002005 string = state_init(&state, self, string, start, end);
2006 if (!string)
2007 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002008
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002009 state.ptr = state.start;
2010
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002011 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
2012
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002013 if (state.charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00002014 status = sre_match(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002015 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002016#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00002017 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002018#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002019 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002020
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002021 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
2022
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002023 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00002024
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002025 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00002026}
2027
2028static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002029pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002030{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002031 SRE_STATE state;
2032 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00002033
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002034 PyObject* string;
2035 int start = 0;
2036 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002037 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
2038 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:search", kwlist,
2039 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002040 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002041
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002042 string = state_init(&state, self, string, start, end);
2043 if (!string)
2044 return NULL;
2045
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002046 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
2047
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002048 if (state.charsize == 1) {
2049 status = sre_search(&state, PatternObject_GetCode(self));
2050 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002051#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002052 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002053#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002054 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002055
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002056 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
2057
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002058 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00002059
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002060 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00002061}
2062
2063static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002064call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002065{
2066 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002067 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002068 PyObject* func;
2069 PyObject* result;
2070
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002071 if (!args)
2072 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002073 name = PyString_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002074 if (!name)
2075 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002076 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002077 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002078 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002079 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002080 func = PyObject_GetAttrString(mod, function);
2081 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002082 if (!func)
2083 return NULL;
2084 result = PyObject_CallObject(func, args);
2085 Py_DECREF(func);
2086 Py_DECREF(args);
2087 return result;
2088}
2089
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002090#ifdef USE_BUILTIN_COPY
2091static int
2092deepcopy(PyObject** object, PyObject* memo)
2093{
2094 PyObject* copy;
2095
2096 copy = call(
2097 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002098 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002099 );
2100 if (!copy)
2101 return 0;
2102
2103 Py_DECREF(*object);
2104 *object = copy;
2105
2106 return 1; /* success */
2107}
2108#endif
2109
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002110static PyObject*
Michael W. Hudsonb6a45052002-07-31 09:54:24 +00002111join_list(PyObject* list, PyObject* pattern)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002112{
2113 /* join list elements */
2114
2115 PyObject* joiner;
2116#if PY_VERSION_HEX >= 0x01060000
2117 PyObject* function;
2118 PyObject* args;
2119#endif
2120 PyObject* result;
2121
2122 switch (PyList_GET_SIZE(list)) {
2123 case 0:
2124 Py_DECREF(list);
Fredrik Lundh09705f02002-11-22 12:46:35 +00002125 return PySequence_GetSlice(pattern, 0, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002126 case 1:
2127 result = PyList_GET_ITEM(list, 0);
2128 Py_INCREF(result);
2129 Py_DECREF(list);
2130 return result;
2131 }
2132
2133 /* two or more elements: slice out a suitable separator from the
2134 first member, and use that to join the entire list */
2135
2136 joiner = PySequence_GetSlice(pattern, 0, 0);
2137 if (!joiner)
2138 return NULL;
2139
2140#if PY_VERSION_HEX >= 0x01060000
2141 function = PyObject_GetAttrString(joiner, "join");
2142 if (!function) {
2143 Py_DECREF(joiner);
2144 return NULL;
2145 }
2146 args = PyTuple_New(1);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002147 if (!args) {
2148 Py_DECREF(function);
2149 Py_DECREF(joiner);
2150 return NULL;
2151 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002152 PyTuple_SET_ITEM(args, 0, list);
2153 result = PyObject_CallObject(function, args);
2154 Py_DECREF(args); /* also removes list */
2155 Py_DECREF(function);
2156#else
2157 result = call(
2158 "string", "join",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002159 PyTuple_Pack(2, list, joiner)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002160 );
2161#endif
2162 Py_DECREF(joiner);
2163
2164 return result;
2165}
2166
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002167static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002168pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002169{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002170 SRE_STATE state;
2171 PyObject* list;
2172 int status;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002173 int i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00002174
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002175 PyObject* string;
2176 int start = 0;
2177 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002178 static char* kwlist[] = { "source", "pos", "endpos", NULL };
2179 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:findall", kwlist,
2180 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002181 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002182
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002183 string = state_init(&state, self, string, start, end);
2184 if (!string)
2185 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002186
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002187 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002188 if (!list) {
2189 state_fini(&state);
2190 return NULL;
2191 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002192
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002193 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002194
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002195 PyObject* item;
2196
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002197 state_reset(&state);
2198
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002199 state.ptr = state.start;
2200
2201 if (state.charsize == 1) {
2202 status = sre_search(&state, PatternObject_GetCode(self));
2203 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002204#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002205 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002206#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002207 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002208
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002209 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002210 if (status == 0)
2211 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002212 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002213 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002214 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002215
2216 /* don't bother to build a match object */
2217 switch (self->groups) {
2218 case 0:
2219 b = STATE_OFFSET(&state, state.start);
2220 e = STATE_OFFSET(&state, state.ptr);
2221 item = PySequence_GetSlice(string, b, e);
2222 if (!item)
2223 goto error;
2224 break;
2225 case 1:
2226 item = state_getslice(&state, 1, string, 1);
2227 if (!item)
2228 goto error;
2229 break;
2230 default:
2231 item = PyTuple_New(self->groups);
2232 if (!item)
2233 goto error;
2234 for (i = 0; i < self->groups; i++) {
2235 PyObject* o = state_getslice(&state, i+1, string, 1);
2236 if (!o) {
2237 Py_DECREF(item);
2238 goto error;
2239 }
2240 PyTuple_SET_ITEM(item, i, o);
2241 }
2242 break;
2243 }
2244
2245 status = PyList_Append(list, item);
2246 Py_DECREF(item);
2247 if (status < 0)
2248 goto error;
2249
2250 if (state.ptr == state.start)
2251 state.start = (void*) ((char*) state.ptr + state.charsize);
2252 else
2253 state.start = state.ptr;
2254
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002255 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002256
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002257 state_fini(&state);
2258 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002259
2260error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002261 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002262 state_fini(&state);
2263 return NULL;
2264
Guido van Rossumb700df92000-03-31 14:59:30 +00002265}
2266
Fredrik Lundh703ce812001-10-24 22:16:30 +00002267#if PY_VERSION_HEX >= 0x02020000
2268static PyObject*
2269pattern_finditer(PatternObject* pattern, PyObject* args)
2270{
2271 PyObject* scanner;
2272 PyObject* search;
2273 PyObject* iterator;
2274
2275 scanner = pattern_scanner(pattern, args);
2276 if (!scanner)
2277 return NULL;
2278
2279 search = PyObject_GetAttrString(scanner, "search");
2280 Py_DECREF(scanner);
2281 if (!search)
2282 return NULL;
2283
2284 iterator = PyCallIter_New(search, Py_None);
2285 Py_DECREF(search);
2286
2287 return iterator;
2288}
2289#endif
2290
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002291static PyObject*
2292pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2293{
2294 SRE_STATE state;
2295 PyObject* list;
2296 PyObject* item;
2297 int status;
2298 int n;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002299 int i;
2300 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002301
2302 PyObject* string;
2303 int maxsplit = 0;
2304 static char* kwlist[] = { "source", "maxsplit", NULL };
2305 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|i:split", kwlist,
2306 &string, &maxsplit))
2307 return NULL;
2308
2309 string = state_init(&state, self, string, 0, INT_MAX);
2310 if (!string)
2311 return NULL;
2312
2313 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002314 if (!list) {
2315 state_fini(&state);
2316 return NULL;
2317 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002318
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002319 n = 0;
2320 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002321
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002322 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002323
2324 state_reset(&state);
2325
2326 state.ptr = state.start;
2327
2328 if (state.charsize == 1) {
2329 status = sre_search(&state, PatternObject_GetCode(self));
2330 } else {
2331#if defined(HAVE_UNICODE)
2332 status = sre_usearch(&state, PatternObject_GetCode(self));
2333#endif
2334 }
2335
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002336 if (status <= 0) {
2337 if (status == 0)
2338 break;
2339 pattern_error(status);
2340 goto error;
2341 }
2342
2343 if (state.start == state.ptr) {
2344 if (last == state.end)
2345 break;
2346 /* skip one character */
2347 state.start = (void*) ((char*) state.ptr + state.charsize);
2348 continue;
2349 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002350
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002351 /* get segment before this match */
2352 item = PySequence_GetSlice(
2353 string, STATE_OFFSET(&state, last),
2354 STATE_OFFSET(&state, state.start)
2355 );
2356 if (!item)
2357 goto error;
2358 status = PyList_Append(list, item);
2359 Py_DECREF(item);
2360 if (status < 0)
2361 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002362
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002363 /* add groups (if any) */
2364 for (i = 0; i < self->groups; i++) {
2365 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002366 if (!item)
2367 goto error;
2368 status = PyList_Append(list, item);
2369 Py_DECREF(item);
2370 if (status < 0)
2371 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002372 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002373
2374 n = n + 1;
2375
2376 last = state.start = state.ptr;
2377
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002378 }
2379
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002380 /* get segment following last match (even if empty) */
2381 item = PySequence_GetSlice(
2382 string, STATE_OFFSET(&state, last), state.endpos
2383 );
2384 if (!item)
2385 goto error;
2386 status = PyList_Append(list, item);
2387 Py_DECREF(item);
2388 if (status < 0)
2389 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002390
2391 state_fini(&state);
2392 return list;
2393
2394error:
2395 Py_DECREF(list);
2396 state_fini(&state);
2397 return NULL;
2398
2399}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002400
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002401static PyObject*
2402pattern_subx(PatternObject* self, PyObject* template, PyObject* string,
2403 int count, int subn)
2404{
2405 SRE_STATE state;
2406 PyObject* list;
2407 PyObject* item;
2408 PyObject* filter;
2409 PyObject* args;
2410 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002411 void* ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002412 int status;
2413 int n;
2414 int i, b, e;
2415 int filter_is_callable;
2416
Fredrik Lundhdac58492001-10-21 21:48:30 +00002417 if (PyCallable_Check(template)) {
2418 /* sub/subn takes either a function or a template */
2419 filter = template;
2420 Py_INCREF(filter);
2421 filter_is_callable = 1;
2422 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002423 /* if not callable, check if it's a literal string */
2424 int literal;
2425 ptr = getstring(template, &n, &b);
2426 if (ptr) {
2427 if (b == 1) {
2428 literal = sre_literal_template(ptr, n);
2429 } else {
2430#if defined(HAVE_UNICODE)
2431 literal = sre_uliteral_template(ptr, n);
2432#endif
2433 }
2434 } else {
2435 PyErr_Clear();
2436 literal = 0;
2437 }
2438 if (literal) {
2439 filter = template;
2440 Py_INCREF(filter);
2441 filter_is_callable = 0;
2442 } else {
2443 /* not a literal; hand it over to the template compiler */
2444 filter = call(
2445 SRE_MODULE, "_subx",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002446 PyTuple_Pack(2, self, template)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002447 );
2448 if (!filter)
2449 return NULL;
2450 filter_is_callable = PyCallable_Check(filter);
2451 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002452 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002453
2454 string = state_init(&state, self, string, 0, INT_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002455 if (!string) {
2456 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002457 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002458 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002459
2460 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002461 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002462 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002463 state_fini(&state);
2464 return NULL;
2465 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002466
2467 n = i = 0;
2468
2469 while (!count || n < count) {
2470
2471 state_reset(&state);
2472
2473 state.ptr = state.start;
2474
2475 if (state.charsize == 1) {
2476 status = sre_search(&state, PatternObject_GetCode(self));
2477 } else {
2478#if defined(HAVE_UNICODE)
2479 status = sre_usearch(&state, PatternObject_GetCode(self));
2480#endif
2481 }
2482
2483 if (status <= 0) {
2484 if (status == 0)
2485 break;
2486 pattern_error(status);
2487 goto error;
2488 }
2489
2490 b = STATE_OFFSET(&state, state.start);
2491 e = STATE_OFFSET(&state, state.ptr);
2492
2493 if (i < b) {
2494 /* get segment before this match */
2495 item = PySequence_GetSlice(string, i, b);
2496 if (!item)
2497 goto error;
2498 status = PyList_Append(list, item);
2499 Py_DECREF(item);
2500 if (status < 0)
2501 goto error;
2502
2503 } else if (i == b && i == e && n > 0)
2504 /* ignore empty match on latest position */
2505 goto next;
2506
2507 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002508 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002509 match = pattern_new_match(self, &state, 1);
2510 if (!match)
2511 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002512 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002513 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002514 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002515 goto error;
2516 }
2517 item = PyObject_CallObject(filter, args);
2518 Py_DECREF(args);
2519 Py_DECREF(match);
2520 if (!item)
2521 goto error;
2522 } else {
2523 /* filter is literal string */
2524 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002525 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002526 }
2527
2528 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002529 if (item != Py_None) {
2530 status = PyList_Append(list, item);
2531 Py_DECREF(item);
2532 if (status < 0)
2533 goto error;
2534 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002535
2536 i = e;
2537 n = n + 1;
2538
2539next:
2540 /* move on */
2541 if (state.ptr == state.start)
2542 state.start = (void*) ((char*) state.ptr + state.charsize);
2543 else
2544 state.start = state.ptr;
2545
2546 }
2547
2548 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002549 if (i < state.endpos) {
2550 item = PySequence_GetSlice(string, i, state.endpos);
2551 if (!item)
2552 goto error;
2553 status = PyList_Append(list, item);
2554 Py_DECREF(item);
2555 if (status < 0)
2556 goto error;
2557 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002558
2559 state_fini(&state);
2560
Guido van Rossum4e173842001-12-07 04:25:10 +00002561 Py_DECREF(filter);
2562
Fredrik Lundhdac58492001-10-21 21:48:30 +00002563 /* convert list to single string (also removes list) */
Michael W. Hudsonb6a45052002-07-31 09:54:24 +00002564 item = join_list(list, self->pattern);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002565
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002566 if (!item)
2567 return NULL;
2568
2569 if (subn)
2570 return Py_BuildValue("Ni", item, n);
2571
2572 return item;
2573
2574error:
2575 Py_DECREF(list);
2576 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002577 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002578 return NULL;
2579
2580}
2581
2582static PyObject*
2583pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2584{
2585 PyObject* template;
2586 PyObject* string;
2587 int count = 0;
2588 static char* kwlist[] = { "repl", "string", "count", NULL };
2589 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:sub", kwlist,
2590 &template, &string, &count))
2591 return NULL;
2592
2593 return pattern_subx(self, template, string, count, 0);
2594}
2595
2596static PyObject*
2597pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2598{
2599 PyObject* template;
2600 PyObject* string;
2601 int count = 0;
2602 static char* kwlist[] = { "repl", "string", "count", NULL };
2603 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:subn", kwlist,
2604 &template, &string, &count))
2605 return NULL;
2606
2607 return pattern_subx(self, template, string, count, 1);
2608}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002609
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002610static PyObject*
2611pattern_copy(PatternObject* self, PyObject* args)
2612{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002613#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002614 PatternObject* copy;
2615 int offset;
2616
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002617 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
2618 return NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002619
2620 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2621 if (!copy)
2622 return NULL;
2623
2624 offset = offsetof(PatternObject, groups);
2625
2626 Py_XINCREF(self->groupindex);
2627 Py_XINCREF(self->indexgroup);
2628 Py_XINCREF(self->pattern);
2629
2630 memcpy((char*) copy + offset, (char*) self + offset,
2631 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
2632
2633 return (PyObject*) copy;
2634#else
2635 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2636 return NULL;
2637#endif
2638}
2639
2640static PyObject*
2641pattern_deepcopy(PatternObject* self, PyObject* args)
2642{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002643#ifdef USE_BUILTIN_COPY
2644 PatternObject* copy;
2645
2646 PyObject* memo;
2647 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
2648 return NULL;
2649
2650 copy = (PatternObject*) pattern_copy(self, Py_None);
2651 if (!copy)
2652 return NULL;
2653
2654 if (!deepcopy(&copy->groupindex, memo) ||
2655 !deepcopy(&copy->indexgroup, memo) ||
2656 !deepcopy(&copy->pattern, memo)) {
2657 Py_DECREF(copy);
2658 return NULL;
2659 }
2660
2661#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002662 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2663 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002664#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002665}
2666
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002667static PyMethodDef pattern_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00002668 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS},
2669 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS},
2670 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS},
2671 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS},
2672 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS},
2673 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002674#if PY_VERSION_HEX >= 0x02020000
2675 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS},
2676#endif
Fredrik Lundh562586e2000-10-03 20:43:34 +00002677 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002678 {"__copy__", (PyCFunction) pattern_copy, METH_VARARGS},
2679 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002680 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002681};
2682
2683static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002684pattern_getattr(PatternObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00002685{
2686 PyObject* res;
2687
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002688 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00002689
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002690 if (res)
2691 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00002692
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002693 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00002694
2695 /* attributes */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002696 if (!strcmp(name, "pattern")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002697 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002698 return self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00002699 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002700
2701 if (!strcmp(name, "flags"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002702 return Py_BuildValue("i", self->flags);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002703
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002704 if (!strcmp(name, "groups"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002705 return Py_BuildValue("i", self->groups);
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002706
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002707 if (!strcmp(name, "groupindex") && self->groupindex) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002708 Py_INCREF(self->groupindex);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002709 return self->groupindex;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002710 }
2711
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002712 PyErr_SetString(PyExc_AttributeError, name);
2713 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002714}
2715
2716statichere PyTypeObject Pattern_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002717 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00002718 0, "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002719 sizeof(PatternObject), sizeof(SRE_CODE),
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002720 (destructor)pattern_dealloc, /*tp_dealloc*/
2721 0, /*tp_print*/
2722 (getattrfunc)pattern_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002723};
2724
2725/* -------------------------------------------------------------------- */
2726/* match methods */
2727
2728static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002729match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00002730{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002731 Py_XDECREF(self->regs);
2732 Py_XDECREF(self->string);
2733 Py_DECREF(self->pattern);
2734 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00002735}
2736
2737static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002738match_getslice_by_index(MatchObject* self, int index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00002739{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002740 if (index < 0 || index >= self->groups) {
2741 /* raise IndexError if we were given a bad group number */
2742 PyErr_SetString(
2743 PyExc_IndexError,
2744 "no such group"
2745 );
2746 return NULL;
2747 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002748
Fredrik Lundh6f013982000-07-03 18:44:21 +00002749 index *= 2;
2750
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002751 if (self->string == Py_None || self->mark[index] < 0) {
2752 /* return default value if the string or group is undefined */
2753 Py_INCREF(def);
2754 return def;
2755 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002756
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002757 return PySequence_GetSlice(
2758 self->string, self->mark[index], self->mark[index+1]
2759 );
Guido van Rossumb700df92000-03-31 14:59:30 +00002760}
2761
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002762static int
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002763match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00002764{
Fredrik Lundh6f013982000-07-03 18:44:21 +00002765 int i;
Guido van Rossumb700df92000-03-31 14:59:30 +00002766
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002767 if (PyInt_Check(index))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002768 return (int) PyInt_AS_LONG(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00002769
Fredrik Lundh6f013982000-07-03 18:44:21 +00002770 i = -1;
2771
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002772 if (self->pattern->groupindex) {
2773 index = PyObject_GetItem(self->pattern->groupindex, index);
2774 if (index) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00002775 if (PyInt_Check(index))
2776 i = (int) PyInt_AS_LONG(index);
2777 Py_DECREF(index);
2778 } else
2779 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002780 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00002781
2782 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002783}
2784
2785static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002786match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002787{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002788 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00002789}
2790
2791static PyObject*
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002792match_expand(MatchObject* self, PyObject* args)
2793{
2794 PyObject* template;
2795 if (!PyArg_ParseTuple(args, "O:expand", &template))
2796 return NULL;
2797
2798 /* delegate to Python code */
2799 return call(
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002800 SRE_MODULE, "_expand",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002801 PyTuple_Pack(3, self->pattern, self, template)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002802 );
2803}
2804
2805static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002806match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002807{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002808 PyObject* result;
2809 int i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00002810
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002811 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00002812
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002813 switch (size) {
2814 case 0:
2815 result = match_getslice(self, Py_False, Py_None);
2816 break;
2817 case 1:
2818 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2819 break;
2820 default:
2821 /* fetch multiple items */
2822 result = PyTuple_New(size);
2823 if (!result)
2824 return NULL;
2825 for (i = 0; i < size; i++) {
2826 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002827 self, PyTuple_GET_ITEM(args, i), Py_None
2828 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002829 if (!item) {
2830 Py_DECREF(result);
2831 return NULL;
2832 }
2833 PyTuple_SET_ITEM(result, i, item);
2834 }
2835 break;
2836 }
2837 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002838}
2839
2840static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002841match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002842{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002843 PyObject* result;
2844 int index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002845
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002846 PyObject* def = Py_None;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002847 static char* kwlist[] = { "default", NULL };
2848 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002849 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002850
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002851 result = PyTuple_New(self->groups-1);
2852 if (!result)
2853 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002854
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002855 for (index = 1; index < self->groups; index++) {
2856 PyObject* item;
2857 item = match_getslice_by_index(self, index, def);
2858 if (!item) {
2859 Py_DECREF(result);
2860 return NULL;
2861 }
2862 PyTuple_SET_ITEM(result, index-1, item);
2863 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002864
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002865 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002866}
2867
2868static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002869match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002870{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002871 PyObject* result;
2872 PyObject* keys;
2873 int index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002874
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002875 PyObject* def = Py_None;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002876 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00002877 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002878 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002879
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002880 result = PyDict_New();
2881 if (!result || !self->pattern->groupindex)
2882 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002883
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002884 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002885 if (!keys)
2886 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00002887
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002888 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00002889 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002890 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002891 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002892 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002893 if (!key)
2894 goto failed;
2895 value = match_getslice(self, key, def);
2896 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002897 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002898 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002899 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00002900 status = PyDict_SetItem(result, key, value);
2901 Py_DECREF(value);
2902 if (status < 0)
2903 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002904 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002905
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002906 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00002907
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002908 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002909
2910failed:
2911 Py_DECREF(keys);
2912 Py_DECREF(result);
2913 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002914}
2915
2916static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002917match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002918{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002919 int index;
2920
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002921 PyObject* index_ = Py_False; /* zero */
2922 if (!PyArg_ParseTuple(args, "|O:start", &index_))
2923 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002924
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002925 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002926
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002927 if (index < 0 || index >= self->groups) {
2928 PyErr_SetString(
2929 PyExc_IndexError,
2930 "no such group"
2931 );
2932 return NULL;
2933 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002934
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002935 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002936 return Py_BuildValue("i", self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00002937}
2938
2939static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002940match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002941{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002942 int index;
2943
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002944 PyObject* index_ = Py_False; /* zero */
2945 if (!PyArg_ParseTuple(args, "|O:end", &index_))
2946 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002947
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002948 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002949
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002950 if (index < 0 || index >= self->groups) {
2951 PyErr_SetString(
2952 PyExc_IndexError,
2953 "no such group"
2954 );
2955 return NULL;
2956 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002957
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002958 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002959 return Py_BuildValue("i", self->mark[index*2+1]);
2960}
2961
2962LOCAL(PyObject*)
2963_pair(int i1, int i2)
2964{
2965 PyObject* pair;
2966 PyObject* item;
2967
2968 pair = PyTuple_New(2);
2969 if (!pair)
2970 return NULL;
2971
2972 item = PyInt_FromLong(i1);
2973 if (!item)
2974 goto error;
2975 PyTuple_SET_ITEM(pair, 0, item);
2976
2977 item = PyInt_FromLong(i2);
2978 if (!item)
2979 goto error;
2980 PyTuple_SET_ITEM(pair, 1, item);
2981
2982 return pair;
2983
2984 error:
2985 Py_DECREF(pair);
2986 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002987}
2988
2989static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002990match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002991{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002992 int index;
2993
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002994 PyObject* index_ = Py_False; /* zero */
2995 if (!PyArg_ParseTuple(args, "|O:span", &index_))
2996 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002997
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002998 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002999
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003000 if (index < 0 || index >= self->groups) {
3001 PyErr_SetString(
3002 PyExc_IndexError,
3003 "no such group"
3004 );
3005 return NULL;
3006 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003007
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003008 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003009 return _pair(self->mark[index*2], self->mark[index*2+1]);
3010}
3011
3012static PyObject*
3013match_regs(MatchObject* self)
3014{
3015 PyObject* regs;
3016 PyObject* item;
3017 int index;
3018
3019 regs = PyTuple_New(self->groups);
3020 if (!regs)
3021 return NULL;
3022
3023 for (index = 0; index < self->groups; index++) {
3024 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3025 if (!item) {
3026 Py_DECREF(regs);
3027 return NULL;
3028 }
3029 PyTuple_SET_ITEM(regs, index, item);
3030 }
3031
3032 Py_INCREF(regs);
3033 self->regs = regs;
3034
3035 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00003036}
3037
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003038static PyObject*
3039match_copy(MatchObject* self, PyObject* args)
3040{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003041#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003042 MatchObject* copy;
3043 int slots, offset;
3044
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003045 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
3046 return NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003047
3048 slots = 2 * (self->pattern->groups+1);
3049
3050 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3051 if (!copy)
3052 return NULL;
3053
3054 /* this value a constant, but any compiler should be able to
3055 figure that out all by itself */
3056 offset = offsetof(MatchObject, string);
3057
3058 Py_XINCREF(self->pattern);
3059 Py_XINCREF(self->string);
3060 Py_XINCREF(self->regs);
3061
3062 memcpy((char*) copy + offset, (char*) self + offset,
3063 sizeof(MatchObject) + slots * sizeof(int) - offset);
3064
3065 return (PyObject*) copy;
3066#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003067 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003068 return NULL;
3069#endif
3070}
3071
3072static PyObject*
3073match_deepcopy(MatchObject* self, PyObject* args)
3074{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003075#ifdef USE_BUILTIN_COPY
3076 MatchObject* copy;
3077
3078 PyObject* memo;
3079 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
3080 return NULL;
3081
3082 copy = (MatchObject*) match_copy(self, Py_None);
3083 if (!copy)
3084 return NULL;
3085
3086 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3087 !deepcopy(&copy->string, memo) ||
3088 !deepcopy(&copy->regs, memo)) {
3089 Py_DECREF(copy);
3090 return NULL;
3091 }
3092
3093#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003094 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3095 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003096#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003097}
3098
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003099static PyMethodDef match_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00003100 {"group", (PyCFunction) match_group, METH_VARARGS},
3101 {"start", (PyCFunction) match_start, METH_VARARGS},
3102 {"end", (PyCFunction) match_end, METH_VARARGS},
3103 {"span", (PyCFunction) match_span, METH_VARARGS},
3104 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3105 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
3106 {"expand", (PyCFunction) match_expand, METH_VARARGS},
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003107 {"__copy__", (PyCFunction) match_copy, METH_VARARGS},
3108 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003109 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003110};
3111
3112static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003113match_getattr(MatchObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00003114{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003115 PyObject* res;
Guido van Rossumb700df92000-03-31 14:59:30 +00003116
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003117 res = Py_FindMethod(match_methods, (PyObject*) self, name);
3118 if (res)
3119 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00003120
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003121 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00003122
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003123 if (!strcmp(name, "lastindex")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00003124 if (self->lastindex >= 0)
3125 return Py_BuildValue("i", self->lastindex);
Fredrik Lundhc2301732000-07-02 22:25:39 +00003126 Py_INCREF(Py_None);
3127 return Py_None;
3128 }
3129
3130 if (!strcmp(name, "lastgroup")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00003131 if (self->pattern->indexgroup && self->lastindex >= 0) {
Fredrik Lundhc2301732000-07-02 22:25:39 +00003132 PyObject* result = PySequence_GetItem(
Fredrik Lundh6f013982000-07-03 18:44:21 +00003133 self->pattern->indexgroup, self->lastindex
Fredrik Lundhc2301732000-07-02 22:25:39 +00003134 );
3135 if (result)
3136 return result;
3137 PyErr_Clear();
3138 }
3139 Py_INCREF(Py_None);
3140 return Py_None;
3141 }
3142
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003143 if (!strcmp(name, "string")) {
3144 if (self->string) {
3145 Py_INCREF(self->string);
3146 return self->string;
3147 } else {
3148 Py_INCREF(Py_None);
3149 return Py_None;
3150 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003151 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003152
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003153 if (!strcmp(name, "regs")) {
3154 if (self->regs) {
3155 Py_INCREF(self->regs);
3156 return self->regs;
3157 } else
3158 return match_regs(self);
3159 }
3160
3161 if (!strcmp(name, "re")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00003162 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003163 return (PyObject*) self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00003164 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003165
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003166 if (!strcmp(name, "pos"))
3167 return Py_BuildValue("i", self->pos);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003168
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003169 if (!strcmp(name, "endpos"))
3170 return Py_BuildValue("i", self->endpos);
Guido van Rossumb700df92000-03-31 14:59:30 +00003171
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003172 PyErr_SetString(PyExc_AttributeError, name);
3173 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003174}
3175
3176/* FIXME: implement setattr("string", None) as a special case (to
3177 detach the associated string, if any */
3178
3179statichere PyTypeObject Match_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003180 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00003181 0, "_" SRE_MODULE ".SRE_Match",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003182 sizeof(MatchObject), sizeof(int),
3183 (destructor)match_dealloc, /*tp_dealloc*/
3184 0, /*tp_print*/
3185 (getattrfunc)match_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00003186};
3187
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003188/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003189/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003190
3191static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003192scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003193{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003194 state_fini(&self->state);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003195 Py_DECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003196 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003197}
3198
3199static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003200scanner_match(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003201{
3202 SRE_STATE* state = &self->state;
3203 PyObject* match;
3204 int status;
3205
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003206 state_reset(state);
3207
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003208 state->ptr = state->start;
3209
3210 if (state->charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003211 status = sre_match(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003212 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003213#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003214 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003215#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003216 }
3217
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003218 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003219 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003220
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00003221 if ((status == 0 || state->ptr == state->start) &&
3222 state->ptr < state->end)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003223 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003224 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003225 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003226
3227 return match;
3228}
3229
3230
3231static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003232scanner_search(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003233{
3234 SRE_STATE* state = &self->state;
3235 PyObject* match;
3236 int status;
3237
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003238 state_reset(state);
3239
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003240 state->ptr = state->start;
3241
3242 if (state->charsize == 1) {
3243 status = sre_search(state, PatternObject_GetCode(self->pattern));
3244 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003245#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003246 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003247#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003248 }
3249
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003250 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003251 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003252
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00003253 if ((status == 0 || state->ptr == state->start) &&
3254 state->ptr < state->end)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003255 state->start = (void*) ((char*) state->ptr + state->charsize);
3256 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003257 state->start = state->ptr;
3258
3259 return match;
3260}
3261
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003262static PyMethodDef scanner_methods[] = {
Neal Norwitzbb2769f2002-03-31 15:46:00 +00003263 /* FIXME: use METH_OLDARGS instead of 0 or fix to use METH_VARARGS */
3264 /* METH_OLDARGS is not in Python 1.5.2 */
3265 {"match", (PyCFunction) scanner_match, 0},
3266 {"search", (PyCFunction) scanner_search, 0},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003267 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003268};
3269
3270static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003271scanner_getattr(ScannerObject* self, char* name)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003272{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003273 PyObject* res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003274
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003275 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
3276 if (res)
3277 return res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003278
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003279 PyErr_Clear();
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003280
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003281 /* attributes */
3282 if (!strcmp(name, "pattern")) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003283 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003284 return self->pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003285 }
3286
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003287 PyErr_SetString(PyExc_AttributeError, name);
3288 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003289}
3290
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003291statichere PyTypeObject Scanner_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003292 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00003293 0, "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003294 sizeof(ScannerObject), 0,
3295 (destructor)scanner_dealloc, /*tp_dealloc*/
3296 0, /*tp_print*/
3297 (getattrfunc)scanner_getattr, /*tp_getattr*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003298};
3299
Guido van Rossumb700df92000-03-31 14:59:30 +00003300static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00003301 {"compile", _compile, METH_VARARGS},
3302 {"getcodesize", sre_codesize, METH_VARARGS},
3303 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003304 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003305};
3306
Andrew M. Kuchlingc24fe362003-04-30 13:09:08 +00003307#if PY_VERSION_HEX < 0x02030000
3308DL_EXPORT(void) init_sre(void)
3309#else
Mark Hammond8235ea12002-07-19 06:55:41 +00003310PyMODINIT_FUNC init_sre(void)
Andrew M. Kuchlingc24fe362003-04-30 13:09:08 +00003311#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00003312{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003313 PyObject* m;
3314 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00003315 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003316
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003317 /* Patch object types */
3318 Pattern_Type.ob_type = Match_Type.ob_type =
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003319 Scanner_Type.ob_type = &PyType_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00003320
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00003321 m = Py_InitModule("_" SRE_MODULE, _functions);
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003322 d = PyModule_GetDict(m);
3323
Fredrik Lundh21009b92001-09-18 18:47:09 +00003324 x = PyInt_FromLong(SRE_MAGIC);
3325 if (x) {
3326 PyDict_SetItemString(d, "MAGIC", x);
3327 Py_DECREF(x);
3328 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00003329
Martin v. Löwis78e2f062003-04-19 12:56:08 +00003330 x = PyInt_FromLong(sizeof(SRE_CODE));
3331 if (x) {
3332 PyDict_SetItemString(d, "CODESIZE", x);
3333 Py_DECREF(x);
3334 }
3335
Fredrik Lundh21009b92001-09-18 18:47:09 +00003336 x = PyString_FromString(copyright);
3337 if (x) {
3338 PyDict_SetItemString(d, "copyright", x);
3339 Py_DECREF(x);
3340 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003341}
3342
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003343#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00003344
3345/* vim:ts=4:sw=4:et
3346*/