blob: 6ee0bb82d3b07338c335a4c083aa655f34a3c10f [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
Fredrik Lundhebc37b22000-10-28 19:30:41 +00009 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000010 * 2000-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000011 * 2000-09-20 fl added expand method
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000012 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000013 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000014 * 2001-04-28 fl added __copy__ methods (work in progress)
Fredrik Lundh09705f02002-11-22 12:46:35 +000015 * 2001-05-14 fl fixes for 1.5.2 compatibility
Fredrik Lundhf71ae462001-07-02 17:04:48 +000016 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh397a6542001-10-18 19:30:16 +000017 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Fredrik Lundh971e78b2001-10-20 17:48:46 +000018 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
Fredrik Lundhbec95b92001-10-21 16:47:57 +000019 * 2001-10-21 fl added sub/subn primitive
Fredrik Lundh703ce812001-10-24 22:16:30 +000020 * 2001-10-24 fl added finditer primitive (for 2.2 only)
Fredrik Lundh82b23072001-12-09 16:13:15 +000021 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
Fredrik Lundh09705f02002-11-22 12:46:35 +000022 * 2002-11-09 fl fixed empty sub/subn return type
Martin v. Löwis78e2f062003-04-19 12:56:08 +000023 * 2003-04-18 mvl fully support 4-byte codes
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +000024 * 2003-10-17 gn implemented non recursive scheme
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000026 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000027 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000028 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
Guido van Rossumb700df92000-03-31 14:59:30 +000032 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000033 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000034 * other compatibility work.
35 */
36
Nicholas Bastin1ce9e4c2004-06-17 18:27:18 +000037#ifdef __SUNPRO_C
38#pragma error_messages (off,E_END_OF_LOOP_CODE_NOT_REACHED)
39#endif
40
Guido van Rossumb700df92000-03-31 14:59:30 +000041#ifndef SRE_RECURSIVE
42
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000043static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000044 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000045
46#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000047#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000048
49#include "sre.h"
50
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000051#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000052
Fredrik Lundh436c3d582000-06-29 08:58:44 +000053/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000054#if !defined(SRE_MODULE)
55#define SRE_MODULE "sre"
56#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000057
Guido van Rossumb700df92000-03-31 14:59:30 +000058/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000059#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000060
Fredrik Lundh971e78b2001-10-20 17:48:46 +000061#if PY_VERSION_HEX >= 0x01060000
62#if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
Fredrik Lundh22d25462000-07-01 17:50:59 +000063/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +000064#define HAVE_UNICODE
65#endif
Fredrik Lundh971e78b2001-10-20 17:48:46 +000066#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000067
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000068/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000069/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000070
71/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000072#define USE_FAST_SEARCH
73
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000074/* enables aggressive inlining (always on for Visual C) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000075#undef USE_INLINE
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000076
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000077/* enables copy/deepcopy handling (work in progress) */
78#undef USE_BUILTIN_COPY
79
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000080#if PY_VERSION_HEX < 0x01060000
81#define PyObject_DEL(op) PyMem_DEL((op))
82#endif
83
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000084/* -------------------------------------------------------------------- */
85
Fredrik Lundh80946112000-06-29 18:03:25 +000086#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000087#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000088#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000089/* fastest possible local call under MSVC */
90#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000091#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000092#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000093#else
94#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000095#endif
96
97/* error codes */
98#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000099#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000100#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +0000101#define SRE_ERROR_MEMORY -9 /* out of memory */
102
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000103#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +0000104#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +0000105#else
106#define TRACE(v)
107#endif
108
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000109/* -------------------------------------------------------------------- */
110/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000111
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000112/* default character predicates (run sre_chars.py to regenerate tables) */
113
114#define SRE_DIGIT_MASK 1
115#define SRE_SPACE_MASK 2
116#define SRE_LINEBREAK_MASK 4
117#define SRE_ALNUM_MASK 8
118#define SRE_WORD_MASK 16
119
Fredrik Lundh21009b92001-09-18 18:47:09 +0000120/* FIXME: this assumes ASCII. create tables in init_sre() instead */
121
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000122static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1232, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1240, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
12525, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12624, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1270, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12824, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
129
Fredrik Lundhb389df32000-06-29 12:48:37 +0000130static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000013110, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
13227, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
13344, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
13461, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
135108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
136122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
137106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
138120, 121, 122, 123, 124, 125, 126, 127 };
139
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000140#define SRE_IS_DIGIT(ch)\
141 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
142#define SRE_IS_SPACE(ch)\
143 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
144#define SRE_IS_LINEBREAK(ch)\
145 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
146#define SRE_IS_ALNUM(ch)\
147 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
148#define SRE_IS_WORD(ch)\
149 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000150
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000151static unsigned int sre_lower(unsigned int ch)
152{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000153 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000154}
155
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000156/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000157/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
158 * warnings when c's type supports only numbers < N+1 */
159#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
160#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000161#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000162#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000163#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
164
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000165static unsigned int sre_lower_locale(unsigned int ch)
166{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000167 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000168}
169
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000170/* unicode-specific character predicates */
171
172#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000173
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000174#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
175#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
176#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000177#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000178#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000179
180static unsigned int sre_lower_unicode(unsigned int ch)
181{
182 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
183}
184
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000185#endif
186
Guido van Rossumb700df92000-03-31 14:59:30 +0000187LOCAL(int)
188sre_category(SRE_CODE category, unsigned int ch)
189{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000190 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000191
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000192 case SRE_CATEGORY_DIGIT:
193 return SRE_IS_DIGIT(ch);
194 case SRE_CATEGORY_NOT_DIGIT:
195 return !SRE_IS_DIGIT(ch);
196 case SRE_CATEGORY_SPACE:
197 return SRE_IS_SPACE(ch);
198 case SRE_CATEGORY_NOT_SPACE:
199 return !SRE_IS_SPACE(ch);
200 case SRE_CATEGORY_WORD:
201 return SRE_IS_WORD(ch);
202 case SRE_CATEGORY_NOT_WORD:
203 return !SRE_IS_WORD(ch);
204 case SRE_CATEGORY_LINEBREAK:
205 return SRE_IS_LINEBREAK(ch);
206 case SRE_CATEGORY_NOT_LINEBREAK:
207 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000208
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000209 case SRE_CATEGORY_LOC_WORD:
210 return SRE_LOC_IS_WORD(ch);
211 case SRE_CATEGORY_LOC_NOT_WORD:
212 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000213
214#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000215 case SRE_CATEGORY_UNI_DIGIT:
216 return SRE_UNI_IS_DIGIT(ch);
217 case SRE_CATEGORY_UNI_NOT_DIGIT:
218 return !SRE_UNI_IS_DIGIT(ch);
219 case SRE_CATEGORY_UNI_SPACE:
220 return SRE_UNI_IS_SPACE(ch);
221 case SRE_CATEGORY_UNI_NOT_SPACE:
222 return !SRE_UNI_IS_SPACE(ch);
223 case SRE_CATEGORY_UNI_WORD:
224 return SRE_UNI_IS_WORD(ch);
225 case SRE_CATEGORY_UNI_NOT_WORD:
226 return !SRE_UNI_IS_WORD(ch);
227 case SRE_CATEGORY_UNI_LINEBREAK:
228 return SRE_UNI_IS_LINEBREAK(ch);
229 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
230 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000231#else
232 case SRE_CATEGORY_UNI_DIGIT:
233 return SRE_IS_DIGIT(ch);
234 case SRE_CATEGORY_UNI_NOT_DIGIT:
235 return !SRE_IS_DIGIT(ch);
236 case SRE_CATEGORY_UNI_SPACE:
237 return SRE_IS_SPACE(ch);
238 case SRE_CATEGORY_UNI_NOT_SPACE:
239 return !SRE_IS_SPACE(ch);
240 case SRE_CATEGORY_UNI_WORD:
241 return SRE_LOC_IS_WORD(ch);
242 case SRE_CATEGORY_UNI_NOT_WORD:
243 return !SRE_LOC_IS_WORD(ch);
244 case SRE_CATEGORY_UNI_LINEBREAK:
245 return SRE_IS_LINEBREAK(ch);
246 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
247 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000248#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000249 }
250 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000251}
252
253/* helpers */
254
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000255static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000256data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000257{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000258 if (state->data_stack) {
259 free(state->data_stack);
260 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000261 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000262 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000263}
264
265static int
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000266data_stack_grow(SRE_STATE* state, int size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000267{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000268 int minsize, cursize;
269 minsize = state->data_stack_base+size;
270 cursize = state->data_stack_size;
271 if (cursize < minsize) {
272 void* stack;
273 cursize = minsize+minsize/4+1024;
274 TRACE(("allocate/grow stack %d\n", cursize));
275 stack = realloc(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000276 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000277 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000278 return SRE_ERROR_MEMORY;
279 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000280 state->data_stack = stack;
281 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000282 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000283 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000284}
285
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000286/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000287
288#define SRE_CHAR unsigned char
289#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000290#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000291#define SRE_CHARSET sre_charset
292#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000293#define SRE_MATCH sre_match
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000294#define SRE_MATCH_CONTEXT sre_match_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000295#define SRE_SEARCH sre_search
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000296#define SRE_LITERAL_TEMPLATE sre_literal_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000297
298#if defined(HAVE_UNICODE)
299
Guido van Rossumb700df92000-03-31 14:59:30 +0000300#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000301#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000302#undef SRE_RECURSIVE
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000303
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000304#undef SRE_LITERAL_TEMPLATE
Guido van Rossumb700df92000-03-31 14:59:30 +0000305#undef SRE_SEARCH
306#undef SRE_MATCH
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000307#undef SRE_MATCH_CONTEXT
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000308#undef SRE_INFO
309#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000310#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000311#undef SRE_AT
312#undef SRE_CHAR
313
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000314/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000315
316#define SRE_CHAR Py_UNICODE
317#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000318#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000319#define SRE_CHARSET sre_ucharset
320#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000321#define SRE_MATCH sre_umatch
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000322#define SRE_MATCH_CONTEXT sre_umatch_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000323#define SRE_SEARCH sre_usearch
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000324#define SRE_LITERAL_TEMPLATE sre_uliteral_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000325#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000326
327#endif /* SRE_RECURSIVE */
328
329/* -------------------------------------------------------------------- */
330/* String matching engine */
331
332/* the following section is compiled twice, with different character
333 settings */
334
335LOCAL(int)
336SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
337{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000338 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000339
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000340 int this, that;
Guido van Rossumb700df92000-03-31 14:59:30 +0000341
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000342 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000343
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000344 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000345 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000346 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000347
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000348 case SRE_AT_BEGINNING_LINE:
349 return ((void*) ptr == state->beginning ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000350 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000351
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000352 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000353 return (((void*) (ptr+1) == state->end &&
354 SRE_IS_LINEBREAK((int) ptr[0])) ||
355 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000356
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000357 case SRE_AT_END_LINE:
358 return ((void*) ptr == state->end ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000359 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000360
Fredrik Lundh770617b2001-01-14 15:06:11 +0000361 case SRE_AT_END_STRING:
362 return ((void*) ptr == state->end);
363
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000364 case SRE_AT_BOUNDARY:
365 if (state->beginning == state->end)
366 return 0;
367 that = ((void*) ptr > state->beginning) ?
368 SRE_IS_WORD((int) ptr[-1]) : 0;
369 this = ((void*) ptr < state->end) ?
370 SRE_IS_WORD((int) ptr[0]) : 0;
371 return this != that;
Fredrik Lundh80946112000-06-29 18:03:25 +0000372
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000373 case SRE_AT_NON_BOUNDARY:
374 if (state->beginning == state->end)
375 return 0;
376 that = ((void*) ptr > state->beginning) ?
377 SRE_IS_WORD((int) ptr[-1]) : 0;
378 this = ((void*) ptr < state->end) ?
379 SRE_IS_WORD((int) ptr[0]) : 0;
380 return this == that;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000381
382 case SRE_AT_LOC_BOUNDARY:
383 if (state->beginning == state->end)
384 return 0;
385 that = ((void*) ptr > state->beginning) ?
386 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
387 this = ((void*) ptr < state->end) ?
388 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
389 return this != that;
390
391 case SRE_AT_LOC_NON_BOUNDARY:
392 if (state->beginning == state->end)
393 return 0;
394 that = ((void*) ptr > state->beginning) ?
395 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
396 this = ((void*) ptr < state->end) ?
397 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
398 return this == that;
399
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000400#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000401 case SRE_AT_UNI_BOUNDARY:
402 if (state->beginning == state->end)
403 return 0;
404 that = ((void*) ptr > state->beginning) ?
405 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
406 this = ((void*) ptr < state->end) ?
407 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
408 return this != that;
409
410 case SRE_AT_UNI_NON_BOUNDARY:
411 if (state->beginning == state->end)
412 return 0;
413 that = ((void*) ptr > state->beginning) ?
414 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
415 this = ((void*) ptr < state->end) ?
416 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
417 return this == that;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000418#endif
419
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000420 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000421
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000422 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000423}
424
425LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000426SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000427{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000428 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000429
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000430 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000431
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000432 for (;;) {
433 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000434
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000435 case SRE_OP_FAILURE:
436 return !ok;
437
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000438 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000439 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000440 if (ch == set[0])
441 return ok;
442 set++;
443 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000444
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000445 case SRE_OP_CATEGORY:
446 /* <CATEGORY> <code> */
447 if (sre_category(set[0], (int) ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000448 return ok;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000449 set += 1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000450 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000451
Fredrik Lundh3562f112000-07-02 12:00:07 +0000452 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000453 if (sizeof(SRE_CODE) == 2) {
454 /* <CHARSET> <bitmap> (16 bits per code word) */
455 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
456 return ok;
457 set += 16;
458 }
459 else {
460 /* <CHARSET> <bitmap> (32 bits per code word) */
461 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
462 return ok;
463 set += 8;
464 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000465 break;
466
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000467 case SRE_OP_RANGE:
468 /* <RANGE> <lower> <upper> */
469 if (set[0] <= ch && ch <= set[1])
470 return ok;
471 set += 2;
472 break;
473
474 case SRE_OP_NEGATE:
475 ok = !ok;
476 break;
477
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000478 case SRE_OP_BIGCHARSET:
479 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
480 {
481 int count, block;
482 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000483
484 if (sizeof(SRE_CODE) == 2) {
485 block = ((unsigned char*)set)[ch >> 8];
486 set += 128;
487 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
488 return ok;
489 set += count*16;
490 }
491 else {
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000492 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
493 * warnings when c's type supports only numbers < N+1 */
494 if (!(ch & ~65535))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000495 block = ((unsigned char*)set)[ch >> 8];
496 else
497 block = -1;
498 set += 64;
499 if (block >=0 &&
500 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
501 return ok;
502 set += count*8;
503 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000504 break;
505 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000506
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000507 default:
508 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000509 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000510 return 0;
511 }
512 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000513}
514
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000515LOCAL(int) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000516
517LOCAL(int)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000518SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, int maxcount)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000519{
520 SRE_CODE chr;
521 SRE_CHAR* ptr = state->ptr;
522 SRE_CHAR* end = state->end;
523 int i;
524
525 /* adjust end */
526 if (maxcount < end - ptr && maxcount != 65535)
527 end = ptr + maxcount;
528
529 switch (pattern[0]) {
530
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000531 case SRE_OP_IN:
532 /* repeated set */
533 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
534 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
535 ptr++;
536 break;
537
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000538 case SRE_OP_ANY:
539 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000540 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000541 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
542 ptr++;
543 break;
544
545 case SRE_OP_ANY_ALL:
546 /* repeated dot wildcare. skip to the end of the target
547 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000548 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000549 ptr = end;
550 break;
551
552 case SRE_OP_LITERAL:
553 /* repeated literal */
554 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000555 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000556 while (ptr < end && (SRE_CODE) *ptr == chr)
557 ptr++;
558 break;
559
560 case SRE_OP_LITERAL_IGNORE:
561 /* repeated literal */
562 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000563 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000564 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
565 ptr++;
566 break;
567
568 case SRE_OP_NOT_LITERAL:
569 /* repeated non-literal */
570 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000571 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000572 while (ptr < end && (SRE_CODE) *ptr != chr)
573 ptr++;
574 break;
575
576 case SRE_OP_NOT_LITERAL_IGNORE:
577 /* repeated non-literal */
578 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000579 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000580 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
581 ptr++;
582 break;
583
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000584 default:
585 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000586 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000587 while ((SRE_CHAR*) state->ptr < end) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000588 i = SRE_MATCH(state, pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000589 if (i < 0)
590 return i;
591 if (!i)
592 break;
593 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000594 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
595 (SRE_CHAR*) state->ptr - ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000596 return (SRE_CHAR*) state->ptr - ptr;
597 }
598
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000599 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000600 return ptr - (SRE_CHAR*) state->ptr;
601}
602
Fredrik Lundh33accc12000-08-27 20:59:47 +0000603#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000604LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000605SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
606{
607 /* check if an SRE_OP_INFO block matches at the current position.
608 returns the number of SRE_CODE objects to skip if successful, 0
609 if no match */
610
611 SRE_CHAR* end = state->end;
612 SRE_CHAR* ptr = state->ptr;
613 int i;
614
615 /* check minimal length */
616 if (pattern[3] && (end - ptr) < pattern[3])
617 return 0;
618
619 /* check known prefix */
620 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
621 /* <length> <skip> <prefix data> <overlap data> */
622 for (i = 0; i < pattern[5]; i++)
623 if ((SRE_CODE) ptr[i] != pattern[7 + i])
624 return 0;
625 return pattern[0] + 2 * pattern[6];
626 }
627 return pattern[0];
628}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000629#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000630
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000631/* The macros below should be used to protect recursive SRE_MATCH()
632 * calls that *failed* and do *not* return immediately (IOW, those
633 * that will backtrack). Explaining:
634 *
635 * - Recursive SRE_MATCH() returned true: that's usually a success
636 * (besides atypical cases like ASSERT_NOT), therefore there's no
637 * reason to restore lastmark;
638 *
639 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
640 * is returning to the caller: If the current SRE_MATCH() is the
641 * top function of the recursion, returning false will be a matching
642 * failure, and it doesn't matter where lastmark is pointing to.
643 * If it's *not* the top function, it will be a recursive SRE_MATCH()
644 * failure by itself, and the calling SRE_MATCH() will have to deal
645 * with the failure by the same rules explained here (it will restore
646 * lastmark by itself if necessary);
647 *
648 * - Recursive SRE_MATCH() returned false, and will continue the
649 * outside 'for' loop: must be protected when breaking, since the next
650 * OP could potentially depend on lastmark;
651 *
652 * - Recursive SRE_MATCH() returned false, and will be called again
653 * inside a local for/while loop: must be protected between each
654 * loop iteration, since the recursive SRE_MATCH() could do anything,
655 * and could potentially depend on lastmark.
656 *
657 * For more information, check the discussion at SF patch #712900.
658 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000659#define LASTMARK_SAVE() \
660 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000661 ctx->lastmark = state->lastmark; \
662 ctx->lastindex = state->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000663 } while (0)
664#define LASTMARK_RESTORE() \
665 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000666 state->lastmark = ctx->lastmark; \
667 state->lastindex = ctx->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000668 } while (0)
669
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000670#define RETURN_ERROR(i) do { return i; } while(0)
671#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
672#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
673
674#define RETURN_ON_ERROR(i) \
675 do { if (i < 0) RETURN_ERROR(i); } while (0)
676#define RETURN_ON_SUCCESS(i) \
677 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
678#define RETURN_ON_FAILURE(i) \
679 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
680
681#define SFY(x) #x
682
683#define DATA_STACK_ALLOC(state, type, ptr) \
684do { \
685 alloc_pos = state->data_stack_base; \
686 TRACE(("allocating %s in %d (%d)\n", \
687 SFY(type), alloc_pos, sizeof(type))); \
688 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
689 int j = data_stack_grow(state, sizeof(type)); \
690 if (j < 0) return j; \
691 if (ctx_pos != -1) \
692 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
693 } \
694 ptr = (type*)(state->data_stack+alloc_pos); \
695 state->data_stack_base += sizeof(type); \
696} while (0)
697
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000698#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
699do { \
700 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
701 ptr = (type*)(state->data_stack+pos); \
702} while (0)
703
704#define DATA_STACK_PUSH(state, data, size) \
705do { \
706 TRACE(("copy data in %p to %d (%d)\n", \
707 data, state->data_stack_base, size)); \
708 if (state->data_stack_size < state->data_stack_base+size) { \
709 int j = data_stack_grow(state, size); \
710 if (j < 0) return j; \
711 if (ctx_pos != -1) \
712 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
713 } \
714 memcpy(state->data_stack+state->data_stack_base, data, size); \
715 state->data_stack_base += size; \
716} while (0)
717
718#define DATA_STACK_POP(state, data, size, discard) \
719do { \
720 TRACE(("copy data to %p from %d (%d)\n", \
721 data, state->data_stack_base-size, size)); \
722 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
723 if (discard) \
724 state->data_stack_base -= size; \
725} while (0)
726
727#define DATA_STACK_POP_DISCARD(state, size) \
728do { \
729 TRACE(("discard data from %d (%d)\n", \
730 state->data_stack_base-size, size)); \
731 state->data_stack_base -= size; \
732} while(0)
733
734#define DATA_PUSH(x) \
735 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
736#define DATA_POP(x) \
737 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000738#define DATA_POP_DISCARD(x) \
739 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
740#define DATA_ALLOC(t,p) \
741 DATA_STACK_ALLOC(state, t, p)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000742#define DATA_LOOKUP_AT(t,p,pos) \
743 DATA_STACK_LOOKUP_AT(state,t,p,pos)
744
745#define MARK_PUSH(lastmark) \
746 do if (lastmark > 0) { \
747 i = lastmark; /* ctx->lastmark may change if reallocated */ \
748 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
749 } while (0)
750#define MARK_POP(lastmark) \
751 do if (lastmark > 0) { \
752 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
753 } while (0)
754#define MARK_POP_KEEP(lastmark) \
755 do if (lastmark > 0) { \
756 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
757 } while (0)
758#define MARK_POP_DISCARD(lastmark) \
759 do if (lastmark > 0) { \
760 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
761 } while (0)
762
763#define JUMP_NONE 0
764#define JUMP_MAX_UNTIL_1 1
765#define JUMP_MAX_UNTIL_2 2
766#define JUMP_MAX_UNTIL_3 3
767#define JUMP_MIN_UNTIL_1 4
768#define JUMP_MIN_UNTIL_2 5
769#define JUMP_MIN_UNTIL_3 6
770#define JUMP_REPEAT 7
771#define JUMP_REPEAT_ONE_1 8
772#define JUMP_REPEAT_ONE_2 9
773#define JUMP_MIN_REPEAT_ONE 10
774#define JUMP_BRANCH 11
775#define JUMP_ASSERT 12
776#define JUMP_ASSERT_NOT 13
777
778#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
779 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
780 nextctx->last_ctx_pos = ctx_pos; \
781 nextctx->jump = jumpvalue; \
782 nextctx->pattern = nextpattern; \
783 ctx_pos = alloc_pos; \
784 ctx = nextctx; \
785 goto entrance; \
786 jumplabel: \
787 while (0) /* gcc doesn't like labels at end of scopes */ \
788
789typedef struct {
790 int last_ctx_pos;
791 int jump;
792 SRE_CHAR* ptr;
793 SRE_CODE* pattern;
794 int count;
795 int lastmark;
796 int lastindex;
797 union {
798 SRE_CODE chr;
799 SRE_REPEAT* rep;
800 } u;
801} SRE_MATCH_CONTEXT;
802
803/* check if string matches the given pattern. returns <0 for
804 error, 0 for failure, and 1 for success */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000805LOCAL(int)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000806SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
Guido van Rossumb700df92000-03-31 14:59:30 +0000807{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000808 SRE_CHAR* end = state->end;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000809 int alloc_pos, ctx_pos = -1;
810 int i, ret = 0;
811 int jump;
Guido van Rossumb700df92000-03-31 14:59:30 +0000812
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000813 SRE_MATCH_CONTEXT* ctx;
814 SRE_MATCH_CONTEXT* nextctx;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000815
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000816 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000817
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000818 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
819 ctx->last_ctx_pos = -1;
820 ctx->jump = JUMP_NONE;
821 ctx->pattern = pattern;
822 ctx_pos = alloc_pos;
823
824entrance:
825
826 ctx->ptr = state->ptr;
827
828 if (ctx->pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000829 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000830 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000831 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000832 TRACE(("reject (got %d chars, need %d)\n",
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000833 (end - ctx->ptr), ctx->pattern[3]));
834 RETURN_FAILURE;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000835 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000836 ctx->pattern += ctx->pattern[1] + 1;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000837 }
838
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000839 for (;;) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000840
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000841 switch (*ctx->pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000842
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000843 case SRE_OP_MARK:
844 /* set mark */
845 /* <MARK> <gid> */
846 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
847 ctx->ptr, ctx->pattern[0]));
848 i = ctx->pattern[0];
849 if (i & 1)
850 state->lastindex = i/2 + 1;
851 if (i > state->lastmark) {
852 /* state->lastmark is the highest valid index in the
853 state->mark array. If it is increased by more than 1,
854 the intervening marks must be set to NULL to signal
855 that these marks have not been encountered. */
856 int j = state->lastmark + 1;
857 while (j < i)
858 state->mark[j++] = NULL;
859 state->lastmark = i;
860 }
861 state->mark[i] = ctx->ptr;
862 ctx->pattern++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000863 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000864
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000865 case SRE_OP_LITERAL:
866 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000867 /* <LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000868 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
869 ctx->ptr, *ctx->pattern));
870 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
871 RETURN_FAILURE;
872 ctx->pattern++;
873 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000874 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000875
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000876 case SRE_OP_NOT_LITERAL:
877 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000878 /* <NOT_LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000879 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
880 ctx->ptr, *ctx->pattern));
881 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
882 RETURN_FAILURE;
883 ctx->pattern++;
884 ctx->ptr++;
885 break;
886
887 case SRE_OP_SUCCESS:
888 /* end of pattern */
889 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
890 state->ptr = ctx->ptr;
891 RETURN_SUCCESS;
892
893 case SRE_OP_AT:
894 /* match at given position */
895 /* <AT> <code> */
896 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
897 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
898 RETURN_FAILURE;
899 ctx->pattern++;
900 break;
901
902 case SRE_OP_CATEGORY:
903 /* match at given category */
904 /* <CATEGORY> <code> */
905 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
906 ctx->ptr, *ctx->pattern));
907 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
908 RETURN_FAILURE;
909 ctx->pattern++;
910 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000911 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000912
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000913 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000914 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000915 /* <ANY> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000916 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
917 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
918 RETURN_FAILURE;
919 ctx->ptr++;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000920 break;
921
922 case SRE_OP_ANY_ALL:
923 /* match anything */
924 /* <ANY_ALL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000925 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
926 if (ctx->ptr >= end)
927 RETURN_FAILURE;
928 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000929 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000930
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000931 case SRE_OP_IN:
932 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000933 /* <IN> <skip> <set> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000934 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
935 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
936 RETURN_FAILURE;
937 ctx->pattern += ctx->pattern[0];
938 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000939 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000940
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000941 case SRE_OP_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000942 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
943 ctx->pattern, ctx->ptr, ctx->pattern[0]));
944 if (ctx->ptr >= end ||
945 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
946 RETURN_FAILURE;
947 ctx->pattern++;
948 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000949 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000950
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000951 case SRE_OP_NOT_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000952 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
953 ctx->pattern, ctx->ptr, *ctx->pattern));
954 if (ctx->ptr >= end ||
955 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
956 RETURN_FAILURE;
957 ctx->pattern++;
958 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000959 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000960
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000961 case SRE_OP_IN_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000962 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
963 if (ctx->ptr >= end
964 || !SRE_CHARSET(ctx->pattern+1,
965 (SRE_CODE)state->lower(*ctx->ptr)))
966 RETURN_FAILURE;
967 ctx->pattern += ctx->pattern[0];
968 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000969 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000970
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000971 case SRE_OP_JUMP:
972 case SRE_OP_INFO:
973 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000974 /* <JUMP> <offset> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000975 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
976 ctx->ptr, ctx->pattern[0]));
977 ctx->pattern += ctx->pattern[0];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000978 break;
979
980 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000981 /* alternation */
982 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000983 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000984 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000985 ctx->u.rep = state->repeat;
986 if (ctx->u.rep)
987 MARK_PUSH(ctx->lastmark);
988 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
989 if (ctx->pattern[1] == SRE_OP_LITERAL &&
990 (ctx->ptr >= end ||
991 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000992 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000993 if (ctx->pattern[1] == SRE_OP_IN &&
994 (ctx->ptr >= end ||
995 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000996 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000997 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000998 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000999 if (ret) {
1000 if (ctx->u.rep)
1001 MARK_POP_DISCARD(ctx->lastmark);
1002 RETURN_ON_ERROR(ret);
1003 RETURN_SUCCESS;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001004 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001005 if (ctx->u.rep)
1006 MARK_POP_KEEP(ctx->lastmark);
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001007 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001008 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001009 if (ctx->u.rep)
1010 MARK_POP_DISCARD(ctx->lastmark);
1011 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001012
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001013 case SRE_OP_REPEAT_ONE:
1014 /* match repeated sequence (maximizing regexp) */
1015
1016 /* this operator only works if the repeated item is
1017 exactly one character wide, and we're not already
1018 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +00001019 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001020
1021 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1022
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001023 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1024 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001025
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001026 if (ctx->ptr + ctx->pattern[1] > end)
1027 RETURN_FAILURE; /* cannot match */
Fredrik Lundhe1869832000-08-01 22:47:49 +00001028
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001029 state->ptr = ctx->ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001030
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001031 ctx->count = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001032 RETURN_ON_ERROR(ctx->count);
Fredrik Lundhe1869832000-08-01 22:47:49 +00001033
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001034 ctx->ptr += ctx->count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001035
1036 /* when we arrive here, count contains the number of
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001037 matches, and ctx->ptr points to the tail of the target
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001038 string. check if the rest of the pattern matches,
1039 and backtrack if not. */
1040
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001041 if (ctx->count < (int) ctx->pattern[1])
1042 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001043
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001044 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001045 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001046 state->ptr = ctx->ptr;
1047 RETURN_SUCCESS;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001048 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001049
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001050 LASTMARK_SAVE();
1051
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001052 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001053 /* tail starts with a literal. skip positions where
1054 the rest of the pattern cannot possibly match */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001055 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001056 for (;;) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001057 while (ctx->count >= (int) ctx->pattern[1] &&
1058 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1059 ctx->ptr--;
1060 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001061 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001062 if (ctx->count < (int) ctx->pattern[1])
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001063 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001064 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001065 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1066 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001067 if (ret) {
1068 RETURN_ON_ERROR(ret);
1069 RETURN_SUCCESS;
1070 }
1071
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001072 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001073
1074 ctx->ptr--;
1075 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001076 }
1077
1078 } else {
1079 /* general case */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001080 while (ctx->count >= (int) ctx->pattern[1]) {
1081 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001082 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1083 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001084 if (ret) {
1085 RETURN_ON_ERROR(ret);
1086 RETURN_SUCCESS;
1087 }
1088 ctx->ptr--;
1089 ctx->count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001090 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001091 }
1092 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001093 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001094
Guido van Rossum41c99e72003-04-14 17:59:34 +00001095 case SRE_OP_MIN_REPEAT_ONE:
1096 /* match repeated sequence (minimizing regexp) */
1097
1098 /* this operator only works if the repeated item is
1099 exactly one character wide, and we're not already
1100 collecting backtracking points. for other cases,
1101 use the MIN_REPEAT operator */
1102
1103 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1104
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001105 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1106 ctx->pattern[1], ctx->pattern[2]));
Guido van Rossum41c99e72003-04-14 17:59:34 +00001107
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001108 if (ctx->ptr + ctx->pattern[1] > end)
1109 RETURN_FAILURE; /* cannot match */
Guido van Rossum41c99e72003-04-14 17:59:34 +00001110
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001111 state->ptr = ctx->ptr;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001112
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001113 if (ctx->pattern[1] == 0)
1114 ctx->count = 0;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001115 else {
1116 /* count using pattern min as the maximum */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001117 ctx->count = SRE_COUNT(state, ctx->pattern+3,
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001118 ctx->pattern[1]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001119 RETURN_ON_ERROR(ctx->count);
1120 if (ctx->count < (int) ctx->pattern[1])
1121 /* didn't match minimum number of times */
1122 RETURN_FAILURE;
1123 /* advance past minimum matches of repeat */
1124 ctx->ptr += ctx->count;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001125 }
1126
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001127 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Guido van Rossum41c99e72003-04-14 17:59:34 +00001128 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001129 state->ptr = ctx->ptr;
1130 RETURN_SUCCESS;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001131
1132 } else {
1133 /* general case */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001134 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001135 while ((int)ctx->pattern[2] == 65535
1136 || ctx->count <= (int)ctx->pattern[2]) {
1137 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001138 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1139 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001140 if (ret) {
1141 RETURN_ON_ERROR(ret);
1142 RETURN_SUCCESS;
1143 }
1144 state->ptr = ctx->ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001145 ret = SRE_COUNT(state, ctx->pattern+3, 1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001146 RETURN_ON_ERROR(ret);
1147 if (ret == 0)
Guido van Rossum41c99e72003-04-14 17:59:34 +00001148 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001149 assert(ret == 1);
1150 ctx->ptr++;
1151 ctx->count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001152 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001153 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001154 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001155 RETURN_FAILURE;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001156
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001157 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001158 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001159 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001160 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001161 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1162 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001163
1164 /* install new repeat context */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001165 ctx->u.rep = (SRE_REPEAT*) malloc(sizeof(*ctx->u.rep));
1166 ctx->u.rep->count = -1;
1167 ctx->u.rep->pattern = ctx->pattern;
1168 ctx->u.rep->prev = state->repeat;
1169 ctx->u.rep->last_ptr = NULL;
1170 state->repeat = ctx->u.rep;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001171
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001172 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001173 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001174 state->repeat = ctx->u.rep->prev;
1175 free(ctx->u.rep);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001176
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001177 if (ret) {
1178 RETURN_ON_ERROR(ret);
1179 RETURN_SUCCESS;
1180 }
1181 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001182
1183 case SRE_OP_MAX_UNTIL:
1184 /* maximizing repeat */
1185 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1186
1187 /* FIXME: we probably need to deal with zero-width
1188 matches in here... */
1189
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001190 ctx->u.rep = state->repeat;
1191 if (!ctx->u.rep)
1192 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001193
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001194 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001195
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001196 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001197
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001198 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1199 ctx->ptr, ctx->count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001200
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001201 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001202 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001203 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001204 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1205 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001206 if (ret) {
1207 RETURN_ON_ERROR(ret);
1208 RETURN_SUCCESS;
1209 }
1210 ctx->u.rep->count = ctx->count-1;
1211 state->ptr = ctx->ptr;
1212 RETURN_FAILURE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001213 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001214
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001215 if ((ctx->count < ctx->u.rep->pattern[2] ||
1216 ctx->u.rep->pattern[2] == 65535) &&
1217 state->ptr != ctx->u.rep->last_ptr) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001218 /* we may have enough matches, but if we can
1219 match another item, do so */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001220 ctx->u.rep->count = ctx->count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001221 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001222 MARK_PUSH(ctx->lastmark);
1223 /* zero-width match protection */
1224 DATA_PUSH(&ctx->u.rep->last_ptr);
1225 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001226 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1227 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001228 DATA_POP(&ctx->u.rep->last_ptr);
1229 if (ret) {
1230 MARK_POP_DISCARD(ctx->lastmark);
1231 RETURN_ON_ERROR(ret);
1232 RETURN_SUCCESS;
1233 }
1234 MARK_POP(ctx->lastmark);
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001235 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001236 ctx->u.rep->count = ctx->count-1;
1237 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001238 }
1239
1240 /* cannot match more repeated items here. make sure the
1241 tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001242 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001243 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001244 RETURN_ON_SUCCESS(ret);
1245 state->repeat = ctx->u.rep;
1246 state->ptr = ctx->ptr;
1247 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001248
1249 case SRE_OP_MIN_UNTIL:
1250 /* minimizing repeat */
1251 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1252
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001253 ctx->u.rep = state->repeat;
1254 if (!ctx->u.rep)
1255 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001256
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001257 state->ptr = ctx->ptr;
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001258
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001259 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001260
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001261 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1262 ctx->ptr, ctx->count, ctx->u.rep->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001263
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001264 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001265 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001266 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001267 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1268 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001269 if (ret) {
1270 RETURN_ON_ERROR(ret);
1271 RETURN_SUCCESS;
1272 }
1273 ctx->u.rep->count = ctx->count-1;
1274 state->ptr = ctx->ptr;
1275 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001276 }
1277
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001278 LASTMARK_SAVE();
1279
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001280 /* see if the tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001281 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001282 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001283 if (ret) {
1284 RETURN_ON_ERROR(ret);
1285 RETURN_SUCCESS;
1286 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001287
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001288 state->repeat = ctx->u.rep;
1289 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001290
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001291 LASTMARK_RESTORE();
1292
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001293 if (ctx->count >= ctx->u.rep->pattern[2]
1294 && ctx->u.rep->pattern[2] != 65535)
1295 RETURN_FAILURE;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001296
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001297 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001298 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1299 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001300 if (ret) {
1301 RETURN_ON_ERROR(ret);
1302 RETURN_SUCCESS;
1303 }
1304 ctx->u.rep->count = ctx->count-1;
1305 state->ptr = ctx->ptr;
1306 RETURN_FAILURE;
1307
1308 case SRE_OP_GROUPREF:
1309 /* match backreference */
1310 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1311 ctx->ptr, ctx->pattern[0]));
1312 i = ctx->pattern[0];
1313 {
1314 int groupref = i+i;
1315 if (groupref >= state->lastmark) {
1316 RETURN_FAILURE;
1317 } else {
1318 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1319 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1320 if (!p || !e || e < p)
1321 RETURN_FAILURE;
1322 while (p < e) {
1323 if (ctx->ptr >= end || *ctx->ptr != *p)
1324 RETURN_FAILURE;
1325 p++; ctx->ptr++;
1326 }
1327 }
1328 }
1329 ctx->pattern++;
1330 break;
1331
1332 case SRE_OP_GROUPREF_IGNORE:
1333 /* match backreference */
1334 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1335 ctx->ptr, ctx->pattern[0]));
1336 i = ctx->pattern[0];
1337 {
1338 int groupref = i+i;
1339 if (groupref >= state->lastmark) {
1340 RETURN_FAILURE;
1341 } else {
1342 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1343 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1344 if (!p || !e || e < p)
1345 RETURN_FAILURE;
1346 while (p < e) {
1347 if (ctx->ptr >= end ||
1348 state->lower(*ctx->ptr) != state->lower(*p))
1349 RETURN_FAILURE;
1350 p++; ctx->ptr++;
1351 }
1352 }
1353 }
1354 ctx->pattern++;
1355 break;
1356
1357 case SRE_OP_GROUPREF_EXISTS:
1358 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1359 ctx->ptr, ctx->pattern[0]));
1360 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1361 i = ctx->pattern[0];
1362 {
1363 int groupref = i+i;
1364 if (groupref >= state->lastmark) {
1365 ctx->pattern += ctx->pattern[1];
1366 break;
1367 } else {
1368 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1369 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1370 if (!p || !e || e < p) {
1371 ctx->pattern += ctx->pattern[1];
1372 break;
1373 }
1374 }
1375 }
1376 ctx->pattern += 2;
1377 break;
1378
1379 case SRE_OP_ASSERT:
1380 /* assert subpattern */
1381 /* <ASSERT> <skip> <back> <pattern> */
1382 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1383 ctx->ptr, ctx->pattern[1]));
1384 state->ptr = ctx->ptr - ctx->pattern[1];
1385 if (state->ptr < state->beginning)
1386 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001387 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001388 RETURN_ON_FAILURE(ret);
1389 ctx->pattern += ctx->pattern[0];
1390 break;
1391
1392 case SRE_OP_ASSERT_NOT:
1393 /* assert not subpattern */
1394 /* <ASSERT_NOT> <skip> <back> <pattern> */
1395 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1396 ctx->ptr, ctx->pattern[1]));
1397 state->ptr = ctx->ptr - ctx->pattern[1];
1398 if (state->ptr >= state->beginning) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001399 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001400 if (ret) {
1401 RETURN_ON_ERROR(ret);
1402 RETURN_FAILURE;
1403 }
1404 }
1405 ctx->pattern += ctx->pattern[0];
1406 break;
1407
1408 case SRE_OP_FAILURE:
1409 /* immediate failure */
1410 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1411 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001412
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001413 default:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001414 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1415 ctx->pattern[-1]));
1416 RETURN_ERROR(SRE_ERROR_ILLEGAL);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001417 }
1418 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001419
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001420exit:
1421 ctx_pos = ctx->last_ctx_pos;
1422 jump = ctx->jump;
1423 DATA_POP_DISCARD(ctx);
1424 if (ctx_pos == -1)
1425 return ret;
1426 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1427
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001428 switch (jump) {
1429 case JUMP_MAX_UNTIL_2:
1430 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1431 goto jump_max_until_2;
1432 case JUMP_MAX_UNTIL_3:
1433 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1434 goto jump_max_until_3;
1435 case JUMP_MIN_UNTIL_2:
1436 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1437 goto jump_min_until_2;
1438 case JUMP_MIN_UNTIL_3:
1439 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1440 goto jump_min_until_3;
1441 case JUMP_BRANCH:
1442 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1443 goto jump_branch;
1444 case JUMP_MAX_UNTIL_1:
1445 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1446 goto jump_max_until_1;
1447 case JUMP_MIN_UNTIL_1:
1448 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1449 goto jump_min_until_1;
1450 case JUMP_REPEAT:
1451 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1452 goto jump_repeat;
1453 case JUMP_REPEAT_ONE_1:
1454 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1455 goto jump_repeat_one_1;
1456 case JUMP_REPEAT_ONE_2:
1457 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1458 goto jump_repeat_one_2;
1459 case JUMP_MIN_REPEAT_ONE:
1460 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1461 goto jump_min_repeat_one;
1462 case JUMP_ASSERT:
1463 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1464 goto jump_assert;
1465 case JUMP_ASSERT_NOT:
1466 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1467 goto jump_assert_not;
1468 case JUMP_NONE:
1469 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1470 break;
1471 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001472
1473 return ret; /* should never get here */
Guido van Rossumb700df92000-03-31 14:59:30 +00001474}
1475
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001476LOCAL(int)
Guido van Rossumb700df92000-03-31 14:59:30 +00001477SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1478{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001479 SRE_CHAR* ptr = state->start;
1480 SRE_CHAR* end = state->end;
1481 int status = 0;
Fredrik Lundh28552902000-07-05 21:14:16 +00001482 int prefix_len = 0;
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001483 int prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001484 SRE_CODE* prefix = NULL;
1485 SRE_CODE* charset = NULL;
1486 SRE_CODE* overlap = NULL;
1487 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001488
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001489 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001490 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001491 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001492
1493 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001494
Gustavo Niemeyer28b5bb32003-06-26 14:41:08 +00001495 if (pattern[3] > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001496 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001497 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001498 end -= pattern[3]-1;
1499 if (end <= ptr)
1500 end = ptr+1;
1501 }
1502
Fredrik Lundh3562f112000-07-02 12:00:07 +00001503 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001504 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001505 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001506 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001507 prefix_skip = pattern[6];
1508 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001509 overlap = prefix + prefix_len - 1;
1510 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001511 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001512 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001513 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001514
1515 pattern += 1 + pattern[1];
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001516 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001517
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001518 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1519 TRACE(("charset = %p\n", charset));
1520
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001521#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001522 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001523 /* pattern starts with a known prefix. use the overlap
1524 table to skip forward as fast as we possibly can */
1525 int i = 0;
1526 end = state->end;
1527 while (ptr < end) {
1528 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001529 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001530 if (!i)
1531 break;
1532 else
1533 i = overlap[i];
1534 } else {
1535 if (++i == prefix_len) {
1536 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001537 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1538 state->start = ptr + 1 - prefix_len;
1539 state->ptr = ptr + 1 - prefix_len + prefix_skip;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001540 if (flags & SRE_INFO_LITERAL)
1541 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001542 status = SRE_MATCH(state, pattern + 2*prefix_skip);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001543 if (status != 0)
1544 return status;
1545 /* close but no cigar -- try again */
1546 i = overlap[i];
1547 }
1548 break;
1549 }
1550
1551 }
1552 ptr++;
1553 }
1554 return 0;
1555 }
1556#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001557
Fredrik Lundh3562f112000-07-02 12:00:07 +00001558 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001559 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001560 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001561 SRE_CODE chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001562 end = state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001563 for (;;) {
1564 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1565 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001566 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001567 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001568 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001569 state->start = ptr;
1570 state->ptr = ++ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001571 if (flags & SRE_INFO_LITERAL)
1572 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001573 status = SRE_MATCH(state, pattern + 2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001574 if (status != 0)
1575 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001576 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001577 } else if (charset) {
1578 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001579 end = state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001580 for (;;) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001581 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001582 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001583 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001584 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001585 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001586 state->start = ptr;
1587 state->ptr = ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001588 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001589 if (status != 0)
1590 break;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001591 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001592 }
1593 } else
1594 /* general case */
1595 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001596 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001597 state->start = state->ptr = ptr++;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001598 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001599 if (status != 0)
1600 break;
1601 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001602
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001603 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001604}
Fredrik Lundh3562f112000-07-02 12:00:07 +00001605
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001606LOCAL(int)
1607SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, int len)
1608{
1609 /* check if given string is a literal template (i.e. no escapes) */
1610 while (len-- > 0)
1611 if (*ptr++ == '\\')
1612 return 0;
1613 return 1;
1614}
Guido van Rossumb700df92000-03-31 14:59:30 +00001615
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001616#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001617
1618/* -------------------------------------------------------------------- */
1619/* factories and destructors */
1620
1621/* see sre.h for object declarations */
1622
Jeremy Hylton938ace62002-07-17 16:30:39 +00001623static PyTypeObject Pattern_Type;
1624static PyTypeObject Match_Type;
1625static PyTypeObject Scanner_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00001626
1627static PyObject *
1628_compile(PyObject* self_, PyObject* args)
1629{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001630 /* "compile" pattern descriptor to pattern object */
Guido van Rossumb700df92000-03-31 14:59:30 +00001631
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001632 PatternObject* self;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001633 int i, n;
Guido van Rossumb700df92000-03-31 14:59:30 +00001634
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001635 PyObject* pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001636 int flags = 0;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001637 PyObject* code;
1638 int groups = 0;
1639 PyObject* groupindex = NULL;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001640 PyObject* indexgroup = NULL;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001641 if (!PyArg_ParseTuple(args, "OiO!|iOO", &pattern, &flags,
1642 &PyList_Type, &code, &groups,
1643 &groupindex, &indexgroup))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001644 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001645
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001646 n = PyList_GET_SIZE(code);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001647
Fredrik Lundhebc37b22000-10-28 19:30:41 +00001648 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001649 if (!self)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001650 return NULL;
Fredrik Lundh6f013982000-07-03 18:44:21 +00001651
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001652 self->codesize = n;
1653
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001654 for (i = 0; i < n; i++) {
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001655 PyObject *o = PyList_GET_ITEM(code, i);
Martin v. Löwis78e2f062003-04-19 12:56:08 +00001656 if (PyInt_Check(o))
1657 self->code[i] = (SRE_CODE) PyInt_AsLong(o);
1658 else
1659 self->code[i] = (SRE_CODE) PyLong_AsUnsignedLong(o);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001660 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00001661
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001662 if (PyErr_Occurred()) {
1663 PyObject_DEL(self);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001664 return NULL;
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001665 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00001666
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001667 Py_INCREF(pattern);
1668 self->pattern = pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00001669
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001670 self->flags = flags;
1671
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001672 self->groups = groups;
Guido van Rossumb700df92000-03-31 14:59:30 +00001673
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001674 Py_XINCREF(groupindex);
1675 self->groupindex = groupindex;
Guido van Rossumb700df92000-03-31 14:59:30 +00001676
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001677 Py_XINCREF(indexgroup);
1678 self->indexgroup = indexgroup;
Fredrik Lundhc2301732000-07-02 22:25:39 +00001679
Raymond Hettinger027bb632004-05-31 03:09:25 +00001680 self->weakreflist = NULL;
1681
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001682 return (PyObject*) self;
Guido van Rossumb700df92000-03-31 14:59:30 +00001683}
1684
1685static PyObject *
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001686sre_codesize(PyObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001687{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001688 return Py_BuildValue("i", sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001689}
1690
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001691static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001692sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001693{
1694 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001695 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001696 return NULL;
1697 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001698 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001699 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001700#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001701 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001702#else
1703 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001704#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001705 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001706}
1707
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001708LOCAL(void)
1709state_reset(SRE_STATE* state)
1710{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001711 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001712 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001713
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001714 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001715 state->lastindex = -1;
1716
1717 state->repeat = NULL;
1718
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001719 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001720}
1721
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001722static void*
1723getstring(PyObject* string, int* p_length, int* p_charsize)
Guido van Rossumb700df92000-03-31 14:59:30 +00001724{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001725 /* given a python object, return a data pointer, a length (in
1726 characters), and a character size. return NULL if the object
1727 is not a string (or not compatible) */
1728
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001729 PyBufferProcs *buffer;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001730 int size, bytes, charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001731 void* ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001732
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001733#if defined(HAVE_UNICODE)
1734 if (PyUnicode_Check(string)) {
1735 /* unicode strings doesn't always support the buffer interface */
1736 ptr = (void*) PyUnicode_AS_DATA(string);
1737 bytes = PyUnicode_GET_DATA_SIZE(string);
1738 size = PyUnicode_GET_SIZE(string);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001739 charsize = sizeof(Py_UNICODE);
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001740
1741 } else {
1742#endif
1743
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001744 /* get pointer to string buffer */
1745 buffer = string->ob_type->tp_as_buffer;
1746 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1747 buffer->bf_getsegcount(string, NULL) != 1) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001748 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001749 return NULL;
1750 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001751
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001752 /* determine buffer size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001753 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1754 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001755 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1756 return NULL;
1757 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001758
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001759 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001760#if PY_VERSION_HEX >= 0x01060000
1761 size = PyObject_Size(string);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001762#else
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001763 size = PyObject_Length(string);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001764#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001765
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001766 if (PyString_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001767 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001768#if defined(HAVE_UNICODE)
1769 else if (bytes == (int) (size * sizeof(Py_UNICODE)))
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001770 charsize = sizeof(Py_UNICODE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001771#endif
1772 else {
1773 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1774 return NULL;
1775 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001776
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001777#if defined(HAVE_UNICODE)
1778 }
1779#endif
1780
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001781 *p_length = size;
1782 *p_charsize = charsize;
1783
1784 return ptr;
1785}
1786
1787LOCAL(PyObject*)
1788state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1789 int start, int end)
1790{
1791 /* prepare state object */
1792
1793 int length;
1794 int charsize;
1795 void* ptr;
1796
1797 memset(state, 0, sizeof(SRE_STATE));
1798
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001799 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001800 state->lastindex = -1;
1801
1802 ptr = getstring(string, &length, &charsize);
1803 if (!ptr)
1804 return NULL;
1805
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001806 /* adjust boundaries */
1807 if (start < 0)
1808 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001809 else if (start > length)
1810 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001811
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001812 if (end < 0)
1813 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001814 else if (end > length)
1815 end = length;
1816
1817 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001818
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001819 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001820
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001821 state->start = (void*) ((char*) ptr + start * state->charsize);
1822 state->end = (void*) ((char*) ptr + end * state->charsize);
1823
1824 Py_INCREF(string);
1825 state->string = string;
1826 state->pos = start;
1827 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001828
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001829 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001830 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001831 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001832#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001833 state->lower = sre_lower_unicode;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001834#else
1835 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001836#endif
1837 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001838 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001839
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001840 return string;
Guido van Rossumb700df92000-03-31 14:59:30 +00001841}
1842
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001843LOCAL(void)
1844state_fini(SRE_STATE* state)
1845{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001846 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001847 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001848}
1849
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001850/* calculate offset from start of string */
1851#define STATE_OFFSET(state, member)\
1852 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1853
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001854LOCAL(PyObject*)
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001855state_getslice(SRE_STATE* state, int index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001856{
Fredrik Lundh58100642000-08-09 09:14:35 +00001857 int i, j;
1858
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001859 index = (index - 1) * 2;
1860
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001861 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001862 if (empty)
1863 /* want empty string */
1864 i = j = 0;
1865 else {
1866 Py_INCREF(Py_None);
1867 return Py_None;
1868 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001869 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001870 i = STATE_OFFSET(state, state->mark[index]);
1871 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001872 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001873
Fredrik Lundh58100642000-08-09 09:14:35 +00001874 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001875}
1876
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001877static void
1878pattern_error(int status)
1879{
1880 switch (status) {
1881 case SRE_ERROR_RECURSION_LIMIT:
1882 PyErr_SetString(
1883 PyExc_RuntimeError,
1884 "maximum recursion limit exceeded"
1885 );
1886 break;
1887 case SRE_ERROR_MEMORY:
1888 PyErr_NoMemory();
1889 break;
1890 default:
1891 /* other error codes indicate compiler/engine bugs */
1892 PyErr_SetString(
1893 PyExc_RuntimeError,
1894 "internal error in regular expression engine"
1895 );
1896 }
1897}
1898
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001899static PyObject*
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001900pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001901{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001902 /* create match object (from state object) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001903
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001904 MatchObject* match;
1905 int i, j;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001906 char* base;
1907 int n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001908
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001909 if (status > 0) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001910
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001911 /* create match object (with room for extra group marks) */
1912 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
Fredrik Lundh6f013982000-07-03 18:44:21 +00001913 2*(pattern->groups+1));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001914 if (!match)
1915 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001916
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001917 Py_INCREF(pattern);
1918 match->pattern = pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001919
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001920 Py_INCREF(state->string);
1921 match->string = state->string;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001922
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001923 match->regs = NULL;
1924 match->groups = pattern->groups+1;
1925
1926 /* fill in group slices */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001927
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001928 base = (char*) state->beginning;
1929 n = state->charsize;
1930
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001931 match->mark[0] = ((char*) state->start - base) / n;
1932 match->mark[1] = ((char*) state->ptr - base) / n;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001933
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001934 for (i = j = 0; i < pattern->groups; i++, j+=2)
1935 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
1936 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
1937 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
1938 } else
1939 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
1940
1941 match->pos = state->pos;
1942 match->endpos = state->endpos;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001943
Fredrik Lundh6f013982000-07-03 18:44:21 +00001944 match->lastindex = state->lastindex;
1945
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001946 return (PyObject*) match;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001947
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001948 } else if (status == 0) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001949
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001950 /* no match */
1951 Py_INCREF(Py_None);
1952 return Py_None;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001953
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001954 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001955
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001956 /* internal error */
1957 pattern_error(status);
1958 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001959}
1960
1961static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001962pattern_scanner(PatternObject* pattern, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001963{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001964 /* create search state object */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001965
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001966 ScannerObject* self;
1967
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001968 PyObject* string;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001969 int start = 0;
1970 int end = INT_MAX;
1971 if (!PyArg_ParseTuple(args, "O|ii:scanner", &string, &start, &end))
1972 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001973
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001974 /* create scanner object */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00001975 self = PyObject_NEW(ScannerObject, &Scanner_Type);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001976 if (!self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001977 return NULL;
1978
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001979 string = state_init(&self->state, pattern, string, start, end);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001980 if (!string) {
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001981 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001982 return NULL;
1983 }
1984
1985 Py_INCREF(pattern);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001986 self->pattern = (PyObject*) pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001987
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001988 return (PyObject*) self;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001989}
1990
Guido van Rossumb700df92000-03-31 14:59:30 +00001991static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001992pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001993{
Raymond Hettinger027bb632004-05-31 03:09:25 +00001994 if (self->weakreflist != NULL)
1995 PyObject_ClearWeakRefs((PyObject *) self);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001996 Py_XDECREF(self->pattern);
1997 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001998 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001999 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00002000}
2001
2002static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002003pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002004{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002005 SRE_STATE state;
2006 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00002007
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002008 PyObject* string;
2009 int start = 0;
2010 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002011 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
2012 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:match", kwlist,
2013 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002014 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002015
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002016 string = state_init(&state, self, string, start, end);
2017 if (!string)
2018 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002019
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002020 state.ptr = state.start;
2021
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002022 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
2023
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002024 if (state.charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00002025 status = sre_match(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002026 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002027#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00002028 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002029#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002030 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002031
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002032 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
2033
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002034 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00002035
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002036 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00002037}
2038
2039static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002040pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002041{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002042 SRE_STATE state;
2043 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00002044
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002045 PyObject* string;
2046 int start = 0;
2047 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002048 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
2049 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:search", kwlist,
2050 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002051 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002052
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002053 string = state_init(&state, self, string, start, end);
2054 if (!string)
2055 return NULL;
2056
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002057 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
2058
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002059 if (state.charsize == 1) {
2060 status = sre_search(&state, PatternObject_GetCode(self));
2061 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002062#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002063 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002064#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002065 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002066
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002067 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
2068
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002069 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00002070
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002071 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00002072}
2073
2074static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002075call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002076{
2077 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002078 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002079 PyObject* func;
2080 PyObject* result;
2081
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002082 if (!args)
2083 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002084 name = PyString_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002085 if (!name)
2086 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002087 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002088 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002089 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002090 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002091 func = PyObject_GetAttrString(mod, function);
2092 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002093 if (!func)
2094 return NULL;
2095 result = PyObject_CallObject(func, args);
2096 Py_DECREF(func);
2097 Py_DECREF(args);
2098 return result;
2099}
2100
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002101#ifdef USE_BUILTIN_COPY
2102static int
2103deepcopy(PyObject** object, PyObject* memo)
2104{
2105 PyObject* copy;
2106
2107 copy = call(
2108 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002109 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002110 );
2111 if (!copy)
2112 return 0;
2113
2114 Py_DECREF(*object);
2115 *object = copy;
2116
2117 return 1; /* success */
2118}
2119#endif
2120
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002121static PyObject*
Michael W. Hudsonb6a45052002-07-31 09:54:24 +00002122join_list(PyObject* list, PyObject* pattern)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002123{
2124 /* join list elements */
2125
2126 PyObject* joiner;
2127#if PY_VERSION_HEX >= 0x01060000
2128 PyObject* function;
2129 PyObject* args;
2130#endif
2131 PyObject* result;
2132
2133 switch (PyList_GET_SIZE(list)) {
2134 case 0:
2135 Py_DECREF(list);
Fredrik Lundh09705f02002-11-22 12:46:35 +00002136 return PySequence_GetSlice(pattern, 0, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002137 case 1:
2138 result = PyList_GET_ITEM(list, 0);
2139 Py_INCREF(result);
2140 Py_DECREF(list);
2141 return result;
2142 }
2143
2144 /* two or more elements: slice out a suitable separator from the
2145 first member, and use that to join the entire list */
2146
2147 joiner = PySequence_GetSlice(pattern, 0, 0);
2148 if (!joiner)
2149 return NULL;
2150
2151#if PY_VERSION_HEX >= 0x01060000
2152 function = PyObject_GetAttrString(joiner, "join");
2153 if (!function) {
2154 Py_DECREF(joiner);
2155 return NULL;
2156 }
2157 args = PyTuple_New(1);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002158 if (!args) {
2159 Py_DECREF(function);
2160 Py_DECREF(joiner);
2161 return NULL;
2162 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002163 PyTuple_SET_ITEM(args, 0, list);
2164 result = PyObject_CallObject(function, args);
2165 Py_DECREF(args); /* also removes list */
2166 Py_DECREF(function);
2167#else
2168 result = call(
2169 "string", "join",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002170 PyTuple_Pack(2, list, joiner)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002171 );
2172#endif
2173 Py_DECREF(joiner);
2174
2175 return result;
2176}
2177
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002178static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002179pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002180{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002181 SRE_STATE state;
2182 PyObject* list;
2183 int status;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002184 int i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00002185
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002186 PyObject* string;
2187 int start = 0;
2188 int end = INT_MAX;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002189 static char* kwlist[] = { "source", "pos", "endpos", NULL };
2190 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:findall", kwlist,
2191 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002192 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002193
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002194 string = state_init(&state, self, string, start, end);
2195 if (!string)
2196 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002197
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002198 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002199 if (!list) {
2200 state_fini(&state);
2201 return NULL;
2202 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002203
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002204 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002205
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002206 PyObject* item;
2207
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002208 state_reset(&state);
2209
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002210 state.ptr = state.start;
2211
2212 if (state.charsize == 1) {
2213 status = sre_search(&state, PatternObject_GetCode(self));
2214 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002215#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002216 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002217#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002218 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002219
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002220 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002221 if (status == 0)
2222 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002223 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002224 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002225 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002226
2227 /* don't bother to build a match object */
2228 switch (self->groups) {
2229 case 0:
2230 b = STATE_OFFSET(&state, state.start);
2231 e = STATE_OFFSET(&state, state.ptr);
2232 item = PySequence_GetSlice(string, b, e);
2233 if (!item)
2234 goto error;
2235 break;
2236 case 1:
2237 item = state_getslice(&state, 1, string, 1);
2238 if (!item)
2239 goto error;
2240 break;
2241 default:
2242 item = PyTuple_New(self->groups);
2243 if (!item)
2244 goto error;
2245 for (i = 0; i < self->groups; i++) {
2246 PyObject* o = state_getslice(&state, i+1, string, 1);
2247 if (!o) {
2248 Py_DECREF(item);
2249 goto error;
2250 }
2251 PyTuple_SET_ITEM(item, i, o);
2252 }
2253 break;
2254 }
2255
2256 status = PyList_Append(list, item);
2257 Py_DECREF(item);
2258 if (status < 0)
2259 goto error;
2260
2261 if (state.ptr == state.start)
2262 state.start = (void*) ((char*) state.ptr + state.charsize);
2263 else
2264 state.start = state.ptr;
2265
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002266 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002267
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002268 state_fini(&state);
2269 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002270
2271error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002272 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002273 state_fini(&state);
2274 return NULL;
2275
Guido van Rossumb700df92000-03-31 14:59:30 +00002276}
2277
Fredrik Lundh703ce812001-10-24 22:16:30 +00002278#if PY_VERSION_HEX >= 0x02020000
2279static PyObject*
2280pattern_finditer(PatternObject* pattern, PyObject* args)
2281{
2282 PyObject* scanner;
2283 PyObject* search;
2284 PyObject* iterator;
2285
2286 scanner = pattern_scanner(pattern, args);
2287 if (!scanner)
2288 return NULL;
2289
2290 search = PyObject_GetAttrString(scanner, "search");
2291 Py_DECREF(scanner);
2292 if (!search)
2293 return NULL;
2294
2295 iterator = PyCallIter_New(search, Py_None);
2296 Py_DECREF(search);
2297
2298 return iterator;
2299}
2300#endif
2301
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002302static PyObject*
2303pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2304{
2305 SRE_STATE state;
2306 PyObject* list;
2307 PyObject* item;
2308 int status;
2309 int n;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002310 int i;
2311 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002312
2313 PyObject* string;
2314 int maxsplit = 0;
2315 static char* kwlist[] = { "source", "maxsplit", NULL };
2316 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|i:split", kwlist,
2317 &string, &maxsplit))
2318 return NULL;
2319
2320 string = state_init(&state, self, string, 0, INT_MAX);
2321 if (!string)
2322 return NULL;
2323
2324 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002325 if (!list) {
2326 state_fini(&state);
2327 return NULL;
2328 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002329
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002330 n = 0;
2331 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002332
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002333 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002334
2335 state_reset(&state);
2336
2337 state.ptr = state.start;
2338
2339 if (state.charsize == 1) {
2340 status = sre_search(&state, PatternObject_GetCode(self));
2341 } else {
2342#if defined(HAVE_UNICODE)
2343 status = sre_usearch(&state, PatternObject_GetCode(self));
2344#endif
2345 }
2346
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002347 if (status <= 0) {
2348 if (status == 0)
2349 break;
2350 pattern_error(status);
2351 goto error;
2352 }
2353
2354 if (state.start == state.ptr) {
2355 if (last == state.end)
2356 break;
2357 /* skip one character */
2358 state.start = (void*) ((char*) state.ptr + state.charsize);
2359 continue;
2360 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002361
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002362 /* get segment before this match */
2363 item = PySequence_GetSlice(
2364 string, STATE_OFFSET(&state, last),
2365 STATE_OFFSET(&state, state.start)
2366 );
2367 if (!item)
2368 goto error;
2369 status = PyList_Append(list, item);
2370 Py_DECREF(item);
2371 if (status < 0)
2372 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002373
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002374 /* add groups (if any) */
2375 for (i = 0; i < self->groups; i++) {
2376 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002377 if (!item)
2378 goto error;
2379 status = PyList_Append(list, item);
2380 Py_DECREF(item);
2381 if (status < 0)
2382 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002383 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002384
2385 n = n + 1;
2386
2387 last = state.start = state.ptr;
2388
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002389 }
2390
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002391 /* get segment following last match (even if empty) */
2392 item = PySequence_GetSlice(
2393 string, STATE_OFFSET(&state, last), state.endpos
2394 );
2395 if (!item)
2396 goto error;
2397 status = PyList_Append(list, item);
2398 Py_DECREF(item);
2399 if (status < 0)
2400 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002401
2402 state_fini(&state);
2403 return list;
2404
2405error:
2406 Py_DECREF(list);
2407 state_fini(&state);
2408 return NULL;
2409
2410}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002411
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002412static PyObject*
2413pattern_subx(PatternObject* self, PyObject* template, PyObject* string,
2414 int count, int subn)
2415{
2416 SRE_STATE state;
2417 PyObject* list;
2418 PyObject* item;
2419 PyObject* filter;
2420 PyObject* args;
2421 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002422 void* ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002423 int status;
2424 int n;
2425 int i, b, e;
2426 int filter_is_callable;
2427
Fredrik Lundhdac58492001-10-21 21:48:30 +00002428 if (PyCallable_Check(template)) {
2429 /* sub/subn takes either a function or a template */
2430 filter = template;
2431 Py_INCREF(filter);
2432 filter_is_callable = 1;
2433 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002434 /* if not callable, check if it's a literal string */
2435 int literal;
2436 ptr = getstring(template, &n, &b);
2437 if (ptr) {
2438 if (b == 1) {
2439 literal = sre_literal_template(ptr, n);
2440 } else {
2441#if defined(HAVE_UNICODE)
2442 literal = sre_uliteral_template(ptr, n);
2443#endif
2444 }
2445 } else {
2446 PyErr_Clear();
2447 literal = 0;
2448 }
2449 if (literal) {
2450 filter = template;
2451 Py_INCREF(filter);
2452 filter_is_callable = 0;
2453 } else {
2454 /* not a literal; hand it over to the template compiler */
2455 filter = call(
2456 SRE_MODULE, "_subx",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002457 PyTuple_Pack(2, self, template)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002458 );
2459 if (!filter)
2460 return NULL;
2461 filter_is_callable = PyCallable_Check(filter);
2462 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002463 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002464
2465 string = state_init(&state, self, string, 0, INT_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002466 if (!string) {
2467 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002468 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002469 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002470
2471 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002472 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002473 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002474 state_fini(&state);
2475 return NULL;
2476 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002477
2478 n = i = 0;
2479
2480 while (!count || n < count) {
2481
2482 state_reset(&state);
2483
2484 state.ptr = state.start;
2485
2486 if (state.charsize == 1) {
2487 status = sre_search(&state, PatternObject_GetCode(self));
2488 } else {
2489#if defined(HAVE_UNICODE)
2490 status = sre_usearch(&state, PatternObject_GetCode(self));
2491#endif
2492 }
2493
2494 if (status <= 0) {
2495 if (status == 0)
2496 break;
2497 pattern_error(status);
2498 goto error;
2499 }
2500
2501 b = STATE_OFFSET(&state, state.start);
2502 e = STATE_OFFSET(&state, state.ptr);
2503
2504 if (i < b) {
2505 /* get segment before this match */
2506 item = PySequence_GetSlice(string, i, b);
2507 if (!item)
2508 goto error;
2509 status = PyList_Append(list, item);
2510 Py_DECREF(item);
2511 if (status < 0)
2512 goto error;
2513
2514 } else if (i == b && i == e && n > 0)
2515 /* ignore empty match on latest position */
2516 goto next;
2517
2518 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002519 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002520 match = pattern_new_match(self, &state, 1);
2521 if (!match)
2522 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002523 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002524 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002525 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002526 goto error;
2527 }
2528 item = PyObject_CallObject(filter, args);
2529 Py_DECREF(args);
2530 Py_DECREF(match);
2531 if (!item)
2532 goto error;
2533 } else {
2534 /* filter is literal string */
2535 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002536 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002537 }
2538
2539 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002540 if (item != Py_None) {
2541 status = PyList_Append(list, item);
2542 Py_DECREF(item);
2543 if (status < 0)
2544 goto error;
2545 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002546
2547 i = e;
2548 n = n + 1;
2549
2550next:
2551 /* move on */
2552 if (state.ptr == state.start)
2553 state.start = (void*) ((char*) state.ptr + state.charsize);
2554 else
2555 state.start = state.ptr;
2556
2557 }
2558
2559 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002560 if (i < state.endpos) {
2561 item = PySequence_GetSlice(string, i, state.endpos);
2562 if (!item)
2563 goto error;
2564 status = PyList_Append(list, item);
2565 Py_DECREF(item);
2566 if (status < 0)
2567 goto error;
2568 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002569
2570 state_fini(&state);
2571
Guido van Rossum4e173842001-12-07 04:25:10 +00002572 Py_DECREF(filter);
2573
Fredrik Lundhdac58492001-10-21 21:48:30 +00002574 /* convert list to single string (also removes list) */
Michael W. Hudsonb6a45052002-07-31 09:54:24 +00002575 item = join_list(list, self->pattern);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002576
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002577 if (!item)
2578 return NULL;
2579
2580 if (subn)
2581 return Py_BuildValue("Ni", item, n);
2582
2583 return item;
2584
2585error:
2586 Py_DECREF(list);
2587 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002588 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002589 return NULL;
2590
2591}
2592
2593static PyObject*
2594pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2595{
2596 PyObject* template;
2597 PyObject* string;
2598 int count = 0;
2599 static char* kwlist[] = { "repl", "string", "count", NULL };
2600 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:sub", kwlist,
2601 &template, &string, &count))
2602 return NULL;
2603
2604 return pattern_subx(self, template, string, count, 0);
2605}
2606
2607static PyObject*
2608pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2609{
2610 PyObject* template;
2611 PyObject* string;
2612 int count = 0;
2613 static char* kwlist[] = { "repl", "string", "count", NULL };
2614 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:subn", kwlist,
2615 &template, &string, &count))
2616 return NULL;
2617
2618 return pattern_subx(self, template, string, count, 1);
2619}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002620
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002621static PyObject*
2622pattern_copy(PatternObject* self, PyObject* args)
2623{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002624#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002625 PatternObject* copy;
2626 int offset;
2627
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002628 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
2629 return NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002630
2631 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2632 if (!copy)
2633 return NULL;
2634
2635 offset = offsetof(PatternObject, groups);
2636
2637 Py_XINCREF(self->groupindex);
2638 Py_XINCREF(self->indexgroup);
2639 Py_XINCREF(self->pattern);
2640
2641 memcpy((char*) copy + offset, (char*) self + offset,
2642 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00002643 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002644
2645 return (PyObject*) copy;
2646#else
2647 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2648 return NULL;
2649#endif
2650}
2651
2652static PyObject*
2653pattern_deepcopy(PatternObject* self, PyObject* args)
2654{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002655#ifdef USE_BUILTIN_COPY
2656 PatternObject* copy;
2657
2658 PyObject* memo;
2659 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
2660 return NULL;
2661
2662 copy = (PatternObject*) pattern_copy(self, Py_None);
2663 if (!copy)
2664 return NULL;
2665
2666 if (!deepcopy(&copy->groupindex, memo) ||
2667 !deepcopy(&copy->indexgroup, memo) ||
2668 !deepcopy(&copy->pattern, memo)) {
2669 Py_DECREF(copy);
2670 return NULL;
2671 }
2672
2673#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002674 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2675 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002676#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002677}
2678
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002679static PyMethodDef pattern_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00002680 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS},
2681 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS},
2682 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS},
2683 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS},
2684 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS},
2685 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002686#if PY_VERSION_HEX >= 0x02020000
2687 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS},
2688#endif
Fredrik Lundh562586e2000-10-03 20:43:34 +00002689 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002690 {"__copy__", (PyCFunction) pattern_copy, METH_VARARGS},
2691 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002692 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002693};
2694
2695static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002696pattern_getattr(PatternObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00002697{
2698 PyObject* res;
2699
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002700 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00002701
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002702 if (res)
2703 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00002704
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002705 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00002706
2707 /* attributes */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002708 if (!strcmp(name, "pattern")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002709 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002710 return self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00002711 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002712
2713 if (!strcmp(name, "flags"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002714 return Py_BuildValue("i", self->flags);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002715
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002716 if (!strcmp(name, "groups"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002717 return Py_BuildValue("i", self->groups);
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002718
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002719 if (!strcmp(name, "groupindex") && self->groupindex) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002720 Py_INCREF(self->groupindex);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002721 return self->groupindex;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002722 }
2723
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002724 PyErr_SetString(PyExc_AttributeError, name);
2725 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002726}
2727
2728statichere PyTypeObject Pattern_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002729 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00002730 0, "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002731 sizeof(PatternObject), sizeof(SRE_CODE),
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002732 (destructor)pattern_dealloc, /*tp_dealloc*/
2733 0, /*tp_print*/
Raymond Hettinger027bb632004-05-31 03:09:25 +00002734 (getattrfunc)pattern_getattr, /*tp_getattr*/
2735 0, /* tp_setattr */
2736 0, /* tp_compare */
2737 0, /* tp_repr */
2738 0, /* tp_as_number */
2739 0, /* tp_as_sequence */
2740 0, /* tp_as_mapping */
2741 0, /* tp_hash */
2742 0, /* tp_call */
2743 0, /* tp_str */
2744 0, /* tp_getattro */
2745 0, /* tp_setattro */
2746 0, /* tp_as_buffer */
2747 Py_TPFLAGS_HAVE_WEAKREFS, /* tp_flags */
2748 0, /* tp_doc */
2749 0, /* tp_traverse */
2750 0, /* tp_clear */
2751 0, /* tp_richcompare */
2752 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
Guido van Rossumb700df92000-03-31 14:59:30 +00002753};
2754
2755/* -------------------------------------------------------------------- */
2756/* match methods */
2757
2758static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002759match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00002760{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002761 Py_XDECREF(self->regs);
2762 Py_XDECREF(self->string);
2763 Py_DECREF(self->pattern);
2764 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00002765}
2766
2767static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002768match_getslice_by_index(MatchObject* self, int index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00002769{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002770 if (index < 0 || index >= self->groups) {
2771 /* raise IndexError if we were given a bad group number */
2772 PyErr_SetString(
2773 PyExc_IndexError,
2774 "no such group"
2775 );
2776 return NULL;
2777 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002778
Fredrik Lundh6f013982000-07-03 18:44:21 +00002779 index *= 2;
2780
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002781 if (self->string == Py_None || self->mark[index] < 0) {
2782 /* return default value if the string or group is undefined */
2783 Py_INCREF(def);
2784 return def;
2785 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002786
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002787 return PySequence_GetSlice(
2788 self->string, self->mark[index], self->mark[index+1]
2789 );
Guido van Rossumb700df92000-03-31 14:59:30 +00002790}
2791
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002792static int
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002793match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00002794{
Fredrik Lundh6f013982000-07-03 18:44:21 +00002795 int i;
Guido van Rossumb700df92000-03-31 14:59:30 +00002796
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002797 if (PyInt_Check(index))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002798 return (int) PyInt_AS_LONG(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00002799
Fredrik Lundh6f013982000-07-03 18:44:21 +00002800 i = -1;
2801
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002802 if (self->pattern->groupindex) {
2803 index = PyObject_GetItem(self->pattern->groupindex, index);
2804 if (index) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00002805 if (PyInt_Check(index))
2806 i = (int) PyInt_AS_LONG(index);
2807 Py_DECREF(index);
2808 } else
2809 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002810 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00002811
2812 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002813}
2814
2815static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002816match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002817{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002818 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00002819}
2820
2821static PyObject*
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002822match_expand(MatchObject* self, PyObject* args)
2823{
2824 PyObject* template;
2825 if (!PyArg_ParseTuple(args, "O:expand", &template))
2826 return NULL;
2827
2828 /* delegate to Python code */
2829 return call(
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002830 SRE_MODULE, "_expand",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002831 PyTuple_Pack(3, self->pattern, self, template)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002832 );
2833}
2834
2835static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002836match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002837{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002838 PyObject* result;
2839 int i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00002840
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002841 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00002842
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002843 switch (size) {
2844 case 0:
2845 result = match_getslice(self, Py_False, Py_None);
2846 break;
2847 case 1:
2848 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2849 break;
2850 default:
2851 /* fetch multiple items */
2852 result = PyTuple_New(size);
2853 if (!result)
2854 return NULL;
2855 for (i = 0; i < size; i++) {
2856 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002857 self, PyTuple_GET_ITEM(args, i), Py_None
2858 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002859 if (!item) {
2860 Py_DECREF(result);
2861 return NULL;
2862 }
2863 PyTuple_SET_ITEM(result, i, item);
2864 }
2865 break;
2866 }
2867 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002868}
2869
2870static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002871match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002872{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002873 PyObject* result;
2874 int index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002875
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002876 PyObject* def = Py_None;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002877 static char* kwlist[] = { "default", NULL };
2878 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002879 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002880
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002881 result = PyTuple_New(self->groups-1);
2882 if (!result)
2883 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002884
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002885 for (index = 1; index < self->groups; index++) {
2886 PyObject* item;
2887 item = match_getslice_by_index(self, index, def);
2888 if (!item) {
2889 Py_DECREF(result);
2890 return NULL;
2891 }
2892 PyTuple_SET_ITEM(result, index-1, item);
2893 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002894
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002895 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002896}
2897
2898static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002899match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002900{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002901 PyObject* result;
2902 PyObject* keys;
2903 int index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002904
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002905 PyObject* def = Py_None;
Fredrik Lundh562586e2000-10-03 20:43:34 +00002906 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00002907 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002908 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002909
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002910 result = PyDict_New();
2911 if (!result || !self->pattern->groupindex)
2912 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002913
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002914 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002915 if (!keys)
2916 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00002917
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002918 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00002919 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002920 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002921 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002922 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002923 if (!key)
2924 goto failed;
2925 value = match_getslice(self, key, def);
2926 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002927 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002928 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002929 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00002930 status = PyDict_SetItem(result, key, value);
2931 Py_DECREF(value);
2932 if (status < 0)
2933 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002934 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002935
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002936 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00002937
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002938 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002939
2940failed:
2941 Py_DECREF(keys);
2942 Py_DECREF(result);
2943 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002944}
2945
2946static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002947match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002948{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002949 int index;
2950
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002951 PyObject* index_ = Py_False; /* zero */
2952 if (!PyArg_ParseTuple(args, "|O:start", &index_))
2953 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002954
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002955 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002956
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002957 if (index < 0 || index >= self->groups) {
2958 PyErr_SetString(
2959 PyExc_IndexError,
2960 "no such group"
2961 );
2962 return NULL;
2963 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002964
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002965 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002966 return Py_BuildValue("i", self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00002967}
2968
2969static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002970match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002971{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002972 int index;
2973
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002974 PyObject* index_ = Py_False; /* zero */
2975 if (!PyArg_ParseTuple(args, "|O:end", &index_))
2976 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002977
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002978 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002979
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002980 if (index < 0 || index >= self->groups) {
2981 PyErr_SetString(
2982 PyExc_IndexError,
2983 "no such group"
2984 );
2985 return NULL;
2986 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002987
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002988 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002989 return Py_BuildValue("i", self->mark[index*2+1]);
2990}
2991
2992LOCAL(PyObject*)
2993_pair(int i1, int i2)
2994{
2995 PyObject* pair;
2996 PyObject* item;
2997
2998 pair = PyTuple_New(2);
2999 if (!pair)
3000 return NULL;
3001
3002 item = PyInt_FromLong(i1);
3003 if (!item)
3004 goto error;
3005 PyTuple_SET_ITEM(pair, 0, item);
3006
3007 item = PyInt_FromLong(i2);
3008 if (!item)
3009 goto error;
3010 PyTuple_SET_ITEM(pair, 1, item);
3011
3012 return pair;
3013
3014 error:
3015 Py_DECREF(pair);
3016 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003017}
3018
3019static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003020match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003021{
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003022 int index;
3023
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003024 PyObject* index_ = Py_False; /* zero */
3025 if (!PyArg_ParseTuple(args, "|O:span", &index_))
3026 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003027
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003028 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003029
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003030 if (index < 0 || index >= self->groups) {
3031 PyErr_SetString(
3032 PyExc_IndexError,
3033 "no such group"
3034 );
3035 return NULL;
3036 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003037
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003038 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003039 return _pair(self->mark[index*2], self->mark[index*2+1]);
3040}
3041
3042static PyObject*
3043match_regs(MatchObject* self)
3044{
3045 PyObject* regs;
3046 PyObject* item;
3047 int index;
3048
3049 regs = PyTuple_New(self->groups);
3050 if (!regs)
3051 return NULL;
3052
3053 for (index = 0; index < self->groups; index++) {
3054 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3055 if (!item) {
3056 Py_DECREF(regs);
3057 return NULL;
3058 }
3059 PyTuple_SET_ITEM(regs, index, item);
3060 }
3061
3062 Py_INCREF(regs);
3063 self->regs = regs;
3064
3065 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00003066}
3067
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003068static PyObject*
3069match_copy(MatchObject* self, PyObject* args)
3070{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003071#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003072 MatchObject* copy;
3073 int slots, offset;
3074
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003075 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
3076 return NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003077
3078 slots = 2 * (self->pattern->groups+1);
3079
3080 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3081 if (!copy)
3082 return NULL;
3083
3084 /* this value a constant, but any compiler should be able to
3085 figure that out all by itself */
3086 offset = offsetof(MatchObject, string);
3087
3088 Py_XINCREF(self->pattern);
3089 Py_XINCREF(self->string);
3090 Py_XINCREF(self->regs);
3091
3092 memcpy((char*) copy + offset, (char*) self + offset,
3093 sizeof(MatchObject) + slots * sizeof(int) - offset);
3094
3095 return (PyObject*) copy;
3096#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003097 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003098 return NULL;
3099#endif
3100}
3101
3102static PyObject*
3103match_deepcopy(MatchObject* self, PyObject* args)
3104{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003105#ifdef USE_BUILTIN_COPY
3106 MatchObject* copy;
3107
3108 PyObject* memo;
3109 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
3110 return NULL;
3111
3112 copy = (MatchObject*) match_copy(self, Py_None);
3113 if (!copy)
3114 return NULL;
3115
3116 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3117 !deepcopy(&copy->string, memo) ||
3118 !deepcopy(&copy->regs, memo)) {
3119 Py_DECREF(copy);
3120 return NULL;
3121 }
3122
3123#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003124 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3125 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003126#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003127}
3128
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003129static PyMethodDef match_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00003130 {"group", (PyCFunction) match_group, METH_VARARGS},
3131 {"start", (PyCFunction) match_start, METH_VARARGS},
3132 {"end", (PyCFunction) match_end, METH_VARARGS},
3133 {"span", (PyCFunction) match_span, METH_VARARGS},
3134 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3135 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
3136 {"expand", (PyCFunction) match_expand, METH_VARARGS},
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003137 {"__copy__", (PyCFunction) match_copy, METH_VARARGS},
3138 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003139 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003140};
3141
3142static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003143match_getattr(MatchObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00003144{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003145 PyObject* res;
Guido van Rossumb700df92000-03-31 14:59:30 +00003146
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003147 res = Py_FindMethod(match_methods, (PyObject*) self, name);
3148 if (res)
3149 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00003150
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003151 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00003152
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003153 if (!strcmp(name, "lastindex")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00003154 if (self->lastindex >= 0)
3155 return Py_BuildValue("i", self->lastindex);
Fredrik Lundhc2301732000-07-02 22:25:39 +00003156 Py_INCREF(Py_None);
3157 return Py_None;
3158 }
3159
3160 if (!strcmp(name, "lastgroup")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00003161 if (self->pattern->indexgroup && self->lastindex >= 0) {
Fredrik Lundhc2301732000-07-02 22:25:39 +00003162 PyObject* result = PySequence_GetItem(
Fredrik Lundh6f013982000-07-03 18:44:21 +00003163 self->pattern->indexgroup, self->lastindex
Fredrik Lundhc2301732000-07-02 22:25:39 +00003164 );
3165 if (result)
3166 return result;
3167 PyErr_Clear();
3168 }
3169 Py_INCREF(Py_None);
3170 return Py_None;
3171 }
3172
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003173 if (!strcmp(name, "string")) {
3174 if (self->string) {
3175 Py_INCREF(self->string);
3176 return self->string;
3177 } else {
3178 Py_INCREF(Py_None);
3179 return Py_None;
3180 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003181 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003182
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003183 if (!strcmp(name, "regs")) {
3184 if (self->regs) {
3185 Py_INCREF(self->regs);
3186 return self->regs;
3187 } else
3188 return match_regs(self);
3189 }
3190
3191 if (!strcmp(name, "re")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00003192 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003193 return (PyObject*) self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00003194 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003195
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003196 if (!strcmp(name, "pos"))
3197 return Py_BuildValue("i", self->pos);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003198
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003199 if (!strcmp(name, "endpos"))
3200 return Py_BuildValue("i", self->endpos);
Guido van Rossumb700df92000-03-31 14:59:30 +00003201
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003202 PyErr_SetString(PyExc_AttributeError, name);
3203 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003204}
3205
3206/* FIXME: implement setattr("string", None) as a special case (to
3207 detach the associated string, if any */
3208
3209statichere PyTypeObject Match_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003210 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00003211 0, "_" SRE_MODULE ".SRE_Match",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003212 sizeof(MatchObject), sizeof(int),
3213 (destructor)match_dealloc, /*tp_dealloc*/
3214 0, /*tp_print*/
3215 (getattrfunc)match_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00003216};
3217
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003218/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003219/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003220
3221static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003222scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003223{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003224 state_fini(&self->state);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003225 Py_DECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003226 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003227}
3228
3229static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003230scanner_match(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003231{
3232 SRE_STATE* state = &self->state;
3233 PyObject* match;
3234 int status;
3235
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003236 state_reset(state);
3237
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003238 state->ptr = state->start;
3239
3240 if (state->charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003241 status = sre_match(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003242 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003243#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003244 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003245#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003246 }
3247
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003248 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003249 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003250
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00003251 if ((status == 0 || state->ptr == state->start) &&
3252 state->ptr < state->end)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003253 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003254 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003255 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003256
3257 return match;
3258}
3259
3260
3261static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003262scanner_search(ScannerObject* self, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003263{
3264 SRE_STATE* state = &self->state;
3265 PyObject* match;
3266 int status;
3267
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003268 state_reset(state);
3269
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003270 state->ptr = state->start;
3271
3272 if (state->charsize == 1) {
3273 status = sre_search(state, PatternObject_GetCode(self->pattern));
3274 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003275#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003276 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003277#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003278 }
3279
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003280 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003281 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003282
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00003283 if ((status == 0 || state->ptr == state->start) &&
3284 state->ptr < state->end)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003285 state->start = (void*) ((char*) state->ptr + state->charsize);
3286 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003287 state->start = state->ptr;
3288
3289 return match;
3290}
3291
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003292static PyMethodDef scanner_methods[] = {
Neal Norwitzbb2769f2002-03-31 15:46:00 +00003293 /* FIXME: use METH_OLDARGS instead of 0 or fix to use METH_VARARGS */
3294 /* METH_OLDARGS is not in Python 1.5.2 */
3295 {"match", (PyCFunction) scanner_match, 0},
3296 {"search", (PyCFunction) scanner_search, 0},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003297 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003298};
3299
3300static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003301scanner_getattr(ScannerObject* self, char* name)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003302{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003303 PyObject* res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003304
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003305 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
3306 if (res)
3307 return res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003308
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003309 PyErr_Clear();
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003310
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003311 /* attributes */
3312 if (!strcmp(name, "pattern")) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003313 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003314 return self->pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003315 }
3316
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003317 PyErr_SetString(PyExc_AttributeError, name);
3318 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003319}
3320
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003321statichere PyTypeObject Scanner_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003322 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00003323 0, "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003324 sizeof(ScannerObject), 0,
3325 (destructor)scanner_dealloc, /*tp_dealloc*/
3326 0, /*tp_print*/
3327 (getattrfunc)scanner_getattr, /*tp_getattr*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003328};
3329
Guido van Rossumb700df92000-03-31 14:59:30 +00003330static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00003331 {"compile", _compile, METH_VARARGS},
3332 {"getcodesize", sre_codesize, METH_VARARGS},
3333 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003334 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003335};
3336
Andrew M. Kuchlingc24fe362003-04-30 13:09:08 +00003337#if PY_VERSION_HEX < 0x02030000
3338DL_EXPORT(void) init_sre(void)
3339#else
Mark Hammond8235ea12002-07-19 06:55:41 +00003340PyMODINIT_FUNC init_sre(void)
Andrew M. Kuchlingc24fe362003-04-30 13:09:08 +00003341#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00003342{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003343 PyObject* m;
3344 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00003345 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003346
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003347 /* Patch object types */
3348 Pattern_Type.ob_type = Match_Type.ob_type =
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003349 Scanner_Type.ob_type = &PyType_Type;
Guido van Rossumb700df92000-03-31 14:59:30 +00003350
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00003351 m = Py_InitModule("_" SRE_MODULE, _functions);
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003352 d = PyModule_GetDict(m);
3353
Fredrik Lundh21009b92001-09-18 18:47:09 +00003354 x = PyInt_FromLong(SRE_MAGIC);
3355 if (x) {
3356 PyDict_SetItemString(d, "MAGIC", x);
3357 Py_DECREF(x);
3358 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00003359
Martin v. Löwis78e2f062003-04-19 12:56:08 +00003360 x = PyInt_FromLong(sizeof(SRE_CODE));
3361 if (x) {
3362 PyDict_SetItemString(d, "CODESIZE", x);
3363 Py_DECREF(x);
3364 }
3365
Fredrik Lundh21009b92001-09-18 18:47:09 +00003366 x = PyString_FromString(copyright);
3367 if (x) {
3368 PyDict_SetItemString(d, "copyright", x);
3369 Py_DECREF(x);
3370 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003371}
3372
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003373#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00003374
3375/* vim:ts=4:sw=4:et
3376*/