blob: 012f1275093e5b360bc0b34f32bed92f88991ad3 [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
Fredrik Lundhebc37b22000-10-28 19:30:41 +00009 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000010 * 2000-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000011 * 2000-09-20 fl added expand method
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000012 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000013 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000014 * 2001-04-28 fl added __copy__ methods (work in progress)
Fredrik Lundh09705f02002-11-22 12:46:35 +000015 * 2001-05-14 fl fixes for 1.5.2 compatibility
Fredrik Lundhf71ae462001-07-02 17:04:48 +000016 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh397a6542001-10-18 19:30:16 +000017 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Fredrik Lundh971e78b2001-10-20 17:48:46 +000018 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
Fredrik Lundhbec95b92001-10-21 16:47:57 +000019 * 2001-10-21 fl added sub/subn primitive
Fredrik Lundh703ce812001-10-24 22:16:30 +000020 * 2001-10-24 fl added finditer primitive (for 2.2 only)
Fredrik Lundh82b23072001-12-09 16:13:15 +000021 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
Fredrik Lundh09705f02002-11-22 12:46:35 +000022 * 2002-11-09 fl fixed empty sub/subn return type
Martin v. Löwis78e2f062003-04-19 12:56:08 +000023 * 2003-04-18 mvl fully support 4-byte codes
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +000024 * 2003-10-17 gn implemented non recursive scheme
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000026 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000027 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000028 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
Guido van Rossumb700df92000-03-31 14:59:30 +000032 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000033 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000034 * other compatibility work.
35 */
36
37#ifndef SRE_RECURSIVE
38
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000039static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000040 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000041
Thomas Wouters0e3f5912006-08-11 14:57:12 +000042#define PY_SSIZE_T_CLEAN
43
Guido van Rossumb700df92000-03-31 14:59:30 +000044#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000045#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000046
47#include "sre.h"
48
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000049#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000050
Fredrik Lundh436c3d582000-06-29 08:58:44 +000051/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000052#if !defined(SRE_MODULE)
53#define SRE_MODULE "sre"
54#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000055
Thomas Wouters9ada3d62006-04-21 09:47:09 +000056#define SRE_PY_MODULE "re"
57
Guido van Rossumb700df92000-03-31 14:59:30 +000058/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000059#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000060
Fredrik Lundh22d25462000-07-01 17:50:59 +000061/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +000062#define HAVE_UNICODE
Fredrik Lundh436c3d582000-06-29 08:58:44 +000063
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000065/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000066
67/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000068#define USE_FAST_SEARCH
69
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000070/* enables aggressive inlining (always on for Visual C) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000071#undef USE_INLINE
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000072
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000073/* enables copy/deepcopy handling (work in progress) */
74#undef USE_BUILTIN_COPY
75
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000076#if PY_VERSION_HEX < 0x01060000
77#define PyObject_DEL(op) PyMem_DEL((op))
78#endif
79
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000080/* -------------------------------------------------------------------- */
81
Fredrik Lundh80946112000-06-29 18:03:25 +000082#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000083#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000084#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000085/* fastest possible local call under MSVC */
86#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000087#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000088#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000089#else
90#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000091#endif
92
93/* error codes */
94#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000095#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000096#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000097#define SRE_ERROR_MEMORY -9 /* out of memory */
98
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000099#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +0000100#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +0000101#else
102#define TRACE(v)
103#endif
104
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000105/* -------------------------------------------------------------------- */
106/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000107
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000108/* default character predicates (run sre_chars.py to regenerate tables) */
109
110#define SRE_DIGIT_MASK 1
111#define SRE_SPACE_MASK 2
112#define SRE_LINEBREAK_MASK 4
113#define SRE_ALNUM_MASK 8
114#define SRE_WORD_MASK 16
115
Fredrik Lundh21009b92001-09-18 18:47:09 +0000116/* FIXME: this assumes ASCII. create tables in init_sre() instead */
117
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000118static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1192, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1200, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
12125, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12224, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1230, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12424, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
125
Fredrik Lundhb389df32000-06-29 12:48:37 +0000126static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000012710, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
12827, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
12944, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
13061, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
131108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
132122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
133106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
134120, 121, 122, 123, 124, 125, 126, 127 };
135
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000136#define SRE_IS_DIGIT(ch)\
137 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
138#define SRE_IS_SPACE(ch)\
139 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
140#define SRE_IS_LINEBREAK(ch)\
141 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
142#define SRE_IS_ALNUM(ch)\
143 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
144#define SRE_IS_WORD(ch)\
145 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000146
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000147static unsigned int sre_lower(unsigned int ch)
148{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000149 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000150}
151
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000152/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000153/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
154 * warnings when c's type supports only numbers < N+1 */
155#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
156#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000157#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000158#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000159#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
160
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000161static unsigned int sre_lower_locale(unsigned int ch)
162{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000163 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000164}
165
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000166/* unicode-specific character predicates */
167
168#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000169
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000170#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
171#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
172#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000173#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000174#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000175
176static unsigned int sre_lower_unicode(unsigned int ch)
177{
178 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
179}
180
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000181#endif
182
Guido van Rossumb700df92000-03-31 14:59:30 +0000183LOCAL(int)
184sre_category(SRE_CODE category, unsigned int ch)
185{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000186 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000187
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000188 case SRE_CATEGORY_DIGIT:
189 return SRE_IS_DIGIT(ch);
190 case SRE_CATEGORY_NOT_DIGIT:
191 return !SRE_IS_DIGIT(ch);
192 case SRE_CATEGORY_SPACE:
193 return SRE_IS_SPACE(ch);
194 case SRE_CATEGORY_NOT_SPACE:
195 return !SRE_IS_SPACE(ch);
196 case SRE_CATEGORY_WORD:
197 return SRE_IS_WORD(ch);
198 case SRE_CATEGORY_NOT_WORD:
199 return !SRE_IS_WORD(ch);
200 case SRE_CATEGORY_LINEBREAK:
201 return SRE_IS_LINEBREAK(ch);
202 case SRE_CATEGORY_NOT_LINEBREAK:
203 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000204
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000205 case SRE_CATEGORY_LOC_WORD:
206 return SRE_LOC_IS_WORD(ch);
207 case SRE_CATEGORY_LOC_NOT_WORD:
208 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000209
210#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000211 case SRE_CATEGORY_UNI_DIGIT:
212 return SRE_UNI_IS_DIGIT(ch);
213 case SRE_CATEGORY_UNI_NOT_DIGIT:
214 return !SRE_UNI_IS_DIGIT(ch);
215 case SRE_CATEGORY_UNI_SPACE:
216 return SRE_UNI_IS_SPACE(ch);
217 case SRE_CATEGORY_UNI_NOT_SPACE:
218 return !SRE_UNI_IS_SPACE(ch);
219 case SRE_CATEGORY_UNI_WORD:
220 return SRE_UNI_IS_WORD(ch);
221 case SRE_CATEGORY_UNI_NOT_WORD:
222 return !SRE_UNI_IS_WORD(ch);
223 case SRE_CATEGORY_UNI_LINEBREAK:
224 return SRE_UNI_IS_LINEBREAK(ch);
225 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
226 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000227#else
228 case SRE_CATEGORY_UNI_DIGIT:
229 return SRE_IS_DIGIT(ch);
230 case SRE_CATEGORY_UNI_NOT_DIGIT:
231 return !SRE_IS_DIGIT(ch);
232 case SRE_CATEGORY_UNI_SPACE:
233 return SRE_IS_SPACE(ch);
234 case SRE_CATEGORY_UNI_NOT_SPACE:
235 return !SRE_IS_SPACE(ch);
236 case SRE_CATEGORY_UNI_WORD:
237 return SRE_LOC_IS_WORD(ch);
238 case SRE_CATEGORY_UNI_NOT_WORD:
239 return !SRE_LOC_IS_WORD(ch);
240 case SRE_CATEGORY_UNI_LINEBREAK:
241 return SRE_IS_LINEBREAK(ch);
242 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
243 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000244#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000245 }
246 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000247}
248
249/* helpers */
250
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000251static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000252data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000253{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000254 if (state->data_stack) {
Thomas Wouters477c8d52006-05-27 19:21:47 +0000255 PyMem_FREE(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000256 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000257 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000258 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000259}
260
261static int
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000262data_stack_grow(SRE_STATE* state, Py_ssize_t size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000263{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000264 Py_ssize_t minsize, cursize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000265 minsize = state->data_stack_base+size;
266 cursize = state->data_stack_size;
267 if (cursize < minsize) {
268 void* stack;
269 cursize = minsize+minsize/4+1024;
270 TRACE(("allocate/grow stack %d\n", cursize));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000271 stack = PyMem_REALLOC(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000272 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000273 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000274 return SRE_ERROR_MEMORY;
275 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000276 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000277 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000278 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000279 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000280}
281
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000282/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000283
284#define SRE_CHAR unsigned char
285#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000286#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000287#define SRE_CHARSET sre_charset
288#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000289#define SRE_MATCH sre_match
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000290#define SRE_MATCH_CONTEXT sre_match_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000291#define SRE_SEARCH sre_search
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000292#define SRE_LITERAL_TEMPLATE sre_literal_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000293
294#if defined(HAVE_UNICODE)
295
Guido van Rossumb700df92000-03-31 14:59:30 +0000296#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000297#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000298#undef SRE_RECURSIVE
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000299
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000300#undef SRE_LITERAL_TEMPLATE
Guido van Rossumb700df92000-03-31 14:59:30 +0000301#undef SRE_SEARCH
302#undef SRE_MATCH
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000303#undef SRE_MATCH_CONTEXT
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000304#undef SRE_INFO
305#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000306#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000307#undef SRE_AT
308#undef SRE_CHAR
309
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000310/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000311
312#define SRE_CHAR Py_UNICODE
313#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000314#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000315#define SRE_CHARSET sre_ucharset
316#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000317#define SRE_MATCH sre_umatch
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000318#define SRE_MATCH_CONTEXT sre_umatch_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000319#define SRE_SEARCH sre_usearch
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000320#define SRE_LITERAL_TEMPLATE sre_uliteral_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000321#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000322
323#endif /* SRE_RECURSIVE */
324
325/* -------------------------------------------------------------------- */
326/* String matching engine */
327
328/* the following section is compiled twice, with different character
329 settings */
330
331LOCAL(int)
332SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
333{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000334 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000335
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000336 Py_ssize_t thisp, thatp;
Guido van Rossumb700df92000-03-31 14:59:30 +0000337
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000338 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000339
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000340 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000341 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000342 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000343
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000344 case SRE_AT_BEGINNING_LINE:
345 return ((void*) ptr == state->beginning ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000346 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000347
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000348 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000349 return (((void*) (ptr+1) == state->end &&
350 SRE_IS_LINEBREAK((int) ptr[0])) ||
351 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000352
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000353 case SRE_AT_END_LINE:
354 return ((void*) ptr == state->end ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000355 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000356
Fredrik Lundh770617b2001-01-14 15:06:11 +0000357 case SRE_AT_END_STRING:
358 return ((void*) ptr == state->end);
359
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000360 case SRE_AT_BOUNDARY:
361 if (state->beginning == state->end)
362 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000363 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000364 SRE_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000365 thisp = ((void*) ptr < state->end) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000366 SRE_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000367 return thisp != thatp;
Fredrik Lundh80946112000-06-29 18:03:25 +0000368
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000369 case SRE_AT_NON_BOUNDARY:
370 if (state->beginning == state->end)
371 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000372 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000373 SRE_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000374 thisp = ((void*) ptr < state->end) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000375 SRE_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000376 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000377
378 case SRE_AT_LOC_BOUNDARY:
379 if (state->beginning == state->end)
380 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000381 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000382 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000383 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000384 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000385 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000386
387 case SRE_AT_LOC_NON_BOUNDARY:
388 if (state->beginning == state->end)
389 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000390 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000391 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000392 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000393 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000394 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000395
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000396#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000397 case SRE_AT_UNI_BOUNDARY:
398 if (state->beginning == state->end)
399 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000400 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000401 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000402 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000403 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000404 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000405
406 case SRE_AT_UNI_NON_BOUNDARY:
407 if (state->beginning == state->end)
408 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000409 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000410 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000411 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000412 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000413 return thisp == thatp;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000414#endif
415
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000416 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000417
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000418 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000419}
420
421LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000422SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000423{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000424 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000425
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000426 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000427
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000428 for (;;) {
429 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000430
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000431 case SRE_OP_FAILURE:
432 return !ok;
433
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000434 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000435 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000436 if (ch == set[0])
437 return ok;
438 set++;
439 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000440
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000441 case SRE_OP_CATEGORY:
442 /* <CATEGORY> <code> */
443 if (sre_category(set[0], (int) ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000444 return ok;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000445 set += 1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000446 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000447
Fredrik Lundh3562f112000-07-02 12:00:07 +0000448 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000449 if (sizeof(SRE_CODE) == 2) {
450 /* <CHARSET> <bitmap> (16 bits per code word) */
451 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
452 return ok;
453 set += 16;
Tim Peters3d563502006-01-21 02:47:53 +0000454 }
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000455 else {
456 /* <CHARSET> <bitmap> (32 bits per code word) */
457 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
458 return ok;
459 set += 8;
460 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000461 break;
462
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000463 case SRE_OP_RANGE:
464 /* <RANGE> <lower> <upper> */
465 if (set[0] <= ch && ch <= set[1])
466 return ok;
467 set += 2;
468 break;
469
470 case SRE_OP_NEGATE:
471 ok = !ok;
472 break;
473
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000474 case SRE_OP_BIGCHARSET:
475 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
476 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000477 Py_ssize_t count, block;
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000478 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000479
480 if (sizeof(SRE_CODE) == 2) {
481 block = ((unsigned char*)set)[ch >> 8];
482 set += 128;
483 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
484 return ok;
485 set += count*16;
486 }
487 else {
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000488 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
489 * warnings when c's type supports only numbers < N+1 */
490 if (!(ch & ~65535))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000491 block = ((unsigned char*)set)[ch >> 8];
492 else
493 block = -1;
494 set += 64;
Tim Peters3d563502006-01-21 02:47:53 +0000495 if (block >=0 &&
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000496 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
497 return ok;
498 set += count*8;
499 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000500 break;
501 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000502
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000503 default:
504 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000505 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000506 return 0;
507 }
508 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000509}
510
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000511LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000512
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000513LOCAL(Py_ssize_t)
514SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000515{
516 SRE_CODE chr;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000517 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
518 SRE_CHAR* end = (SRE_CHAR *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000519 Py_ssize_t i;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000520
521 /* adjust end */
522 if (maxcount < end - ptr && maxcount != 65535)
523 end = ptr + maxcount;
524
525 switch (pattern[0]) {
526
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000527 case SRE_OP_IN:
528 /* repeated set */
529 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
530 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
531 ptr++;
532 break;
533
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000534 case SRE_OP_ANY:
535 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000536 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000537 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
538 ptr++;
539 break;
540
541 case SRE_OP_ANY_ALL:
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000542 /* repeated dot wildcard. skip to the end of the target
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000543 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000544 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000545 ptr = end;
546 break;
547
548 case SRE_OP_LITERAL:
549 /* repeated literal */
550 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000551 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000552 while (ptr < end && (SRE_CODE) *ptr == chr)
553 ptr++;
554 break;
555
556 case SRE_OP_LITERAL_IGNORE:
557 /* repeated literal */
558 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000559 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000560 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
561 ptr++;
562 break;
563
564 case SRE_OP_NOT_LITERAL:
565 /* repeated non-literal */
566 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000567 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000568 while (ptr < end && (SRE_CODE) *ptr != chr)
569 ptr++;
570 break;
Tim Peters3d563502006-01-21 02:47:53 +0000571
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000572 case SRE_OP_NOT_LITERAL_IGNORE:
573 /* repeated non-literal */
574 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000575 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000576 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
577 ptr++;
578 break;
579
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000580 default:
581 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000582 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000583 while ((SRE_CHAR*) state->ptr < end) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000584 i = SRE_MATCH(state, pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000585 if (i < 0)
586 return i;
587 if (!i)
588 break;
589 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000590 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
591 (SRE_CHAR*) state->ptr - ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000592 return (SRE_CHAR*) state->ptr - ptr;
593 }
594
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000595 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000596 return ptr - (SRE_CHAR*) state->ptr;
597}
598
Fredrik Lundh33accc12000-08-27 20:59:47 +0000599#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000600LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000601SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
602{
603 /* check if an SRE_OP_INFO block matches at the current position.
604 returns the number of SRE_CODE objects to skip if successful, 0
605 if no match */
606
607 SRE_CHAR* end = state->end;
608 SRE_CHAR* ptr = state->ptr;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000609 Py_ssize_t i;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000610
611 /* check minimal length */
612 if (pattern[3] && (end - ptr) < pattern[3])
613 return 0;
614
615 /* check known prefix */
616 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
617 /* <length> <skip> <prefix data> <overlap data> */
618 for (i = 0; i < pattern[5]; i++)
619 if ((SRE_CODE) ptr[i] != pattern[7 + i])
620 return 0;
621 return pattern[0] + 2 * pattern[6];
622 }
623 return pattern[0];
624}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000625#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000626
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000627/* The macros below should be used to protect recursive SRE_MATCH()
628 * calls that *failed* and do *not* return immediately (IOW, those
629 * that will backtrack). Explaining:
630 *
631 * - Recursive SRE_MATCH() returned true: that's usually a success
632 * (besides atypical cases like ASSERT_NOT), therefore there's no
633 * reason to restore lastmark;
634 *
635 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
636 * is returning to the caller: If the current SRE_MATCH() is the
637 * top function of the recursion, returning false will be a matching
638 * failure, and it doesn't matter where lastmark is pointing to.
639 * If it's *not* the top function, it will be a recursive SRE_MATCH()
640 * failure by itself, and the calling SRE_MATCH() will have to deal
641 * with the failure by the same rules explained here (it will restore
642 * lastmark by itself if necessary);
643 *
644 * - Recursive SRE_MATCH() returned false, and will continue the
645 * outside 'for' loop: must be protected when breaking, since the next
646 * OP could potentially depend on lastmark;
Tim Peters3d563502006-01-21 02:47:53 +0000647 *
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000648 * - Recursive SRE_MATCH() returned false, and will be called again
649 * inside a local for/while loop: must be protected between each
650 * loop iteration, since the recursive SRE_MATCH() could do anything,
651 * and could potentially depend on lastmark.
652 *
653 * For more information, check the discussion at SF patch #712900.
654 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000655#define LASTMARK_SAVE() \
656 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000657 ctx->lastmark = state->lastmark; \
658 ctx->lastindex = state->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000659 } while (0)
660#define LASTMARK_RESTORE() \
661 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000662 state->lastmark = ctx->lastmark; \
663 state->lastindex = ctx->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000664 } while (0)
665
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000666#define RETURN_ERROR(i) do { return i; } while(0)
667#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
668#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
669
670#define RETURN_ON_ERROR(i) \
671 do { if (i < 0) RETURN_ERROR(i); } while (0)
672#define RETURN_ON_SUCCESS(i) \
673 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
674#define RETURN_ON_FAILURE(i) \
675 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
676
677#define SFY(x) #x
678
679#define DATA_STACK_ALLOC(state, type, ptr) \
680do { \
681 alloc_pos = state->data_stack_base; \
682 TRACE(("allocating %s in %d (%d)\n", \
683 SFY(type), alloc_pos, sizeof(type))); \
684 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
685 int j = data_stack_grow(state, sizeof(type)); \
686 if (j < 0) return j; \
687 if (ctx_pos != -1) \
688 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
689 } \
690 ptr = (type*)(state->data_stack+alloc_pos); \
691 state->data_stack_base += sizeof(type); \
692} while (0)
693
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000694#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
695do { \
696 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
697 ptr = (type*)(state->data_stack+pos); \
698} while (0)
699
700#define DATA_STACK_PUSH(state, data, size) \
701do { \
702 TRACE(("copy data in %p to %d (%d)\n", \
703 data, state->data_stack_base, size)); \
704 if (state->data_stack_size < state->data_stack_base+size) { \
705 int j = data_stack_grow(state, size); \
706 if (j < 0) return j; \
707 if (ctx_pos != -1) \
708 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
709 } \
710 memcpy(state->data_stack+state->data_stack_base, data, size); \
711 state->data_stack_base += size; \
712} while (0)
713
714#define DATA_STACK_POP(state, data, size, discard) \
715do { \
716 TRACE(("copy data to %p from %d (%d)\n", \
717 data, state->data_stack_base-size, size)); \
718 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
719 if (discard) \
720 state->data_stack_base -= size; \
721} while (0)
722
723#define DATA_STACK_POP_DISCARD(state, size) \
724do { \
725 TRACE(("discard data from %d (%d)\n", \
726 state->data_stack_base-size, size)); \
727 state->data_stack_base -= size; \
728} while(0)
729
730#define DATA_PUSH(x) \
731 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
732#define DATA_POP(x) \
733 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000734#define DATA_POP_DISCARD(x) \
735 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
736#define DATA_ALLOC(t,p) \
737 DATA_STACK_ALLOC(state, t, p)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000738#define DATA_LOOKUP_AT(t,p,pos) \
739 DATA_STACK_LOOKUP_AT(state,t,p,pos)
740
741#define MARK_PUSH(lastmark) \
742 do if (lastmark > 0) { \
743 i = lastmark; /* ctx->lastmark may change if reallocated */ \
744 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
745 } while (0)
746#define MARK_POP(lastmark) \
747 do if (lastmark > 0) { \
748 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
749 } while (0)
750#define MARK_POP_KEEP(lastmark) \
751 do if (lastmark > 0) { \
752 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
753 } while (0)
754#define MARK_POP_DISCARD(lastmark) \
755 do if (lastmark > 0) { \
756 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
757 } while (0)
758
759#define JUMP_NONE 0
760#define JUMP_MAX_UNTIL_1 1
761#define JUMP_MAX_UNTIL_2 2
762#define JUMP_MAX_UNTIL_3 3
763#define JUMP_MIN_UNTIL_1 4
764#define JUMP_MIN_UNTIL_2 5
765#define JUMP_MIN_UNTIL_3 6
766#define JUMP_REPEAT 7
767#define JUMP_REPEAT_ONE_1 8
768#define JUMP_REPEAT_ONE_2 9
769#define JUMP_MIN_REPEAT_ONE 10
770#define JUMP_BRANCH 11
771#define JUMP_ASSERT 12
772#define JUMP_ASSERT_NOT 13
773
774#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
775 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
776 nextctx->last_ctx_pos = ctx_pos; \
777 nextctx->jump = jumpvalue; \
778 nextctx->pattern = nextpattern; \
779 ctx_pos = alloc_pos; \
780 ctx = nextctx; \
781 goto entrance; \
782 jumplabel: \
783 while (0) /* gcc doesn't like labels at end of scopes */ \
784
785typedef struct {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000786 Py_ssize_t last_ctx_pos;
787 Py_ssize_t jump;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000788 SRE_CHAR* ptr;
789 SRE_CODE* pattern;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000790 Py_ssize_t count;
791 Py_ssize_t lastmark;
792 Py_ssize_t lastindex;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000793 union {
794 SRE_CODE chr;
795 SRE_REPEAT* rep;
796 } u;
797} SRE_MATCH_CONTEXT;
798
799/* check if string matches the given pattern. returns <0 for
800 error, 0 for failure, and 1 for success */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000801LOCAL(Py_ssize_t)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000802SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
Guido van Rossumb700df92000-03-31 14:59:30 +0000803{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000804 SRE_CHAR* end = (SRE_CHAR *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000805 Py_ssize_t alloc_pos, ctx_pos = -1;
806 Py_ssize_t i, ret = 0;
807 Py_ssize_t jump;
Guido van Rossumb700df92000-03-31 14:59:30 +0000808
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000809 SRE_MATCH_CONTEXT* ctx;
810 SRE_MATCH_CONTEXT* nextctx;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000811
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000812 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000813
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000814 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
815 ctx->last_ctx_pos = -1;
816 ctx->jump = JUMP_NONE;
817 ctx->pattern = pattern;
818 ctx_pos = alloc_pos;
819
820entrance:
821
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000822 ctx->ptr = (SRE_CHAR *)state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000823
824 if (ctx->pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000825 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000826 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000827 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000828 TRACE(("reject (got %d chars, need %d)\n",
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000829 (end - ctx->ptr), ctx->pattern[3]));
830 RETURN_FAILURE;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000831 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000832 ctx->pattern += ctx->pattern[1] + 1;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000833 }
834
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000835 for (;;) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000836
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000837 switch (*ctx->pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000838
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000839 case SRE_OP_MARK:
840 /* set mark */
841 /* <MARK> <gid> */
842 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
843 ctx->ptr, ctx->pattern[0]));
844 i = ctx->pattern[0];
845 if (i & 1)
846 state->lastindex = i/2 + 1;
847 if (i > state->lastmark) {
848 /* state->lastmark is the highest valid index in the
849 state->mark array. If it is increased by more than 1,
850 the intervening marks must be set to NULL to signal
Tim Peters3d563502006-01-21 02:47:53 +0000851 that these marks have not been encountered. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000852 Py_ssize_t j = state->lastmark + 1;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000853 while (j < i)
854 state->mark[j++] = NULL;
855 state->lastmark = i;
856 }
857 state->mark[i] = ctx->ptr;
858 ctx->pattern++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000859 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000860
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000861 case SRE_OP_LITERAL:
862 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000863 /* <LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000864 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
865 ctx->ptr, *ctx->pattern));
866 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
867 RETURN_FAILURE;
868 ctx->pattern++;
869 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000870 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000871
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000872 case SRE_OP_NOT_LITERAL:
873 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000874 /* <NOT_LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000875 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
876 ctx->ptr, *ctx->pattern));
877 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
878 RETURN_FAILURE;
879 ctx->pattern++;
880 ctx->ptr++;
881 break;
882
883 case SRE_OP_SUCCESS:
884 /* end of pattern */
885 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
886 state->ptr = ctx->ptr;
887 RETURN_SUCCESS;
888
889 case SRE_OP_AT:
890 /* match at given position */
891 /* <AT> <code> */
892 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
893 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
894 RETURN_FAILURE;
895 ctx->pattern++;
896 break;
897
898 case SRE_OP_CATEGORY:
899 /* match at given category */
900 /* <CATEGORY> <code> */
901 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
902 ctx->ptr, *ctx->pattern));
903 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
904 RETURN_FAILURE;
905 ctx->pattern++;
906 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000907 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000908
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000909 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000910 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000911 /* <ANY> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000912 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
913 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
914 RETURN_FAILURE;
915 ctx->ptr++;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000916 break;
917
918 case SRE_OP_ANY_ALL:
919 /* match anything */
920 /* <ANY_ALL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000921 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
922 if (ctx->ptr >= end)
923 RETURN_FAILURE;
924 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000925 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000926
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000927 case SRE_OP_IN:
928 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000929 /* <IN> <skip> <set> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000930 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
931 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
932 RETURN_FAILURE;
933 ctx->pattern += ctx->pattern[0];
934 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000935 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000936
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000937 case SRE_OP_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000938 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
939 ctx->pattern, ctx->ptr, ctx->pattern[0]));
940 if (ctx->ptr >= end ||
941 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
942 RETURN_FAILURE;
943 ctx->pattern++;
944 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000945 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000946
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000947 case SRE_OP_NOT_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000948 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
949 ctx->pattern, ctx->ptr, *ctx->pattern));
950 if (ctx->ptr >= end ||
951 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
952 RETURN_FAILURE;
953 ctx->pattern++;
954 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000955 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000956
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000957 case SRE_OP_IN_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000958 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
959 if (ctx->ptr >= end
960 || !SRE_CHARSET(ctx->pattern+1,
961 (SRE_CODE)state->lower(*ctx->ptr)))
962 RETURN_FAILURE;
963 ctx->pattern += ctx->pattern[0];
964 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000965 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000966
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000967 case SRE_OP_JUMP:
968 case SRE_OP_INFO:
969 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000970 /* <JUMP> <offset> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000971 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
972 ctx->ptr, ctx->pattern[0]));
973 ctx->pattern += ctx->pattern[0];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000974 break;
975
976 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000977 /* alternation */
978 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000979 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000980 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000981 ctx->u.rep = state->repeat;
982 if (ctx->u.rep)
983 MARK_PUSH(ctx->lastmark);
984 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
985 if (ctx->pattern[1] == SRE_OP_LITERAL &&
986 (ctx->ptr >= end ||
987 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000988 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000989 if (ctx->pattern[1] == SRE_OP_IN &&
990 (ctx->ptr >= end ||
991 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000992 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000993 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000994 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000995 if (ret) {
996 if (ctx->u.rep)
997 MARK_POP_DISCARD(ctx->lastmark);
998 RETURN_ON_ERROR(ret);
999 RETURN_SUCCESS;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001000 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001001 if (ctx->u.rep)
1002 MARK_POP_KEEP(ctx->lastmark);
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001003 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001004 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001005 if (ctx->u.rep)
1006 MARK_POP_DISCARD(ctx->lastmark);
1007 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001008
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001009 case SRE_OP_REPEAT_ONE:
1010 /* match repeated sequence (maximizing regexp) */
1011
1012 /* this operator only works if the repeated item is
1013 exactly one character wide, and we're not already
1014 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +00001015 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001016
1017 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1018
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001019 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1020 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001021
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001022 if (ctx->ptr + ctx->pattern[1] > end)
1023 RETURN_FAILURE; /* cannot match */
Fredrik Lundhe1869832000-08-01 22:47:49 +00001024
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001025 state->ptr = ctx->ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001026
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001027 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1028 RETURN_ON_ERROR(ret);
1029 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1030 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001031 ctx->ptr += ctx->count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001032
1033 /* when we arrive here, count contains the number of
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001034 matches, and ctx->ptr points to the tail of the target
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001035 string. check if the rest of the pattern matches,
1036 and backtrack if not. */
1037
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001038 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001039 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001040
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001041 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001042 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001043 state->ptr = ctx->ptr;
1044 RETURN_SUCCESS;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001045 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001046
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001047 LASTMARK_SAVE();
1048
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001049 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001050 /* tail starts with a literal. skip positions where
1051 the rest of the pattern cannot possibly match */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001052 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001053 for (;;) {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001054 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001055 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1056 ctx->ptr--;
1057 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001058 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001059 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001060 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001061 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001062 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1063 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001064 if (ret) {
1065 RETURN_ON_ERROR(ret);
1066 RETURN_SUCCESS;
1067 }
Tim Peters3d563502006-01-21 02:47:53 +00001068
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001069 LASTMARK_RESTORE();
Tim Peters3d563502006-01-21 02:47:53 +00001070
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001071 ctx->ptr--;
1072 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001073 }
1074
1075 } else {
1076 /* general case */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001077 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001078 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001079 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1080 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001081 if (ret) {
1082 RETURN_ON_ERROR(ret);
1083 RETURN_SUCCESS;
1084 }
1085 ctx->ptr--;
1086 ctx->count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001087 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001088 }
1089 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001090 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001091
Guido van Rossum41c99e72003-04-14 17:59:34 +00001092 case SRE_OP_MIN_REPEAT_ONE:
1093 /* match repeated sequence (minimizing regexp) */
1094
1095 /* this operator only works if the repeated item is
1096 exactly one character wide, and we're not already
1097 collecting backtracking points. for other cases,
1098 use the MIN_REPEAT operator */
1099
1100 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1101
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001102 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1103 ctx->pattern[1], ctx->pattern[2]));
Guido van Rossum41c99e72003-04-14 17:59:34 +00001104
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001105 if (ctx->ptr + ctx->pattern[1] > end)
1106 RETURN_FAILURE; /* cannot match */
Guido van Rossum41c99e72003-04-14 17:59:34 +00001107
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001108 state->ptr = ctx->ptr;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001109
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001110 if (ctx->pattern[1] == 0)
1111 ctx->count = 0;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001112 else {
1113 /* count using pattern min as the maximum */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001114 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1115 RETURN_ON_ERROR(ret);
1116 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001117 if (ret < (Py_ssize_t) ctx->pattern[1])
Tim Peters3d563502006-01-21 02:47:53 +00001118 /* didn't match minimum number of times */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001119 RETURN_FAILURE;
1120 /* advance past minimum matches of repeat */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001121 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001122 ctx->ptr += ctx->count;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001123 }
1124
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001125 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Guido van Rossum41c99e72003-04-14 17:59:34 +00001126 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001127 state->ptr = ctx->ptr;
1128 RETURN_SUCCESS;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001129
1130 } else {
1131 /* general case */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001132 LASTMARK_SAVE();
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001133 while ((Py_ssize_t)ctx->pattern[2] == 65535
1134 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001135 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001136 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1137 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001138 if (ret) {
1139 RETURN_ON_ERROR(ret);
1140 RETURN_SUCCESS;
1141 }
1142 state->ptr = ctx->ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001143 ret = SRE_COUNT(state, ctx->pattern+3, 1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001144 RETURN_ON_ERROR(ret);
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001145 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001146 if (ret == 0)
Guido van Rossum41c99e72003-04-14 17:59:34 +00001147 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001148 assert(ret == 1);
1149 ctx->ptr++;
1150 ctx->count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001151 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001152 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001153 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001154 RETURN_FAILURE;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001155
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001156 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001157 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001158 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001159 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001160 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1161 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001162
1163 /* install new repeat context */
Thomas Wouters477c8d52006-05-27 19:21:47 +00001164 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001165 if (!ctx->u.rep) {
1166 PyErr_NoMemory();
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001167 RETURN_FAILURE;
Thomas Wouters89f507f2006-12-13 04:49:30 +00001168 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001169 ctx->u.rep->count = -1;
1170 ctx->u.rep->pattern = ctx->pattern;
1171 ctx->u.rep->prev = state->repeat;
1172 ctx->u.rep->last_ptr = NULL;
1173 state->repeat = ctx->u.rep;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001174
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001175 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001176 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001177 state->repeat = ctx->u.rep->prev;
Thomas Wouters477c8d52006-05-27 19:21:47 +00001178 PyObject_FREE(ctx->u.rep);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001179
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001180 if (ret) {
1181 RETURN_ON_ERROR(ret);
1182 RETURN_SUCCESS;
1183 }
1184 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001185
1186 case SRE_OP_MAX_UNTIL:
1187 /* maximizing repeat */
1188 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1189
1190 /* FIXME: we probably need to deal with zero-width
1191 matches in here... */
1192
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001193 ctx->u.rep = state->repeat;
1194 if (!ctx->u.rep)
1195 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001196
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001197 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001198
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001199 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001200
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001201 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1202 ctx->ptr, ctx->count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001203
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001204 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001205 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001206 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001207 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1208 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001209 if (ret) {
1210 RETURN_ON_ERROR(ret);
1211 RETURN_SUCCESS;
1212 }
1213 ctx->u.rep->count = ctx->count-1;
1214 state->ptr = ctx->ptr;
1215 RETURN_FAILURE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001216 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001217
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001218 if ((ctx->count < ctx->u.rep->pattern[2] ||
1219 ctx->u.rep->pattern[2] == 65535) &&
1220 state->ptr != ctx->u.rep->last_ptr) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001221 /* we may have enough matches, but if we can
1222 match another item, do so */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001223 ctx->u.rep->count = ctx->count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001224 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001225 MARK_PUSH(ctx->lastmark);
1226 /* zero-width match protection */
1227 DATA_PUSH(&ctx->u.rep->last_ptr);
1228 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001229 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1230 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001231 DATA_POP(&ctx->u.rep->last_ptr);
1232 if (ret) {
1233 MARK_POP_DISCARD(ctx->lastmark);
1234 RETURN_ON_ERROR(ret);
1235 RETURN_SUCCESS;
1236 }
1237 MARK_POP(ctx->lastmark);
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001238 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001239 ctx->u.rep->count = ctx->count-1;
1240 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001241 }
1242
1243 /* cannot match more repeated items here. make sure the
1244 tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001245 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001246 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001247 RETURN_ON_SUCCESS(ret);
1248 state->repeat = ctx->u.rep;
1249 state->ptr = ctx->ptr;
1250 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001251
1252 case SRE_OP_MIN_UNTIL:
1253 /* minimizing repeat */
1254 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1255
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001256 ctx->u.rep = state->repeat;
1257 if (!ctx->u.rep)
1258 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001259
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001260 state->ptr = ctx->ptr;
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001261
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001262 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001263
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001264 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1265 ctx->ptr, ctx->count, ctx->u.rep->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001266
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001267 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001268 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001269 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001270 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1271 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001272 if (ret) {
1273 RETURN_ON_ERROR(ret);
1274 RETURN_SUCCESS;
1275 }
1276 ctx->u.rep->count = ctx->count-1;
1277 state->ptr = ctx->ptr;
1278 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001279 }
1280
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001281 LASTMARK_SAVE();
1282
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001283 /* see if the tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001284 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001285 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001286 if (ret) {
1287 RETURN_ON_ERROR(ret);
1288 RETURN_SUCCESS;
1289 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001290
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001291 state->repeat = ctx->u.rep;
1292 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001293
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001294 LASTMARK_RESTORE();
1295
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001296 if (ctx->count >= ctx->u.rep->pattern[2]
1297 && ctx->u.rep->pattern[2] != 65535)
1298 RETURN_FAILURE;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001299
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001300 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001301 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1302 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001303 if (ret) {
1304 RETURN_ON_ERROR(ret);
1305 RETURN_SUCCESS;
1306 }
1307 ctx->u.rep->count = ctx->count-1;
1308 state->ptr = ctx->ptr;
1309 RETURN_FAILURE;
1310
1311 case SRE_OP_GROUPREF:
1312 /* match backreference */
1313 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1314 ctx->ptr, ctx->pattern[0]));
1315 i = ctx->pattern[0];
1316 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001317 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001318 if (groupref >= state->lastmark) {
1319 RETURN_FAILURE;
1320 } else {
1321 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1322 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1323 if (!p || !e || e < p)
1324 RETURN_FAILURE;
1325 while (p < e) {
1326 if (ctx->ptr >= end || *ctx->ptr != *p)
1327 RETURN_FAILURE;
1328 p++; ctx->ptr++;
1329 }
1330 }
1331 }
1332 ctx->pattern++;
1333 break;
1334
1335 case SRE_OP_GROUPREF_IGNORE:
1336 /* match backreference */
1337 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1338 ctx->ptr, ctx->pattern[0]));
1339 i = ctx->pattern[0];
1340 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001341 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001342 if (groupref >= state->lastmark) {
1343 RETURN_FAILURE;
1344 } else {
1345 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1346 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1347 if (!p || !e || e < p)
1348 RETURN_FAILURE;
1349 while (p < e) {
1350 if (ctx->ptr >= end ||
1351 state->lower(*ctx->ptr) != state->lower(*p))
1352 RETURN_FAILURE;
1353 p++; ctx->ptr++;
1354 }
1355 }
1356 }
1357 ctx->pattern++;
1358 break;
1359
1360 case SRE_OP_GROUPREF_EXISTS:
1361 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1362 ctx->ptr, ctx->pattern[0]));
1363 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1364 i = ctx->pattern[0];
1365 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001366 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001367 if (groupref >= state->lastmark) {
1368 ctx->pattern += ctx->pattern[1];
1369 break;
1370 } else {
1371 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1372 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1373 if (!p || !e || e < p) {
1374 ctx->pattern += ctx->pattern[1];
1375 break;
1376 }
1377 }
1378 }
1379 ctx->pattern += 2;
1380 break;
1381
1382 case SRE_OP_ASSERT:
1383 /* assert subpattern */
1384 /* <ASSERT> <skip> <back> <pattern> */
1385 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1386 ctx->ptr, ctx->pattern[1]));
1387 state->ptr = ctx->ptr - ctx->pattern[1];
1388 if (state->ptr < state->beginning)
1389 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001390 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001391 RETURN_ON_FAILURE(ret);
1392 ctx->pattern += ctx->pattern[0];
1393 break;
1394
1395 case SRE_OP_ASSERT_NOT:
1396 /* assert not subpattern */
1397 /* <ASSERT_NOT> <skip> <back> <pattern> */
1398 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1399 ctx->ptr, ctx->pattern[1]));
1400 state->ptr = ctx->ptr - ctx->pattern[1];
1401 if (state->ptr >= state->beginning) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001402 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001403 if (ret) {
1404 RETURN_ON_ERROR(ret);
1405 RETURN_FAILURE;
1406 }
1407 }
1408 ctx->pattern += ctx->pattern[0];
1409 break;
1410
1411 case SRE_OP_FAILURE:
1412 /* immediate failure */
1413 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1414 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001415
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001416 default:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001417 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1418 ctx->pattern[-1]));
1419 RETURN_ERROR(SRE_ERROR_ILLEGAL);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001420 }
1421 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001422
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001423exit:
1424 ctx_pos = ctx->last_ctx_pos;
1425 jump = ctx->jump;
1426 DATA_POP_DISCARD(ctx);
1427 if (ctx_pos == -1)
1428 return ret;
1429 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1430
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001431 switch (jump) {
1432 case JUMP_MAX_UNTIL_2:
1433 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1434 goto jump_max_until_2;
1435 case JUMP_MAX_UNTIL_3:
1436 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1437 goto jump_max_until_3;
1438 case JUMP_MIN_UNTIL_2:
1439 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1440 goto jump_min_until_2;
1441 case JUMP_MIN_UNTIL_3:
1442 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1443 goto jump_min_until_3;
1444 case JUMP_BRANCH:
1445 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1446 goto jump_branch;
1447 case JUMP_MAX_UNTIL_1:
1448 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1449 goto jump_max_until_1;
1450 case JUMP_MIN_UNTIL_1:
1451 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1452 goto jump_min_until_1;
1453 case JUMP_REPEAT:
1454 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1455 goto jump_repeat;
1456 case JUMP_REPEAT_ONE_1:
1457 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1458 goto jump_repeat_one_1;
1459 case JUMP_REPEAT_ONE_2:
1460 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1461 goto jump_repeat_one_2;
1462 case JUMP_MIN_REPEAT_ONE:
1463 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1464 goto jump_min_repeat_one;
1465 case JUMP_ASSERT:
1466 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1467 goto jump_assert;
1468 case JUMP_ASSERT_NOT:
1469 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1470 goto jump_assert_not;
1471 case JUMP_NONE:
1472 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1473 break;
1474 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001475
1476 return ret; /* should never get here */
Guido van Rossumb700df92000-03-31 14:59:30 +00001477}
1478
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001479LOCAL(Py_ssize_t)
Guido van Rossumb700df92000-03-31 14:59:30 +00001480SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1481{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001482 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1483 SRE_CHAR* end = (SRE_CHAR *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001484 Py_ssize_t status = 0;
1485 Py_ssize_t prefix_len = 0;
1486 Py_ssize_t prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001487 SRE_CODE* prefix = NULL;
1488 SRE_CODE* charset = NULL;
1489 SRE_CODE* overlap = NULL;
1490 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001491
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001492 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001493 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001494 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001495
1496 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001497
Gustavo Niemeyer28b5bb32003-06-26 14:41:08 +00001498 if (pattern[3] > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001499 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001500 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001501 end -= pattern[3]-1;
1502 if (end <= ptr)
1503 end = ptr+1;
1504 }
1505
Fredrik Lundh3562f112000-07-02 12:00:07 +00001506 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001507 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001508 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001509 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001510 prefix_skip = pattern[6];
1511 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001512 overlap = prefix + prefix_len - 1;
1513 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001514 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001515 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001516 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001517
1518 pattern += 1 + pattern[1];
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001519 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001520
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001521 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1522 TRACE(("charset = %p\n", charset));
1523
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001524#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001525 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001526 /* pattern starts with a known prefix. use the overlap
1527 table to skip forward as fast as we possibly can */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001528 Py_ssize_t i = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001529 end = (SRE_CHAR *)state->end;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001530 while (ptr < end) {
1531 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001532 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001533 if (!i)
1534 break;
1535 else
1536 i = overlap[i];
1537 } else {
1538 if (++i == prefix_len) {
1539 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001540 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1541 state->start = ptr + 1 - prefix_len;
1542 state->ptr = ptr + 1 - prefix_len + prefix_skip;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001543 if (flags & SRE_INFO_LITERAL)
1544 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001545 status = SRE_MATCH(state, pattern + 2*prefix_skip);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001546 if (status != 0)
1547 return status;
1548 /* close but no cigar -- try again */
1549 i = overlap[i];
1550 }
1551 break;
1552 }
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001553 }
1554 ptr++;
1555 }
1556 return 0;
1557 }
1558#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001559
Fredrik Lundh3562f112000-07-02 12:00:07 +00001560 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001561 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001562 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001563 SRE_CODE chr = pattern[1];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001564 end = (SRE_CHAR *)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001565 for (;;) {
1566 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1567 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001568 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001569 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001570 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001571 state->start = ptr;
1572 state->ptr = ++ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001573 if (flags & SRE_INFO_LITERAL)
1574 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001575 status = SRE_MATCH(state, pattern + 2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001576 if (status != 0)
1577 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001578 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001579 } else if (charset) {
1580 /* pattern starts with a character from a known set */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001581 end = (SRE_CHAR *)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001582 for (;;) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001583 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001584 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001585 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001586 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001587 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001588 state->start = ptr;
1589 state->ptr = ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001590 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001591 if (status != 0)
1592 break;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001593 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001594 }
1595 } else
1596 /* general case */
1597 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001598 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001599 state->start = state->ptr = ptr++;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001600 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001601 if (status != 0)
1602 break;
1603 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001604
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001605 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001606}
Tim Peters3d563502006-01-21 02:47:53 +00001607
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001608LOCAL(int)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001609SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001610{
1611 /* check if given string is a literal template (i.e. no escapes) */
1612 while (len-- > 0)
1613 if (*ptr++ == '\\')
1614 return 0;
1615 return 1;
1616}
Guido van Rossumb700df92000-03-31 14:59:30 +00001617
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001618#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001619
1620/* -------------------------------------------------------------------- */
1621/* factories and destructors */
1622
1623/* see sre.h for object declarations */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001624static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1625static PyObject*pattern_scanner(PatternObject*, PyObject*);
Guido van Rossumb700df92000-03-31 14:59:30 +00001626
1627static PyObject *
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001628sre_codesize(PyObject* self, PyObject *unused)
Guido van Rossumb700df92000-03-31 14:59:30 +00001629{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001630 return Py_BuildValue("l", sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001631}
1632
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001633static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001634sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001635{
1636 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001637 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001638 return NULL;
1639 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001640 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001641 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001642#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001643 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001644#else
1645 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001646#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001647 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001648}
1649
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001650LOCAL(void)
1651state_reset(SRE_STATE* state)
1652{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001653 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001654 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001655
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001656 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001657 state->lastindex = -1;
1658
1659 state->repeat = NULL;
1660
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001661 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001662}
1663
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001664static void*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001665getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
Guido van Rossumb700df92000-03-31 14:59:30 +00001666{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001667 /* given a python object, return a data pointer, a length (in
1668 characters), and a character size. return NULL if the object
1669 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +00001670
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001671 PyBufferProcs *buffer;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001672 Py_ssize_t size, bytes;
1673 int charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001674 void* ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001675
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001676#if defined(HAVE_UNICODE)
1677 if (PyUnicode_Check(string)) {
1678 /* unicode strings doesn't always support the buffer interface */
1679 ptr = (void*) PyUnicode_AS_DATA(string);
1680 bytes = PyUnicode_GET_DATA_SIZE(string);
1681 size = PyUnicode_GET_SIZE(string);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001682 charsize = sizeof(Py_UNICODE);
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001683
1684 } else {
1685#endif
1686
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001687 /* get pointer to string buffer */
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001688 buffer = Py_Type(string)->tp_as_buffer;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001689 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1690 buffer->bf_getsegcount(string, NULL) != 1) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001691 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001692 return NULL;
1693 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001694
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001695 /* determine buffer size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001696 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1697 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001698 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1699 return NULL;
1700 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001701
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001702 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001703#if PY_VERSION_HEX >= 0x01060000
1704 size = PyObject_Size(string);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001705#else
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001706 size = PyObject_Length(string);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001707#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001708
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001709 if (PyString_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001710 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001711#if defined(HAVE_UNICODE)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001712 else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001713 charsize = sizeof(Py_UNICODE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001714#endif
1715 else {
1716 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1717 return NULL;
1718 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001719
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001720#if defined(HAVE_UNICODE)
1721 }
1722#endif
1723
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001724 *p_length = size;
1725 *p_charsize = charsize;
1726
1727 return ptr;
1728}
1729
1730LOCAL(PyObject*)
1731state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001732 Py_ssize_t start, Py_ssize_t end)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001733{
1734 /* prepare state object */
1735
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001736 Py_ssize_t length;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001737 int charsize;
1738 void* ptr;
1739
1740 memset(state, 0, sizeof(SRE_STATE));
1741
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001742 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001743 state->lastindex = -1;
1744
1745 ptr = getstring(string, &length, &charsize);
1746 if (!ptr)
1747 return NULL;
1748
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001749 /* adjust boundaries */
1750 if (start < 0)
1751 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001752 else if (start > length)
1753 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001754
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001755 if (end < 0)
1756 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001757 else if (end > length)
1758 end = length;
1759
1760 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001761
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001762 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001763
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001764 state->start = (void*) ((char*) ptr + start * state->charsize);
1765 state->end = (void*) ((char*) ptr + end * state->charsize);
1766
1767 Py_INCREF(string);
1768 state->string = string;
1769 state->pos = start;
1770 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001771
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001772 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001773 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001774 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001775#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001776 state->lower = sre_lower_unicode;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001777#else
1778 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001779#endif
1780 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001781 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001782
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001783 return string;
Guido van Rossumb700df92000-03-31 14:59:30 +00001784}
1785
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001786LOCAL(void)
1787state_fini(SRE_STATE* state)
1788{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001789 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001790 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001791}
1792
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001793/* calculate offset from start of string */
1794#define STATE_OFFSET(state, member)\
1795 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1796
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001797LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001798state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001799{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001800 Py_ssize_t i, j;
Fredrik Lundh58100642000-08-09 09:14:35 +00001801
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001802 index = (index - 1) * 2;
1803
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001804 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001805 if (empty)
1806 /* want empty string */
1807 i = j = 0;
1808 else {
1809 Py_INCREF(Py_None);
1810 return Py_None;
1811 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001812 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001813 i = STATE_OFFSET(state, state->mark[index]);
1814 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001815 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001816
Fredrik Lundh58100642000-08-09 09:14:35 +00001817 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001818}
1819
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001820static void
1821pattern_error(int status)
1822{
1823 switch (status) {
1824 case SRE_ERROR_RECURSION_LIMIT:
1825 PyErr_SetString(
1826 PyExc_RuntimeError,
1827 "maximum recursion limit exceeded"
1828 );
1829 break;
1830 case SRE_ERROR_MEMORY:
1831 PyErr_NoMemory();
1832 break;
1833 default:
1834 /* other error codes indicate compiler/engine bugs */
1835 PyErr_SetString(
1836 PyExc_RuntimeError,
1837 "internal error in regular expression engine"
1838 );
1839 }
1840}
1841
Guido van Rossumb700df92000-03-31 14:59:30 +00001842static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001843pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001844{
Raymond Hettinger027bb632004-05-31 03:09:25 +00001845 if (self->weakreflist != NULL)
1846 PyObject_ClearWeakRefs((PyObject *) self);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001847 Py_XDECREF(self->pattern);
1848 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001849 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001850 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001851}
1852
1853static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001854pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001855{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001856 SRE_STATE state;
1857 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001858
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001859 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001860 Py_ssize_t start = 0;
1861 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001862 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001863 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001864 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001865 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001866
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001867 string = state_init(&state, self, string, start, end);
1868 if (!string)
1869 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001870
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001871 state.ptr = state.start;
1872
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001873 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1874
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001875 if (state.charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001876 status = sre_match(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001877 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001878#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001879 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001880#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001881 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001882
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001883 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001884 if (PyErr_Occurred())
1885 return NULL;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001886
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001887 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001888
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001889 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001890}
1891
1892static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001893pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001894{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001895 SRE_STATE state;
1896 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001897
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001898 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001899 Py_ssize_t start = 0;
1900 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001901 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001902 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001903 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001904 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001905
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001906 string = state_init(&state, self, string, start, end);
1907 if (!string)
1908 return NULL;
1909
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001910 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1911
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001912 if (state.charsize == 1) {
1913 status = sre_search(&state, PatternObject_GetCode(self));
1914 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001915#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001916 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001917#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001918 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001919
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001920 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1921
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001922 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001923
Thomas Wouters89f507f2006-12-13 04:49:30 +00001924 if (PyErr_Occurred())
1925 return NULL;
1926
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001927 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001928}
1929
1930static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001931call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001932{
1933 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001934 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001935 PyObject* func;
1936 PyObject* result;
1937
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001938 if (!args)
1939 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001940 name = PyString_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001941 if (!name)
1942 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001943 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001944 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001945 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001946 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001947 func = PyObject_GetAttrString(mod, function);
1948 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001949 if (!func)
1950 return NULL;
1951 result = PyObject_CallObject(func, args);
1952 Py_DECREF(func);
1953 Py_DECREF(args);
1954 return result;
1955}
1956
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001957#ifdef USE_BUILTIN_COPY
1958static int
1959deepcopy(PyObject** object, PyObject* memo)
1960{
1961 PyObject* copy;
1962
1963 copy = call(
1964 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00001965 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001966 );
1967 if (!copy)
1968 return 0;
1969
1970 Py_DECREF(*object);
1971 *object = copy;
1972
1973 return 1; /* success */
1974}
1975#endif
1976
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001977static PyObject*
Michael W. Hudsonb6a45052002-07-31 09:54:24 +00001978join_list(PyObject* list, PyObject* pattern)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001979{
1980 /* join list elements */
1981
1982 PyObject* joiner;
1983#if PY_VERSION_HEX >= 0x01060000
1984 PyObject* function;
1985 PyObject* args;
1986#endif
1987 PyObject* result;
1988
1989 switch (PyList_GET_SIZE(list)) {
1990 case 0:
1991 Py_DECREF(list);
Fredrik Lundh09705f02002-11-22 12:46:35 +00001992 return PySequence_GetSlice(pattern, 0, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001993 case 1:
1994 result = PyList_GET_ITEM(list, 0);
1995 Py_INCREF(result);
1996 Py_DECREF(list);
1997 return result;
1998 }
1999
2000 /* two or more elements: slice out a suitable separator from the
2001 first member, and use that to join the entire list */
2002
2003 joiner = PySequence_GetSlice(pattern, 0, 0);
2004 if (!joiner)
2005 return NULL;
2006
2007#if PY_VERSION_HEX >= 0x01060000
2008 function = PyObject_GetAttrString(joiner, "join");
2009 if (!function) {
2010 Py_DECREF(joiner);
2011 return NULL;
2012 }
2013 args = PyTuple_New(1);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002014 if (!args) {
2015 Py_DECREF(function);
2016 Py_DECREF(joiner);
2017 return NULL;
2018 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002019 PyTuple_SET_ITEM(args, 0, list);
2020 result = PyObject_CallObject(function, args);
2021 Py_DECREF(args); /* also removes list */
2022 Py_DECREF(function);
2023#else
2024 result = call(
2025 "string", "join",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002026 PyTuple_Pack(2, list, joiner)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002027 );
2028#endif
2029 Py_DECREF(joiner);
2030
2031 return result;
2032}
2033
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002034static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002035pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002036{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002037 SRE_STATE state;
2038 PyObject* list;
2039 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002040 Py_ssize_t i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00002041
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002042 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002043 Py_ssize_t start = 0;
2044 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002045 static char* kwlist[] = { "source", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002046 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00002047 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002048 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002049
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002050 string = state_init(&state, self, string, start, end);
2051 if (!string)
2052 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002053
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002054 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002055 if (!list) {
2056 state_fini(&state);
2057 return NULL;
2058 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002059
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002060 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002061
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002062 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +00002063
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002064 state_reset(&state);
2065
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002066 state.ptr = state.start;
2067
2068 if (state.charsize == 1) {
2069 status = sre_search(&state, PatternObject_GetCode(self));
2070 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002071#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002072 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002073#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002074 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002075
Thomas Wouters89f507f2006-12-13 04:49:30 +00002076 if (PyErr_Occurred())
2077 goto error;
2078
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002079 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002080 if (status == 0)
2081 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002082 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002083 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002084 }
Tim Peters3d563502006-01-21 02:47:53 +00002085
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002086 /* don't bother to build a match object */
2087 switch (self->groups) {
2088 case 0:
2089 b = STATE_OFFSET(&state, state.start);
2090 e = STATE_OFFSET(&state, state.ptr);
2091 item = PySequence_GetSlice(string, b, e);
2092 if (!item)
2093 goto error;
2094 break;
2095 case 1:
2096 item = state_getslice(&state, 1, string, 1);
2097 if (!item)
2098 goto error;
2099 break;
2100 default:
2101 item = PyTuple_New(self->groups);
2102 if (!item)
2103 goto error;
2104 for (i = 0; i < self->groups; i++) {
2105 PyObject* o = state_getslice(&state, i+1, string, 1);
2106 if (!o) {
2107 Py_DECREF(item);
2108 goto error;
2109 }
2110 PyTuple_SET_ITEM(item, i, o);
2111 }
2112 break;
2113 }
2114
2115 status = PyList_Append(list, item);
2116 Py_DECREF(item);
2117 if (status < 0)
2118 goto error;
2119
2120 if (state.ptr == state.start)
2121 state.start = (void*) ((char*) state.ptr + state.charsize);
2122 else
2123 state.start = state.ptr;
2124
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002125 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002126
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002127 state_fini(&state);
2128 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002129
2130error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002131 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002132 state_fini(&state);
2133 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002134
Guido van Rossumb700df92000-03-31 14:59:30 +00002135}
2136
Fredrik Lundh703ce812001-10-24 22:16:30 +00002137#if PY_VERSION_HEX >= 0x02020000
2138static PyObject*
2139pattern_finditer(PatternObject* pattern, PyObject* args)
2140{
2141 PyObject* scanner;
2142 PyObject* search;
2143 PyObject* iterator;
2144
2145 scanner = pattern_scanner(pattern, args);
2146 if (!scanner)
2147 return NULL;
2148
2149 search = PyObject_GetAttrString(scanner, "search");
2150 Py_DECREF(scanner);
2151 if (!search)
2152 return NULL;
2153
2154 iterator = PyCallIter_New(search, Py_None);
2155 Py_DECREF(search);
2156
2157 return iterator;
2158}
2159#endif
2160
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002161static PyObject*
2162pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2163{
2164 SRE_STATE state;
2165 PyObject* list;
2166 PyObject* item;
2167 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002168 Py_ssize_t n;
2169 Py_ssize_t i;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002170 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002171
2172 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002173 Py_ssize_t maxsplit = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002174 static char* kwlist[] = { "source", "maxsplit", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002175 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002176 &string, &maxsplit))
2177 return NULL;
2178
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002179 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002180 if (!string)
2181 return NULL;
2182
2183 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002184 if (!list) {
2185 state_fini(&state);
2186 return NULL;
2187 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002188
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002189 n = 0;
2190 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002191
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002192 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002193
2194 state_reset(&state);
2195
2196 state.ptr = state.start;
2197
2198 if (state.charsize == 1) {
2199 status = sre_search(&state, PatternObject_GetCode(self));
2200 } else {
2201#if defined(HAVE_UNICODE)
2202 status = sre_usearch(&state, PatternObject_GetCode(self));
2203#endif
2204 }
2205
Thomas Wouters89f507f2006-12-13 04:49:30 +00002206 if (PyErr_Occurred())
2207 goto error;
2208
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002209 if (status <= 0) {
2210 if (status == 0)
2211 break;
2212 pattern_error(status);
2213 goto error;
2214 }
Tim Peters3d563502006-01-21 02:47:53 +00002215
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002216 if (state.start == state.ptr) {
2217 if (last == state.end)
2218 break;
2219 /* skip one character */
2220 state.start = (void*) ((char*) state.ptr + state.charsize);
2221 continue;
2222 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002223
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002224 /* get segment before this match */
2225 item = PySequence_GetSlice(
2226 string, STATE_OFFSET(&state, last),
2227 STATE_OFFSET(&state, state.start)
2228 );
2229 if (!item)
2230 goto error;
2231 status = PyList_Append(list, item);
2232 Py_DECREF(item);
2233 if (status < 0)
2234 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002235
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002236 /* add groups (if any) */
2237 for (i = 0; i < self->groups; i++) {
2238 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002239 if (!item)
2240 goto error;
2241 status = PyList_Append(list, item);
2242 Py_DECREF(item);
2243 if (status < 0)
2244 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002245 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002246
2247 n = n + 1;
2248
2249 last = state.start = state.ptr;
2250
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002251 }
2252
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002253 /* get segment following last match (even if empty) */
2254 item = PySequence_GetSlice(
2255 string, STATE_OFFSET(&state, last), state.endpos
2256 );
2257 if (!item)
2258 goto error;
2259 status = PyList_Append(list, item);
2260 Py_DECREF(item);
2261 if (status < 0)
2262 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002263
2264 state_fini(&state);
2265 return list;
2266
2267error:
2268 Py_DECREF(list);
2269 state_fini(&state);
2270 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002271
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002272}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002273
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002274static PyObject*
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002275pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002276 Py_ssize_t count, Py_ssize_t subn)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002277{
2278 SRE_STATE state;
2279 PyObject* list;
2280 PyObject* item;
2281 PyObject* filter;
2282 PyObject* args;
2283 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002284 void* ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002285 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002286 Py_ssize_t n;
2287 Py_ssize_t i, b, e;
2288 int bint;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002289 int filter_is_callable;
2290
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002291 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002292 /* sub/subn takes either a function or a template */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002293 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002294 Py_INCREF(filter);
2295 filter_is_callable = 1;
2296 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002297 /* if not callable, check if it's a literal string */
2298 int literal;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002299 ptr = getstring(ptemplate, &n, &bint);
2300 b = bint;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002301 if (ptr) {
2302 if (b == 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002303 literal = sre_literal_template((unsigned char *)ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002304 } else {
2305#if defined(HAVE_UNICODE)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002306 literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002307#endif
2308 }
2309 } else {
2310 PyErr_Clear();
2311 literal = 0;
2312 }
2313 if (literal) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002314 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002315 Py_INCREF(filter);
2316 filter_is_callable = 0;
2317 } else {
2318 /* not a literal; hand it over to the template compiler */
2319 filter = call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00002320 SRE_PY_MODULE, "_subx",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002321 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002322 );
2323 if (!filter)
2324 return NULL;
2325 filter_is_callable = PyCallable_Check(filter);
2326 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002327 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002328
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002329 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002330 if (!string) {
2331 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002332 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002333 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002334
2335 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002336 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002337 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002338 state_fini(&state);
2339 return NULL;
2340 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002341
2342 n = i = 0;
2343
2344 while (!count || n < count) {
2345
2346 state_reset(&state);
2347
2348 state.ptr = state.start;
2349
2350 if (state.charsize == 1) {
2351 status = sre_search(&state, PatternObject_GetCode(self));
2352 } else {
2353#if defined(HAVE_UNICODE)
2354 status = sre_usearch(&state, PatternObject_GetCode(self));
2355#endif
2356 }
2357
Thomas Wouters89f507f2006-12-13 04:49:30 +00002358 if (PyErr_Occurred())
2359 goto error;
2360
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002361 if (status <= 0) {
2362 if (status == 0)
2363 break;
2364 pattern_error(status);
2365 goto error;
2366 }
Tim Peters3d563502006-01-21 02:47:53 +00002367
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002368 b = STATE_OFFSET(&state, state.start);
2369 e = STATE_OFFSET(&state, state.ptr);
2370
2371 if (i < b) {
2372 /* get segment before this match */
2373 item = PySequence_GetSlice(string, i, b);
2374 if (!item)
2375 goto error;
2376 status = PyList_Append(list, item);
2377 Py_DECREF(item);
2378 if (status < 0)
2379 goto error;
2380
2381 } else if (i == b && i == e && n > 0)
2382 /* ignore empty match on latest position */
2383 goto next;
2384
2385 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002386 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002387 match = pattern_new_match(self, &state, 1);
2388 if (!match)
2389 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002390 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002391 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002392 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002393 goto error;
2394 }
2395 item = PyObject_CallObject(filter, args);
2396 Py_DECREF(args);
2397 Py_DECREF(match);
2398 if (!item)
2399 goto error;
2400 } else {
2401 /* filter is literal string */
2402 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002403 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002404 }
2405
2406 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002407 if (item != Py_None) {
2408 status = PyList_Append(list, item);
2409 Py_DECREF(item);
2410 if (status < 0)
2411 goto error;
2412 }
Tim Peters3d563502006-01-21 02:47:53 +00002413
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002414 i = e;
2415 n = n + 1;
2416
2417next:
2418 /* move on */
2419 if (state.ptr == state.start)
2420 state.start = (void*) ((char*) state.ptr + state.charsize);
2421 else
2422 state.start = state.ptr;
2423
2424 }
2425
2426 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002427 if (i < state.endpos) {
2428 item = PySequence_GetSlice(string, i, state.endpos);
2429 if (!item)
2430 goto error;
2431 status = PyList_Append(list, item);
2432 Py_DECREF(item);
2433 if (status < 0)
2434 goto error;
2435 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002436
2437 state_fini(&state);
2438
Guido van Rossum4e173842001-12-07 04:25:10 +00002439 Py_DECREF(filter);
2440
Fredrik Lundhdac58492001-10-21 21:48:30 +00002441 /* convert list to single string (also removes list) */
Michael W. Hudsonb6a45052002-07-31 09:54:24 +00002442 item = join_list(list, self->pattern);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002443
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002444 if (!item)
2445 return NULL;
2446
2447 if (subn)
2448 return Py_BuildValue("Ni", item, n);
2449
2450 return item;
2451
2452error:
2453 Py_DECREF(list);
2454 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002455 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002456 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002457
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002458}
2459
2460static PyObject*
2461pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2462{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002463 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002464 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002465 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002466 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002467 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002468 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002469 return NULL;
2470
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002471 return pattern_subx(self, ptemplate, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002472}
2473
2474static PyObject*
2475pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2476{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002477 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002478 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002479 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002480 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002481 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002482 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002483 return NULL;
2484
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002485 return pattern_subx(self, ptemplate, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002486}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002487
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002488static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002489pattern_copy(PatternObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002490{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002491#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002492 PatternObject* copy;
2493 int offset;
2494
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002495 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2496 if (!copy)
2497 return NULL;
2498
2499 offset = offsetof(PatternObject, groups);
2500
2501 Py_XINCREF(self->groupindex);
2502 Py_XINCREF(self->indexgroup);
2503 Py_XINCREF(self->pattern);
2504
2505 memcpy((char*) copy + offset, (char*) self + offset,
2506 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00002507 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002508
2509 return (PyObject*) copy;
2510#else
2511 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2512 return NULL;
2513#endif
2514}
2515
2516static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002517pattern_deepcopy(PatternObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002518{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002519#ifdef USE_BUILTIN_COPY
2520 PatternObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00002521
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002522 copy = (PatternObject*) pattern_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002523 if (!copy)
2524 return NULL;
2525
2526 if (!deepcopy(&copy->groupindex, memo) ||
2527 !deepcopy(&copy->indexgroup, memo) ||
2528 !deepcopy(&copy->pattern, memo)) {
2529 Py_DECREF(copy);
2530 return NULL;
2531 }
2532
2533#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002534 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2535 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002536#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002537}
2538
Raymond Hettinger94478742004-09-24 04:31:19 +00002539PyDoc_STRVAR(pattern_match_doc,
2540"match(string[, pos[, endpos]]) --> match object or None.\n\
2541 Matches zero or more characters at the beginning of the string");
2542
2543PyDoc_STRVAR(pattern_search_doc,
2544"search(string[, pos[, endpos]]) --> match object or None.\n\
2545 Scan through string looking for a match, and return a corresponding\n\
2546 MatchObject instance. Return None if no position in the string matches.");
2547
2548PyDoc_STRVAR(pattern_split_doc,
2549"split(string[, maxsplit = 0]) --> list.\n\
2550 Split string by the occurrences of pattern.");
2551
2552PyDoc_STRVAR(pattern_findall_doc,
2553"findall(string[, pos[, endpos]]) --> list.\n\
2554 Return a list of all non-overlapping matches of pattern in string.");
2555
2556PyDoc_STRVAR(pattern_finditer_doc,
2557"finditer(string[, pos[, endpos]]) --> iterator.\n\
2558 Return an iterator over all non-overlapping matches for the \n\
2559 RE pattern in string. For each match, the iterator returns a\n\
2560 match object.");
2561
2562PyDoc_STRVAR(pattern_sub_doc,
2563"sub(repl, string[, count = 0]) --> newstring\n\
2564 Return the string obtained by replacing the leftmost non-overlapping\n\
Tim Peters3d563502006-01-21 02:47:53 +00002565 occurrences of pattern in string by the replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002566
2567PyDoc_STRVAR(pattern_subn_doc,
2568"subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2569 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2570 the leftmost non-overlapping occurrences of pattern with the\n\
Tim Peters3d563502006-01-21 02:47:53 +00002571 replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002572
2573PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2574
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002575static PyMethodDef pattern_methods[] = {
Tim Peters3d563502006-01-21 02:47:53 +00002576 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002577 pattern_match_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002578 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002579 pattern_search_doc},
2580 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2581 pattern_sub_doc},
2582 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2583 pattern_subn_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002584 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002585 pattern_split_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002586 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002587 pattern_findall_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002588#if PY_VERSION_HEX >= 0x02020000
Raymond Hettinger94478742004-09-24 04:31:19 +00002589 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2590 pattern_finditer_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002591#endif
Fredrik Lundh562586e2000-10-03 20:43:34 +00002592 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002593 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2594 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002595 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002596};
2597
Tim Peters3d563502006-01-21 02:47:53 +00002598static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002599pattern_getattr(PatternObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00002600{
2601 PyObject* res;
2602
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002603 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00002604
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002605 if (res)
2606 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00002607
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002608 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00002609
2610 /* attributes */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002611 if (!strcmp(name, "pattern")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002612 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002613 return self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00002614 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002615
2616 if (!strcmp(name, "flags"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002617 return Py_BuildValue("i", self->flags);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002618
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002619 if (!strcmp(name, "groups"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002620 return Py_BuildValue("i", self->groups);
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002621
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002622 if (!strcmp(name, "groupindex") && self->groupindex) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002623 Py_INCREF(self->groupindex);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002624 return self->groupindex;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002625 }
2626
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002627 PyErr_SetString(PyExc_AttributeError, name);
2628 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002629}
2630
Neal Norwitz57c179c2006-03-22 07:18:02 +00002631static PyTypeObject Pattern_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00002632 PyVarObject_HEAD_INIT(NULL, 0)
2633 "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002634 sizeof(PatternObject), sizeof(SRE_CODE),
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002635 (destructor)pattern_dealloc, /*tp_dealloc*/
2636 0, /*tp_print*/
Raymond Hettinger027bb632004-05-31 03:09:25 +00002637 (getattrfunc)pattern_getattr, /*tp_getattr*/
2638 0, /* tp_setattr */
2639 0, /* tp_compare */
2640 0, /* tp_repr */
2641 0, /* tp_as_number */
2642 0, /* tp_as_sequence */
2643 0, /* tp_as_mapping */
2644 0, /* tp_hash */
2645 0, /* tp_call */
2646 0, /* tp_str */
2647 0, /* tp_getattro */
2648 0, /* tp_setattro */
2649 0, /* tp_as_buffer */
Guido van Rossum3cf5b1e2006-07-27 21:53:35 +00002650 Py_TPFLAGS_DEFAULT, /* tp_flags */
Raymond Hettinger94478742004-09-24 04:31:19 +00002651 pattern_doc, /* tp_doc */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002652 0, /* tp_traverse */
2653 0, /* tp_clear */
2654 0, /* tp_richcompare */
2655 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
Guido van Rossumb700df92000-03-31 14:59:30 +00002656};
2657
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002658static PyObject *
2659_compile(PyObject* self_, PyObject* args)
2660{
2661 /* "compile" pattern descriptor to pattern object */
2662
2663 PatternObject* self;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002664 Py_ssize_t i, n;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002665
2666 PyObject* pattern;
2667 int flags = 0;
2668 PyObject* code;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002669 Py_ssize_t groups = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002670 PyObject* groupindex = NULL;
2671 PyObject* indexgroup = NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002672 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002673 &PyList_Type, &code, &groups,
2674 &groupindex, &indexgroup))
2675 return NULL;
2676
2677 n = PyList_GET_SIZE(code);
2678
2679 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2680 if (!self)
2681 return NULL;
2682
2683 self->codesize = n;
2684
2685 for (i = 0; i < n; i++) {
2686 PyObject *o = PyList_GET_ITEM(code, i);
Guido van Rossumddefaf32007-01-14 03:31:43 +00002687 unsigned long value = PyLong_AsUnsignedLong(o);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002688 self->code[i] = (SRE_CODE) value;
2689 if ((unsigned long) self->code[i] != value) {
2690 PyErr_SetString(PyExc_OverflowError,
2691 "regular expression code size limit exceeded");
2692 break;
2693 }
2694 }
2695
2696 if (PyErr_Occurred()) {
2697 PyObject_DEL(self);
2698 return NULL;
2699 }
2700
2701 Py_INCREF(pattern);
2702 self->pattern = pattern;
2703
2704 self->flags = flags;
2705
2706 self->groups = groups;
2707
2708 Py_XINCREF(groupindex);
2709 self->groupindex = groupindex;
2710
2711 Py_XINCREF(indexgroup);
2712 self->indexgroup = indexgroup;
2713
2714 self->weakreflist = NULL;
2715
2716 return (PyObject*) self;
2717}
2718
Guido van Rossumb700df92000-03-31 14:59:30 +00002719/* -------------------------------------------------------------------- */
2720/* match methods */
2721
2722static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002723match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00002724{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002725 Py_XDECREF(self->regs);
2726 Py_XDECREF(self->string);
2727 Py_DECREF(self->pattern);
2728 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00002729}
2730
2731static PyObject*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002732match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00002733{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002734 if (index < 0 || index >= self->groups) {
2735 /* raise IndexError if we were given a bad group number */
2736 PyErr_SetString(
2737 PyExc_IndexError,
2738 "no such group"
2739 );
2740 return NULL;
2741 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002742
Fredrik Lundh6f013982000-07-03 18:44:21 +00002743 index *= 2;
2744
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002745 if (self->string == Py_None || self->mark[index] < 0) {
2746 /* return default value if the string or group is undefined */
2747 Py_INCREF(def);
2748 return def;
2749 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002750
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002751 return PySequence_GetSlice(
2752 self->string, self->mark[index], self->mark[index+1]
2753 );
Guido van Rossumb700df92000-03-31 14:59:30 +00002754}
2755
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002756static Py_ssize_t
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002757match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00002758{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002759 Py_ssize_t i;
Guido van Rossumb700df92000-03-31 14:59:30 +00002760
Guido van Rossumddefaf32007-01-14 03:31:43 +00002761 if (index == NULL)
2762 /* Default value */
2763 return 0;
2764
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002765 if (PyInt_Check(index))
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002766 return PyInt_AsSsize_t(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00002767
Fredrik Lundh6f013982000-07-03 18:44:21 +00002768 i = -1;
2769
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002770 if (self->pattern->groupindex) {
2771 index = PyObject_GetItem(self->pattern->groupindex, index);
2772 if (index) {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002773 if (PyInt_Check(index) || PyLong_Check(index))
2774 i = PyInt_AsSsize_t(index);
Fredrik Lundh6f013982000-07-03 18:44:21 +00002775 Py_DECREF(index);
2776 } else
2777 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002778 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00002779
2780 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002781}
2782
2783static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002784match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002785{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002786 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00002787}
2788
2789static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002790match_expand(MatchObject* self, PyObject* ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002791{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002792 /* delegate to Python code */
2793 return call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00002794 SRE_PY_MODULE, "_expand",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002795 PyTuple_Pack(3, self->pattern, self, ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002796 );
2797}
2798
2799static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002800match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002801{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002802 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002803 Py_ssize_t i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00002804
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002805 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00002806
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002807 switch (size) {
2808 case 0:
2809 result = match_getslice(self, Py_False, Py_None);
2810 break;
2811 case 1:
2812 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2813 break;
2814 default:
2815 /* fetch multiple items */
2816 result = PyTuple_New(size);
2817 if (!result)
2818 return NULL;
2819 for (i = 0; i < size; i++) {
2820 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002821 self, PyTuple_GET_ITEM(args, i), Py_None
2822 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002823 if (!item) {
2824 Py_DECREF(result);
2825 return NULL;
2826 }
2827 PyTuple_SET_ITEM(result, i, item);
2828 }
2829 break;
2830 }
2831 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002832}
2833
2834static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002835match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002836{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002837 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002838 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002839
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002840 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002841 static char* kwlist[] = { "default", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00002842 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002843 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002844
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002845 result = PyTuple_New(self->groups-1);
2846 if (!result)
2847 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002848
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002849 for (index = 1; index < self->groups; index++) {
2850 PyObject* item;
2851 item = match_getslice_by_index(self, index, def);
2852 if (!item) {
2853 Py_DECREF(result);
2854 return NULL;
2855 }
2856 PyTuple_SET_ITEM(result, index-1, item);
2857 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002858
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002859 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002860}
2861
2862static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002863match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002864{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002865 PyObject* result;
2866 PyObject* keys;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002867 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002868
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002869 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002870 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00002871 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002872 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002873
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002874 result = PyDict_New();
2875 if (!result || !self->pattern->groupindex)
2876 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002877
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002878 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002879 if (!keys)
2880 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00002881
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002882 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00002883 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002884 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002885 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002886 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002887 if (!key)
2888 goto failed;
2889 value = match_getslice(self, key, def);
2890 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002891 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002892 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002893 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00002894 status = PyDict_SetItem(result, key, value);
2895 Py_DECREF(value);
2896 if (status < 0)
2897 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002898 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002899
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002900 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00002901
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002902 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002903
2904failed:
Neal Norwitz60da3162006-03-07 04:48:24 +00002905 Py_XDECREF(keys);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002906 Py_DECREF(result);
2907 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002908}
2909
2910static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002911match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002912{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002913 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002914
Guido van Rossumddefaf32007-01-14 03:31:43 +00002915 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002916 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002917 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002918
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002919 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002920
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002921 if (index < 0 || index >= self->groups) {
2922 PyErr_SetString(
2923 PyExc_IndexError,
2924 "no such group"
2925 );
2926 return NULL;
2927 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002928
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002929 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002930 return Py_BuildValue("i", self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00002931}
2932
2933static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002934match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002935{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002936 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002937
Guido van Rossumddefaf32007-01-14 03:31:43 +00002938 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002939 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002940 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002941
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002942 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002943
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002944 if (index < 0 || index >= self->groups) {
2945 PyErr_SetString(
2946 PyExc_IndexError,
2947 "no such group"
2948 );
2949 return NULL;
2950 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002951
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002952 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002953 return Py_BuildValue("i", self->mark[index*2+1]);
2954}
2955
2956LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002957_pair(Py_ssize_t i1, Py_ssize_t i2)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002958{
2959 PyObject* pair;
2960 PyObject* item;
2961
2962 pair = PyTuple_New(2);
2963 if (!pair)
2964 return NULL;
2965
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002966 item = PyInt_FromSsize_t(i1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002967 if (!item)
2968 goto error;
2969 PyTuple_SET_ITEM(pair, 0, item);
2970
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002971 item = PyInt_FromSsize_t(i2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002972 if (!item)
2973 goto error;
2974 PyTuple_SET_ITEM(pair, 1, item);
2975
2976 return pair;
2977
2978 error:
2979 Py_DECREF(pair);
2980 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002981}
2982
2983static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002984match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002985{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002986 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002987
Guido van Rossumddefaf32007-01-14 03:31:43 +00002988 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002989 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002990 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002991
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002992 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002993
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002994 if (index < 0 || index >= self->groups) {
2995 PyErr_SetString(
2996 PyExc_IndexError,
2997 "no such group"
2998 );
2999 return NULL;
3000 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003001
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003002 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003003 return _pair(self->mark[index*2], self->mark[index*2+1]);
3004}
3005
3006static PyObject*
3007match_regs(MatchObject* self)
3008{
3009 PyObject* regs;
3010 PyObject* item;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003011 Py_ssize_t index;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003012
3013 regs = PyTuple_New(self->groups);
3014 if (!regs)
3015 return NULL;
3016
3017 for (index = 0; index < self->groups; index++) {
3018 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3019 if (!item) {
3020 Py_DECREF(regs);
3021 return NULL;
3022 }
3023 PyTuple_SET_ITEM(regs, index, item);
3024 }
3025
3026 Py_INCREF(regs);
3027 self->regs = regs;
3028
3029 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00003030}
3031
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003032static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003033match_copy(MatchObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003034{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003035#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003036 MatchObject* copy;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003037 Py_ssize_t slots, offset;
Tim Peters3d563502006-01-21 02:47:53 +00003038
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003039 slots = 2 * (self->pattern->groups+1);
3040
3041 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3042 if (!copy)
3043 return NULL;
3044
3045 /* this value a constant, but any compiler should be able to
3046 figure that out all by itself */
3047 offset = offsetof(MatchObject, string);
3048
3049 Py_XINCREF(self->pattern);
3050 Py_XINCREF(self->string);
3051 Py_XINCREF(self->regs);
3052
3053 memcpy((char*) copy + offset, (char*) self + offset,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003054 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003055
3056 return (PyObject*) copy;
3057#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003058 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003059 return NULL;
3060#endif
3061}
3062
3063static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003064match_deepcopy(MatchObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003065{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003066#ifdef USE_BUILTIN_COPY
3067 MatchObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00003068
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003069 copy = (MatchObject*) match_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003070 if (!copy)
3071 return NULL;
3072
3073 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3074 !deepcopy(&copy->string, memo) ||
3075 !deepcopy(&copy->regs, memo)) {
3076 Py_DECREF(copy);
3077 return NULL;
3078 }
3079
3080#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003081 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3082 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003083#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003084}
3085
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003086static PyMethodDef match_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00003087 {"group", (PyCFunction) match_group, METH_VARARGS},
3088 {"start", (PyCFunction) match_start, METH_VARARGS},
3089 {"end", (PyCFunction) match_end, METH_VARARGS},
3090 {"span", (PyCFunction) match_span, METH_VARARGS},
3091 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3092 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003093 {"expand", (PyCFunction) match_expand, METH_O},
3094 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3095 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003096 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003097};
3098
Tim Peters3d563502006-01-21 02:47:53 +00003099static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003100match_getattr(MatchObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00003101{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003102 PyObject* res;
Guido van Rossumb700df92000-03-31 14:59:30 +00003103
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003104 res = Py_FindMethod(match_methods, (PyObject*) self, name);
3105 if (res)
3106 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00003107
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003108 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00003109
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003110 if (!strcmp(name, "lastindex")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00003111 if (self->lastindex >= 0)
3112 return Py_BuildValue("i", self->lastindex);
Fredrik Lundhc2301732000-07-02 22:25:39 +00003113 Py_INCREF(Py_None);
3114 return Py_None;
3115 }
3116
3117 if (!strcmp(name, "lastgroup")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00003118 if (self->pattern->indexgroup && self->lastindex >= 0) {
Fredrik Lundhc2301732000-07-02 22:25:39 +00003119 PyObject* result = PySequence_GetItem(
Fredrik Lundh6f013982000-07-03 18:44:21 +00003120 self->pattern->indexgroup, self->lastindex
Fredrik Lundhc2301732000-07-02 22:25:39 +00003121 );
3122 if (result)
3123 return result;
3124 PyErr_Clear();
3125 }
3126 Py_INCREF(Py_None);
3127 return Py_None;
3128 }
3129
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003130 if (!strcmp(name, "string")) {
3131 if (self->string) {
3132 Py_INCREF(self->string);
3133 return self->string;
3134 } else {
3135 Py_INCREF(Py_None);
3136 return Py_None;
3137 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003138 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003139
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003140 if (!strcmp(name, "regs")) {
3141 if (self->regs) {
3142 Py_INCREF(self->regs);
3143 return self->regs;
3144 } else
3145 return match_regs(self);
3146 }
3147
3148 if (!strcmp(name, "re")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00003149 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003150 return (PyObject*) self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00003151 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003152
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003153 if (!strcmp(name, "pos"))
3154 return Py_BuildValue("i", self->pos);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003155
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003156 if (!strcmp(name, "endpos"))
3157 return Py_BuildValue("i", self->endpos);
Guido van Rossumb700df92000-03-31 14:59:30 +00003158
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003159 PyErr_SetString(PyExc_AttributeError, name);
3160 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003161}
3162
3163/* FIXME: implement setattr("string", None) as a special case (to
3164 detach the associated string, if any */
3165
Neal Norwitz57c179c2006-03-22 07:18:02 +00003166static PyTypeObject Match_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003167 PyVarObject_HEAD_INIT(NULL,0)
3168 "_" SRE_MODULE ".SRE_Match",
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003169 sizeof(MatchObject), sizeof(Py_ssize_t),
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003170 (destructor)match_dealloc, /*tp_dealloc*/
3171 0, /*tp_print*/
3172 (getattrfunc)match_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00003173};
3174
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003175static PyObject*
3176pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3177{
3178 /* create match object (from state object) */
3179
3180 MatchObject* match;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003181 Py_ssize_t i, j;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003182 char* base;
3183 int n;
3184
3185 if (status > 0) {
3186
3187 /* create match object (with room for extra group marks) */
3188 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3189 2*(pattern->groups+1));
3190 if (!match)
3191 return NULL;
3192
3193 Py_INCREF(pattern);
3194 match->pattern = pattern;
3195
3196 Py_INCREF(state->string);
3197 match->string = state->string;
3198
3199 match->regs = NULL;
3200 match->groups = pattern->groups+1;
3201
3202 /* fill in group slices */
3203
3204 base = (char*) state->beginning;
3205 n = state->charsize;
3206
3207 match->mark[0] = ((char*) state->start - base) / n;
3208 match->mark[1] = ((char*) state->ptr - base) / n;
3209
3210 for (i = j = 0; i < pattern->groups; i++, j+=2)
3211 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3212 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3213 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3214 } else
3215 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3216
3217 match->pos = state->pos;
3218 match->endpos = state->endpos;
3219
3220 match->lastindex = state->lastindex;
3221
3222 return (PyObject*) match;
3223
3224 } else if (status == 0) {
3225
3226 /* no match */
3227 Py_INCREF(Py_None);
3228 return Py_None;
3229
3230 }
3231
3232 /* internal error */
3233 pattern_error(status);
3234 return NULL;
3235}
3236
3237
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003238/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003239/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003240
3241static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003242scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003243{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003244 state_fini(&self->state);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003245 Py_DECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003246 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003247}
3248
3249static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003250scanner_match(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003251{
3252 SRE_STATE* state = &self->state;
3253 PyObject* match;
3254 int status;
3255
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003256 state_reset(state);
3257
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003258 state->ptr = state->start;
3259
3260 if (state->charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003261 status = sre_match(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003262 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003263#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003264 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003265#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003266 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003267 if (PyErr_Occurred())
3268 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003269
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003270 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003271 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003272
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003273 if (status == 0 || state->ptr == state->start)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003274 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003275 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003276 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003277
3278 return match;
3279}
3280
3281
3282static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003283scanner_search(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003284{
3285 SRE_STATE* state = &self->state;
3286 PyObject* match;
3287 int status;
3288
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003289 state_reset(state);
3290
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003291 state->ptr = state->start;
3292
3293 if (state->charsize == 1) {
3294 status = sre_search(state, PatternObject_GetCode(self->pattern));
3295 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003296#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003297 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003298#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003299 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003300 if (PyErr_Occurred())
3301 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003302
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003303 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003304 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003305
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003306 if (status == 0 || state->ptr == state->start)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003307 state->start = (void*) ((char*) state->ptr + state->charsize);
3308 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003309 state->start = state->ptr;
3310
3311 return match;
3312}
3313
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003314static PyMethodDef scanner_methods[] = {
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003315 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3316 {"search", (PyCFunction) scanner_search, METH_NOARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003317 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003318};
3319
Tim Peters3d563502006-01-21 02:47:53 +00003320static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003321scanner_getattr(ScannerObject* self, char* name)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003322{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003323 PyObject* res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003324
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003325 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
3326 if (res)
3327 return res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003328
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003329 PyErr_Clear();
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003330
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003331 /* attributes */
3332 if (!strcmp(name, "pattern")) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003333 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003334 return self->pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003335 }
3336
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003337 PyErr_SetString(PyExc_AttributeError, name);
3338 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003339}
3340
Neal Norwitz57c179c2006-03-22 07:18:02 +00003341static PyTypeObject Scanner_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003342 PyVarObject_HEAD_INIT(NULL, 0)
3343 "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003344 sizeof(ScannerObject), 0,
3345 (destructor)scanner_dealloc, /*tp_dealloc*/
3346 0, /*tp_print*/
3347 (getattrfunc)scanner_getattr, /*tp_getattr*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003348};
3349
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003350static PyObject*
3351pattern_scanner(PatternObject* pattern, PyObject* args)
3352{
3353 /* create search state object */
3354
3355 ScannerObject* self;
3356
3357 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003358 Py_ssize_t start = 0;
3359 Py_ssize_t end = PY_SSIZE_T_MAX;
3360 if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003361 return NULL;
3362
3363 /* create scanner object */
3364 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3365 if (!self)
3366 return NULL;
3367
3368 string = state_init(&self->state, pattern, string, start, end);
3369 if (!string) {
3370 PyObject_DEL(self);
3371 return NULL;
3372 }
3373
3374 Py_INCREF(pattern);
3375 self->pattern = (PyObject*) pattern;
3376
3377 return (PyObject*) self;
3378}
3379
Guido van Rossumb700df92000-03-31 14:59:30 +00003380static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00003381 {"compile", _compile, METH_VARARGS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003382 {"getcodesize", sre_codesize, METH_NOARGS},
Neal Norwitzb0493252002-03-31 14:44:22 +00003383 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003384 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003385};
3386
Mark Hammond8235ea12002-07-19 06:55:41 +00003387PyMODINIT_FUNC init_sre(void)
Guido van Rossumb700df92000-03-31 14:59:30 +00003388{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003389 PyObject* m;
3390 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00003391 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003392
Guido van Rossum50e9fb92006-08-17 05:42:55 +00003393 /* Initialize object types */
3394 if (PyType_Ready(&Pattern_Type) < 0)
3395 return;
3396 if (PyType_Ready(&Match_Type) < 0)
3397 return;
3398 if (PyType_Ready(&Scanner_Type) < 0)
3399 return;
Guido van Rossumb700df92000-03-31 14:59:30 +00003400
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00003401 m = Py_InitModule("_" SRE_MODULE, _functions);
Neal Norwitz1ac754f2006-01-19 06:09:39 +00003402 if (m == NULL)
3403 return;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003404 d = PyModule_GetDict(m);
3405
Fredrik Lundh21009b92001-09-18 18:47:09 +00003406 x = PyInt_FromLong(SRE_MAGIC);
3407 if (x) {
3408 PyDict_SetItemString(d, "MAGIC", x);
3409 Py_DECREF(x);
3410 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00003411
Martin v. Löwis78e2f062003-04-19 12:56:08 +00003412 x = PyInt_FromLong(sizeof(SRE_CODE));
3413 if (x) {
3414 PyDict_SetItemString(d, "CODESIZE", x);
3415 Py_DECREF(x);
3416 }
3417
Fredrik Lundh21009b92001-09-18 18:47:09 +00003418 x = PyString_FromString(copyright);
3419 if (x) {
3420 PyDict_SetItemString(d, "copyright", x);
3421 Py_DECREF(x);
3422 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003423}
3424
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003425#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00003426
3427/* vim:ts=4:sw=4:et
3428*/