blob: 20f98ca4591b20f6b8296134354e32a071924e06 [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
Fredrik Lundhebc37b22000-10-28 19:30:41 +00009 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000010 * 2000-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000011 * 2000-09-20 fl added expand method
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000012 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000013 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000014 * 2001-04-28 fl added __copy__ methods (work in progress)
Fredrik Lundh09705f02002-11-22 12:46:35 +000015 * 2001-05-14 fl fixes for 1.5.2 compatibility
Fredrik Lundhf71ae462001-07-02 17:04:48 +000016 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh397a6542001-10-18 19:30:16 +000017 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Fredrik Lundh971e78b2001-10-20 17:48:46 +000018 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
Fredrik Lundhbec95b92001-10-21 16:47:57 +000019 * 2001-10-21 fl added sub/subn primitive
Fredrik Lundh703ce812001-10-24 22:16:30 +000020 * 2001-10-24 fl added finditer primitive (for 2.2 only)
Fredrik Lundh82b23072001-12-09 16:13:15 +000021 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
Fredrik Lundh09705f02002-11-22 12:46:35 +000022 * 2002-11-09 fl fixed empty sub/subn return type
Martin v. Löwis78e2f062003-04-19 12:56:08 +000023 * 2003-04-18 mvl fully support 4-byte codes
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +000024 * 2003-10-17 gn implemented non recursive scheme
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000026 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000027 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000028 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
Guido van Rossumb700df92000-03-31 14:59:30 +000032 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000033 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000034 * other compatibility work.
35 */
36
37#ifndef SRE_RECURSIVE
38
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000039static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000040 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000041
Thomas Wouters0e3f5912006-08-11 14:57:12 +000042#define PY_SSIZE_T_CLEAN
43
Guido van Rossumb700df92000-03-31 14:59:30 +000044#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000045#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000046
47#include "sre.h"
48
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000049#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000050
Fredrik Lundh436c3d582000-06-29 08:58:44 +000051/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000052#if !defined(SRE_MODULE)
53#define SRE_MODULE "sre"
54#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000055
Thomas Wouters9ada3d62006-04-21 09:47:09 +000056#define SRE_PY_MODULE "re"
57
Guido van Rossumb700df92000-03-31 14:59:30 +000058/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000059#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000060
Fredrik Lundh22d25462000-07-01 17:50:59 +000061/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +000062#define HAVE_UNICODE
Fredrik Lundh436c3d582000-06-29 08:58:44 +000063
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000065/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000066
67/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000068#define USE_FAST_SEARCH
69
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000070/* enables aggressive inlining (always on for Visual C) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000071#undef USE_INLINE
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000072
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000073/* enables copy/deepcopy handling (work in progress) */
74#undef USE_BUILTIN_COPY
75
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000076#if PY_VERSION_HEX < 0x01060000
77#define PyObject_DEL(op) PyMem_DEL((op))
78#endif
79
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000080/* -------------------------------------------------------------------- */
81
Fredrik Lundh80946112000-06-29 18:03:25 +000082#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000083#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000084#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000085/* fastest possible local call under MSVC */
86#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000087#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000088#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000089#else
90#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000091#endif
92
93/* error codes */
94#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000095#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000096#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000097#define SRE_ERROR_MEMORY -9 /* out of memory */
98
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000099#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +0000100#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +0000101#else
102#define TRACE(v)
103#endif
104
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000105/* -------------------------------------------------------------------- */
106/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000107
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000108/* default character predicates (run sre_chars.py to regenerate tables) */
109
110#define SRE_DIGIT_MASK 1
111#define SRE_SPACE_MASK 2
112#define SRE_LINEBREAK_MASK 4
113#define SRE_ALNUM_MASK 8
114#define SRE_WORD_MASK 16
115
Fredrik Lundh21009b92001-09-18 18:47:09 +0000116/* FIXME: this assumes ASCII. create tables in init_sre() instead */
117
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000118static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1192, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1200, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
12125, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12224, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1230, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12424, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
125
Fredrik Lundhb389df32000-06-29 12:48:37 +0000126static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000012710, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
12827, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
12944, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
13061, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
131108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
132122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
133106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
134120, 121, 122, 123, 124, 125, 126, 127 };
135
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000136#define SRE_IS_DIGIT(ch)\
137 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
138#define SRE_IS_SPACE(ch)\
139 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
140#define SRE_IS_LINEBREAK(ch)\
141 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
142#define SRE_IS_ALNUM(ch)\
143 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
144#define SRE_IS_WORD(ch)\
145 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000146
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000147static unsigned int sre_lower(unsigned int ch)
148{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000149 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000150}
151
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000152/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000153/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
154 * warnings when c's type supports only numbers < N+1 */
155#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
156#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000157#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000158#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000159#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
160
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000161static unsigned int sre_lower_locale(unsigned int ch)
162{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000163 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000164}
165
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000166/* unicode-specific character predicates */
167
168#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000169
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000170#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
171#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
172#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000173#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000174#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000175
176static unsigned int sre_lower_unicode(unsigned int ch)
177{
178 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
179}
180
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000181#endif
182
Guido van Rossumb700df92000-03-31 14:59:30 +0000183LOCAL(int)
184sre_category(SRE_CODE category, unsigned int ch)
185{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000186 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000187
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000188 case SRE_CATEGORY_DIGIT:
189 return SRE_IS_DIGIT(ch);
190 case SRE_CATEGORY_NOT_DIGIT:
191 return !SRE_IS_DIGIT(ch);
192 case SRE_CATEGORY_SPACE:
193 return SRE_IS_SPACE(ch);
194 case SRE_CATEGORY_NOT_SPACE:
195 return !SRE_IS_SPACE(ch);
196 case SRE_CATEGORY_WORD:
197 return SRE_IS_WORD(ch);
198 case SRE_CATEGORY_NOT_WORD:
199 return !SRE_IS_WORD(ch);
200 case SRE_CATEGORY_LINEBREAK:
201 return SRE_IS_LINEBREAK(ch);
202 case SRE_CATEGORY_NOT_LINEBREAK:
203 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000204
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000205 case SRE_CATEGORY_LOC_WORD:
206 return SRE_LOC_IS_WORD(ch);
207 case SRE_CATEGORY_LOC_NOT_WORD:
208 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000209
210#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000211 case SRE_CATEGORY_UNI_DIGIT:
212 return SRE_UNI_IS_DIGIT(ch);
213 case SRE_CATEGORY_UNI_NOT_DIGIT:
214 return !SRE_UNI_IS_DIGIT(ch);
215 case SRE_CATEGORY_UNI_SPACE:
216 return SRE_UNI_IS_SPACE(ch);
217 case SRE_CATEGORY_UNI_NOT_SPACE:
218 return !SRE_UNI_IS_SPACE(ch);
219 case SRE_CATEGORY_UNI_WORD:
220 return SRE_UNI_IS_WORD(ch);
221 case SRE_CATEGORY_UNI_NOT_WORD:
222 return !SRE_UNI_IS_WORD(ch);
223 case SRE_CATEGORY_UNI_LINEBREAK:
224 return SRE_UNI_IS_LINEBREAK(ch);
225 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
226 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000227#else
228 case SRE_CATEGORY_UNI_DIGIT:
229 return SRE_IS_DIGIT(ch);
230 case SRE_CATEGORY_UNI_NOT_DIGIT:
231 return !SRE_IS_DIGIT(ch);
232 case SRE_CATEGORY_UNI_SPACE:
233 return SRE_IS_SPACE(ch);
234 case SRE_CATEGORY_UNI_NOT_SPACE:
235 return !SRE_IS_SPACE(ch);
236 case SRE_CATEGORY_UNI_WORD:
237 return SRE_LOC_IS_WORD(ch);
238 case SRE_CATEGORY_UNI_NOT_WORD:
239 return !SRE_LOC_IS_WORD(ch);
240 case SRE_CATEGORY_UNI_LINEBREAK:
241 return SRE_IS_LINEBREAK(ch);
242 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
243 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000244#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000245 }
246 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000247}
248
249/* helpers */
250
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000251static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000252data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000253{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000254 if (state->data_stack) {
Thomas Wouters477c8d52006-05-27 19:21:47 +0000255 PyMem_FREE(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000256 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000257 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000258 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000259}
260
261static int
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000262data_stack_grow(SRE_STATE* state, Py_ssize_t size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000263{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000264 Py_ssize_t minsize, cursize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000265 minsize = state->data_stack_base+size;
266 cursize = state->data_stack_size;
267 if (cursize < minsize) {
268 void* stack;
269 cursize = minsize+minsize/4+1024;
270 TRACE(("allocate/grow stack %d\n", cursize));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000271 stack = PyMem_REALLOC(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000272 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000273 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000274 return SRE_ERROR_MEMORY;
275 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000276 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000277 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000278 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000279 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000280}
281
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000282/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000283
284#define SRE_CHAR unsigned char
285#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000286#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000287#define SRE_CHARSET sre_charset
288#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000289#define SRE_MATCH sre_match
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000290#define SRE_MATCH_CONTEXT sre_match_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000291#define SRE_SEARCH sre_search
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000292#define SRE_LITERAL_TEMPLATE sre_literal_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000293
294#if defined(HAVE_UNICODE)
295
Guido van Rossumb700df92000-03-31 14:59:30 +0000296#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000297#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000298#undef SRE_RECURSIVE
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000299
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000300#undef SRE_LITERAL_TEMPLATE
Guido van Rossumb700df92000-03-31 14:59:30 +0000301#undef SRE_SEARCH
302#undef SRE_MATCH
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000303#undef SRE_MATCH_CONTEXT
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000304#undef SRE_INFO
305#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000306#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000307#undef SRE_AT
308#undef SRE_CHAR
309
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000310/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000311
312#define SRE_CHAR Py_UNICODE
313#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000314#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000315#define SRE_CHARSET sre_ucharset
316#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000317#define SRE_MATCH sre_umatch
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000318#define SRE_MATCH_CONTEXT sre_umatch_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000319#define SRE_SEARCH sre_usearch
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000320#define SRE_LITERAL_TEMPLATE sre_uliteral_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000321#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000322
323#endif /* SRE_RECURSIVE */
324
325/* -------------------------------------------------------------------- */
326/* String matching engine */
327
328/* the following section is compiled twice, with different character
329 settings */
330
331LOCAL(int)
332SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
333{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000334 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000335
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000336 Py_ssize_t thisp, thatp;
Guido van Rossumb700df92000-03-31 14:59:30 +0000337
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000338 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000339
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000340 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000341 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000342 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000343
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000344 case SRE_AT_BEGINNING_LINE:
345 return ((void*) ptr == state->beginning ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000346 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000347
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000348 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000349 return (((void*) (ptr+1) == state->end &&
350 SRE_IS_LINEBREAK((int) ptr[0])) ||
351 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000352
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000353 case SRE_AT_END_LINE:
354 return ((void*) ptr == state->end ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000355 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000356
Fredrik Lundh770617b2001-01-14 15:06:11 +0000357 case SRE_AT_END_STRING:
358 return ((void*) ptr == state->end);
359
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000360 case SRE_AT_BOUNDARY:
361 if (state->beginning == state->end)
362 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000363 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000364 SRE_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000365 thisp = ((void*) ptr < state->end) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000366 SRE_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000367 return thisp != thatp;
Fredrik Lundh80946112000-06-29 18:03:25 +0000368
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000369 case SRE_AT_NON_BOUNDARY:
370 if (state->beginning == state->end)
371 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000372 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000373 SRE_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000374 thisp = ((void*) ptr < state->end) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000375 SRE_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000376 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000377
378 case SRE_AT_LOC_BOUNDARY:
379 if (state->beginning == state->end)
380 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000381 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000382 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000383 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000384 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000385 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000386
387 case SRE_AT_LOC_NON_BOUNDARY:
388 if (state->beginning == state->end)
389 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000390 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000391 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000392 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000393 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000394 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000395
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000396#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000397 case SRE_AT_UNI_BOUNDARY:
398 if (state->beginning == state->end)
399 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000400 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000401 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000402 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000403 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000404 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000405
406 case SRE_AT_UNI_NON_BOUNDARY:
407 if (state->beginning == state->end)
408 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000409 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000410 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000411 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000412 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000413 return thisp == thatp;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000414#endif
415
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000416 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000417
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000418 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000419}
420
421LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000422SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000423{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000424 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000425
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000426 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000427
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000428 for (;;) {
429 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000430
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000431 case SRE_OP_FAILURE:
432 return !ok;
433
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000434 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000435 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000436 if (ch == set[0])
437 return ok;
438 set++;
439 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000440
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000441 case SRE_OP_CATEGORY:
442 /* <CATEGORY> <code> */
443 if (sre_category(set[0], (int) ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000444 return ok;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000445 set += 1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000446 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000447
Fredrik Lundh3562f112000-07-02 12:00:07 +0000448 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000449 if (sizeof(SRE_CODE) == 2) {
450 /* <CHARSET> <bitmap> (16 bits per code word) */
451 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
452 return ok;
453 set += 16;
Tim Peters3d563502006-01-21 02:47:53 +0000454 }
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000455 else {
456 /* <CHARSET> <bitmap> (32 bits per code word) */
457 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
458 return ok;
459 set += 8;
460 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000461 break;
462
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000463 case SRE_OP_RANGE:
464 /* <RANGE> <lower> <upper> */
465 if (set[0] <= ch && ch <= set[1])
466 return ok;
467 set += 2;
468 break;
469
470 case SRE_OP_NEGATE:
471 ok = !ok;
472 break;
473
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000474 case SRE_OP_BIGCHARSET:
475 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
476 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000477 Py_ssize_t count, block;
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000478 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000479
480 if (sizeof(SRE_CODE) == 2) {
481 block = ((unsigned char*)set)[ch >> 8];
482 set += 128;
483 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
484 return ok;
485 set += count*16;
486 }
487 else {
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000488 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
489 * warnings when c's type supports only numbers < N+1 */
490 if (!(ch & ~65535))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000491 block = ((unsigned char*)set)[ch >> 8];
492 else
493 block = -1;
494 set += 64;
Tim Peters3d563502006-01-21 02:47:53 +0000495 if (block >=0 &&
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000496 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
497 return ok;
498 set += count*8;
499 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000500 break;
501 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000502
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000503 default:
504 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000505 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000506 return 0;
507 }
508 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000509}
510
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000511LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000512
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000513LOCAL(Py_ssize_t)
514SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000515{
516 SRE_CODE chr;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000517 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
518 SRE_CHAR* end = (SRE_CHAR *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000519 Py_ssize_t i;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000520
521 /* adjust end */
522 if (maxcount < end - ptr && maxcount != 65535)
523 end = ptr + maxcount;
524
525 switch (pattern[0]) {
526
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000527 case SRE_OP_IN:
528 /* repeated set */
529 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
530 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
531 ptr++;
532 break;
533
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000534 case SRE_OP_ANY:
535 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000536 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000537 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
538 ptr++;
539 break;
540
541 case SRE_OP_ANY_ALL:
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000542 /* repeated dot wildcard. skip to the end of the target
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000543 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000544 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000545 ptr = end;
546 break;
547
548 case SRE_OP_LITERAL:
549 /* repeated literal */
550 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000551 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000552 while (ptr < end && (SRE_CODE) *ptr == chr)
553 ptr++;
554 break;
555
556 case SRE_OP_LITERAL_IGNORE:
557 /* repeated literal */
558 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000559 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000560 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
561 ptr++;
562 break;
563
564 case SRE_OP_NOT_LITERAL:
565 /* repeated non-literal */
566 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000567 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000568 while (ptr < end && (SRE_CODE) *ptr != chr)
569 ptr++;
570 break;
Tim Peters3d563502006-01-21 02:47:53 +0000571
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000572 case SRE_OP_NOT_LITERAL_IGNORE:
573 /* repeated non-literal */
574 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000575 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000576 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
577 ptr++;
578 break;
579
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000580 default:
581 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000582 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000583 while ((SRE_CHAR*) state->ptr < end) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000584 i = SRE_MATCH(state, pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000585 if (i < 0)
586 return i;
587 if (!i)
588 break;
589 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000590 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
591 (SRE_CHAR*) state->ptr - ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000592 return (SRE_CHAR*) state->ptr - ptr;
593 }
594
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000595 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000596 return ptr - (SRE_CHAR*) state->ptr;
597}
598
Fredrik Lundh33accc12000-08-27 20:59:47 +0000599#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000600LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000601SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
602{
603 /* check if an SRE_OP_INFO block matches at the current position.
604 returns the number of SRE_CODE objects to skip if successful, 0
605 if no match */
606
607 SRE_CHAR* end = state->end;
608 SRE_CHAR* ptr = state->ptr;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000609 Py_ssize_t i;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000610
611 /* check minimal length */
612 if (pattern[3] && (end - ptr) < pattern[3])
613 return 0;
614
615 /* check known prefix */
616 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
617 /* <length> <skip> <prefix data> <overlap data> */
618 for (i = 0; i < pattern[5]; i++)
619 if ((SRE_CODE) ptr[i] != pattern[7 + i])
620 return 0;
621 return pattern[0] + 2 * pattern[6];
622 }
623 return pattern[0];
624}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000625#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000626
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000627/* The macros below should be used to protect recursive SRE_MATCH()
628 * calls that *failed* and do *not* return immediately (IOW, those
629 * that will backtrack). Explaining:
630 *
631 * - Recursive SRE_MATCH() returned true: that's usually a success
632 * (besides atypical cases like ASSERT_NOT), therefore there's no
633 * reason to restore lastmark;
634 *
635 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
636 * is returning to the caller: If the current SRE_MATCH() is the
637 * top function of the recursion, returning false will be a matching
638 * failure, and it doesn't matter where lastmark is pointing to.
639 * If it's *not* the top function, it will be a recursive SRE_MATCH()
640 * failure by itself, and the calling SRE_MATCH() will have to deal
641 * with the failure by the same rules explained here (it will restore
642 * lastmark by itself if necessary);
643 *
644 * - Recursive SRE_MATCH() returned false, and will continue the
645 * outside 'for' loop: must be protected when breaking, since the next
646 * OP could potentially depend on lastmark;
Tim Peters3d563502006-01-21 02:47:53 +0000647 *
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000648 * - Recursive SRE_MATCH() returned false, and will be called again
649 * inside a local for/while loop: must be protected between each
650 * loop iteration, since the recursive SRE_MATCH() could do anything,
651 * and could potentially depend on lastmark.
652 *
653 * For more information, check the discussion at SF patch #712900.
654 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000655#define LASTMARK_SAVE() \
656 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000657 ctx->lastmark = state->lastmark; \
658 ctx->lastindex = state->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000659 } while (0)
660#define LASTMARK_RESTORE() \
661 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000662 state->lastmark = ctx->lastmark; \
663 state->lastindex = ctx->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000664 } while (0)
665
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000666#define RETURN_ERROR(i) do { return i; } while(0)
667#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
668#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
669
670#define RETURN_ON_ERROR(i) \
671 do { if (i < 0) RETURN_ERROR(i); } while (0)
672#define RETURN_ON_SUCCESS(i) \
673 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
674#define RETURN_ON_FAILURE(i) \
675 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
676
677#define SFY(x) #x
678
679#define DATA_STACK_ALLOC(state, type, ptr) \
680do { \
681 alloc_pos = state->data_stack_base; \
682 TRACE(("allocating %s in %d (%d)\n", \
683 SFY(type), alloc_pos, sizeof(type))); \
684 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
685 int j = data_stack_grow(state, sizeof(type)); \
686 if (j < 0) return j; \
687 if (ctx_pos != -1) \
688 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
689 } \
690 ptr = (type*)(state->data_stack+alloc_pos); \
691 state->data_stack_base += sizeof(type); \
692} while (0)
693
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000694#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
695do { \
696 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
697 ptr = (type*)(state->data_stack+pos); \
698} while (0)
699
700#define DATA_STACK_PUSH(state, data, size) \
701do { \
702 TRACE(("copy data in %p to %d (%d)\n", \
703 data, state->data_stack_base, size)); \
704 if (state->data_stack_size < state->data_stack_base+size) { \
705 int j = data_stack_grow(state, size); \
706 if (j < 0) return j; \
707 if (ctx_pos != -1) \
708 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
709 } \
710 memcpy(state->data_stack+state->data_stack_base, data, size); \
711 state->data_stack_base += size; \
712} while (0)
713
714#define DATA_STACK_POP(state, data, size, discard) \
715do { \
716 TRACE(("copy data to %p from %d (%d)\n", \
717 data, state->data_stack_base-size, size)); \
718 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
719 if (discard) \
720 state->data_stack_base -= size; \
721} while (0)
722
723#define DATA_STACK_POP_DISCARD(state, size) \
724do { \
725 TRACE(("discard data from %d (%d)\n", \
726 state->data_stack_base-size, size)); \
727 state->data_stack_base -= size; \
728} while(0)
729
730#define DATA_PUSH(x) \
731 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
732#define DATA_POP(x) \
733 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000734#define DATA_POP_DISCARD(x) \
735 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
736#define DATA_ALLOC(t,p) \
737 DATA_STACK_ALLOC(state, t, p)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000738#define DATA_LOOKUP_AT(t,p,pos) \
739 DATA_STACK_LOOKUP_AT(state,t,p,pos)
740
741#define MARK_PUSH(lastmark) \
742 do if (lastmark > 0) { \
743 i = lastmark; /* ctx->lastmark may change if reallocated */ \
744 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
745 } while (0)
746#define MARK_POP(lastmark) \
747 do if (lastmark > 0) { \
748 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
749 } while (0)
750#define MARK_POP_KEEP(lastmark) \
751 do if (lastmark > 0) { \
752 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
753 } while (0)
754#define MARK_POP_DISCARD(lastmark) \
755 do if (lastmark > 0) { \
756 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
757 } while (0)
758
759#define JUMP_NONE 0
760#define JUMP_MAX_UNTIL_1 1
761#define JUMP_MAX_UNTIL_2 2
762#define JUMP_MAX_UNTIL_3 3
763#define JUMP_MIN_UNTIL_1 4
764#define JUMP_MIN_UNTIL_2 5
765#define JUMP_MIN_UNTIL_3 6
766#define JUMP_REPEAT 7
767#define JUMP_REPEAT_ONE_1 8
768#define JUMP_REPEAT_ONE_2 9
769#define JUMP_MIN_REPEAT_ONE 10
770#define JUMP_BRANCH 11
771#define JUMP_ASSERT 12
772#define JUMP_ASSERT_NOT 13
773
774#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
775 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
776 nextctx->last_ctx_pos = ctx_pos; \
777 nextctx->jump = jumpvalue; \
778 nextctx->pattern = nextpattern; \
779 ctx_pos = alloc_pos; \
780 ctx = nextctx; \
781 goto entrance; \
782 jumplabel: \
783 while (0) /* gcc doesn't like labels at end of scopes */ \
784
785typedef struct {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000786 Py_ssize_t last_ctx_pos;
787 Py_ssize_t jump;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000788 SRE_CHAR* ptr;
789 SRE_CODE* pattern;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000790 Py_ssize_t count;
791 Py_ssize_t lastmark;
792 Py_ssize_t lastindex;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000793 union {
794 SRE_CODE chr;
795 SRE_REPEAT* rep;
796 } u;
797} SRE_MATCH_CONTEXT;
798
799/* check if string matches the given pattern. returns <0 for
800 error, 0 for failure, and 1 for success */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000801LOCAL(Py_ssize_t)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000802SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
Guido van Rossumb700df92000-03-31 14:59:30 +0000803{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000804 SRE_CHAR* end = (SRE_CHAR *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000805 Py_ssize_t alloc_pos, ctx_pos = -1;
806 Py_ssize_t i, ret = 0;
807 Py_ssize_t jump;
Guido van Rossumb700df92000-03-31 14:59:30 +0000808
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000809 SRE_MATCH_CONTEXT* ctx;
810 SRE_MATCH_CONTEXT* nextctx;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000811
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000812 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000813
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000814 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
815 ctx->last_ctx_pos = -1;
816 ctx->jump = JUMP_NONE;
817 ctx->pattern = pattern;
818 ctx_pos = alloc_pos;
819
820entrance:
821
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000822 ctx->ptr = (SRE_CHAR *)state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000823
824 if (ctx->pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000825 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000826 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000827 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000828 TRACE(("reject (got %d chars, need %d)\n",
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000829 (end - ctx->ptr), ctx->pattern[3]));
830 RETURN_FAILURE;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000831 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000832 ctx->pattern += ctx->pattern[1] + 1;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000833 }
834
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000835 for (;;) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000836
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000837 switch (*ctx->pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000838
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000839 case SRE_OP_MARK:
840 /* set mark */
841 /* <MARK> <gid> */
842 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
843 ctx->ptr, ctx->pattern[0]));
844 i = ctx->pattern[0];
845 if (i & 1)
846 state->lastindex = i/2 + 1;
847 if (i > state->lastmark) {
848 /* state->lastmark is the highest valid index in the
849 state->mark array. If it is increased by more than 1,
850 the intervening marks must be set to NULL to signal
Tim Peters3d563502006-01-21 02:47:53 +0000851 that these marks have not been encountered. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000852 Py_ssize_t j = state->lastmark + 1;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000853 while (j < i)
854 state->mark[j++] = NULL;
855 state->lastmark = i;
856 }
857 state->mark[i] = ctx->ptr;
858 ctx->pattern++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000859 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000860
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000861 case SRE_OP_LITERAL:
862 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000863 /* <LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000864 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
865 ctx->ptr, *ctx->pattern));
866 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
867 RETURN_FAILURE;
868 ctx->pattern++;
869 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000870 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000871
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000872 case SRE_OP_NOT_LITERAL:
873 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000874 /* <NOT_LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000875 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
876 ctx->ptr, *ctx->pattern));
877 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
878 RETURN_FAILURE;
879 ctx->pattern++;
880 ctx->ptr++;
881 break;
882
883 case SRE_OP_SUCCESS:
884 /* end of pattern */
885 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
886 state->ptr = ctx->ptr;
887 RETURN_SUCCESS;
888
889 case SRE_OP_AT:
890 /* match at given position */
891 /* <AT> <code> */
892 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
893 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
894 RETURN_FAILURE;
895 ctx->pattern++;
896 break;
897
898 case SRE_OP_CATEGORY:
899 /* match at given category */
900 /* <CATEGORY> <code> */
901 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
902 ctx->ptr, *ctx->pattern));
903 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
904 RETURN_FAILURE;
905 ctx->pattern++;
906 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000907 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000908
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000909 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000910 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000911 /* <ANY> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000912 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
913 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
914 RETURN_FAILURE;
915 ctx->ptr++;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000916 break;
917
918 case SRE_OP_ANY_ALL:
919 /* match anything */
920 /* <ANY_ALL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000921 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
922 if (ctx->ptr >= end)
923 RETURN_FAILURE;
924 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000925 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000926
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000927 case SRE_OP_IN:
928 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000929 /* <IN> <skip> <set> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000930 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
931 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
932 RETURN_FAILURE;
933 ctx->pattern += ctx->pattern[0];
934 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000935 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000936
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000937 case SRE_OP_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000938 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
939 ctx->pattern, ctx->ptr, ctx->pattern[0]));
940 if (ctx->ptr >= end ||
941 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
942 RETURN_FAILURE;
943 ctx->pattern++;
944 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000945 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000946
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000947 case SRE_OP_NOT_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000948 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
949 ctx->pattern, ctx->ptr, *ctx->pattern));
950 if (ctx->ptr >= end ||
951 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
952 RETURN_FAILURE;
953 ctx->pattern++;
954 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000955 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000956
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000957 case SRE_OP_IN_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000958 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
959 if (ctx->ptr >= end
960 || !SRE_CHARSET(ctx->pattern+1,
961 (SRE_CODE)state->lower(*ctx->ptr)))
962 RETURN_FAILURE;
963 ctx->pattern += ctx->pattern[0];
964 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000965 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000966
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000967 case SRE_OP_JUMP:
968 case SRE_OP_INFO:
969 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000970 /* <JUMP> <offset> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000971 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
972 ctx->ptr, ctx->pattern[0]));
973 ctx->pattern += ctx->pattern[0];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000974 break;
975
976 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000977 /* alternation */
978 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000979 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000980 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000981 ctx->u.rep = state->repeat;
982 if (ctx->u.rep)
983 MARK_PUSH(ctx->lastmark);
984 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
985 if (ctx->pattern[1] == SRE_OP_LITERAL &&
986 (ctx->ptr >= end ||
987 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000988 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000989 if (ctx->pattern[1] == SRE_OP_IN &&
990 (ctx->ptr >= end ||
991 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000992 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000993 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000994 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000995 if (ret) {
996 if (ctx->u.rep)
997 MARK_POP_DISCARD(ctx->lastmark);
998 RETURN_ON_ERROR(ret);
999 RETURN_SUCCESS;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001000 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001001 if (ctx->u.rep)
1002 MARK_POP_KEEP(ctx->lastmark);
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001003 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001004 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001005 if (ctx->u.rep)
1006 MARK_POP_DISCARD(ctx->lastmark);
1007 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001008
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001009 case SRE_OP_REPEAT_ONE:
1010 /* match repeated sequence (maximizing regexp) */
1011
1012 /* this operator only works if the repeated item is
1013 exactly one character wide, and we're not already
1014 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +00001015 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001016
1017 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1018
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001019 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1020 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001021
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001022 if (ctx->ptr + ctx->pattern[1] > end)
1023 RETURN_FAILURE; /* cannot match */
Fredrik Lundhe1869832000-08-01 22:47:49 +00001024
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001025 state->ptr = ctx->ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001026
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001027 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1028 RETURN_ON_ERROR(ret);
1029 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1030 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001031 ctx->ptr += ctx->count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001032
1033 /* when we arrive here, count contains the number of
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001034 matches, and ctx->ptr points to the tail of the target
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001035 string. check if the rest of the pattern matches,
1036 and backtrack if not. */
1037
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001038 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001039 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001040
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001041 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001042 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001043 state->ptr = ctx->ptr;
1044 RETURN_SUCCESS;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001045 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001046
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001047 LASTMARK_SAVE();
1048
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001049 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001050 /* tail starts with a literal. skip positions where
1051 the rest of the pattern cannot possibly match */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001052 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001053 for (;;) {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001054 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001055 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1056 ctx->ptr--;
1057 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001058 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001059 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001060 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001061 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001062 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1063 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001064 if (ret) {
1065 RETURN_ON_ERROR(ret);
1066 RETURN_SUCCESS;
1067 }
Tim Peters3d563502006-01-21 02:47:53 +00001068
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001069 LASTMARK_RESTORE();
Tim Peters3d563502006-01-21 02:47:53 +00001070
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001071 ctx->ptr--;
1072 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001073 }
1074
1075 } else {
1076 /* general case */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001077 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001078 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001079 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1080 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001081 if (ret) {
1082 RETURN_ON_ERROR(ret);
1083 RETURN_SUCCESS;
1084 }
1085 ctx->ptr--;
1086 ctx->count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001087 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001088 }
1089 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001090 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001091
Guido van Rossum41c99e72003-04-14 17:59:34 +00001092 case SRE_OP_MIN_REPEAT_ONE:
1093 /* match repeated sequence (minimizing regexp) */
1094
1095 /* this operator only works if the repeated item is
1096 exactly one character wide, and we're not already
1097 collecting backtracking points. for other cases,
1098 use the MIN_REPEAT operator */
1099
1100 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1101
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001102 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1103 ctx->pattern[1], ctx->pattern[2]));
Guido van Rossum41c99e72003-04-14 17:59:34 +00001104
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001105 if (ctx->ptr + ctx->pattern[1] > end)
1106 RETURN_FAILURE; /* cannot match */
Guido van Rossum41c99e72003-04-14 17:59:34 +00001107
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001108 state->ptr = ctx->ptr;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001109
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001110 if (ctx->pattern[1] == 0)
1111 ctx->count = 0;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001112 else {
1113 /* count using pattern min as the maximum */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001114 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1115 RETURN_ON_ERROR(ret);
1116 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001117 if (ret < (Py_ssize_t) ctx->pattern[1])
Tim Peters3d563502006-01-21 02:47:53 +00001118 /* didn't match minimum number of times */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001119 RETURN_FAILURE;
1120 /* advance past minimum matches of repeat */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001121 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001122 ctx->ptr += ctx->count;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001123 }
1124
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001125 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Guido van Rossum41c99e72003-04-14 17:59:34 +00001126 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001127 state->ptr = ctx->ptr;
1128 RETURN_SUCCESS;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001129
1130 } else {
1131 /* general case */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001132 LASTMARK_SAVE();
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001133 while ((Py_ssize_t)ctx->pattern[2] == 65535
1134 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001135 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001136 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1137 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001138 if (ret) {
1139 RETURN_ON_ERROR(ret);
1140 RETURN_SUCCESS;
1141 }
1142 state->ptr = ctx->ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001143 ret = SRE_COUNT(state, ctx->pattern+3, 1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001144 RETURN_ON_ERROR(ret);
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001145 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001146 if (ret == 0)
Guido van Rossum41c99e72003-04-14 17:59:34 +00001147 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001148 assert(ret == 1);
1149 ctx->ptr++;
1150 ctx->count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001151 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001152 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001153 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001154 RETURN_FAILURE;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001155
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001156 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001157 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001158 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001159 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001160 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1161 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001162
1163 /* install new repeat context */
Thomas Wouters477c8d52006-05-27 19:21:47 +00001164 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001165 if (!ctx->u.rep) {
1166 PyErr_NoMemory();
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001167 RETURN_FAILURE;
Thomas Wouters89f507f2006-12-13 04:49:30 +00001168 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001169 ctx->u.rep->count = -1;
1170 ctx->u.rep->pattern = ctx->pattern;
1171 ctx->u.rep->prev = state->repeat;
1172 ctx->u.rep->last_ptr = NULL;
1173 state->repeat = ctx->u.rep;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001174
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001175 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001176 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001177 state->repeat = ctx->u.rep->prev;
Thomas Wouters477c8d52006-05-27 19:21:47 +00001178 PyObject_FREE(ctx->u.rep);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001179
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001180 if (ret) {
1181 RETURN_ON_ERROR(ret);
1182 RETURN_SUCCESS;
1183 }
1184 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001185
1186 case SRE_OP_MAX_UNTIL:
1187 /* maximizing repeat */
1188 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1189
1190 /* FIXME: we probably need to deal with zero-width
1191 matches in here... */
1192
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001193 ctx->u.rep = state->repeat;
1194 if (!ctx->u.rep)
1195 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001196
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001197 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001198
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001199 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001200
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001201 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1202 ctx->ptr, ctx->count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001203
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001204 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001205 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001206 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001207 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1208 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001209 if (ret) {
1210 RETURN_ON_ERROR(ret);
1211 RETURN_SUCCESS;
1212 }
1213 ctx->u.rep->count = ctx->count-1;
1214 state->ptr = ctx->ptr;
1215 RETURN_FAILURE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001216 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001217
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001218 if ((ctx->count < ctx->u.rep->pattern[2] ||
1219 ctx->u.rep->pattern[2] == 65535) &&
1220 state->ptr != ctx->u.rep->last_ptr) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001221 /* we may have enough matches, but if we can
1222 match another item, do so */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001223 ctx->u.rep->count = ctx->count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001224 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001225 MARK_PUSH(ctx->lastmark);
1226 /* zero-width match protection */
1227 DATA_PUSH(&ctx->u.rep->last_ptr);
1228 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001229 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1230 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001231 DATA_POP(&ctx->u.rep->last_ptr);
1232 if (ret) {
1233 MARK_POP_DISCARD(ctx->lastmark);
1234 RETURN_ON_ERROR(ret);
1235 RETURN_SUCCESS;
1236 }
1237 MARK_POP(ctx->lastmark);
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001238 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001239 ctx->u.rep->count = ctx->count-1;
1240 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001241 }
1242
1243 /* cannot match more repeated items here. make sure the
1244 tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001245 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001246 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001247 RETURN_ON_SUCCESS(ret);
1248 state->repeat = ctx->u.rep;
1249 state->ptr = ctx->ptr;
1250 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001251
1252 case SRE_OP_MIN_UNTIL:
1253 /* minimizing repeat */
1254 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1255
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001256 ctx->u.rep = state->repeat;
1257 if (!ctx->u.rep)
1258 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001259
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001260 state->ptr = ctx->ptr;
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001261
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001262 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001263
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001264 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1265 ctx->ptr, ctx->count, ctx->u.rep->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001266
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001267 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001268 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001269 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001270 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1271 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001272 if (ret) {
1273 RETURN_ON_ERROR(ret);
1274 RETURN_SUCCESS;
1275 }
1276 ctx->u.rep->count = ctx->count-1;
1277 state->ptr = ctx->ptr;
1278 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001279 }
1280
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001281 LASTMARK_SAVE();
1282
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001283 /* see if the tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001284 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001285 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001286 if (ret) {
1287 RETURN_ON_ERROR(ret);
1288 RETURN_SUCCESS;
1289 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001290
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001291 state->repeat = ctx->u.rep;
1292 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001293
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001294 LASTMARK_RESTORE();
1295
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001296 if (ctx->count >= ctx->u.rep->pattern[2]
1297 && ctx->u.rep->pattern[2] != 65535)
1298 RETURN_FAILURE;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001299
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001300 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001301 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1302 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001303 if (ret) {
1304 RETURN_ON_ERROR(ret);
1305 RETURN_SUCCESS;
1306 }
1307 ctx->u.rep->count = ctx->count-1;
1308 state->ptr = ctx->ptr;
1309 RETURN_FAILURE;
1310
1311 case SRE_OP_GROUPREF:
1312 /* match backreference */
1313 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1314 ctx->ptr, ctx->pattern[0]));
1315 i = ctx->pattern[0];
1316 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001317 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001318 if (groupref >= state->lastmark) {
1319 RETURN_FAILURE;
1320 } else {
1321 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1322 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1323 if (!p || !e || e < p)
1324 RETURN_FAILURE;
1325 while (p < e) {
1326 if (ctx->ptr >= end || *ctx->ptr != *p)
1327 RETURN_FAILURE;
1328 p++; ctx->ptr++;
1329 }
1330 }
1331 }
1332 ctx->pattern++;
1333 break;
1334
1335 case SRE_OP_GROUPREF_IGNORE:
1336 /* match backreference */
1337 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1338 ctx->ptr, ctx->pattern[0]));
1339 i = ctx->pattern[0];
1340 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001341 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001342 if (groupref >= state->lastmark) {
1343 RETURN_FAILURE;
1344 } else {
1345 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1346 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1347 if (!p || !e || e < p)
1348 RETURN_FAILURE;
1349 while (p < e) {
1350 if (ctx->ptr >= end ||
1351 state->lower(*ctx->ptr) != state->lower(*p))
1352 RETURN_FAILURE;
1353 p++; ctx->ptr++;
1354 }
1355 }
1356 }
1357 ctx->pattern++;
1358 break;
1359
1360 case SRE_OP_GROUPREF_EXISTS:
1361 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1362 ctx->ptr, ctx->pattern[0]));
1363 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1364 i = ctx->pattern[0];
1365 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001366 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001367 if (groupref >= state->lastmark) {
1368 ctx->pattern += ctx->pattern[1];
1369 break;
1370 } else {
1371 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1372 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1373 if (!p || !e || e < p) {
1374 ctx->pattern += ctx->pattern[1];
1375 break;
1376 }
1377 }
1378 }
1379 ctx->pattern += 2;
1380 break;
1381
1382 case SRE_OP_ASSERT:
1383 /* assert subpattern */
1384 /* <ASSERT> <skip> <back> <pattern> */
1385 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1386 ctx->ptr, ctx->pattern[1]));
1387 state->ptr = ctx->ptr - ctx->pattern[1];
1388 if (state->ptr < state->beginning)
1389 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001390 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001391 RETURN_ON_FAILURE(ret);
1392 ctx->pattern += ctx->pattern[0];
1393 break;
1394
1395 case SRE_OP_ASSERT_NOT:
1396 /* assert not subpattern */
1397 /* <ASSERT_NOT> <skip> <back> <pattern> */
1398 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1399 ctx->ptr, ctx->pattern[1]));
1400 state->ptr = ctx->ptr - ctx->pattern[1];
1401 if (state->ptr >= state->beginning) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001402 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001403 if (ret) {
1404 RETURN_ON_ERROR(ret);
1405 RETURN_FAILURE;
1406 }
1407 }
1408 ctx->pattern += ctx->pattern[0];
1409 break;
1410
1411 case SRE_OP_FAILURE:
1412 /* immediate failure */
1413 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1414 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001415
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001416 default:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001417 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1418 ctx->pattern[-1]));
1419 RETURN_ERROR(SRE_ERROR_ILLEGAL);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001420 }
1421 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001422
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001423exit:
1424 ctx_pos = ctx->last_ctx_pos;
1425 jump = ctx->jump;
1426 DATA_POP_DISCARD(ctx);
1427 if (ctx_pos == -1)
1428 return ret;
1429 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1430
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001431 switch (jump) {
1432 case JUMP_MAX_UNTIL_2:
1433 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1434 goto jump_max_until_2;
1435 case JUMP_MAX_UNTIL_3:
1436 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1437 goto jump_max_until_3;
1438 case JUMP_MIN_UNTIL_2:
1439 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1440 goto jump_min_until_2;
1441 case JUMP_MIN_UNTIL_3:
1442 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1443 goto jump_min_until_3;
1444 case JUMP_BRANCH:
1445 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1446 goto jump_branch;
1447 case JUMP_MAX_UNTIL_1:
1448 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1449 goto jump_max_until_1;
1450 case JUMP_MIN_UNTIL_1:
1451 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1452 goto jump_min_until_1;
1453 case JUMP_REPEAT:
1454 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1455 goto jump_repeat;
1456 case JUMP_REPEAT_ONE_1:
1457 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1458 goto jump_repeat_one_1;
1459 case JUMP_REPEAT_ONE_2:
1460 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1461 goto jump_repeat_one_2;
1462 case JUMP_MIN_REPEAT_ONE:
1463 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1464 goto jump_min_repeat_one;
1465 case JUMP_ASSERT:
1466 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1467 goto jump_assert;
1468 case JUMP_ASSERT_NOT:
1469 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1470 goto jump_assert_not;
1471 case JUMP_NONE:
1472 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1473 break;
1474 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001475
1476 return ret; /* should never get here */
Guido van Rossumb700df92000-03-31 14:59:30 +00001477}
1478
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001479LOCAL(Py_ssize_t)
Guido van Rossumb700df92000-03-31 14:59:30 +00001480SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1481{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001482 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1483 SRE_CHAR* end = (SRE_CHAR *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001484 Py_ssize_t status = 0;
1485 Py_ssize_t prefix_len = 0;
1486 Py_ssize_t prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001487 SRE_CODE* prefix = NULL;
1488 SRE_CODE* charset = NULL;
1489 SRE_CODE* overlap = NULL;
1490 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001491
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001492 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001493 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001494 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001495
1496 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001497
Gustavo Niemeyer28b5bb32003-06-26 14:41:08 +00001498 if (pattern[3] > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001499 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001500 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001501 end -= pattern[3]-1;
1502 if (end <= ptr)
1503 end = ptr+1;
1504 }
1505
Fredrik Lundh3562f112000-07-02 12:00:07 +00001506 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001507 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001508 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001509 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001510 prefix_skip = pattern[6];
1511 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001512 overlap = prefix + prefix_len - 1;
1513 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001514 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001515 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001516 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001517
1518 pattern += 1 + pattern[1];
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001519 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001520
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001521 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1522 TRACE(("charset = %p\n", charset));
1523
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001524#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001525 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001526 /* pattern starts with a known prefix. use the overlap
1527 table to skip forward as fast as we possibly can */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001528 Py_ssize_t i = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001529 end = (SRE_CHAR *)state->end;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001530 while (ptr < end) {
1531 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001532 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001533 if (!i)
1534 break;
1535 else
1536 i = overlap[i];
1537 } else {
1538 if (++i == prefix_len) {
1539 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001540 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1541 state->start = ptr + 1 - prefix_len;
1542 state->ptr = ptr + 1 - prefix_len + prefix_skip;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001543 if (flags & SRE_INFO_LITERAL)
1544 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001545 status = SRE_MATCH(state, pattern + 2*prefix_skip);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001546 if (status != 0)
1547 return status;
1548 /* close but no cigar -- try again */
1549 i = overlap[i];
1550 }
1551 break;
1552 }
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001553 }
1554 ptr++;
1555 }
1556 return 0;
1557 }
1558#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001559
Fredrik Lundh3562f112000-07-02 12:00:07 +00001560 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001561 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001562 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001563 SRE_CODE chr = pattern[1];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001564 end = (SRE_CHAR *)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001565 for (;;) {
1566 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1567 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001568 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001569 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001570 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001571 state->start = ptr;
1572 state->ptr = ++ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001573 if (flags & SRE_INFO_LITERAL)
1574 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001575 status = SRE_MATCH(state, pattern + 2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001576 if (status != 0)
1577 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001578 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001579 } else if (charset) {
1580 /* pattern starts with a character from a known set */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001581 end = (SRE_CHAR *)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001582 for (;;) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001583 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001584 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001585 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001586 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001587 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001588 state->start = ptr;
1589 state->ptr = ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001590 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001591 if (status != 0)
1592 break;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001593 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001594 }
1595 } else
1596 /* general case */
1597 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001598 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001599 state->start = state->ptr = ptr++;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001600 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001601 if (status != 0)
1602 break;
1603 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001604
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001605 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001606}
Tim Peters3d563502006-01-21 02:47:53 +00001607
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001608LOCAL(int)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001609SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001610{
1611 /* check if given string is a literal template (i.e. no escapes) */
1612 while (len-- > 0)
1613 if (*ptr++ == '\\')
1614 return 0;
1615 return 1;
1616}
Guido van Rossumb700df92000-03-31 14:59:30 +00001617
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001618#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001619
1620/* -------------------------------------------------------------------- */
1621/* factories and destructors */
1622
1623/* see sre.h for object declarations */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001624static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1625static PyObject*pattern_scanner(PatternObject*, PyObject*);
Guido van Rossumb700df92000-03-31 14:59:30 +00001626
1627static PyObject *
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001628sre_codesize(PyObject* self, PyObject *unused)
Guido van Rossumb700df92000-03-31 14:59:30 +00001629{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001630 return Py_BuildValue("l", sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001631}
1632
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001633static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001634sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001635{
1636 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001637 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001638 return NULL;
1639 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001640 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001641 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001642#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001643 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001644#else
1645 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001646#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001647 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001648}
1649
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001650LOCAL(void)
1651state_reset(SRE_STATE* state)
1652{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001653 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001654 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001655
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001656 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001657 state->lastindex = -1;
1658
1659 state->repeat = NULL;
1660
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001661 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001662}
1663
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001664static void*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001665getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
Guido van Rossumb700df92000-03-31 14:59:30 +00001666{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001667 /* given a python object, return a data pointer, a length (in
1668 characters), and a character size. return NULL if the object
1669 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +00001670
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001671 PyBufferProcs *buffer;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001672 Py_ssize_t size, bytes;
1673 int charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001674 void* ptr;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001675 PyBuffer view;
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001676
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001677 /* get pointer to string buffer */
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001678 view.len = -1;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001679 buffer = Py_Type(string)->tp_as_buffer;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001680 if (!buffer || !buffer->bf_getbuffer ||
1681 (*buffer->bf_getbuffer)(string, &view, PyBUF_SIMPLE) < 0) {
1682 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1683 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001684 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001685
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001686 /* determine buffer size */
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001687 bytes = view.len;
1688 ptr = view.buf;
1689
1690 /* Release the buffer immediately --- possibly dangerous
1691 but doing something else would require some re-factoring
1692 */
1693 PyObject_ReleaseBuffer(string, &view);
1694
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001695 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001696 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1697 return NULL;
1698 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001699
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001700 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001701 size = PyObject_Size(string);
Guido van Rossumb700df92000-03-31 14:59:30 +00001702
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001703 if (PyString_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001704 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001705#if defined(HAVE_UNICODE)
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001706 else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001707 charsize = sizeof(Py_UNICODE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001708#endif
1709 else {
1710 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1711 return NULL;
1712 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001713
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001714 *p_length = size;
1715 *p_charsize = charsize;
1716
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001717 if (ptr == NULL) {
1718 PyErr_SetString(PyExc_ValueError,
1719 "Buffer is NULL");
1720 }
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001721 return ptr;
1722}
1723
1724LOCAL(PyObject*)
1725state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001726 Py_ssize_t start, Py_ssize_t end)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001727{
1728 /* prepare state object */
1729
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001730 Py_ssize_t length;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001731 int charsize;
1732 void* ptr;
1733
1734 memset(state, 0, sizeof(SRE_STATE));
1735
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001736 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001737 state->lastindex = -1;
1738
1739 ptr = getstring(string, &length, &charsize);
1740 if (!ptr)
1741 return NULL;
1742
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001743 /* adjust boundaries */
1744 if (start < 0)
1745 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001746 else if (start > length)
1747 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001748
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001749 if (end < 0)
1750 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001751 else if (end > length)
1752 end = length;
1753
1754 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001755
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001756 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001757
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001758 state->start = (void*) ((char*) ptr + start * state->charsize);
1759 state->end = (void*) ((char*) ptr + end * state->charsize);
1760
1761 Py_INCREF(string);
1762 state->string = string;
1763 state->pos = start;
1764 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001765
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001766 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001767 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001768 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001769#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001770 state->lower = sre_lower_unicode;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001771#else
1772 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001773#endif
1774 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001775 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001776
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001777 return string;
Guido van Rossumb700df92000-03-31 14:59:30 +00001778}
1779
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001780LOCAL(void)
1781state_fini(SRE_STATE* state)
1782{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001783 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001784 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001785}
1786
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001787/* calculate offset from start of string */
1788#define STATE_OFFSET(state, member)\
1789 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1790
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001791LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001792state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001793{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001794 Py_ssize_t i, j;
Fredrik Lundh58100642000-08-09 09:14:35 +00001795
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001796 index = (index - 1) * 2;
1797
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001798 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001799 if (empty)
1800 /* want empty string */
1801 i = j = 0;
1802 else {
1803 Py_INCREF(Py_None);
1804 return Py_None;
1805 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001806 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001807 i = STATE_OFFSET(state, state->mark[index]);
1808 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001809 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001810
Fredrik Lundh58100642000-08-09 09:14:35 +00001811 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001812}
1813
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001814static void
1815pattern_error(int status)
1816{
1817 switch (status) {
1818 case SRE_ERROR_RECURSION_LIMIT:
1819 PyErr_SetString(
1820 PyExc_RuntimeError,
1821 "maximum recursion limit exceeded"
1822 );
1823 break;
1824 case SRE_ERROR_MEMORY:
1825 PyErr_NoMemory();
1826 break;
1827 default:
1828 /* other error codes indicate compiler/engine bugs */
1829 PyErr_SetString(
1830 PyExc_RuntimeError,
1831 "internal error in regular expression engine"
1832 );
1833 }
1834}
1835
Guido van Rossumb700df92000-03-31 14:59:30 +00001836static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001837pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001838{
Raymond Hettinger027bb632004-05-31 03:09:25 +00001839 if (self->weakreflist != NULL)
1840 PyObject_ClearWeakRefs((PyObject *) self);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001841 Py_XDECREF(self->pattern);
1842 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001843 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001844 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001845}
1846
1847static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001848pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001849{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001850 SRE_STATE state;
1851 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001852
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001853 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001854 Py_ssize_t start = 0;
1855 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001856 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001857 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001858 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001859 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001860
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001861 string = state_init(&state, self, string, start, end);
1862 if (!string)
1863 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001864
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001865 state.ptr = state.start;
1866
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001867 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1868
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001869 if (state.charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001870 status = sre_match(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001871 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001872#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001873 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001874#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001875 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001876
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001877 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001878 if (PyErr_Occurred())
1879 return NULL;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001880
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001881 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001882
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001883 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001884}
1885
1886static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001887pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001888{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001889 SRE_STATE state;
1890 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001891
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001892 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001893 Py_ssize_t start = 0;
1894 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001895 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001896 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001897 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001898 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001899
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001900 string = state_init(&state, self, string, start, end);
1901 if (!string)
1902 return NULL;
1903
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001904 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1905
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001906 if (state.charsize == 1) {
1907 status = sre_search(&state, PatternObject_GetCode(self));
1908 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001909#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001910 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001911#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001912 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001913
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001914 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1915
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001916 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001917
Thomas Wouters89f507f2006-12-13 04:49:30 +00001918 if (PyErr_Occurred())
1919 return NULL;
1920
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001921 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001922}
1923
1924static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001925call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001926{
1927 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001928 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001929 PyObject* func;
1930 PyObject* result;
1931
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001932 if (!args)
1933 return NULL;
Neal Norwitzfe537132007-08-26 03:55:15 +00001934 name = PyUnicode_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001935 if (!name)
1936 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001937 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001938 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001939 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001940 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001941 func = PyObject_GetAttrString(mod, function);
1942 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001943 if (!func)
1944 return NULL;
1945 result = PyObject_CallObject(func, args);
1946 Py_DECREF(func);
1947 Py_DECREF(args);
1948 return result;
1949}
1950
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001951#ifdef USE_BUILTIN_COPY
1952static int
1953deepcopy(PyObject** object, PyObject* memo)
1954{
1955 PyObject* copy;
1956
1957 copy = call(
1958 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00001959 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001960 );
1961 if (!copy)
1962 return 0;
1963
1964 Py_DECREF(*object);
1965 *object = copy;
1966
1967 return 1; /* success */
1968}
1969#endif
1970
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001971static PyObject*
Michael W. Hudsonb6a45052002-07-31 09:54:24 +00001972join_list(PyObject* list, PyObject* pattern)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001973{
1974 /* join list elements */
1975
1976 PyObject* joiner;
1977#if PY_VERSION_HEX >= 0x01060000
1978 PyObject* function;
1979 PyObject* args;
1980#endif
1981 PyObject* result;
1982
1983 switch (PyList_GET_SIZE(list)) {
1984 case 0:
1985 Py_DECREF(list);
Fredrik Lundh09705f02002-11-22 12:46:35 +00001986 return PySequence_GetSlice(pattern, 0, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001987 case 1:
1988 result = PyList_GET_ITEM(list, 0);
1989 Py_INCREF(result);
1990 Py_DECREF(list);
1991 return result;
1992 }
1993
1994 /* two or more elements: slice out a suitable separator from the
1995 first member, and use that to join the entire list */
1996
1997 joiner = PySequence_GetSlice(pattern, 0, 0);
1998 if (!joiner)
1999 return NULL;
2000
2001#if PY_VERSION_HEX >= 0x01060000
2002 function = PyObject_GetAttrString(joiner, "join");
2003 if (!function) {
2004 Py_DECREF(joiner);
2005 return NULL;
2006 }
2007 args = PyTuple_New(1);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002008 if (!args) {
2009 Py_DECREF(function);
2010 Py_DECREF(joiner);
2011 return NULL;
2012 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002013 PyTuple_SET_ITEM(args, 0, list);
2014 result = PyObject_CallObject(function, args);
2015 Py_DECREF(args); /* also removes list */
2016 Py_DECREF(function);
2017#else
2018 result = call(
2019 "string", "join",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002020 PyTuple_Pack(2, list, joiner)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002021 );
2022#endif
2023 Py_DECREF(joiner);
2024
2025 return result;
2026}
2027
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002028static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002029pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002030{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002031 SRE_STATE state;
2032 PyObject* list;
2033 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002034 Py_ssize_t i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00002035
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002036 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002037 Py_ssize_t start = 0;
2038 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002039 static char* kwlist[] = { "source", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002040 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00002041 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002042 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002043
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002044 string = state_init(&state, self, string, start, end);
2045 if (!string)
2046 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002047
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002048 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002049 if (!list) {
2050 state_fini(&state);
2051 return NULL;
2052 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002053
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002054 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002055
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002056 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +00002057
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002058 state_reset(&state);
2059
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002060 state.ptr = state.start;
2061
2062 if (state.charsize == 1) {
2063 status = sre_search(&state, PatternObject_GetCode(self));
2064 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002065#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002066 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002067#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002068 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002069
Thomas Wouters89f507f2006-12-13 04:49:30 +00002070 if (PyErr_Occurred())
2071 goto error;
2072
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002073 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002074 if (status == 0)
2075 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002076 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002077 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002078 }
Tim Peters3d563502006-01-21 02:47:53 +00002079
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002080 /* don't bother to build a match object */
2081 switch (self->groups) {
2082 case 0:
2083 b = STATE_OFFSET(&state, state.start);
2084 e = STATE_OFFSET(&state, state.ptr);
2085 item = PySequence_GetSlice(string, b, e);
2086 if (!item)
2087 goto error;
2088 break;
2089 case 1:
2090 item = state_getslice(&state, 1, string, 1);
2091 if (!item)
2092 goto error;
2093 break;
2094 default:
2095 item = PyTuple_New(self->groups);
2096 if (!item)
2097 goto error;
2098 for (i = 0; i < self->groups; i++) {
2099 PyObject* o = state_getslice(&state, i+1, string, 1);
2100 if (!o) {
2101 Py_DECREF(item);
2102 goto error;
2103 }
2104 PyTuple_SET_ITEM(item, i, o);
2105 }
2106 break;
2107 }
2108
2109 status = PyList_Append(list, item);
2110 Py_DECREF(item);
2111 if (status < 0)
2112 goto error;
2113
2114 if (state.ptr == state.start)
2115 state.start = (void*) ((char*) state.ptr + state.charsize);
2116 else
2117 state.start = state.ptr;
2118
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002119 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002120
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002121 state_fini(&state);
2122 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002123
2124error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002125 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002126 state_fini(&state);
2127 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002128
Guido van Rossumb700df92000-03-31 14:59:30 +00002129}
2130
Fredrik Lundh703ce812001-10-24 22:16:30 +00002131#if PY_VERSION_HEX >= 0x02020000
2132static PyObject*
2133pattern_finditer(PatternObject* pattern, PyObject* args)
2134{
2135 PyObject* scanner;
2136 PyObject* search;
2137 PyObject* iterator;
2138
2139 scanner = pattern_scanner(pattern, args);
2140 if (!scanner)
2141 return NULL;
2142
2143 search = PyObject_GetAttrString(scanner, "search");
2144 Py_DECREF(scanner);
2145 if (!search)
2146 return NULL;
2147
2148 iterator = PyCallIter_New(search, Py_None);
2149 Py_DECREF(search);
2150
2151 return iterator;
2152}
2153#endif
2154
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002155static PyObject*
2156pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2157{
2158 SRE_STATE state;
2159 PyObject* list;
2160 PyObject* item;
2161 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002162 Py_ssize_t n;
2163 Py_ssize_t i;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002164 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002165
2166 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002167 Py_ssize_t maxsplit = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002168 static char* kwlist[] = { "source", "maxsplit", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002169 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002170 &string, &maxsplit))
2171 return NULL;
2172
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002173 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002174 if (!string)
2175 return NULL;
2176
2177 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002178 if (!list) {
2179 state_fini(&state);
2180 return NULL;
2181 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002182
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002183 n = 0;
2184 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002185
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002186 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002187
2188 state_reset(&state);
2189
2190 state.ptr = state.start;
2191
2192 if (state.charsize == 1) {
2193 status = sre_search(&state, PatternObject_GetCode(self));
2194 } else {
2195#if defined(HAVE_UNICODE)
2196 status = sre_usearch(&state, PatternObject_GetCode(self));
2197#endif
2198 }
2199
Thomas Wouters89f507f2006-12-13 04:49:30 +00002200 if (PyErr_Occurred())
2201 goto error;
2202
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002203 if (status <= 0) {
2204 if (status == 0)
2205 break;
2206 pattern_error(status);
2207 goto error;
2208 }
Tim Peters3d563502006-01-21 02:47:53 +00002209
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002210 if (state.start == state.ptr) {
2211 if (last == state.end)
2212 break;
2213 /* skip one character */
2214 state.start = (void*) ((char*) state.ptr + state.charsize);
2215 continue;
2216 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002217
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002218 /* get segment before this match */
2219 item = PySequence_GetSlice(
2220 string, STATE_OFFSET(&state, last),
2221 STATE_OFFSET(&state, state.start)
2222 );
2223 if (!item)
2224 goto error;
2225 status = PyList_Append(list, item);
2226 Py_DECREF(item);
2227 if (status < 0)
2228 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002229
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002230 /* add groups (if any) */
2231 for (i = 0; i < self->groups; i++) {
2232 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002233 if (!item)
2234 goto error;
2235 status = PyList_Append(list, item);
2236 Py_DECREF(item);
2237 if (status < 0)
2238 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002239 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002240
2241 n = n + 1;
2242
2243 last = state.start = state.ptr;
2244
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002245 }
2246
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002247 /* get segment following last match (even if empty) */
2248 item = PySequence_GetSlice(
2249 string, STATE_OFFSET(&state, last), state.endpos
2250 );
2251 if (!item)
2252 goto error;
2253 status = PyList_Append(list, item);
2254 Py_DECREF(item);
2255 if (status < 0)
2256 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002257
2258 state_fini(&state);
2259 return list;
2260
2261error:
2262 Py_DECREF(list);
2263 state_fini(&state);
2264 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002265
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002266}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002267
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002268static PyObject*
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002269pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002270 Py_ssize_t count, Py_ssize_t subn)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002271{
2272 SRE_STATE state;
2273 PyObject* list;
2274 PyObject* item;
2275 PyObject* filter;
2276 PyObject* args;
2277 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002278 void* ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002279 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002280 Py_ssize_t n;
2281 Py_ssize_t i, b, e;
2282 int bint;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002283 int filter_is_callable;
2284
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002285 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002286 /* sub/subn takes either a function or a template */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002287 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002288 Py_INCREF(filter);
2289 filter_is_callable = 1;
2290 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002291 /* if not callable, check if it's a literal string */
2292 int literal;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002293 ptr = getstring(ptemplate, &n, &bint);
2294 b = bint;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002295 if (ptr) {
2296 if (b == 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002297 literal = sre_literal_template((unsigned char *)ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002298 } else {
2299#if defined(HAVE_UNICODE)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002300 literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002301#endif
2302 }
2303 } else {
2304 PyErr_Clear();
2305 literal = 0;
2306 }
2307 if (literal) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002308 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002309 Py_INCREF(filter);
2310 filter_is_callable = 0;
2311 } else {
2312 /* not a literal; hand it over to the template compiler */
2313 filter = call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00002314 SRE_PY_MODULE, "_subx",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002315 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002316 );
2317 if (!filter)
2318 return NULL;
2319 filter_is_callable = PyCallable_Check(filter);
2320 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002321 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002322
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002323 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002324 if (!string) {
2325 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002326 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002327 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002328
2329 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002330 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002331 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002332 state_fini(&state);
2333 return NULL;
2334 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002335
2336 n = i = 0;
2337
2338 while (!count || n < count) {
2339
2340 state_reset(&state);
2341
2342 state.ptr = state.start;
2343
2344 if (state.charsize == 1) {
2345 status = sre_search(&state, PatternObject_GetCode(self));
2346 } else {
2347#if defined(HAVE_UNICODE)
2348 status = sre_usearch(&state, PatternObject_GetCode(self));
2349#endif
2350 }
2351
Thomas Wouters89f507f2006-12-13 04:49:30 +00002352 if (PyErr_Occurred())
2353 goto error;
2354
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002355 if (status <= 0) {
2356 if (status == 0)
2357 break;
2358 pattern_error(status);
2359 goto error;
2360 }
Tim Peters3d563502006-01-21 02:47:53 +00002361
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002362 b = STATE_OFFSET(&state, state.start);
2363 e = STATE_OFFSET(&state, state.ptr);
2364
2365 if (i < b) {
2366 /* get segment before this match */
2367 item = PySequence_GetSlice(string, i, b);
2368 if (!item)
2369 goto error;
2370 status = PyList_Append(list, item);
2371 Py_DECREF(item);
2372 if (status < 0)
2373 goto error;
2374
2375 } else if (i == b && i == e && n > 0)
2376 /* ignore empty match on latest position */
2377 goto next;
2378
2379 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002380 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002381 match = pattern_new_match(self, &state, 1);
2382 if (!match)
2383 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002384 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002385 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002386 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002387 goto error;
2388 }
2389 item = PyObject_CallObject(filter, args);
2390 Py_DECREF(args);
2391 Py_DECREF(match);
2392 if (!item)
2393 goto error;
2394 } else {
2395 /* filter is literal string */
2396 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002397 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002398 }
2399
2400 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002401 if (item != Py_None) {
2402 status = PyList_Append(list, item);
2403 Py_DECREF(item);
2404 if (status < 0)
2405 goto error;
2406 }
Tim Peters3d563502006-01-21 02:47:53 +00002407
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002408 i = e;
2409 n = n + 1;
2410
2411next:
2412 /* move on */
2413 if (state.ptr == state.start)
2414 state.start = (void*) ((char*) state.ptr + state.charsize);
2415 else
2416 state.start = state.ptr;
2417
2418 }
2419
2420 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002421 if (i < state.endpos) {
2422 item = PySequence_GetSlice(string, i, state.endpos);
2423 if (!item)
2424 goto error;
2425 status = PyList_Append(list, item);
2426 Py_DECREF(item);
2427 if (status < 0)
2428 goto error;
2429 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002430
2431 state_fini(&state);
2432
Guido van Rossum4e173842001-12-07 04:25:10 +00002433 Py_DECREF(filter);
2434
Fredrik Lundhdac58492001-10-21 21:48:30 +00002435 /* convert list to single string (also removes list) */
Michael W. Hudsonb6a45052002-07-31 09:54:24 +00002436 item = join_list(list, self->pattern);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002437
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002438 if (!item)
2439 return NULL;
2440
2441 if (subn)
2442 return Py_BuildValue("Ni", item, n);
2443
2444 return item;
2445
2446error:
2447 Py_DECREF(list);
2448 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002449 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002450 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002451
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002452}
2453
2454static PyObject*
2455pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2456{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002457 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002458 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002459 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002460 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002461 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002462 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002463 return NULL;
2464
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002465 return pattern_subx(self, ptemplate, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002466}
2467
2468static PyObject*
2469pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2470{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002471 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002472 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002473 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002474 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002475 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002476 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002477 return NULL;
2478
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002479 return pattern_subx(self, ptemplate, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002480}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002481
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002482static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002483pattern_copy(PatternObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002484{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002485#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002486 PatternObject* copy;
2487 int offset;
2488
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002489 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2490 if (!copy)
2491 return NULL;
2492
2493 offset = offsetof(PatternObject, groups);
2494
2495 Py_XINCREF(self->groupindex);
2496 Py_XINCREF(self->indexgroup);
2497 Py_XINCREF(self->pattern);
2498
2499 memcpy((char*) copy + offset, (char*) self + offset,
2500 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00002501 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002502
2503 return (PyObject*) copy;
2504#else
2505 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2506 return NULL;
2507#endif
2508}
2509
2510static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002511pattern_deepcopy(PatternObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002512{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002513#ifdef USE_BUILTIN_COPY
2514 PatternObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00002515
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002516 copy = (PatternObject*) pattern_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002517 if (!copy)
2518 return NULL;
2519
2520 if (!deepcopy(&copy->groupindex, memo) ||
2521 !deepcopy(&copy->indexgroup, memo) ||
2522 !deepcopy(&copy->pattern, memo)) {
2523 Py_DECREF(copy);
2524 return NULL;
2525 }
2526
2527#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002528 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2529 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002530#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002531}
2532
Raymond Hettinger94478742004-09-24 04:31:19 +00002533PyDoc_STRVAR(pattern_match_doc,
2534"match(string[, pos[, endpos]]) --> match object or None.\n\
2535 Matches zero or more characters at the beginning of the string");
2536
2537PyDoc_STRVAR(pattern_search_doc,
2538"search(string[, pos[, endpos]]) --> match object or None.\n\
2539 Scan through string looking for a match, and return a corresponding\n\
2540 MatchObject instance. Return None if no position in the string matches.");
2541
2542PyDoc_STRVAR(pattern_split_doc,
2543"split(string[, maxsplit = 0]) --> list.\n\
2544 Split string by the occurrences of pattern.");
2545
2546PyDoc_STRVAR(pattern_findall_doc,
2547"findall(string[, pos[, endpos]]) --> list.\n\
2548 Return a list of all non-overlapping matches of pattern in string.");
2549
2550PyDoc_STRVAR(pattern_finditer_doc,
2551"finditer(string[, pos[, endpos]]) --> iterator.\n\
2552 Return an iterator over all non-overlapping matches for the \n\
2553 RE pattern in string. For each match, the iterator returns a\n\
2554 match object.");
2555
2556PyDoc_STRVAR(pattern_sub_doc,
2557"sub(repl, string[, count = 0]) --> newstring\n\
2558 Return the string obtained by replacing the leftmost non-overlapping\n\
Tim Peters3d563502006-01-21 02:47:53 +00002559 occurrences of pattern in string by the replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002560
2561PyDoc_STRVAR(pattern_subn_doc,
2562"subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2563 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2564 the leftmost non-overlapping occurrences of pattern with the\n\
Tim Peters3d563502006-01-21 02:47:53 +00002565 replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002566
2567PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2568
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002569static PyMethodDef pattern_methods[] = {
Tim Peters3d563502006-01-21 02:47:53 +00002570 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002571 pattern_match_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002572 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002573 pattern_search_doc},
2574 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2575 pattern_sub_doc},
2576 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2577 pattern_subn_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002578 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002579 pattern_split_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002580 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002581 pattern_findall_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002582#if PY_VERSION_HEX >= 0x02020000
Raymond Hettinger94478742004-09-24 04:31:19 +00002583 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2584 pattern_finditer_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002585#endif
Fredrik Lundh562586e2000-10-03 20:43:34 +00002586 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002587 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2588 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002589 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002590};
2591
Tim Peters3d563502006-01-21 02:47:53 +00002592static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002593pattern_getattr(PatternObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00002594{
2595 PyObject* res;
2596
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002597 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00002598
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002599 if (res)
2600 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00002601
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002602 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00002603
2604 /* attributes */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002605 if (!strcmp(name, "pattern")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002606 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002607 return self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00002608 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002609
2610 if (!strcmp(name, "flags"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002611 return Py_BuildValue("i", self->flags);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002612
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002613 if (!strcmp(name, "groups"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002614 return Py_BuildValue("i", self->groups);
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002615
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002616 if (!strcmp(name, "groupindex") && self->groupindex) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002617 Py_INCREF(self->groupindex);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002618 return self->groupindex;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002619 }
2620
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002621 PyErr_SetString(PyExc_AttributeError, name);
2622 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002623}
2624
Neal Norwitz57c179c2006-03-22 07:18:02 +00002625static PyTypeObject Pattern_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00002626 PyVarObject_HEAD_INIT(NULL, 0)
2627 "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002628 sizeof(PatternObject), sizeof(SRE_CODE),
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002629 (destructor)pattern_dealloc, /*tp_dealloc*/
2630 0, /*tp_print*/
Raymond Hettinger027bb632004-05-31 03:09:25 +00002631 (getattrfunc)pattern_getattr, /*tp_getattr*/
2632 0, /* tp_setattr */
2633 0, /* tp_compare */
2634 0, /* tp_repr */
2635 0, /* tp_as_number */
2636 0, /* tp_as_sequence */
2637 0, /* tp_as_mapping */
2638 0, /* tp_hash */
2639 0, /* tp_call */
2640 0, /* tp_str */
2641 0, /* tp_getattro */
2642 0, /* tp_setattro */
2643 0, /* tp_as_buffer */
Guido van Rossum3cf5b1e2006-07-27 21:53:35 +00002644 Py_TPFLAGS_DEFAULT, /* tp_flags */
Raymond Hettinger94478742004-09-24 04:31:19 +00002645 pattern_doc, /* tp_doc */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002646 0, /* tp_traverse */
2647 0, /* tp_clear */
2648 0, /* tp_richcompare */
2649 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
Guido van Rossumb700df92000-03-31 14:59:30 +00002650};
2651
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002652static PyObject *
2653_compile(PyObject* self_, PyObject* args)
2654{
2655 /* "compile" pattern descriptor to pattern object */
2656
2657 PatternObject* self;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002658 Py_ssize_t i, n;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002659
2660 PyObject* pattern;
2661 int flags = 0;
2662 PyObject* code;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002663 Py_ssize_t groups = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002664 PyObject* groupindex = NULL;
2665 PyObject* indexgroup = NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002666 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002667 &PyList_Type, &code, &groups,
2668 &groupindex, &indexgroup))
2669 return NULL;
2670
2671 n = PyList_GET_SIZE(code);
2672
2673 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2674 if (!self)
2675 return NULL;
2676
2677 self->codesize = n;
2678
2679 for (i = 0; i < n; i++) {
2680 PyObject *o = PyList_GET_ITEM(code, i);
Guido van Rossumddefaf32007-01-14 03:31:43 +00002681 unsigned long value = PyLong_AsUnsignedLong(o);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002682 self->code[i] = (SRE_CODE) value;
2683 if ((unsigned long) self->code[i] != value) {
2684 PyErr_SetString(PyExc_OverflowError,
2685 "regular expression code size limit exceeded");
2686 break;
2687 }
2688 }
2689
2690 if (PyErr_Occurred()) {
2691 PyObject_DEL(self);
2692 return NULL;
2693 }
2694
2695 Py_INCREF(pattern);
2696 self->pattern = pattern;
2697
2698 self->flags = flags;
2699
2700 self->groups = groups;
2701
2702 Py_XINCREF(groupindex);
2703 self->groupindex = groupindex;
2704
2705 Py_XINCREF(indexgroup);
2706 self->indexgroup = indexgroup;
2707
2708 self->weakreflist = NULL;
2709
2710 return (PyObject*) self;
2711}
2712
Guido van Rossumb700df92000-03-31 14:59:30 +00002713/* -------------------------------------------------------------------- */
2714/* match methods */
2715
2716static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002717match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00002718{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002719 Py_XDECREF(self->regs);
2720 Py_XDECREF(self->string);
2721 Py_DECREF(self->pattern);
2722 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00002723}
2724
2725static PyObject*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002726match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00002727{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002728 if (index < 0 || index >= self->groups) {
2729 /* raise IndexError if we were given a bad group number */
2730 PyErr_SetString(
2731 PyExc_IndexError,
2732 "no such group"
2733 );
2734 return NULL;
2735 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002736
Fredrik Lundh6f013982000-07-03 18:44:21 +00002737 index *= 2;
2738
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002739 if (self->string == Py_None || self->mark[index] < 0) {
2740 /* return default value if the string or group is undefined */
2741 Py_INCREF(def);
2742 return def;
2743 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002744
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002745 return PySequence_GetSlice(
2746 self->string, self->mark[index], self->mark[index+1]
2747 );
Guido van Rossumb700df92000-03-31 14:59:30 +00002748}
2749
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002750static Py_ssize_t
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002751match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00002752{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002753 Py_ssize_t i;
Guido van Rossumb700df92000-03-31 14:59:30 +00002754
Guido van Rossumddefaf32007-01-14 03:31:43 +00002755 if (index == NULL)
2756 /* Default value */
2757 return 0;
2758
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002759 if (PyInt_Check(index))
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002760 return PyInt_AsSsize_t(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00002761
Fredrik Lundh6f013982000-07-03 18:44:21 +00002762 i = -1;
2763
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002764 if (self->pattern->groupindex) {
2765 index = PyObject_GetItem(self->pattern->groupindex, index);
2766 if (index) {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002767 if (PyInt_Check(index) || PyLong_Check(index))
2768 i = PyInt_AsSsize_t(index);
Fredrik Lundh6f013982000-07-03 18:44:21 +00002769 Py_DECREF(index);
2770 } else
2771 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002772 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00002773
2774 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002775}
2776
2777static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002778match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002779{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002780 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00002781}
2782
2783static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002784match_expand(MatchObject* self, PyObject* ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002785{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002786 /* delegate to Python code */
2787 return call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00002788 SRE_PY_MODULE, "_expand",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002789 PyTuple_Pack(3, self->pattern, self, ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002790 );
2791}
2792
2793static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002794match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002795{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002796 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002797 Py_ssize_t i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00002798
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002799 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00002800
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002801 switch (size) {
2802 case 0:
2803 result = match_getslice(self, Py_False, Py_None);
2804 break;
2805 case 1:
2806 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2807 break;
2808 default:
2809 /* fetch multiple items */
2810 result = PyTuple_New(size);
2811 if (!result)
2812 return NULL;
2813 for (i = 0; i < size; i++) {
2814 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002815 self, PyTuple_GET_ITEM(args, i), Py_None
2816 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002817 if (!item) {
2818 Py_DECREF(result);
2819 return NULL;
2820 }
2821 PyTuple_SET_ITEM(result, i, item);
2822 }
2823 break;
2824 }
2825 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002826}
2827
2828static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002829match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002830{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002831 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002832 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002833
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002834 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002835 static char* kwlist[] = { "default", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00002836 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002837 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002838
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002839 result = PyTuple_New(self->groups-1);
2840 if (!result)
2841 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002842
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002843 for (index = 1; index < self->groups; index++) {
2844 PyObject* item;
2845 item = match_getslice_by_index(self, index, def);
2846 if (!item) {
2847 Py_DECREF(result);
2848 return NULL;
2849 }
2850 PyTuple_SET_ITEM(result, index-1, item);
2851 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002852
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002853 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002854}
2855
2856static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002857match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002858{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002859 PyObject* result;
2860 PyObject* keys;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002861 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002862
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002863 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002864 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00002865 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002866 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002867
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002868 result = PyDict_New();
2869 if (!result || !self->pattern->groupindex)
2870 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002871
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002872 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002873 if (!keys)
2874 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00002875
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002876 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00002877 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002878 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002879 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002880 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002881 if (!key)
2882 goto failed;
2883 value = match_getslice(self, key, def);
2884 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002885 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002886 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002887 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00002888 status = PyDict_SetItem(result, key, value);
2889 Py_DECREF(value);
2890 if (status < 0)
2891 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002892 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002893
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002894 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00002895
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002896 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002897
2898failed:
Neal Norwitz60da3162006-03-07 04:48:24 +00002899 Py_XDECREF(keys);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002900 Py_DECREF(result);
2901 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002902}
2903
2904static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002905match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002906{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002907 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002908
Guido van Rossumddefaf32007-01-14 03:31:43 +00002909 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002910 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002911 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002912
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002913 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002914
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002915 if (index < 0 || index >= self->groups) {
2916 PyErr_SetString(
2917 PyExc_IndexError,
2918 "no such group"
2919 );
2920 return NULL;
2921 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002922
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002923 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002924 return Py_BuildValue("i", self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00002925}
2926
2927static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002928match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002929{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002930 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002931
Guido van Rossumddefaf32007-01-14 03:31:43 +00002932 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002933 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002934 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002935
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002936 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002937
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002938 if (index < 0 || index >= self->groups) {
2939 PyErr_SetString(
2940 PyExc_IndexError,
2941 "no such group"
2942 );
2943 return NULL;
2944 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002945
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002946 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002947 return Py_BuildValue("i", self->mark[index*2+1]);
2948}
2949
2950LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002951_pair(Py_ssize_t i1, Py_ssize_t i2)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002952{
2953 PyObject* pair;
2954 PyObject* item;
2955
2956 pair = PyTuple_New(2);
2957 if (!pair)
2958 return NULL;
2959
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002960 item = PyInt_FromSsize_t(i1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002961 if (!item)
2962 goto error;
2963 PyTuple_SET_ITEM(pair, 0, item);
2964
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002965 item = PyInt_FromSsize_t(i2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002966 if (!item)
2967 goto error;
2968 PyTuple_SET_ITEM(pair, 1, item);
2969
2970 return pair;
2971
2972 error:
2973 Py_DECREF(pair);
2974 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002975}
2976
2977static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002978match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002979{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002980 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002981
Guido van Rossumddefaf32007-01-14 03:31:43 +00002982 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002983 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002984 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002985
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002986 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002987
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002988 if (index < 0 || index >= self->groups) {
2989 PyErr_SetString(
2990 PyExc_IndexError,
2991 "no such group"
2992 );
2993 return NULL;
2994 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002995
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002996 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002997 return _pair(self->mark[index*2], self->mark[index*2+1]);
2998}
2999
3000static PyObject*
3001match_regs(MatchObject* self)
3002{
3003 PyObject* regs;
3004 PyObject* item;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003005 Py_ssize_t index;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003006
3007 regs = PyTuple_New(self->groups);
3008 if (!regs)
3009 return NULL;
3010
3011 for (index = 0; index < self->groups; index++) {
3012 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3013 if (!item) {
3014 Py_DECREF(regs);
3015 return NULL;
3016 }
3017 PyTuple_SET_ITEM(regs, index, item);
3018 }
3019
3020 Py_INCREF(regs);
3021 self->regs = regs;
3022
3023 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00003024}
3025
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003026static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003027match_copy(MatchObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003028{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003029#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003030 MatchObject* copy;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003031 Py_ssize_t slots, offset;
Tim Peters3d563502006-01-21 02:47:53 +00003032
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003033 slots = 2 * (self->pattern->groups+1);
3034
3035 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3036 if (!copy)
3037 return NULL;
3038
3039 /* this value a constant, but any compiler should be able to
3040 figure that out all by itself */
3041 offset = offsetof(MatchObject, string);
3042
3043 Py_XINCREF(self->pattern);
3044 Py_XINCREF(self->string);
3045 Py_XINCREF(self->regs);
3046
3047 memcpy((char*) copy + offset, (char*) self + offset,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003048 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003049
3050 return (PyObject*) copy;
3051#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003052 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003053 return NULL;
3054#endif
3055}
3056
3057static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003058match_deepcopy(MatchObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003059{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003060#ifdef USE_BUILTIN_COPY
3061 MatchObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00003062
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003063 copy = (MatchObject*) match_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003064 if (!copy)
3065 return NULL;
3066
3067 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3068 !deepcopy(&copy->string, memo) ||
3069 !deepcopy(&copy->regs, memo)) {
3070 Py_DECREF(copy);
3071 return NULL;
3072 }
3073
3074#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003075 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3076 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003077#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003078}
3079
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003080static PyMethodDef match_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00003081 {"group", (PyCFunction) match_group, METH_VARARGS},
3082 {"start", (PyCFunction) match_start, METH_VARARGS},
3083 {"end", (PyCFunction) match_end, METH_VARARGS},
3084 {"span", (PyCFunction) match_span, METH_VARARGS},
3085 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3086 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003087 {"expand", (PyCFunction) match_expand, METH_O},
3088 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3089 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003090 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003091};
3092
Tim Peters3d563502006-01-21 02:47:53 +00003093static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003094match_getattr(MatchObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00003095{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003096 PyObject* res;
Guido van Rossumb700df92000-03-31 14:59:30 +00003097
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003098 res = Py_FindMethod(match_methods, (PyObject*) self, name);
3099 if (res)
3100 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00003101
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003102 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00003103
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003104 if (!strcmp(name, "lastindex")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00003105 if (self->lastindex >= 0)
3106 return Py_BuildValue("i", self->lastindex);
Fredrik Lundhc2301732000-07-02 22:25:39 +00003107 Py_INCREF(Py_None);
3108 return Py_None;
3109 }
3110
3111 if (!strcmp(name, "lastgroup")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00003112 if (self->pattern->indexgroup && self->lastindex >= 0) {
Fredrik Lundhc2301732000-07-02 22:25:39 +00003113 PyObject* result = PySequence_GetItem(
Fredrik Lundh6f013982000-07-03 18:44:21 +00003114 self->pattern->indexgroup, self->lastindex
Fredrik Lundhc2301732000-07-02 22:25:39 +00003115 );
3116 if (result)
3117 return result;
3118 PyErr_Clear();
3119 }
3120 Py_INCREF(Py_None);
3121 return Py_None;
3122 }
3123
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003124 if (!strcmp(name, "string")) {
3125 if (self->string) {
3126 Py_INCREF(self->string);
3127 return self->string;
3128 } else {
3129 Py_INCREF(Py_None);
3130 return Py_None;
3131 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003132 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003133
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003134 if (!strcmp(name, "regs")) {
3135 if (self->regs) {
3136 Py_INCREF(self->regs);
3137 return self->regs;
3138 } else
3139 return match_regs(self);
3140 }
3141
3142 if (!strcmp(name, "re")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00003143 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003144 return (PyObject*) self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00003145 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003146
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003147 if (!strcmp(name, "pos"))
3148 return Py_BuildValue("i", self->pos);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003149
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003150 if (!strcmp(name, "endpos"))
3151 return Py_BuildValue("i", self->endpos);
Guido van Rossumb700df92000-03-31 14:59:30 +00003152
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003153 PyErr_SetString(PyExc_AttributeError, name);
3154 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003155}
3156
3157/* FIXME: implement setattr("string", None) as a special case (to
3158 detach the associated string, if any */
3159
Neal Norwitz57c179c2006-03-22 07:18:02 +00003160static PyTypeObject Match_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003161 PyVarObject_HEAD_INIT(NULL,0)
3162 "_" SRE_MODULE ".SRE_Match",
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003163 sizeof(MatchObject), sizeof(Py_ssize_t),
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003164 (destructor)match_dealloc, /*tp_dealloc*/
3165 0, /*tp_print*/
3166 (getattrfunc)match_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00003167};
3168
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003169static PyObject*
3170pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3171{
3172 /* create match object (from state object) */
3173
3174 MatchObject* match;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003175 Py_ssize_t i, j;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003176 char* base;
3177 int n;
3178
3179 if (status > 0) {
3180
3181 /* create match object (with room for extra group marks) */
3182 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3183 2*(pattern->groups+1));
3184 if (!match)
3185 return NULL;
3186
3187 Py_INCREF(pattern);
3188 match->pattern = pattern;
3189
3190 Py_INCREF(state->string);
3191 match->string = state->string;
3192
3193 match->regs = NULL;
3194 match->groups = pattern->groups+1;
3195
3196 /* fill in group slices */
3197
3198 base = (char*) state->beginning;
3199 n = state->charsize;
3200
3201 match->mark[0] = ((char*) state->start - base) / n;
3202 match->mark[1] = ((char*) state->ptr - base) / n;
3203
3204 for (i = j = 0; i < pattern->groups; i++, j+=2)
3205 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3206 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3207 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3208 } else
3209 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3210
3211 match->pos = state->pos;
3212 match->endpos = state->endpos;
3213
3214 match->lastindex = state->lastindex;
3215
3216 return (PyObject*) match;
3217
3218 } else if (status == 0) {
3219
3220 /* no match */
3221 Py_INCREF(Py_None);
3222 return Py_None;
3223
3224 }
3225
3226 /* internal error */
3227 pattern_error(status);
3228 return NULL;
3229}
3230
3231
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003232/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003233/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003234
3235static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003236scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003237{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003238 state_fini(&self->state);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003239 Py_DECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003240 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003241}
3242
3243static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003244scanner_match(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003245{
3246 SRE_STATE* state = &self->state;
3247 PyObject* match;
3248 int status;
3249
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003250 state_reset(state);
3251
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003252 state->ptr = state->start;
3253
3254 if (state->charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003255 status = sre_match(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003256 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003257#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003258 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003259#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003260 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003261 if (PyErr_Occurred())
3262 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003263
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003264 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003265 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003266
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003267 if (status == 0 || state->ptr == state->start)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003268 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003269 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003270 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003271
3272 return match;
3273}
3274
3275
3276static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003277scanner_search(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003278{
3279 SRE_STATE* state = &self->state;
3280 PyObject* match;
3281 int status;
3282
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003283 state_reset(state);
3284
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003285 state->ptr = state->start;
3286
3287 if (state->charsize == 1) {
3288 status = sre_search(state, PatternObject_GetCode(self->pattern));
3289 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003290#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003291 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003292#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003293 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003294 if (PyErr_Occurred())
3295 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003296
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003297 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003298 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003299
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003300 if (status == 0 || state->ptr == state->start)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003301 state->start = (void*) ((char*) state->ptr + state->charsize);
3302 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003303 state->start = state->ptr;
3304
3305 return match;
3306}
3307
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003308static PyMethodDef scanner_methods[] = {
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003309 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3310 {"search", (PyCFunction) scanner_search, METH_NOARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003311 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003312};
3313
Tim Peters3d563502006-01-21 02:47:53 +00003314static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003315scanner_getattr(ScannerObject* self, char* name)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003316{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003317 PyObject* res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003318
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003319 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
3320 if (res)
3321 return res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003322
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003323 PyErr_Clear();
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003324
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003325 /* attributes */
3326 if (!strcmp(name, "pattern")) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003327 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003328 return self->pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003329 }
3330
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003331 PyErr_SetString(PyExc_AttributeError, name);
3332 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003333}
3334
Neal Norwitz57c179c2006-03-22 07:18:02 +00003335static PyTypeObject Scanner_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003336 PyVarObject_HEAD_INIT(NULL, 0)
3337 "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003338 sizeof(ScannerObject), 0,
3339 (destructor)scanner_dealloc, /*tp_dealloc*/
3340 0, /*tp_print*/
3341 (getattrfunc)scanner_getattr, /*tp_getattr*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003342};
3343
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003344static PyObject*
3345pattern_scanner(PatternObject* pattern, PyObject* args)
3346{
3347 /* create search state object */
3348
3349 ScannerObject* self;
3350
3351 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003352 Py_ssize_t start = 0;
3353 Py_ssize_t end = PY_SSIZE_T_MAX;
3354 if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003355 return NULL;
3356
3357 /* create scanner object */
3358 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3359 if (!self)
3360 return NULL;
3361
3362 string = state_init(&self->state, pattern, string, start, end);
3363 if (!string) {
3364 PyObject_DEL(self);
3365 return NULL;
3366 }
3367
3368 Py_INCREF(pattern);
3369 self->pattern = (PyObject*) pattern;
3370
3371 return (PyObject*) self;
3372}
3373
Guido van Rossumb700df92000-03-31 14:59:30 +00003374static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00003375 {"compile", _compile, METH_VARARGS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003376 {"getcodesize", sre_codesize, METH_NOARGS},
Neal Norwitzb0493252002-03-31 14:44:22 +00003377 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003378 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003379};
3380
Mark Hammond8235ea12002-07-19 06:55:41 +00003381PyMODINIT_FUNC init_sre(void)
Guido van Rossumb700df92000-03-31 14:59:30 +00003382{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003383 PyObject* m;
3384 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00003385 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003386
Guido van Rossum50e9fb92006-08-17 05:42:55 +00003387 /* Initialize object types */
3388 if (PyType_Ready(&Pattern_Type) < 0)
3389 return;
3390 if (PyType_Ready(&Match_Type) < 0)
3391 return;
3392 if (PyType_Ready(&Scanner_Type) < 0)
3393 return;
Guido van Rossumb700df92000-03-31 14:59:30 +00003394
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00003395 m = Py_InitModule("_" SRE_MODULE, _functions);
Neal Norwitz1ac754f2006-01-19 06:09:39 +00003396 if (m == NULL)
3397 return;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003398 d = PyModule_GetDict(m);
3399
Fredrik Lundh21009b92001-09-18 18:47:09 +00003400 x = PyInt_FromLong(SRE_MAGIC);
3401 if (x) {
3402 PyDict_SetItemString(d, "MAGIC", x);
3403 Py_DECREF(x);
3404 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00003405
Martin v. Löwis78e2f062003-04-19 12:56:08 +00003406 x = PyInt_FromLong(sizeof(SRE_CODE));
3407 if (x) {
3408 PyDict_SetItemString(d, "CODESIZE", x);
3409 Py_DECREF(x);
3410 }
3411
Neal Norwitzfe537132007-08-26 03:55:15 +00003412 x = PyUnicode_FromString(copyright);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003413 if (x) {
3414 PyDict_SetItemString(d, "copyright", x);
3415 Py_DECREF(x);
3416 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003417}
3418
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003419#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00003420
3421/* vim:ts=4:sw=4:et
3422*/