blob: a0e974a570a7207c8549542c2b7fba39591ecae9 [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
Fredrik Lundhebc37b22000-10-28 19:30:41 +00009 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000010 * 2000-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000011 * 2000-09-20 fl added expand method
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000012 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000013 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000014 * 2001-04-28 fl added __copy__ methods (work in progress)
Fredrik Lundh09705f02002-11-22 12:46:35 +000015 * 2001-05-14 fl fixes for 1.5.2 compatibility
Fredrik Lundhf71ae462001-07-02 17:04:48 +000016 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh397a6542001-10-18 19:30:16 +000017 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Fredrik Lundh971e78b2001-10-20 17:48:46 +000018 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
Fredrik Lundhbec95b92001-10-21 16:47:57 +000019 * 2001-10-21 fl added sub/subn primitive
Fredrik Lundh703ce812001-10-24 22:16:30 +000020 * 2001-10-24 fl added finditer primitive (for 2.2 only)
Fredrik Lundh82b23072001-12-09 16:13:15 +000021 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
Fredrik Lundh09705f02002-11-22 12:46:35 +000022 * 2002-11-09 fl fixed empty sub/subn return type
Martin v. Löwis78e2f062003-04-19 12:56:08 +000023 * 2003-04-18 mvl fully support 4-byte codes
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +000024 * 2003-10-17 gn implemented non recursive scheme
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000026 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000027 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000028 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
Guido van Rossumb700df92000-03-31 14:59:30 +000032 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000033 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000034 * other compatibility work.
35 */
36
37#ifndef SRE_RECURSIVE
38
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000039static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000040 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000041
Thomas Wouters0e3f5912006-08-11 14:57:12 +000042#define PY_SSIZE_T_CLEAN
43
Guido van Rossumb700df92000-03-31 14:59:30 +000044#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000045#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000046
47#include "sre.h"
48
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000049#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000050
Fredrik Lundh436c3d582000-06-29 08:58:44 +000051/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000052#if !defined(SRE_MODULE)
53#define SRE_MODULE "sre"
54#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000055
Thomas Wouters9ada3d62006-04-21 09:47:09 +000056#define SRE_PY_MODULE "re"
57
Guido van Rossumb700df92000-03-31 14:59:30 +000058/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000059#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000060
Fredrik Lundh22d25462000-07-01 17:50:59 +000061/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +000062#define HAVE_UNICODE
Fredrik Lundh436c3d582000-06-29 08:58:44 +000063
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000065/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000066
67/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000068#define USE_FAST_SEARCH
69
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000070/* enables aggressive inlining (always on for Visual C) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000071#undef USE_INLINE
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000072
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000073/* enables copy/deepcopy handling (work in progress) */
74#undef USE_BUILTIN_COPY
75
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000076#if PY_VERSION_HEX < 0x01060000
77#define PyObject_DEL(op) PyMem_DEL((op))
78#endif
79
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000080/* -------------------------------------------------------------------- */
81
Fredrik Lundh80946112000-06-29 18:03:25 +000082#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000083#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000084#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000085/* fastest possible local call under MSVC */
86#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000087#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000088#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000089#else
90#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000091#endif
92
93/* error codes */
94#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000095#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000096#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000097#define SRE_ERROR_MEMORY -9 /* out of memory */
Christian Heimes2380ac72008-01-09 00:17:24 +000098#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
Guido van Rossumb700df92000-03-31 14:59:30 +000099
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000100#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +0000101#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +0000102#else
103#define TRACE(v)
104#endif
105
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000106/* -------------------------------------------------------------------- */
107/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000108
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000109/* default character predicates (run sre_chars.py to regenerate tables) */
110
111#define SRE_DIGIT_MASK 1
112#define SRE_SPACE_MASK 2
113#define SRE_LINEBREAK_MASK 4
114#define SRE_ALNUM_MASK 8
115#define SRE_WORD_MASK 16
116
Fredrik Lundh21009b92001-09-18 18:47:09 +0000117/* FIXME: this assumes ASCII. create tables in init_sre() instead */
118
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000119static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1202, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1210, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
12225, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12324, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1240, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12524, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
126
Fredrik Lundhb389df32000-06-29 12:48:37 +0000127static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000012810, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
12927, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
13044, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
13161, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
132108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
133122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
134106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
135120, 121, 122, 123, 124, 125, 126, 127 };
136
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000137#define SRE_IS_DIGIT(ch)\
138 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
139#define SRE_IS_SPACE(ch)\
140 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
141#define SRE_IS_LINEBREAK(ch)\
142 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
143#define SRE_IS_ALNUM(ch)\
144 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
145#define SRE_IS_WORD(ch)\
146 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000147
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000148static unsigned int sre_lower(unsigned int ch)
149{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000150 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000151}
152
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000153/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000154/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
155 * warnings when c's type supports only numbers < N+1 */
156#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
157#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000158#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000159#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000160#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
161
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000162static unsigned int sre_lower_locale(unsigned int ch)
163{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000164 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000165}
166
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000167/* unicode-specific character predicates */
168
169#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000170
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000171#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
172#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
173#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000174#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000175#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000176
177static unsigned int sre_lower_unicode(unsigned int ch)
178{
179 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
180}
181
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000182#endif
183
Guido van Rossumb700df92000-03-31 14:59:30 +0000184LOCAL(int)
185sre_category(SRE_CODE category, unsigned int ch)
186{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000187 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000188
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000189 case SRE_CATEGORY_DIGIT:
190 return SRE_IS_DIGIT(ch);
191 case SRE_CATEGORY_NOT_DIGIT:
192 return !SRE_IS_DIGIT(ch);
193 case SRE_CATEGORY_SPACE:
194 return SRE_IS_SPACE(ch);
195 case SRE_CATEGORY_NOT_SPACE:
196 return !SRE_IS_SPACE(ch);
197 case SRE_CATEGORY_WORD:
198 return SRE_IS_WORD(ch);
199 case SRE_CATEGORY_NOT_WORD:
200 return !SRE_IS_WORD(ch);
201 case SRE_CATEGORY_LINEBREAK:
202 return SRE_IS_LINEBREAK(ch);
203 case SRE_CATEGORY_NOT_LINEBREAK:
204 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000205
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000206 case SRE_CATEGORY_LOC_WORD:
207 return SRE_LOC_IS_WORD(ch);
208 case SRE_CATEGORY_LOC_NOT_WORD:
209 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000210
211#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000212 case SRE_CATEGORY_UNI_DIGIT:
213 return SRE_UNI_IS_DIGIT(ch);
214 case SRE_CATEGORY_UNI_NOT_DIGIT:
215 return !SRE_UNI_IS_DIGIT(ch);
216 case SRE_CATEGORY_UNI_SPACE:
217 return SRE_UNI_IS_SPACE(ch);
218 case SRE_CATEGORY_UNI_NOT_SPACE:
219 return !SRE_UNI_IS_SPACE(ch);
220 case SRE_CATEGORY_UNI_WORD:
221 return SRE_UNI_IS_WORD(ch);
222 case SRE_CATEGORY_UNI_NOT_WORD:
223 return !SRE_UNI_IS_WORD(ch);
224 case SRE_CATEGORY_UNI_LINEBREAK:
225 return SRE_UNI_IS_LINEBREAK(ch);
226 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
227 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000228#else
229 case SRE_CATEGORY_UNI_DIGIT:
230 return SRE_IS_DIGIT(ch);
231 case SRE_CATEGORY_UNI_NOT_DIGIT:
232 return !SRE_IS_DIGIT(ch);
233 case SRE_CATEGORY_UNI_SPACE:
234 return SRE_IS_SPACE(ch);
235 case SRE_CATEGORY_UNI_NOT_SPACE:
236 return !SRE_IS_SPACE(ch);
237 case SRE_CATEGORY_UNI_WORD:
238 return SRE_LOC_IS_WORD(ch);
239 case SRE_CATEGORY_UNI_NOT_WORD:
240 return !SRE_LOC_IS_WORD(ch);
241 case SRE_CATEGORY_UNI_LINEBREAK:
242 return SRE_IS_LINEBREAK(ch);
243 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
244 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000245#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000246 }
247 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000248}
249
250/* helpers */
251
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000252static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000253data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000254{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000255 if (state->data_stack) {
Thomas Wouters477c8d52006-05-27 19:21:47 +0000256 PyMem_FREE(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000257 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000258 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000259 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000260}
261
262static int
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000263data_stack_grow(SRE_STATE* state, Py_ssize_t size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000264{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000265 Py_ssize_t minsize, cursize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000266 minsize = state->data_stack_base+size;
267 cursize = state->data_stack_size;
268 if (cursize < minsize) {
269 void* stack;
270 cursize = minsize+minsize/4+1024;
271 TRACE(("allocate/grow stack %d\n", cursize));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000272 stack = PyMem_REALLOC(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000273 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000274 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000275 return SRE_ERROR_MEMORY;
276 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000278 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000279 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000280 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000281}
282
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000283/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000284
285#define SRE_CHAR unsigned char
286#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000287#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000288#define SRE_CHARSET sre_charset
289#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000290#define SRE_MATCH sre_match
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000291#define SRE_MATCH_CONTEXT sre_match_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000292#define SRE_SEARCH sre_search
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000293#define SRE_LITERAL_TEMPLATE sre_literal_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000294
295#if defined(HAVE_UNICODE)
296
Guido van Rossumb700df92000-03-31 14:59:30 +0000297#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000298#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000299#undef SRE_RECURSIVE
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000300
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000301#undef SRE_LITERAL_TEMPLATE
Guido van Rossumb700df92000-03-31 14:59:30 +0000302#undef SRE_SEARCH
303#undef SRE_MATCH
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000304#undef SRE_MATCH_CONTEXT
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000305#undef SRE_INFO
306#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000307#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000308#undef SRE_AT
309#undef SRE_CHAR
310
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000311/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000312
313#define SRE_CHAR Py_UNICODE
314#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000315#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000316#define SRE_CHARSET sre_ucharset
317#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000318#define SRE_MATCH sre_umatch
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000319#define SRE_MATCH_CONTEXT sre_umatch_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000320#define SRE_SEARCH sre_usearch
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000321#define SRE_LITERAL_TEMPLATE sre_uliteral_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000322#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000323
324#endif /* SRE_RECURSIVE */
325
326/* -------------------------------------------------------------------- */
327/* String matching engine */
328
329/* the following section is compiled twice, with different character
330 settings */
331
332LOCAL(int)
333SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
334{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000335 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000336
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000337 Py_ssize_t thisp, thatp;
Guido van Rossumb700df92000-03-31 14:59:30 +0000338
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000339 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000340
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000341 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000342 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000343 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000344
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000345 case SRE_AT_BEGINNING_LINE:
346 return ((void*) ptr == state->beginning ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000347 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000348
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000349 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000350 return (((void*) (ptr+1) == state->end &&
351 SRE_IS_LINEBREAK((int) ptr[0])) ||
352 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000353
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000354 case SRE_AT_END_LINE:
355 return ((void*) ptr == state->end ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000356 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000357
Fredrik Lundh770617b2001-01-14 15:06:11 +0000358 case SRE_AT_END_STRING:
359 return ((void*) ptr == state->end);
360
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000361 case SRE_AT_BOUNDARY:
362 if (state->beginning == state->end)
363 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000364 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000365 SRE_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000366 thisp = ((void*) ptr < state->end) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000367 SRE_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000368 return thisp != thatp;
Fredrik Lundh80946112000-06-29 18:03:25 +0000369
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000370 case SRE_AT_NON_BOUNDARY:
371 if (state->beginning == state->end)
372 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000373 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000374 SRE_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000375 thisp = ((void*) ptr < state->end) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000376 SRE_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000377 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000378
379 case SRE_AT_LOC_BOUNDARY:
380 if (state->beginning == state->end)
381 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000382 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000383 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000384 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000385 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000386 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000387
388 case SRE_AT_LOC_NON_BOUNDARY:
389 if (state->beginning == state->end)
390 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000391 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000392 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000393 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000394 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000395 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000396
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000397#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000398 case SRE_AT_UNI_BOUNDARY:
399 if (state->beginning == state->end)
400 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000401 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000402 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000403 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000404 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000405 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000406
407 case SRE_AT_UNI_NON_BOUNDARY:
408 if (state->beginning == state->end)
409 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000410 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000411 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000412 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000413 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000414 return thisp == thatp;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000415#endif
416
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000417 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000418
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000419 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000420}
421
422LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000423SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000424{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000425 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000426
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000427 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000428
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000429 for (;;) {
430 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000431
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000432 case SRE_OP_FAILURE:
433 return !ok;
434
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000435 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000436 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000437 if (ch == set[0])
438 return ok;
439 set++;
440 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000441
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000442 case SRE_OP_CATEGORY:
443 /* <CATEGORY> <code> */
444 if (sre_category(set[0], (int) ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000445 return ok;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000446 set += 1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000447 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000448
Fredrik Lundh3562f112000-07-02 12:00:07 +0000449 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000450 if (sizeof(SRE_CODE) == 2) {
451 /* <CHARSET> <bitmap> (16 bits per code word) */
452 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
453 return ok;
454 set += 16;
Tim Peters3d563502006-01-21 02:47:53 +0000455 }
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000456 else {
457 /* <CHARSET> <bitmap> (32 bits per code word) */
458 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
459 return ok;
460 set += 8;
461 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000462 break;
463
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000464 case SRE_OP_RANGE:
465 /* <RANGE> <lower> <upper> */
466 if (set[0] <= ch && ch <= set[1])
467 return ok;
468 set += 2;
469 break;
470
471 case SRE_OP_NEGATE:
472 ok = !ok;
473 break;
474
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000475 case SRE_OP_BIGCHARSET:
476 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
477 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000478 Py_ssize_t count, block;
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000479 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000480
481 if (sizeof(SRE_CODE) == 2) {
482 block = ((unsigned char*)set)[ch >> 8];
483 set += 128;
484 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
485 return ok;
486 set += count*16;
487 }
488 else {
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000489 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
490 * warnings when c's type supports only numbers < N+1 */
491 if (!(ch & ~65535))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000492 block = ((unsigned char*)set)[ch >> 8];
493 else
494 block = -1;
495 set += 64;
Tim Peters3d563502006-01-21 02:47:53 +0000496 if (block >=0 &&
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000497 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
498 return ok;
499 set += count*8;
500 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000501 break;
502 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000503
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000504 default:
505 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000506 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000507 return 0;
508 }
509 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000510}
511
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000512LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000513
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000514LOCAL(Py_ssize_t)
515SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000516{
517 SRE_CODE chr;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000518 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
519 SRE_CHAR* end = (SRE_CHAR *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000520 Py_ssize_t i;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000521
522 /* adjust end */
523 if (maxcount < end - ptr && maxcount != 65535)
524 end = ptr + maxcount;
525
526 switch (pattern[0]) {
527
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000528 case SRE_OP_IN:
529 /* repeated set */
530 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
531 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
532 ptr++;
533 break;
534
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000535 case SRE_OP_ANY:
536 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000537 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000538 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
539 ptr++;
540 break;
541
542 case SRE_OP_ANY_ALL:
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000543 /* repeated dot wildcard. skip to the end of the target
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000544 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000545 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000546 ptr = end;
547 break;
548
549 case SRE_OP_LITERAL:
550 /* repeated literal */
551 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000552 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000553 while (ptr < end && (SRE_CODE) *ptr == chr)
554 ptr++;
555 break;
556
557 case SRE_OP_LITERAL_IGNORE:
558 /* repeated literal */
559 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000560 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000561 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
562 ptr++;
563 break;
564
565 case SRE_OP_NOT_LITERAL:
566 /* repeated non-literal */
567 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000568 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000569 while (ptr < end && (SRE_CODE) *ptr != chr)
570 ptr++;
571 break;
Tim Peters3d563502006-01-21 02:47:53 +0000572
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000573 case SRE_OP_NOT_LITERAL_IGNORE:
574 /* repeated non-literal */
575 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000576 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000577 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
578 ptr++;
579 break;
580
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000581 default:
582 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000583 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000584 while ((SRE_CHAR*) state->ptr < end) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000585 i = SRE_MATCH(state, pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000586 if (i < 0)
587 return i;
588 if (!i)
589 break;
590 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000591 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
592 (SRE_CHAR*) state->ptr - ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000593 return (SRE_CHAR*) state->ptr - ptr;
594 }
595
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000596 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000597 return ptr - (SRE_CHAR*) state->ptr;
598}
599
Fredrik Lundh33accc12000-08-27 20:59:47 +0000600#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000601LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000602SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
603{
604 /* check if an SRE_OP_INFO block matches at the current position.
605 returns the number of SRE_CODE objects to skip if successful, 0
606 if no match */
607
608 SRE_CHAR* end = state->end;
609 SRE_CHAR* ptr = state->ptr;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000610 Py_ssize_t i;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000611
612 /* check minimal length */
613 if (pattern[3] && (end - ptr) < pattern[3])
614 return 0;
615
616 /* check known prefix */
617 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
618 /* <length> <skip> <prefix data> <overlap data> */
619 for (i = 0; i < pattern[5]; i++)
620 if ((SRE_CODE) ptr[i] != pattern[7 + i])
621 return 0;
622 return pattern[0] + 2 * pattern[6];
623 }
624 return pattern[0];
625}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000626#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000627
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000628/* The macros below should be used to protect recursive SRE_MATCH()
629 * calls that *failed* and do *not* return immediately (IOW, those
630 * that will backtrack). Explaining:
631 *
632 * - Recursive SRE_MATCH() returned true: that's usually a success
633 * (besides atypical cases like ASSERT_NOT), therefore there's no
634 * reason to restore lastmark;
635 *
636 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
637 * is returning to the caller: If the current SRE_MATCH() is the
638 * top function of the recursion, returning false will be a matching
639 * failure, and it doesn't matter where lastmark is pointing to.
640 * If it's *not* the top function, it will be a recursive SRE_MATCH()
641 * failure by itself, and the calling SRE_MATCH() will have to deal
642 * with the failure by the same rules explained here (it will restore
643 * lastmark by itself if necessary);
644 *
645 * - Recursive SRE_MATCH() returned false, and will continue the
646 * outside 'for' loop: must be protected when breaking, since the next
647 * OP could potentially depend on lastmark;
Tim Peters3d563502006-01-21 02:47:53 +0000648 *
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000649 * - Recursive SRE_MATCH() returned false, and will be called again
650 * inside a local for/while loop: must be protected between each
651 * loop iteration, since the recursive SRE_MATCH() could do anything,
652 * and could potentially depend on lastmark.
653 *
654 * For more information, check the discussion at SF patch #712900.
655 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000656#define LASTMARK_SAVE() \
657 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000658 ctx->lastmark = state->lastmark; \
659 ctx->lastindex = state->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000660 } while (0)
661#define LASTMARK_RESTORE() \
662 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000663 state->lastmark = ctx->lastmark; \
664 state->lastindex = ctx->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000665 } while (0)
666
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000667#define RETURN_ERROR(i) do { return i; } while(0)
668#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
669#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
670
671#define RETURN_ON_ERROR(i) \
672 do { if (i < 0) RETURN_ERROR(i); } while (0)
673#define RETURN_ON_SUCCESS(i) \
674 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
675#define RETURN_ON_FAILURE(i) \
676 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
677
678#define SFY(x) #x
679
680#define DATA_STACK_ALLOC(state, type, ptr) \
681do { \
682 alloc_pos = state->data_stack_base; \
683 TRACE(("allocating %s in %d (%d)\n", \
684 SFY(type), alloc_pos, sizeof(type))); \
685 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
686 int j = data_stack_grow(state, sizeof(type)); \
687 if (j < 0) return j; \
688 if (ctx_pos != -1) \
689 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
690 } \
691 ptr = (type*)(state->data_stack+alloc_pos); \
692 state->data_stack_base += sizeof(type); \
693} while (0)
694
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000695#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
696do { \
697 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
698 ptr = (type*)(state->data_stack+pos); \
699} while (0)
700
701#define DATA_STACK_PUSH(state, data, size) \
702do { \
703 TRACE(("copy data in %p to %d (%d)\n", \
704 data, state->data_stack_base, size)); \
705 if (state->data_stack_size < state->data_stack_base+size) { \
706 int j = data_stack_grow(state, size); \
707 if (j < 0) return j; \
708 if (ctx_pos != -1) \
709 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
710 } \
711 memcpy(state->data_stack+state->data_stack_base, data, size); \
712 state->data_stack_base += size; \
713} while (0)
714
715#define DATA_STACK_POP(state, data, size, discard) \
716do { \
717 TRACE(("copy data to %p from %d (%d)\n", \
718 data, state->data_stack_base-size, size)); \
719 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
720 if (discard) \
721 state->data_stack_base -= size; \
722} while (0)
723
724#define DATA_STACK_POP_DISCARD(state, size) \
725do { \
726 TRACE(("discard data from %d (%d)\n", \
727 state->data_stack_base-size, size)); \
728 state->data_stack_base -= size; \
729} while(0)
730
731#define DATA_PUSH(x) \
732 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
733#define DATA_POP(x) \
734 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000735#define DATA_POP_DISCARD(x) \
736 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
737#define DATA_ALLOC(t,p) \
738 DATA_STACK_ALLOC(state, t, p)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000739#define DATA_LOOKUP_AT(t,p,pos) \
740 DATA_STACK_LOOKUP_AT(state,t,p,pos)
741
742#define MARK_PUSH(lastmark) \
743 do if (lastmark > 0) { \
744 i = lastmark; /* ctx->lastmark may change if reallocated */ \
745 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
746 } while (0)
747#define MARK_POP(lastmark) \
748 do if (lastmark > 0) { \
749 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
750 } while (0)
751#define MARK_POP_KEEP(lastmark) \
752 do if (lastmark > 0) { \
753 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
754 } while (0)
755#define MARK_POP_DISCARD(lastmark) \
756 do if (lastmark > 0) { \
757 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
758 } while (0)
759
760#define JUMP_NONE 0
761#define JUMP_MAX_UNTIL_1 1
762#define JUMP_MAX_UNTIL_2 2
763#define JUMP_MAX_UNTIL_3 3
764#define JUMP_MIN_UNTIL_1 4
765#define JUMP_MIN_UNTIL_2 5
766#define JUMP_MIN_UNTIL_3 6
767#define JUMP_REPEAT 7
768#define JUMP_REPEAT_ONE_1 8
769#define JUMP_REPEAT_ONE_2 9
770#define JUMP_MIN_REPEAT_ONE 10
771#define JUMP_BRANCH 11
772#define JUMP_ASSERT 12
773#define JUMP_ASSERT_NOT 13
774
775#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
776 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
777 nextctx->last_ctx_pos = ctx_pos; \
778 nextctx->jump = jumpvalue; \
779 nextctx->pattern = nextpattern; \
780 ctx_pos = alloc_pos; \
781 ctx = nextctx; \
782 goto entrance; \
783 jumplabel: \
784 while (0) /* gcc doesn't like labels at end of scopes */ \
785
786typedef struct {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000787 Py_ssize_t last_ctx_pos;
788 Py_ssize_t jump;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000789 SRE_CHAR* ptr;
790 SRE_CODE* pattern;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000791 Py_ssize_t count;
792 Py_ssize_t lastmark;
793 Py_ssize_t lastindex;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000794 union {
795 SRE_CODE chr;
796 SRE_REPEAT* rep;
797 } u;
798} SRE_MATCH_CONTEXT;
799
800/* check if string matches the given pattern. returns <0 for
801 error, 0 for failure, and 1 for success */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000802LOCAL(Py_ssize_t)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000803SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
Guido van Rossumb700df92000-03-31 14:59:30 +0000804{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000805 SRE_CHAR* end = (SRE_CHAR *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000806 Py_ssize_t alloc_pos, ctx_pos = -1;
807 Py_ssize_t i, ret = 0;
808 Py_ssize_t jump;
Christian Heimes2380ac72008-01-09 00:17:24 +0000809 unsigned int sigcount=0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000810
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000811 SRE_MATCH_CONTEXT* ctx;
812 SRE_MATCH_CONTEXT* nextctx;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000813
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000814 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000815
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000816 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
817 ctx->last_ctx_pos = -1;
818 ctx->jump = JUMP_NONE;
819 ctx->pattern = pattern;
820 ctx_pos = alloc_pos;
821
822entrance:
823
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000824 ctx->ptr = (SRE_CHAR *)state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000825
826 if (ctx->pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000827 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000828 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000829 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000830 TRACE(("reject (got %d chars, need %d)\n",
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000831 (end - ctx->ptr), ctx->pattern[3]));
832 RETURN_FAILURE;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000833 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000834 ctx->pattern += ctx->pattern[1] + 1;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000835 }
836
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000837 for (;;) {
Christian Heimes2380ac72008-01-09 00:17:24 +0000838 ++sigcount;
839 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
840 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
Guido van Rossumb700df92000-03-31 14:59:30 +0000841
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000842 switch (*ctx->pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000843
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000844 case SRE_OP_MARK:
845 /* set mark */
846 /* <MARK> <gid> */
847 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
848 ctx->ptr, ctx->pattern[0]));
849 i = ctx->pattern[0];
850 if (i & 1)
851 state->lastindex = i/2 + 1;
852 if (i > state->lastmark) {
853 /* state->lastmark is the highest valid index in the
854 state->mark array. If it is increased by more than 1,
855 the intervening marks must be set to NULL to signal
Tim Peters3d563502006-01-21 02:47:53 +0000856 that these marks have not been encountered. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000857 Py_ssize_t j = state->lastmark + 1;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000858 while (j < i)
859 state->mark[j++] = NULL;
860 state->lastmark = i;
861 }
862 state->mark[i] = ctx->ptr;
863 ctx->pattern++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000864 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000865
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000866 case SRE_OP_LITERAL:
867 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000868 /* <LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000869 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
870 ctx->ptr, *ctx->pattern));
871 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
872 RETURN_FAILURE;
873 ctx->pattern++;
874 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000875 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000876
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000877 case SRE_OP_NOT_LITERAL:
878 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000879 /* <NOT_LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000880 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
881 ctx->ptr, *ctx->pattern));
882 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
883 RETURN_FAILURE;
884 ctx->pattern++;
885 ctx->ptr++;
886 break;
887
888 case SRE_OP_SUCCESS:
889 /* end of pattern */
890 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
891 state->ptr = ctx->ptr;
892 RETURN_SUCCESS;
893
894 case SRE_OP_AT:
895 /* match at given position */
896 /* <AT> <code> */
897 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
898 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
899 RETURN_FAILURE;
900 ctx->pattern++;
901 break;
902
903 case SRE_OP_CATEGORY:
904 /* match at given category */
905 /* <CATEGORY> <code> */
906 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
907 ctx->ptr, *ctx->pattern));
908 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
909 RETURN_FAILURE;
910 ctx->pattern++;
911 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000912 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000913
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000914 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000915 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000916 /* <ANY> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000917 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
918 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
919 RETURN_FAILURE;
920 ctx->ptr++;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000921 break;
922
923 case SRE_OP_ANY_ALL:
924 /* match anything */
925 /* <ANY_ALL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000926 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
927 if (ctx->ptr >= end)
928 RETURN_FAILURE;
929 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000930 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000931
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000932 case SRE_OP_IN:
933 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000934 /* <IN> <skip> <set> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000935 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
936 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
937 RETURN_FAILURE;
938 ctx->pattern += ctx->pattern[0];
939 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000940 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000941
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000942 case SRE_OP_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000943 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
944 ctx->pattern, ctx->ptr, ctx->pattern[0]));
945 if (ctx->ptr >= end ||
946 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
947 RETURN_FAILURE;
948 ctx->pattern++;
949 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000950 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000951
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000952 case SRE_OP_NOT_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000953 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
954 ctx->pattern, ctx->ptr, *ctx->pattern));
955 if (ctx->ptr >= end ||
956 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
957 RETURN_FAILURE;
958 ctx->pattern++;
959 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000960 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000961
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000962 case SRE_OP_IN_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000963 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
964 if (ctx->ptr >= end
965 || !SRE_CHARSET(ctx->pattern+1,
966 (SRE_CODE)state->lower(*ctx->ptr)))
967 RETURN_FAILURE;
968 ctx->pattern += ctx->pattern[0];
969 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000970 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000971
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000972 case SRE_OP_JUMP:
973 case SRE_OP_INFO:
974 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000975 /* <JUMP> <offset> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000976 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
977 ctx->ptr, ctx->pattern[0]));
978 ctx->pattern += ctx->pattern[0];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000979 break;
980
981 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000982 /* alternation */
983 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000984 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000985 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000986 ctx->u.rep = state->repeat;
987 if (ctx->u.rep)
988 MARK_PUSH(ctx->lastmark);
989 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
990 if (ctx->pattern[1] == SRE_OP_LITERAL &&
991 (ctx->ptr >= end ||
992 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000993 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000994 if (ctx->pattern[1] == SRE_OP_IN &&
995 (ctx->ptr >= end ||
996 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000997 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000998 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000999 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001000 if (ret) {
1001 if (ctx->u.rep)
1002 MARK_POP_DISCARD(ctx->lastmark);
1003 RETURN_ON_ERROR(ret);
1004 RETURN_SUCCESS;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001005 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001006 if (ctx->u.rep)
1007 MARK_POP_KEEP(ctx->lastmark);
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001008 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001009 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001010 if (ctx->u.rep)
1011 MARK_POP_DISCARD(ctx->lastmark);
1012 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001013
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001014 case SRE_OP_REPEAT_ONE:
1015 /* match repeated sequence (maximizing regexp) */
1016
1017 /* this operator only works if the repeated item is
1018 exactly one character wide, and we're not already
1019 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +00001020 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001021
1022 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1023
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001024 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1025 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001026
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001027 if (ctx->ptr + ctx->pattern[1] > end)
1028 RETURN_FAILURE; /* cannot match */
Fredrik Lundhe1869832000-08-01 22:47:49 +00001029
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001030 state->ptr = ctx->ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001031
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001032 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1033 RETURN_ON_ERROR(ret);
1034 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1035 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001036 ctx->ptr += ctx->count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001037
1038 /* when we arrive here, count contains the number of
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001039 matches, and ctx->ptr points to the tail of the target
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001040 string. check if the rest of the pattern matches,
1041 and backtrack if not. */
1042
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001043 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001044 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001045
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001046 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001047 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001048 state->ptr = ctx->ptr;
1049 RETURN_SUCCESS;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001050 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001051
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001052 LASTMARK_SAVE();
1053
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001054 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001055 /* tail starts with a literal. skip positions where
1056 the rest of the pattern cannot possibly match */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001057 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001058 for (;;) {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001059 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001060 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1061 ctx->ptr--;
1062 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001063 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001064 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001065 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001066 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001067 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1068 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001069 if (ret) {
1070 RETURN_ON_ERROR(ret);
1071 RETURN_SUCCESS;
1072 }
Tim Peters3d563502006-01-21 02:47:53 +00001073
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001074 LASTMARK_RESTORE();
Tim Peters3d563502006-01-21 02:47:53 +00001075
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001076 ctx->ptr--;
1077 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001078 }
1079
1080 } else {
1081 /* general case */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001082 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001083 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001084 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1085 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001086 if (ret) {
1087 RETURN_ON_ERROR(ret);
1088 RETURN_SUCCESS;
1089 }
1090 ctx->ptr--;
1091 ctx->count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001092 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001093 }
1094 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001095 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001096
Guido van Rossum41c99e72003-04-14 17:59:34 +00001097 case SRE_OP_MIN_REPEAT_ONE:
1098 /* match repeated sequence (minimizing regexp) */
1099
1100 /* this operator only works if the repeated item is
1101 exactly one character wide, and we're not already
1102 collecting backtracking points. for other cases,
1103 use the MIN_REPEAT operator */
1104
1105 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1106
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001107 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1108 ctx->pattern[1], ctx->pattern[2]));
Guido van Rossum41c99e72003-04-14 17:59:34 +00001109
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001110 if (ctx->ptr + ctx->pattern[1] > end)
1111 RETURN_FAILURE; /* cannot match */
Guido van Rossum41c99e72003-04-14 17:59:34 +00001112
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001113 state->ptr = ctx->ptr;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001114
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001115 if (ctx->pattern[1] == 0)
1116 ctx->count = 0;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001117 else {
1118 /* count using pattern min as the maximum */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001119 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1120 RETURN_ON_ERROR(ret);
1121 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001122 if (ret < (Py_ssize_t) ctx->pattern[1])
Tim Peters3d563502006-01-21 02:47:53 +00001123 /* didn't match minimum number of times */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001124 RETURN_FAILURE;
1125 /* advance past minimum matches of repeat */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001126 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001127 ctx->ptr += ctx->count;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001128 }
1129
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001130 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Guido van Rossum41c99e72003-04-14 17:59:34 +00001131 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001132 state->ptr = ctx->ptr;
1133 RETURN_SUCCESS;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001134
1135 } else {
1136 /* general case */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001137 LASTMARK_SAVE();
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001138 while ((Py_ssize_t)ctx->pattern[2] == 65535
1139 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001140 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001141 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1142 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001143 if (ret) {
1144 RETURN_ON_ERROR(ret);
1145 RETURN_SUCCESS;
1146 }
1147 state->ptr = ctx->ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001148 ret = SRE_COUNT(state, ctx->pattern+3, 1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001149 RETURN_ON_ERROR(ret);
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001150 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001151 if (ret == 0)
Guido van Rossum41c99e72003-04-14 17:59:34 +00001152 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001153 assert(ret == 1);
1154 ctx->ptr++;
1155 ctx->count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001156 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001157 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001158 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001159 RETURN_FAILURE;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001160
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001161 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001162 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001163 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001164 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001165 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1166 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001167
1168 /* install new repeat context */
Thomas Wouters477c8d52006-05-27 19:21:47 +00001169 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001170 if (!ctx->u.rep) {
1171 PyErr_NoMemory();
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001172 RETURN_FAILURE;
Thomas Wouters89f507f2006-12-13 04:49:30 +00001173 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001174 ctx->u.rep->count = -1;
1175 ctx->u.rep->pattern = ctx->pattern;
1176 ctx->u.rep->prev = state->repeat;
1177 ctx->u.rep->last_ptr = NULL;
1178 state->repeat = ctx->u.rep;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001179
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001180 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001181 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001182 state->repeat = ctx->u.rep->prev;
Thomas Wouters477c8d52006-05-27 19:21:47 +00001183 PyObject_FREE(ctx->u.rep);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001184
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001185 if (ret) {
1186 RETURN_ON_ERROR(ret);
1187 RETURN_SUCCESS;
1188 }
1189 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001190
1191 case SRE_OP_MAX_UNTIL:
1192 /* maximizing repeat */
1193 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1194
1195 /* FIXME: we probably need to deal with zero-width
1196 matches in here... */
1197
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001198 ctx->u.rep = state->repeat;
1199 if (!ctx->u.rep)
1200 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001201
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001202 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001203
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001204 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001205
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001206 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1207 ctx->ptr, ctx->count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001208
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001209 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001210 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001211 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001212 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1213 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001214 if (ret) {
1215 RETURN_ON_ERROR(ret);
1216 RETURN_SUCCESS;
1217 }
1218 ctx->u.rep->count = ctx->count-1;
1219 state->ptr = ctx->ptr;
1220 RETURN_FAILURE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001221 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001222
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001223 if ((ctx->count < ctx->u.rep->pattern[2] ||
1224 ctx->u.rep->pattern[2] == 65535) &&
1225 state->ptr != ctx->u.rep->last_ptr) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001226 /* we may have enough matches, but if we can
1227 match another item, do so */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001228 ctx->u.rep->count = ctx->count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001229 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001230 MARK_PUSH(ctx->lastmark);
1231 /* zero-width match protection */
1232 DATA_PUSH(&ctx->u.rep->last_ptr);
1233 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001234 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1235 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001236 DATA_POP(&ctx->u.rep->last_ptr);
1237 if (ret) {
1238 MARK_POP_DISCARD(ctx->lastmark);
1239 RETURN_ON_ERROR(ret);
1240 RETURN_SUCCESS;
1241 }
1242 MARK_POP(ctx->lastmark);
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001243 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001244 ctx->u.rep->count = ctx->count-1;
1245 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001246 }
1247
1248 /* cannot match more repeated items here. make sure the
1249 tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001250 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001251 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001252 RETURN_ON_SUCCESS(ret);
1253 state->repeat = ctx->u.rep;
1254 state->ptr = ctx->ptr;
1255 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001256
1257 case SRE_OP_MIN_UNTIL:
1258 /* minimizing repeat */
1259 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1260
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001261 ctx->u.rep = state->repeat;
1262 if (!ctx->u.rep)
1263 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001264
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001265 state->ptr = ctx->ptr;
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001266
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001267 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001268
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001269 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1270 ctx->ptr, ctx->count, ctx->u.rep->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001271
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001272 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001273 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001274 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001275 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1276 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001277 if (ret) {
1278 RETURN_ON_ERROR(ret);
1279 RETURN_SUCCESS;
1280 }
1281 ctx->u.rep->count = ctx->count-1;
1282 state->ptr = ctx->ptr;
1283 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001284 }
1285
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001286 LASTMARK_SAVE();
1287
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001288 /* see if the tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001289 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001290 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001291 if (ret) {
1292 RETURN_ON_ERROR(ret);
1293 RETURN_SUCCESS;
1294 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001295
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001296 state->repeat = ctx->u.rep;
1297 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001298
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001299 LASTMARK_RESTORE();
1300
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001301 if (ctx->count >= ctx->u.rep->pattern[2]
1302 && ctx->u.rep->pattern[2] != 65535)
1303 RETURN_FAILURE;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001304
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001305 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001306 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1307 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001308 if (ret) {
1309 RETURN_ON_ERROR(ret);
1310 RETURN_SUCCESS;
1311 }
1312 ctx->u.rep->count = ctx->count-1;
1313 state->ptr = ctx->ptr;
1314 RETURN_FAILURE;
1315
1316 case SRE_OP_GROUPREF:
1317 /* match backreference */
1318 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1319 ctx->ptr, ctx->pattern[0]));
1320 i = ctx->pattern[0];
1321 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001322 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001323 if (groupref >= state->lastmark) {
1324 RETURN_FAILURE;
1325 } else {
1326 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1327 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1328 if (!p || !e || e < p)
1329 RETURN_FAILURE;
1330 while (p < e) {
1331 if (ctx->ptr >= end || *ctx->ptr != *p)
1332 RETURN_FAILURE;
1333 p++; ctx->ptr++;
1334 }
1335 }
1336 }
1337 ctx->pattern++;
1338 break;
1339
1340 case SRE_OP_GROUPREF_IGNORE:
1341 /* match backreference */
1342 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1343 ctx->ptr, ctx->pattern[0]));
1344 i = ctx->pattern[0];
1345 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001346 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001347 if (groupref >= state->lastmark) {
1348 RETURN_FAILURE;
1349 } else {
1350 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1351 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1352 if (!p || !e || e < p)
1353 RETURN_FAILURE;
1354 while (p < e) {
1355 if (ctx->ptr >= end ||
1356 state->lower(*ctx->ptr) != state->lower(*p))
1357 RETURN_FAILURE;
1358 p++; ctx->ptr++;
1359 }
1360 }
1361 }
1362 ctx->pattern++;
1363 break;
1364
1365 case SRE_OP_GROUPREF_EXISTS:
1366 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1367 ctx->ptr, ctx->pattern[0]));
1368 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1369 i = ctx->pattern[0];
1370 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001371 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001372 if (groupref >= state->lastmark) {
1373 ctx->pattern += ctx->pattern[1];
1374 break;
1375 } else {
1376 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1377 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1378 if (!p || !e || e < p) {
1379 ctx->pattern += ctx->pattern[1];
1380 break;
1381 }
1382 }
1383 }
1384 ctx->pattern += 2;
1385 break;
1386
1387 case SRE_OP_ASSERT:
1388 /* assert subpattern */
1389 /* <ASSERT> <skip> <back> <pattern> */
1390 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1391 ctx->ptr, ctx->pattern[1]));
1392 state->ptr = ctx->ptr - ctx->pattern[1];
1393 if (state->ptr < state->beginning)
1394 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001395 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001396 RETURN_ON_FAILURE(ret);
1397 ctx->pattern += ctx->pattern[0];
1398 break;
1399
1400 case SRE_OP_ASSERT_NOT:
1401 /* assert not subpattern */
1402 /* <ASSERT_NOT> <skip> <back> <pattern> */
1403 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1404 ctx->ptr, ctx->pattern[1]));
1405 state->ptr = ctx->ptr - ctx->pattern[1];
1406 if (state->ptr >= state->beginning) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001407 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001408 if (ret) {
1409 RETURN_ON_ERROR(ret);
1410 RETURN_FAILURE;
1411 }
1412 }
1413 ctx->pattern += ctx->pattern[0];
1414 break;
1415
1416 case SRE_OP_FAILURE:
1417 /* immediate failure */
1418 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1419 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001420
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001421 default:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001422 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1423 ctx->pattern[-1]));
1424 RETURN_ERROR(SRE_ERROR_ILLEGAL);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001425 }
1426 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001427
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001428exit:
1429 ctx_pos = ctx->last_ctx_pos;
1430 jump = ctx->jump;
1431 DATA_POP_DISCARD(ctx);
1432 if (ctx_pos == -1)
1433 return ret;
1434 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1435
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001436 switch (jump) {
1437 case JUMP_MAX_UNTIL_2:
1438 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1439 goto jump_max_until_2;
1440 case JUMP_MAX_UNTIL_3:
1441 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1442 goto jump_max_until_3;
1443 case JUMP_MIN_UNTIL_2:
1444 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1445 goto jump_min_until_2;
1446 case JUMP_MIN_UNTIL_3:
1447 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1448 goto jump_min_until_3;
1449 case JUMP_BRANCH:
1450 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1451 goto jump_branch;
1452 case JUMP_MAX_UNTIL_1:
1453 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1454 goto jump_max_until_1;
1455 case JUMP_MIN_UNTIL_1:
1456 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1457 goto jump_min_until_1;
1458 case JUMP_REPEAT:
1459 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1460 goto jump_repeat;
1461 case JUMP_REPEAT_ONE_1:
1462 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1463 goto jump_repeat_one_1;
1464 case JUMP_REPEAT_ONE_2:
1465 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1466 goto jump_repeat_one_2;
1467 case JUMP_MIN_REPEAT_ONE:
1468 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1469 goto jump_min_repeat_one;
1470 case JUMP_ASSERT:
1471 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1472 goto jump_assert;
1473 case JUMP_ASSERT_NOT:
1474 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1475 goto jump_assert_not;
1476 case JUMP_NONE:
1477 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1478 break;
1479 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001480
1481 return ret; /* should never get here */
Guido van Rossumb700df92000-03-31 14:59:30 +00001482}
1483
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001484LOCAL(Py_ssize_t)
Guido van Rossumb700df92000-03-31 14:59:30 +00001485SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1486{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001487 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1488 SRE_CHAR* end = (SRE_CHAR *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001489 Py_ssize_t status = 0;
1490 Py_ssize_t prefix_len = 0;
1491 Py_ssize_t prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001492 SRE_CODE* prefix = NULL;
1493 SRE_CODE* charset = NULL;
1494 SRE_CODE* overlap = NULL;
1495 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001496
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001497 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001498 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001499 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001500
1501 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001502
Gustavo Niemeyer28b5bb32003-06-26 14:41:08 +00001503 if (pattern[3] > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001504 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001505 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001506 end -= pattern[3]-1;
1507 if (end <= ptr)
1508 end = ptr+1;
1509 }
1510
Fredrik Lundh3562f112000-07-02 12:00:07 +00001511 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001512 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001513 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001514 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001515 prefix_skip = pattern[6];
1516 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001517 overlap = prefix + prefix_len - 1;
1518 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001519 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001520 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001521 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001522
1523 pattern += 1 + pattern[1];
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001524 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001525
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001526 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1527 TRACE(("charset = %p\n", charset));
1528
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001529#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001530 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001531 /* pattern starts with a known prefix. use the overlap
1532 table to skip forward as fast as we possibly can */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001533 Py_ssize_t i = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001534 end = (SRE_CHAR *)state->end;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001535 while (ptr < end) {
1536 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001537 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001538 if (!i)
1539 break;
1540 else
1541 i = overlap[i];
1542 } else {
1543 if (++i == prefix_len) {
1544 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001545 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1546 state->start = ptr + 1 - prefix_len;
1547 state->ptr = ptr + 1 - prefix_len + prefix_skip;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001548 if (flags & SRE_INFO_LITERAL)
1549 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001550 status = SRE_MATCH(state, pattern + 2*prefix_skip);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001551 if (status != 0)
1552 return status;
1553 /* close but no cigar -- try again */
1554 i = overlap[i];
1555 }
1556 break;
1557 }
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001558 }
1559 ptr++;
1560 }
1561 return 0;
1562 }
1563#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001564
Fredrik Lundh3562f112000-07-02 12:00:07 +00001565 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001566 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001567 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001568 SRE_CODE chr = pattern[1];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001569 end = (SRE_CHAR *)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001570 for (;;) {
1571 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1572 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001573 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001574 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001575 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001576 state->start = ptr;
1577 state->ptr = ++ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001578 if (flags & SRE_INFO_LITERAL)
1579 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001580 status = SRE_MATCH(state, pattern + 2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001581 if (status != 0)
1582 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001583 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001584 } else if (charset) {
1585 /* pattern starts with a character from a known set */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001586 end = (SRE_CHAR *)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001587 for (;;) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001588 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001589 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001590 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001591 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001592 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001593 state->start = ptr;
1594 state->ptr = ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001595 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001596 if (status != 0)
1597 break;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001598 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001599 }
1600 } else
1601 /* general case */
1602 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001603 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001604 state->start = state->ptr = ptr++;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001605 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001606 if (status != 0)
1607 break;
1608 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001609
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001610 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001611}
Tim Peters3d563502006-01-21 02:47:53 +00001612
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001613LOCAL(int)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001614SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001615{
1616 /* check if given string is a literal template (i.e. no escapes) */
1617 while (len-- > 0)
1618 if (*ptr++ == '\\')
1619 return 0;
1620 return 1;
1621}
Guido van Rossumb700df92000-03-31 14:59:30 +00001622
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001623#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001624
1625/* -------------------------------------------------------------------- */
1626/* factories and destructors */
1627
1628/* see sre.h for object declarations */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001629static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1630static PyObject*pattern_scanner(PatternObject*, PyObject*);
Guido van Rossumb700df92000-03-31 14:59:30 +00001631
1632static PyObject *
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001633sre_codesize(PyObject* self, PyObject *unused)
Guido van Rossumb700df92000-03-31 14:59:30 +00001634{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001635 return Py_BuildValue("l", sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001636}
1637
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001638static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001639sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001640{
1641 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001642 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001643 return NULL;
1644 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001645 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001646 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001647#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001648 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001649#else
1650 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001651#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001652 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001653}
1654
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001655LOCAL(void)
1656state_reset(SRE_STATE* state)
1657{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001658 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001659 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001660
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001661 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001662 state->lastindex = -1;
1663
1664 state->repeat = NULL;
1665
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001666 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001667}
1668
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001669static void*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001670getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
Guido van Rossumb700df92000-03-31 14:59:30 +00001671{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001672 /* given a python object, return a data pointer, a length (in
1673 characters), and a character size. return NULL if the object
1674 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +00001675
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001676 PyBufferProcs *buffer;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001677 Py_ssize_t size, bytes;
1678 int charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001679 void* ptr;
Travis E. Oliphant8ae62b62007-09-23 02:00:13 +00001680 Py_buffer view;
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001681
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00001682 /* Unicode objects do not support the buffer API. So, get the data
1683 directly instead. */
1684 if (PyUnicode_Check(string)) {
1685 ptr = (void *)PyUnicode_AS_DATA(string);
1686 *p_length = PyUnicode_GET_SIZE(string);
1687 *p_charsize = sizeof(Py_UNICODE);
1688 return ptr;
1689 }
1690
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001691 /* get pointer to string buffer */
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001692 view.len = -1;
Christian Heimes90aa7642007-12-19 02:45:37 +00001693 buffer = Py_TYPE(string)->tp_as_buffer;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001694 if (!buffer || !buffer->bf_getbuffer ||
1695 (*buffer->bf_getbuffer)(string, &view, PyBUF_SIMPLE) < 0) {
1696 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1697 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001698 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001699
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001700 /* determine buffer size */
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001701 bytes = view.len;
1702 ptr = view.buf;
1703
1704 /* Release the buffer immediately --- possibly dangerous
1705 but doing something else would require some re-factoring
1706 */
1707 PyObject_ReleaseBuffer(string, &view);
1708
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001709 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001710 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1711 return NULL;
1712 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001713
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001714 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001715 size = PyObject_Size(string);
Guido van Rossumb700df92000-03-31 14:59:30 +00001716
Christian Heimes72b710a2008-05-26 13:28:38 +00001717 if (PyBytes_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001718 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001719#if defined(HAVE_UNICODE)
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001720 else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001721 charsize = sizeof(Py_UNICODE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001722#endif
1723 else {
1724 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1725 return NULL;
1726 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001727
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001728 *p_length = size;
1729 *p_charsize = charsize;
1730
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001731 if (ptr == NULL) {
1732 PyErr_SetString(PyExc_ValueError,
1733 "Buffer is NULL");
1734 }
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001735 return ptr;
1736}
1737
1738LOCAL(PyObject*)
1739state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001740 Py_ssize_t start, Py_ssize_t end)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001741{
1742 /* prepare state object */
1743
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001744 Py_ssize_t length;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001745 int charsize;
1746 void* ptr;
1747
1748 memset(state, 0, sizeof(SRE_STATE));
1749
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001750 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001751 state->lastindex = -1;
1752
1753 ptr = getstring(string, &length, &charsize);
1754 if (!ptr)
1755 return NULL;
1756
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001757 /* adjust boundaries */
1758 if (start < 0)
1759 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001760 else if (start > length)
1761 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001762
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001763 if (end < 0)
1764 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001765 else if (end > length)
1766 end = length;
1767
1768 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001769
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001770 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001771
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001772 state->start = (void*) ((char*) ptr + start * state->charsize);
1773 state->end = (void*) ((char*) ptr + end * state->charsize);
1774
1775 Py_INCREF(string);
1776 state->string = string;
1777 state->pos = start;
1778 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001779
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001780 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001781 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001782 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001783#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001784 state->lower = sre_lower_unicode;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001785#else
1786 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001787#endif
1788 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001789 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001790
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001791 return string;
Guido van Rossumb700df92000-03-31 14:59:30 +00001792}
1793
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001794LOCAL(void)
1795state_fini(SRE_STATE* state)
1796{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001797 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001798 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001799}
1800
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001801/* calculate offset from start of string */
1802#define STATE_OFFSET(state, member)\
1803 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1804
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001805LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001806state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001807{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001808 Py_ssize_t i, j;
Fredrik Lundh58100642000-08-09 09:14:35 +00001809
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001810 index = (index - 1) * 2;
1811
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001812 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001813 if (empty)
1814 /* want empty string */
1815 i = j = 0;
1816 else {
1817 Py_INCREF(Py_None);
1818 return Py_None;
1819 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001820 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001821 i = STATE_OFFSET(state, state->mark[index]);
1822 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001823 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001824
Fredrik Lundh58100642000-08-09 09:14:35 +00001825 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001826}
1827
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001828static void
1829pattern_error(int status)
1830{
1831 switch (status) {
1832 case SRE_ERROR_RECURSION_LIMIT:
1833 PyErr_SetString(
1834 PyExc_RuntimeError,
1835 "maximum recursion limit exceeded"
1836 );
1837 break;
1838 case SRE_ERROR_MEMORY:
1839 PyErr_NoMemory();
1840 break;
Christian Heimes2380ac72008-01-09 00:17:24 +00001841 case SRE_ERROR_INTERRUPTED:
1842 /* An exception has already been raised, so let it fly */
1843 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001844 default:
1845 /* other error codes indicate compiler/engine bugs */
1846 PyErr_SetString(
1847 PyExc_RuntimeError,
1848 "internal error in regular expression engine"
1849 );
1850 }
1851}
1852
Guido van Rossumb700df92000-03-31 14:59:30 +00001853static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001854pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001855{
Raymond Hettinger027bb632004-05-31 03:09:25 +00001856 if (self->weakreflist != NULL)
1857 PyObject_ClearWeakRefs((PyObject *) self);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001858 Py_XDECREF(self->pattern);
1859 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001860 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001861 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001862}
1863
1864static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001865pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001866{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001867 SRE_STATE state;
1868 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001869
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001870 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001871 Py_ssize_t start = 0;
1872 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001873 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001874 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001875 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001876 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001877
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001878 string = state_init(&state, self, string, start, end);
1879 if (!string)
1880 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001881
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001882 state.ptr = state.start;
1883
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001884 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1885
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001886 if (state.charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001887 status = sre_match(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001888 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001889#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001890 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001891#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001892 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001893
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001894 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001895 if (PyErr_Occurred())
1896 return NULL;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001897
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001898 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001899
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001900 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001901}
1902
1903static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001904pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001905{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001906 SRE_STATE state;
1907 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001908
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001909 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001910 Py_ssize_t start = 0;
1911 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001912 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001913 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001914 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001915 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001916
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001917 string = state_init(&state, self, string, start, end);
1918 if (!string)
1919 return NULL;
1920
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001921 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1922
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001923 if (state.charsize == 1) {
1924 status = sre_search(&state, PatternObject_GetCode(self));
1925 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001926#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001927 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001928#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001929 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001930
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001931 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1932
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001933 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001934
Thomas Wouters89f507f2006-12-13 04:49:30 +00001935 if (PyErr_Occurred())
1936 return NULL;
1937
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001938 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001939}
1940
1941static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001942call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001943{
1944 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001945 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001946 PyObject* func;
1947 PyObject* result;
1948
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001949 if (!args)
1950 return NULL;
Neal Norwitzfe537132007-08-26 03:55:15 +00001951 name = PyUnicode_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001952 if (!name)
1953 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001954 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001955 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001956 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001957 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001958 func = PyObject_GetAttrString(mod, function);
1959 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001960 if (!func)
1961 return NULL;
1962 result = PyObject_CallObject(func, args);
1963 Py_DECREF(func);
1964 Py_DECREF(args);
1965 return result;
1966}
1967
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001968#ifdef USE_BUILTIN_COPY
1969static int
1970deepcopy(PyObject** object, PyObject* memo)
1971{
1972 PyObject* copy;
1973
1974 copy = call(
1975 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00001976 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001977 );
1978 if (!copy)
1979 return 0;
1980
1981 Py_DECREF(*object);
1982 *object = copy;
1983
1984 return 1; /* success */
1985}
1986#endif
1987
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001988static PyObject*
Thomas Wouters1b7f8912007-09-19 03:06:30 +00001989join_list(PyObject* list, PyObject* string)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001990{
1991 /* join list elements */
1992
1993 PyObject* joiner;
1994#if PY_VERSION_HEX >= 0x01060000
1995 PyObject* function;
1996 PyObject* args;
1997#endif
1998 PyObject* result;
1999
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002000 joiner = PySequence_GetSlice(string, 0, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002001 if (!joiner)
2002 return NULL;
2003
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002004 if (PyList_GET_SIZE(list) == 0) {
2005 Py_DECREF(list);
2006 return joiner;
2007 }
2008
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002009#if PY_VERSION_HEX >= 0x01060000
2010 function = PyObject_GetAttrString(joiner, "join");
2011 if (!function) {
2012 Py_DECREF(joiner);
2013 return NULL;
2014 }
2015 args = PyTuple_New(1);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002016 if (!args) {
2017 Py_DECREF(function);
2018 Py_DECREF(joiner);
2019 return NULL;
2020 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002021 PyTuple_SET_ITEM(args, 0, list);
2022 result = PyObject_CallObject(function, args);
2023 Py_DECREF(args); /* also removes list */
2024 Py_DECREF(function);
2025#else
2026 result = call(
2027 "string", "join",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002028 PyTuple_Pack(2, list, joiner)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002029 );
2030#endif
2031 Py_DECREF(joiner);
2032
2033 return result;
2034}
2035
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002036static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002037pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002038{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002039 SRE_STATE state;
2040 PyObject* list;
2041 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002042 Py_ssize_t i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00002043
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002044 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002045 Py_ssize_t start = 0;
2046 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002047 static char* kwlist[] = { "source", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002048 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00002049 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002050 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002051
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002052 string = state_init(&state, self, string, start, end);
2053 if (!string)
2054 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002055
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002056 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002057 if (!list) {
2058 state_fini(&state);
2059 return NULL;
2060 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002061
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002062 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002063
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002064 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +00002065
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002066 state_reset(&state);
2067
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002068 state.ptr = state.start;
2069
2070 if (state.charsize == 1) {
2071 status = sre_search(&state, PatternObject_GetCode(self));
2072 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002073#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002074 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002075#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002076 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002077
Thomas Wouters89f507f2006-12-13 04:49:30 +00002078 if (PyErr_Occurred())
2079 goto error;
2080
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002081 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002082 if (status == 0)
2083 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002084 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002085 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002086 }
Tim Peters3d563502006-01-21 02:47:53 +00002087
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002088 /* don't bother to build a match object */
2089 switch (self->groups) {
2090 case 0:
2091 b = STATE_OFFSET(&state, state.start);
2092 e = STATE_OFFSET(&state, state.ptr);
2093 item = PySequence_GetSlice(string, b, e);
2094 if (!item)
2095 goto error;
2096 break;
2097 case 1:
2098 item = state_getslice(&state, 1, string, 1);
2099 if (!item)
2100 goto error;
2101 break;
2102 default:
2103 item = PyTuple_New(self->groups);
2104 if (!item)
2105 goto error;
2106 for (i = 0; i < self->groups; i++) {
2107 PyObject* o = state_getslice(&state, i+1, string, 1);
2108 if (!o) {
2109 Py_DECREF(item);
2110 goto error;
2111 }
2112 PyTuple_SET_ITEM(item, i, o);
2113 }
2114 break;
2115 }
2116
2117 status = PyList_Append(list, item);
2118 Py_DECREF(item);
2119 if (status < 0)
2120 goto error;
2121
2122 if (state.ptr == state.start)
2123 state.start = (void*) ((char*) state.ptr + state.charsize);
2124 else
2125 state.start = state.ptr;
2126
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002127 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002128
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002129 state_fini(&state);
2130 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002131
2132error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002133 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002134 state_fini(&state);
2135 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002136
Guido van Rossumb700df92000-03-31 14:59:30 +00002137}
2138
Fredrik Lundh703ce812001-10-24 22:16:30 +00002139#if PY_VERSION_HEX >= 0x02020000
2140static PyObject*
2141pattern_finditer(PatternObject* pattern, PyObject* args)
2142{
2143 PyObject* scanner;
2144 PyObject* search;
2145 PyObject* iterator;
2146
2147 scanner = pattern_scanner(pattern, args);
2148 if (!scanner)
2149 return NULL;
2150
2151 search = PyObject_GetAttrString(scanner, "search");
2152 Py_DECREF(scanner);
2153 if (!search)
2154 return NULL;
2155
2156 iterator = PyCallIter_New(search, Py_None);
2157 Py_DECREF(search);
2158
2159 return iterator;
2160}
2161#endif
2162
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002163static PyObject*
2164pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2165{
2166 SRE_STATE state;
2167 PyObject* list;
2168 PyObject* item;
2169 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002170 Py_ssize_t n;
2171 Py_ssize_t i;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002172 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002173
2174 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002175 Py_ssize_t maxsplit = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002176 static char* kwlist[] = { "source", "maxsplit", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002177 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002178 &string, &maxsplit))
2179 return NULL;
2180
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002181 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002182 if (!string)
2183 return NULL;
2184
2185 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002186 if (!list) {
2187 state_fini(&state);
2188 return NULL;
2189 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002190
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002191 n = 0;
2192 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002193
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002194 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002195
2196 state_reset(&state);
2197
2198 state.ptr = state.start;
2199
2200 if (state.charsize == 1) {
2201 status = sre_search(&state, PatternObject_GetCode(self));
2202 } else {
2203#if defined(HAVE_UNICODE)
2204 status = sre_usearch(&state, PatternObject_GetCode(self));
2205#endif
2206 }
2207
Thomas Wouters89f507f2006-12-13 04:49:30 +00002208 if (PyErr_Occurred())
2209 goto error;
2210
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002211 if (status <= 0) {
2212 if (status == 0)
2213 break;
2214 pattern_error(status);
2215 goto error;
2216 }
Tim Peters3d563502006-01-21 02:47:53 +00002217
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002218 if (state.start == state.ptr) {
2219 if (last == state.end)
2220 break;
2221 /* skip one character */
2222 state.start = (void*) ((char*) state.ptr + state.charsize);
2223 continue;
2224 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002225
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002226 /* get segment before this match */
2227 item = PySequence_GetSlice(
2228 string, STATE_OFFSET(&state, last),
2229 STATE_OFFSET(&state, state.start)
2230 );
2231 if (!item)
2232 goto error;
2233 status = PyList_Append(list, item);
2234 Py_DECREF(item);
2235 if (status < 0)
2236 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002237
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002238 /* add groups (if any) */
2239 for (i = 0; i < self->groups; i++) {
2240 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002241 if (!item)
2242 goto error;
2243 status = PyList_Append(list, item);
2244 Py_DECREF(item);
2245 if (status < 0)
2246 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002247 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002248
2249 n = n + 1;
2250
2251 last = state.start = state.ptr;
2252
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002253 }
2254
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002255 /* get segment following last match (even if empty) */
2256 item = PySequence_GetSlice(
2257 string, STATE_OFFSET(&state, last), state.endpos
2258 );
2259 if (!item)
2260 goto error;
2261 status = PyList_Append(list, item);
2262 Py_DECREF(item);
2263 if (status < 0)
2264 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002265
2266 state_fini(&state);
2267 return list;
2268
2269error:
2270 Py_DECREF(list);
2271 state_fini(&state);
2272 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002273
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002274}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002275
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002276static PyObject*
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002277pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002278 Py_ssize_t count, Py_ssize_t subn)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002279{
2280 SRE_STATE state;
2281 PyObject* list;
2282 PyObject* item;
2283 PyObject* filter;
2284 PyObject* args;
2285 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002286 void* ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002287 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002288 Py_ssize_t n;
2289 Py_ssize_t i, b, e;
2290 int bint;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002291 int filter_is_callable;
2292
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002293 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002294 /* sub/subn takes either a function or a template */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002295 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002296 Py_INCREF(filter);
2297 filter_is_callable = 1;
2298 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002299 /* if not callable, check if it's a literal string */
2300 int literal;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002301 ptr = getstring(ptemplate, &n, &bint);
2302 b = bint;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002303 if (ptr) {
2304 if (b == 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002305 literal = sre_literal_template((unsigned char *)ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002306 } else {
2307#if defined(HAVE_UNICODE)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002308 literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002309#endif
2310 }
2311 } else {
2312 PyErr_Clear();
2313 literal = 0;
2314 }
2315 if (literal) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002316 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002317 Py_INCREF(filter);
2318 filter_is_callable = 0;
2319 } else {
2320 /* not a literal; hand it over to the template compiler */
2321 filter = call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00002322 SRE_PY_MODULE, "_subx",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002323 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002324 );
2325 if (!filter)
2326 return NULL;
2327 filter_is_callable = PyCallable_Check(filter);
2328 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002329 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002330
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002331 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002332 if (!string) {
2333 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002334 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002335 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002336
2337 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002338 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002339 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002340 state_fini(&state);
2341 return NULL;
2342 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002343
2344 n = i = 0;
2345
2346 while (!count || n < count) {
2347
2348 state_reset(&state);
2349
2350 state.ptr = state.start;
2351
2352 if (state.charsize == 1) {
2353 status = sre_search(&state, PatternObject_GetCode(self));
2354 } else {
2355#if defined(HAVE_UNICODE)
2356 status = sre_usearch(&state, PatternObject_GetCode(self));
2357#endif
2358 }
2359
Thomas Wouters89f507f2006-12-13 04:49:30 +00002360 if (PyErr_Occurred())
2361 goto error;
2362
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002363 if (status <= 0) {
2364 if (status == 0)
2365 break;
2366 pattern_error(status);
2367 goto error;
2368 }
Tim Peters3d563502006-01-21 02:47:53 +00002369
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002370 b = STATE_OFFSET(&state, state.start);
2371 e = STATE_OFFSET(&state, state.ptr);
2372
2373 if (i < b) {
2374 /* get segment before this match */
2375 item = PySequence_GetSlice(string, i, b);
2376 if (!item)
2377 goto error;
2378 status = PyList_Append(list, item);
2379 Py_DECREF(item);
2380 if (status < 0)
2381 goto error;
2382
2383 } else if (i == b && i == e && n > 0)
2384 /* ignore empty match on latest position */
2385 goto next;
2386
2387 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002388 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002389 match = pattern_new_match(self, &state, 1);
2390 if (!match)
2391 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002392 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002393 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002394 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002395 goto error;
2396 }
2397 item = PyObject_CallObject(filter, args);
2398 Py_DECREF(args);
2399 Py_DECREF(match);
2400 if (!item)
2401 goto error;
2402 } else {
2403 /* filter is literal string */
2404 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002405 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002406 }
2407
2408 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002409 if (item != Py_None) {
2410 status = PyList_Append(list, item);
2411 Py_DECREF(item);
2412 if (status < 0)
2413 goto error;
2414 }
Tim Peters3d563502006-01-21 02:47:53 +00002415
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002416 i = e;
2417 n = n + 1;
2418
2419next:
2420 /* move on */
2421 if (state.ptr == state.start)
2422 state.start = (void*) ((char*) state.ptr + state.charsize);
2423 else
2424 state.start = state.ptr;
2425
2426 }
2427
2428 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002429 if (i < state.endpos) {
2430 item = PySequence_GetSlice(string, i, state.endpos);
2431 if (!item)
2432 goto error;
2433 status = PyList_Append(list, item);
2434 Py_DECREF(item);
2435 if (status < 0)
2436 goto error;
2437 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002438
2439 state_fini(&state);
2440
Guido van Rossum4e173842001-12-07 04:25:10 +00002441 Py_DECREF(filter);
2442
Fredrik Lundhdac58492001-10-21 21:48:30 +00002443 /* convert list to single string (also removes list) */
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002444 item = join_list(list, string);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002445
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002446 if (!item)
2447 return NULL;
2448
2449 if (subn)
2450 return Py_BuildValue("Ni", item, n);
2451
2452 return item;
2453
2454error:
2455 Py_DECREF(list);
2456 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002457 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002458 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002459
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002460}
2461
2462static PyObject*
2463pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2464{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002465 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002466 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002467 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002468 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002469 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002470 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002471 return NULL;
2472
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002473 return pattern_subx(self, ptemplate, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002474}
2475
2476static PyObject*
2477pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2478{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002479 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002480 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002481 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002482 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002483 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002484 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002485 return NULL;
2486
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002487 return pattern_subx(self, ptemplate, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002488}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002489
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002490static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002491pattern_copy(PatternObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002492{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002493#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002494 PatternObject* copy;
2495 int offset;
2496
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002497 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2498 if (!copy)
2499 return NULL;
2500
2501 offset = offsetof(PatternObject, groups);
2502
2503 Py_XINCREF(self->groupindex);
2504 Py_XINCREF(self->indexgroup);
2505 Py_XINCREF(self->pattern);
2506
2507 memcpy((char*) copy + offset, (char*) self + offset,
2508 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00002509 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002510
2511 return (PyObject*) copy;
2512#else
2513 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2514 return NULL;
2515#endif
2516}
2517
2518static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002519pattern_deepcopy(PatternObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002520{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002521#ifdef USE_BUILTIN_COPY
2522 PatternObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00002523
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002524 copy = (PatternObject*) pattern_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002525 if (!copy)
2526 return NULL;
2527
2528 if (!deepcopy(&copy->groupindex, memo) ||
2529 !deepcopy(&copy->indexgroup, memo) ||
2530 !deepcopy(&copy->pattern, memo)) {
2531 Py_DECREF(copy);
2532 return NULL;
2533 }
2534
2535#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002536 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2537 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002538#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002539}
2540
Raymond Hettinger94478742004-09-24 04:31:19 +00002541PyDoc_STRVAR(pattern_match_doc,
2542"match(string[, pos[, endpos]]) --> match object or None.\n\
2543 Matches zero or more characters at the beginning of the string");
2544
2545PyDoc_STRVAR(pattern_search_doc,
2546"search(string[, pos[, endpos]]) --> match object or None.\n\
2547 Scan through string looking for a match, and return a corresponding\n\
2548 MatchObject instance. Return None if no position in the string matches.");
2549
2550PyDoc_STRVAR(pattern_split_doc,
2551"split(string[, maxsplit = 0]) --> list.\n\
2552 Split string by the occurrences of pattern.");
2553
2554PyDoc_STRVAR(pattern_findall_doc,
2555"findall(string[, pos[, endpos]]) --> list.\n\
2556 Return a list of all non-overlapping matches of pattern in string.");
2557
2558PyDoc_STRVAR(pattern_finditer_doc,
2559"finditer(string[, pos[, endpos]]) --> iterator.\n\
2560 Return an iterator over all non-overlapping matches for the \n\
2561 RE pattern in string. For each match, the iterator returns a\n\
2562 match object.");
2563
2564PyDoc_STRVAR(pattern_sub_doc,
2565"sub(repl, string[, count = 0]) --> newstring\n\
2566 Return the string obtained by replacing the leftmost non-overlapping\n\
Tim Peters3d563502006-01-21 02:47:53 +00002567 occurrences of pattern in string by the replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002568
2569PyDoc_STRVAR(pattern_subn_doc,
2570"subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2571 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2572 the leftmost non-overlapping occurrences of pattern with the\n\
Tim Peters3d563502006-01-21 02:47:53 +00002573 replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002574
2575PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2576
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002577static PyMethodDef pattern_methods[] = {
Tim Peters3d563502006-01-21 02:47:53 +00002578 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002579 pattern_match_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002580 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002581 pattern_search_doc},
2582 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2583 pattern_sub_doc},
2584 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2585 pattern_subn_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002586 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002587 pattern_split_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002588 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002589 pattern_findall_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002590#if PY_VERSION_HEX >= 0x02020000
Raymond Hettinger94478742004-09-24 04:31:19 +00002591 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2592 pattern_finditer_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002593#endif
Fredrik Lundh562586e2000-10-03 20:43:34 +00002594 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002595 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2596 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002597 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002598};
2599
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002600#define PAT_OFF(x) offsetof(PatternObject, x)
2601static PyMemberDef pattern_members[] = {
2602 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
2603 {"flags", T_INT, PAT_OFF(flags), READONLY},
2604 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
2605 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
2606 {NULL} /* Sentinel */
2607};
Guido van Rossumb700df92000-03-31 14:59:30 +00002608
Neal Norwitz57c179c2006-03-22 07:18:02 +00002609static PyTypeObject Pattern_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00002610 PyVarObject_HEAD_INIT(NULL, 0)
2611 "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002612 sizeof(PatternObject), sizeof(SRE_CODE),
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002613 (destructor)pattern_dealloc, /* tp_dealloc */
2614 0, /* tp_print */
2615 0, /* tp_getattr */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002616 0, /* tp_setattr */
2617 0, /* tp_compare */
2618 0, /* tp_repr */
2619 0, /* tp_as_number */
2620 0, /* tp_as_sequence */
2621 0, /* tp_as_mapping */
2622 0, /* tp_hash */
2623 0, /* tp_call */
2624 0, /* tp_str */
2625 0, /* tp_getattro */
2626 0, /* tp_setattro */
2627 0, /* tp_as_buffer */
Guido van Rossum3cf5b1e2006-07-27 21:53:35 +00002628 Py_TPFLAGS_DEFAULT, /* tp_flags */
Raymond Hettinger94478742004-09-24 04:31:19 +00002629 pattern_doc, /* tp_doc */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002630 0, /* tp_traverse */
2631 0, /* tp_clear */
2632 0, /* tp_richcompare */
2633 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002634 0, /* tp_iter */
2635 0, /* tp_iternext */
2636 pattern_methods, /* tp_methods */
2637 pattern_members, /* tp_members */
Guido van Rossumb700df92000-03-31 14:59:30 +00002638};
2639
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002640static PyObject *
2641_compile(PyObject* self_, PyObject* args)
2642{
2643 /* "compile" pattern descriptor to pattern object */
2644
2645 PatternObject* self;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002646 Py_ssize_t i, n;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002647
2648 PyObject* pattern;
2649 int flags = 0;
2650 PyObject* code;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002651 Py_ssize_t groups = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002652 PyObject* groupindex = NULL;
2653 PyObject* indexgroup = NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002654 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002655 &PyList_Type, &code, &groups,
2656 &groupindex, &indexgroup))
2657 return NULL;
2658
2659 n = PyList_GET_SIZE(code);
Christian Heimes587c2bf2008-01-19 16:21:02 +00002660 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002661 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2662 if (!self)
2663 return NULL;
2664
2665 self->codesize = n;
2666
2667 for (i = 0; i < n; i++) {
2668 PyObject *o = PyList_GET_ITEM(code, i);
Guido van Rossumddefaf32007-01-14 03:31:43 +00002669 unsigned long value = PyLong_AsUnsignedLong(o);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002670 self->code[i] = (SRE_CODE) value;
2671 if ((unsigned long) self->code[i] != value) {
2672 PyErr_SetString(PyExc_OverflowError,
2673 "regular expression code size limit exceeded");
2674 break;
2675 }
2676 }
2677
2678 if (PyErr_Occurred()) {
2679 PyObject_DEL(self);
2680 return NULL;
2681 }
2682
2683 Py_INCREF(pattern);
2684 self->pattern = pattern;
2685
2686 self->flags = flags;
2687
2688 self->groups = groups;
2689
2690 Py_XINCREF(groupindex);
2691 self->groupindex = groupindex;
2692
2693 Py_XINCREF(indexgroup);
2694 self->indexgroup = indexgroup;
2695
2696 self->weakreflist = NULL;
2697
2698 return (PyObject*) self;
2699}
2700
Guido van Rossumb700df92000-03-31 14:59:30 +00002701/* -------------------------------------------------------------------- */
2702/* match methods */
2703
2704static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002705match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00002706{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002707 Py_XDECREF(self->regs);
2708 Py_XDECREF(self->string);
2709 Py_DECREF(self->pattern);
2710 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00002711}
2712
2713static PyObject*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002714match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00002715{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002716 if (index < 0 || index >= self->groups) {
2717 /* raise IndexError if we were given a bad group number */
2718 PyErr_SetString(
2719 PyExc_IndexError,
2720 "no such group"
2721 );
2722 return NULL;
2723 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002724
Fredrik Lundh6f013982000-07-03 18:44:21 +00002725 index *= 2;
2726
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002727 if (self->string == Py_None || self->mark[index] < 0) {
2728 /* return default value if the string or group is undefined */
2729 Py_INCREF(def);
2730 return def;
2731 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002732
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002733 return PySequence_GetSlice(
2734 self->string, self->mark[index], self->mark[index+1]
2735 );
Guido van Rossumb700df92000-03-31 14:59:30 +00002736}
2737
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002738static Py_ssize_t
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002739match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00002740{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002741 Py_ssize_t i;
Guido van Rossumb700df92000-03-31 14:59:30 +00002742
Guido van Rossumddefaf32007-01-14 03:31:43 +00002743 if (index == NULL)
2744 /* Default value */
2745 return 0;
2746
Christian Heimes217cfd12007-12-02 14:31:20 +00002747 if (PyLong_Check(index))
2748 return PyLong_AsSsize_t(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00002749
Fredrik Lundh6f013982000-07-03 18:44:21 +00002750 i = -1;
2751
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002752 if (self->pattern->groupindex) {
2753 index = PyObject_GetItem(self->pattern->groupindex, index);
2754 if (index) {
Neal Norwitz1fe5f382007-08-31 04:32:55 +00002755 if (PyLong_Check(index))
Christian Heimes217cfd12007-12-02 14:31:20 +00002756 i = PyLong_AsSsize_t(index);
Fredrik Lundh6f013982000-07-03 18:44:21 +00002757 Py_DECREF(index);
2758 } else
2759 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002760 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00002761
2762 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002763}
2764
2765static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002766match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002767{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002768 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00002769}
2770
2771static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002772match_expand(MatchObject* self, PyObject* ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002773{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002774 /* delegate to Python code */
2775 return call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00002776 SRE_PY_MODULE, "_expand",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002777 PyTuple_Pack(3, self->pattern, self, ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002778 );
2779}
2780
2781static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002782match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002783{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002784 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002785 Py_ssize_t i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00002786
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002787 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00002788
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002789 switch (size) {
2790 case 0:
2791 result = match_getslice(self, Py_False, Py_None);
2792 break;
2793 case 1:
2794 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2795 break;
2796 default:
2797 /* fetch multiple items */
2798 result = PyTuple_New(size);
2799 if (!result)
2800 return NULL;
2801 for (i = 0; i < size; i++) {
2802 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002803 self, PyTuple_GET_ITEM(args, i), Py_None
2804 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002805 if (!item) {
2806 Py_DECREF(result);
2807 return NULL;
2808 }
2809 PyTuple_SET_ITEM(result, i, item);
2810 }
2811 break;
2812 }
2813 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002814}
2815
2816static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002817match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002818{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002819 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002820 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002821
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002822 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002823 static char* kwlist[] = { "default", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00002824 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002825 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002826
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002827 result = PyTuple_New(self->groups-1);
2828 if (!result)
2829 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002830
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002831 for (index = 1; index < self->groups; index++) {
2832 PyObject* item;
2833 item = match_getslice_by_index(self, index, def);
2834 if (!item) {
2835 Py_DECREF(result);
2836 return NULL;
2837 }
2838 PyTuple_SET_ITEM(result, index-1, item);
2839 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002840
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002841 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002842}
2843
2844static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002845match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002846{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002847 PyObject* result;
2848 PyObject* keys;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002849 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002850
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002851 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002852 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00002853 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002854 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002855
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002856 result = PyDict_New();
2857 if (!result || !self->pattern->groupindex)
2858 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002859
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002860 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002861 if (!keys)
2862 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00002863
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002864 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00002865 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002866 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002867 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002868 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002869 if (!key)
2870 goto failed;
2871 value = match_getslice(self, key, def);
2872 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002873 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002874 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002875 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00002876 status = PyDict_SetItem(result, key, value);
2877 Py_DECREF(value);
2878 if (status < 0)
2879 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002880 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002881
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002882 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00002883
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002884 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002885
2886failed:
Neal Norwitz60da3162006-03-07 04:48:24 +00002887 Py_XDECREF(keys);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002888 Py_DECREF(result);
2889 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002890}
2891
2892static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002893match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002894{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002895 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002896
Guido van Rossumddefaf32007-01-14 03:31:43 +00002897 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002898 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002899 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002900
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002901 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002902
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002903 if (index < 0 || index >= self->groups) {
2904 PyErr_SetString(
2905 PyExc_IndexError,
2906 "no such group"
2907 );
2908 return NULL;
2909 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002910
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002911 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002912 return Py_BuildValue("i", self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00002913}
2914
2915static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002916match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002917{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002918 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002919
Guido van Rossumddefaf32007-01-14 03:31:43 +00002920 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002921 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002922 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002923
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002924 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002925
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002926 if (index < 0 || index >= self->groups) {
2927 PyErr_SetString(
2928 PyExc_IndexError,
2929 "no such group"
2930 );
2931 return NULL;
2932 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002933
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002934 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002935 return Py_BuildValue("i", self->mark[index*2+1]);
2936}
2937
2938LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002939_pair(Py_ssize_t i1, Py_ssize_t i2)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002940{
2941 PyObject* pair;
2942 PyObject* item;
2943
2944 pair = PyTuple_New(2);
2945 if (!pair)
2946 return NULL;
2947
Christian Heimes217cfd12007-12-02 14:31:20 +00002948 item = PyLong_FromSsize_t(i1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002949 if (!item)
2950 goto error;
2951 PyTuple_SET_ITEM(pair, 0, item);
2952
Christian Heimes217cfd12007-12-02 14:31:20 +00002953 item = PyLong_FromSsize_t(i2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002954 if (!item)
2955 goto error;
2956 PyTuple_SET_ITEM(pair, 1, item);
2957
2958 return pair;
2959
2960 error:
2961 Py_DECREF(pair);
2962 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002963}
2964
2965static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002966match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002967{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002968 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002969
Guido van Rossumddefaf32007-01-14 03:31:43 +00002970 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002971 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002972 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002973
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002974 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002975
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002976 if (index < 0 || index >= self->groups) {
2977 PyErr_SetString(
2978 PyExc_IndexError,
2979 "no such group"
2980 );
2981 return NULL;
2982 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002983
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002984 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002985 return _pair(self->mark[index*2], self->mark[index*2+1]);
2986}
2987
2988static PyObject*
2989match_regs(MatchObject* self)
2990{
2991 PyObject* regs;
2992 PyObject* item;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002993 Py_ssize_t index;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002994
2995 regs = PyTuple_New(self->groups);
2996 if (!regs)
2997 return NULL;
2998
2999 for (index = 0; index < self->groups; index++) {
3000 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3001 if (!item) {
3002 Py_DECREF(regs);
3003 return NULL;
3004 }
3005 PyTuple_SET_ITEM(regs, index, item);
3006 }
3007
3008 Py_INCREF(regs);
3009 self->regs = regs;
3010
3011 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00003012}
3013
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003014static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003015match_copy(MatchObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003016{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003017#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003018 MatchObject* copy;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003019 Py_ssize_t slots, offset;
Tim Peters3d563502006-01-21 02:47:53 +00003020
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003021 slots = 2 * (self->pattern->groups+1);
3022
3023 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3024 if (!copy)
3025 return NULL;
3026
3027 /* this value a constant, but any compiler should be able to
3028 figure that out all by itself */
3029 offset = offsetof(MatchObject, string);
3030
3031 Py_XINCREF(self->pattern);
3032 Py_XINCREF(self->string);
3033 Py_XINCREF(self->regs);
3034
3035 memcpy((char*) copy + offset, (char*) self + offset,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003036 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003037
3038 return (PyObject*) copy;
3039#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003040 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003041 return NULL;
3042#endif
3043}
3044
3045static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003046match_deepcopy(MatchObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003047{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003048#ifdef USE_BUILTIN_COPY
3049 MatchObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00003050
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003051 copy = (MatchObject*) match_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003052 if (!copy)
3053 return NULL;
3054
3055 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3056 !deepcopy(&copy->string, memo) ||
3057 !deepcopy(&copy->regs, memo)) {
3058 Py_DECREF(copy);
3059 return NULL;
3060 }
3061
3062#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003063 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3064 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003065#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003066}
3067
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003068static PyMethodDef match_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00003069 {"group", (PyCFunction) match_group, METH_VARARGS},
3070 {"start", (PyCFunction) match_start, METH_VARARGS},
3071 {"end", (PyCFunction) match_end, METH_VARARGS},
3072 {"span", (PyCFunction) match_span, METH_VARARGS},
3073 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3074 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003075 {"expand", (PyCFunction) match_expand, METH_O},
3076 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3077 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003078 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003079};
3080
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003081static PyObject *
3082match_lastindex_get(MatchObject *self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003083{
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003084 if (self->lastindex >= 0)
3085 return Py_BuildValue("i", self->lastindex);
3086 Py_INCREF(Py_None);
3087 return Py_None;
Guido van Rossumb700df92000-03-31 14:59:30 +00003088}
3089
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003090static PyObject *
3091match_lastgroup_get(MatchObject *self)
3092{
3093 if (self->pattern->indexgroup && self->lastindex >= 0) {
3094 PyObject* result = PySequence_GetItem(
3095 self->pattern->indexgroup, self->lastindex
3096 );
3097 if (result)
3098 return result;
3099 PyErr_Clear();
3100 }
3101 Py_INCREF(Py_None);
3102 return Py_None;
3103}
3104
3105static PyObject *
3106match_regs_get(MatchObject *self)
3107{
3108 if (self->regs) {
3109 Py_INCREF(self->regs);
3110 return self->regs;
3111 } else
3112 return match_regs(self);
3113}
3114
3115static PyGetSetDef match_getset[] = {
3116 {"lastindex", (getter)match_lastindex_get, (setter)NULL},
3117 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
3118 {"regs", (getter)match_regs_get, (setter)NULL},
3119 {NULL}
3120};
3121
3122#define MATCH_OFF(x) offsetof(MatchObject, x)
3123static PyMemberDef match_members[] = {
3124 {"string", T_OBJECT, MATCH_OFF(string), READONLY},
3125 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
3126 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
3127 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
3128 {NULL}
3129};
3130
Guido van Rossumb700df92000-03-31 14:59:30 +00003131/* FIXME: implement setattr("string", None) as a special case (to
3132 detach the associated string, if any */
3133
Neal Norwitz57c179c2006-03-22 07:18:02 +00003134static PyTypeObject Match_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003135 PyVarObject_HEAD_INIT(NULL,0)
3136 "_" SRE_MODULE ".SRE_Match",
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003137 sizeof(MatchObject), sizeof(Py_ssize_t),
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003138 (destructor)match_dealloc, /* tp_dealloc */
3139 0, /* tp_print */
3140 0, /* tp_getattr */
3141 0, /* tp_setattr */
3142 0, /* tp_compare */
3143 0, /* tp_repr */
3144 0, /* tp_as_number */
3145 0, /* tp_as_sequence */
3146 0, /* tp_as_mapping */
3147 0, /* tp_hash */
3148 0, /* tp_call */
3149 0, /* tp_str */
3150 0, /* tp_getattro */
3151 0, /* tp_setattro */
3152 0, /* tp_as_buffer */
3153 Py_TPFLAGS_DEFAULT, /* tp_flags */
3154 0, /* tp_doc */
3155 0, /* tp_traverse */
3156 0, /* tp_clear */
3157 0, /* tp_richcompare */
3158 0, /* tp_weaklistoffset */
3159 0, /* tp_iter */
3160 0, /* tp_iternext */
3161 match_methods, /* tp_methods */
3162 match_members, /* tp_members */
3163 match_getset, /* tp_getset */
Guido van Rossumb700df92000-03-31 14:59:30 +00003164};
3165
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003166static PyObject*
3167pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3168{
3169 /* create match object (from state object) */
3170
3171 MatchObject* match;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003172 Py_ssize_t i, j;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003173 char* base;
3174 int n;
3175
3176 if (status > 0) {
3177
3178 /* create match object (with room for extra group marks) */
Christian Heimes587c2bf2008-01-19 16:21:02 +00003179 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003180 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3181 2*(pattern->groups+1));
3182 if (!match)
3183 return NULL;
3184
3185 Py_INCREF(pattern);
3186 match->pattern = pattern;
3187
3188 Py_INCREF(state->string);
3189 match->string = state->string;
3190
3191 match->regs = NULL;
3192 match->groups = pattern->groups+1;
3193
3194 /* fill in group slices */
3195
3196 base = (char*) state->beginning;
3197 n = state->charsize;
3198
3199 match->mark[0] = ((char*) state->start - base) / n;
3200 match->mark[1] = ((char*) state->ptr - base) / n;
3201
3202 for (i = j = 0; i < pattern->groups; i++, j+=2)
3203 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3204 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3205 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3206 } else
3207 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3208
3209 match->pos = state->pos;
3210 match->endpos = state->endpos;
3211
3212 match->lastindex = state->lastindex;
3213
3214 return (PyObject*) match;
3215
3216 } else if (status == 0) {
3217
3218 /* no match */
3219 Py_INCREF(Py_None);
3220 return Py_None;
3221
3222 }
3223
3224 /* internal error */
3225 pattern_error(status);
3226 return NULL;
3227}
3228
3229
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003230/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003231/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003232
3233static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003234scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003235{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003236 state_fini(&self->state);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003237 Py_DECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003238 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003239}
3240
3241static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003242scanner_match(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003243{
3244 SRE_STATE* state = &self->state;
3245 PyObject* match;
3246 int status;
3247
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003248 state_reset(state);
3249
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003250 state->ptr = state->start;
3251
3252 if (state->charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003253 status = sre_match(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003254 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003255#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003256 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003257#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003258 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003259 if (PyErr_Occurred())
3260 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003261
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003262 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003263 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003264
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003265 if (status == 0 || state->ptr == state->start)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003266 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003267 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003268 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003269
3270 return match;
3271}
3272
3273
3274static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003275scanner_search(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003276{
3277 SRE_STATE* state = &self->state;
3278 PyObject* match;
3279 int status;
3280
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003281 state_reset(state);
3282
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003283 state->ptr = state->start;
3284
3285 if (state->charsize == 1) {
3286 status = sre_search(state, PatternObject_GetCode(self->pattern));
3287 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003288#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003289 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003290#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003291 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003292 if (PyErr_Occurred())
3293 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003294
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003295 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003296 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003297
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003298 if (status == 0 || state->ptr == state->start)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003299 state->start = (void*) ((char*) state->ptr + state->charsize);
3300 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003301 state->start = state->ptr;
3302
3303 return match;
3304}
3305
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003306static PyMethodDef scanner_methods[] = {
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003307 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3308 {"search", (PyCFunction) scanner_search, METH_NOARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003309 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003310};
3311
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003312#define SCAN_OFF(x) offsetof(ScannerObject, x)
3313static PyMemberDef scanner_members[] = {
3314 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
3315 {NULL} /* Sentinel */
3316};
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003317
Neal Norwitz57c179c2006-03-22 07:18:02 +00003318static PyTypeObject Scanner_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003319 PyVarObject_HEAD_INIT(NULL, 0)
3320 "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003321 sizeof(ScannerObject), 0,
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003322 (destructor)scanner_dealloc,/* tp_dealloc */
3323 0, /* tp_print */
3324 0, /* tp_getattr */
3325 0, /* tp_setattr */
3326 0, /* tp_compare */
3327 0, /* tp_repr */
3328 0, /* tp_as_number */
3329 0, /* tp_as_sequence */
3330 0, /* tp_as_mapping */
3331 0, /* tp_hash */
3332 0, /* tp_call */
3333 0, /* tp_str */
3334 0, /* tp_getattro */
3335 0, /* tp_setattro */
3336 0, /* tp_as_buffer */
3337 Py_TPFLAGS_DEFAULT, /* tp_flags */
3338 0, /* tp_doc */
3339 0, /* tp_traverse */
3340 0, /* tp_clear */
3341 0, /* tp_richcompare */
3342 0, /* tp_weaklistoffset */
3343 0, /* tp_iter */
3344 0, /* tp_iternext */
3345 scanner_methods, /* tp_methods */
3346 scanner_members, /* tp_members */
3347 0, /* tp_getset */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003348};
3349
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003350static PyObject*
3351pattern_scanner(PatternObject* pattern, PyObject* args)
3352{
3353 /* create search state object */
3354
3355 ScannerObject* self;
3356
3357 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003358 Py_ssize_t start = 0;
3359 Py_ssize_t end = PY_SSIZE_T_MAX;
3360 if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003361 return NULL;
3362
3363 /* create scanner object */
3364 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3365 if (!self)
3366 return NULL;
3367
3368 string = state_init(&self->state, pattern, string, start, end);
3369 if (!string) {
3370 PyObject_DEL(self);
3371 return NULL;
3372 }
3373
3374 Py_INCREF(pattern);
3375 self->pattern = (PyObject*) pattern;
3376
3377 return (PyObject*) self;
3378}
3379
Guido van Rossumb700df92000-03-31 14:59:30 +00003380static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00003381 {"compile", _compile, METH_VARARGS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003382 {"getcodesize", sre_codesize, METH_NOARGS},
Neal Norwitzb0493252002-03-31 14:44:22 +00003383 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003384 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003385};
3386
Martin v. Löwis1a214512008-06-11 05:26:20 +00003387static struct PyModuleDef sremodule = {
3388 PyModuleDef_HEAD_INIT,
3389 "_" SRE_MODULE,
3390 NULL,
3391 -1,
3392 _functions,
3393 NULL,
3394 NULL,
3395 NULL,
3396 NULL
3397};
3398
3399PyMODINIT_FUNC PyInit__sre(void)
Guido van Rossumb700df92000-03-31 14:59:30 +00003400{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003401 PyObject* m;
3402 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00003403 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003404
Guido van Rossum50e9fb92006-08-17 05:42:55 +00003405 /* Initialize object types */
3406 if (PyType_Ready(&Pattern_Type) < 0)
Martin v. Löwis1a214512008-06-11 05:26:20 +00003407 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00003408 if (PyType_Ready(&Match_Type) < 0)
Martin v. Löwis1a214512008-06-11 05:26:20 +00003409 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00003410 if (PyType_Ready(&Scanner_Type) < 0)
Martin v. Löwis1a214512008-06-11 05:26:20 +00003411 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003412
Martin v. Löwis1a214512008-06-11 05:26:20 +00003413 m = PyModule_Create(&sremodule);
Neal Norwitz1ac754f2006-01-19 06:09:39 +00003414 if (m == NULL)
Martin v. Löwis1a214512008-06-11 05:26:20 +00003415 return NULL;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003416 d = PyModule_GetDict(m);
3417
Christian Heimes217cfd12007-12-02 14:31:20 +00003418 x = PyLong_FromLong(SRE_MAGIC);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003419 if (x) {
3420 PyDict_SetItemString(d, "MAGIC", x);
3421 Py_DECREF(x);
3422 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00003423
Christian Heimes217cfd12007-12-02 14:31:20 +00003424 x = PyLong_FromLong(sizeof(SRE_CODE));
Martin v. Löwis78e2f062003-04-19 12:56:08 +00003425 if (x) {
3426 PyDict_SetItemString(d, "CODESIZE", x);
3427 Py_DECREF(x);
3428 }
3429
Neal Norwitzfe537132007-08-26 03:55:15 +00003430 x = PyUnicode_FromString(copyright);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003431 if (x) {
3432 PyDict_SetItemString(d, "copyright", x);
3433 Py_DECREF(x);
3434 }
Martin v. Löwis1a214512008-06-11 05:26:20 +00003435 return m;
Guido van Rossumb700df92000-03-31 14:59:30 +00003436}
3437
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003438#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00003439
3440/* vim:ts=4:sw=4:et
3441*/