blob: 45b92f319d6c5f94ce7ae5bfa1dab45854a313aa [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
Fredrik Lundhebc37b22000-10-28 19:30:41 +00009 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000010 * 2000-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000011 * 2000-09-20 fl added expand method
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000012 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000013 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000014 * 2001-04-28 fl added __copy__ methods (work in progress)
Fredrik Lundh09705f02002-11-22 12:46:35 +000015 * 2001-05-14 fl fixes for 1.5.2 compatibility
Fredrik Lundhf71ae462001-07-02 17:04:48 +000016 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh397a6542001-10-18 19:30:16 +000017 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Fredrik Lundh971e78b2001-10-20 17:48:46 +000018 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
Fredrik Lundhbec95b92001-10-21 16:47:57 +000019 * 2001-10-21 fl added sub/subn primitive
Fredrik Lundh703ce812001-10-24 22:16:30 +000020 * 2001-10-24 fl added finditer primitive (for 2.2 only)
Fredrik Lundh82b23072001-12-09 16:13:15 +000021 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
Fredrik Lundh09705f02002-11-22 12:46:35 +000022 * 2002-11-09 fl fixed empty sub/subn return type
Martin v. Löwis78e2f062003-04-19 12:56:08 +000023 * 2003-04-18 mvl fully support 4-byte codes
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +000024 * 2003-10-17 gn implemented non recursive scheme
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000026 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000027 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000028 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
Guido van Rossumb700df92000-03-31 14:59:30 +000032 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000033 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000034 * other compatibility work.
35 */
36
37#ifndef SRE_RECURSIVE
38
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000039static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000040 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000041
Thomas Wouters0e3f5912006-08-11 14:57:12 +000042#define PY_SSIZE_T_CLEAN
43
Guido van Rossumb700df92000-03-31 14:59:30 +000044#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000045#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000046
47#include "sre.h"
48
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000049#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000050
Fredrik Lundh436c3d582000-06-29 08:58:44 +000051/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000052#if !defined(SRE_MODULE)
53#define SRE_MODULE "sre"
54#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000055
Thomas Wouters9ada3d62006-04-21 09:47:09 +000056#define SRE_PY_MODULE "re"
57
Guido van Rossumb700df92000-03-31 14:59:30 +000058/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000059#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000060
Fredrik Lundh22d25462000-07-01 17:50:59 +000061/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +000062#define HAVE_UNICODE
Fredrik Lundh436c3d582000-06-29 08:58:44 +000063
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000065/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000066
67/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000068#define USE_FAST_SEARCH
69
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000070/* enables aggressive inlining (always on for Visual C) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000071#undef USE_INLINE
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000072
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000073/* enables copy/deepcopy handling (work in progress) */
74#undef USE_BUILTIN_COPY
75
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000076#if PY_VERSION_HEX < 0x01060000
77#define PyObject_DEL(op) PyMem_DEL((op))
78#endif
79
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000080/* -------------------------------------------------------------------- */
81
Fredrik Lundh80946112000-06-29 18:03:25 +000082#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000083#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000084#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000085/* fastest possible local call under MSVC */
86#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000087#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000088#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000089#else
90#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000091#endif
92
93/* error codes */
94#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000095#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000096#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000097#define SRE_ERROR_MEMORY -9 /* out of memory */
Christian Heimes2380ac72008-01-09 00:17:24 +000098#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
Guido van Rossumb700df92000-03-31 14:59:30 +000099
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000100#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +0000101#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +0000102#else
103#define TRACE(v)
104#endif
105
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000106/* -------------------------------------------------------------------- */
107/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000108
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000109/* default character predicates (run sre_chars.py to regenerate tables) */
110
111#define SRE_DIGIT_MASK 1
112#define SRE_SPACE_MASK 2
113#define SRE_LINEBREAK_MASK 4
114#define SRE_ALNUM_MASK 8
115#define SRE_WORD_MASK 16
116
Fredrik Lundh21009b92001-09-18 18:47:09 +0000117/* FIXME: this assumes ASCII. create tables in init_sre() instead */
118
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000119static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1202, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1210, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
12225, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12324, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1240, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12524, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
126
Fredrik Lundhb389df32000-06-29 12:48:37 +0000127static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000012810, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
12927, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
13044, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
13161, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
132108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
133122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
134106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
135120, 121, 122, 123, 124, 125, 126, 127 };
136
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000137#define SRE_IS_DIGIT(ch)\
138 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
139#define SRE_IS_SPACE(ch)\
140 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
141#define SRE_IS_LINEBREAK(ch)\
142 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
143#define SRE_IS_ALNUM(ch)\
144 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
145#define SRE_IS_WORD(ch)\
146 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000147
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000148static unsigned int sre_lower(unsigned int ch)
149{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000150 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000151}
152
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000153/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000154/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
155 * warnings when c's type supports only numbers < N+1 */
156#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
157#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000158#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000159#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000160#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
161
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000162static unsigned int sre_lower_locale(unsigned int ch)
163{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000164 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000165}
166
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000167/* unicode-specific character predicates */
168
169#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000170
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000171#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
172#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
173#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000174#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000175#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000176
177static unsigned int sre_lower_unicode(unsigned int ch)
178{
179 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
180}
181
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000182#endif
183
Guido van Rossumb700df92000-03-31 14:59:30 +0000184LOCAL(int)
185sre_category(SRE_CODE category, unsigned int ch)
186{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000187 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000188
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000189 case SRE_CATEGORY_DIGIT:
190 return SRE_IS_DIGIT(ch);
191 case SRE_CATEGORY_NOT_DIGIT:
192 return !SRE_IS_DIGIT(ch);
193 case SRE_CATEGORY_SPACE:
194 return SRE_IS_SPACE(ch);
195 case SRE_CATEGORY_NOT_SPACE:
196 return !SRE_IS_SPACE(ch);
197 case SRE_CATEGORY_WORD:
198 return SRE_IS_WORD(ch);
199 case SRE_CATEGORY_NOT_WORD:
200 return !SRE_IS_WORD(ch);
201 case SRE_CATEGORY_LINEBREAK:
202 return SRE_IS_LINEBREAK(ch);
203 case SRE_CATEGORY_NOT_LINEBREAK:
204 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000205
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000206 case SRE_CATEGORY_LOC_WORD:
207 return SRE_LOC_IS_WORD(ch);
208 case SRE_CATEGORY_LOC_NOT_WORD:
209 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000210
211#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000212 case SRE_CATEGORY_UNI_DIGIT:
213 return SRE_UNI_IS_DIGIT(ch);
214 case SRE_CATEGORY_UNI_NOT_DIGIT:
215 return !SRE_UNI_IS_DIGIT(ch);
216 case SRE_CATEGORY_UNI_SPACE:
217 return SRE_UNI_IS_SPACE(ch);
218 case SRE_CATEGORY_UNI_NOT_SPACE:
219 return !SRE_UNI_IS_SPACE(ch);
220 case SRE_CATEGORY_UNI_WORD:
221 return SRE_UNI_IS_WORD(ch);
222 case SRE_CATEGORY_UNI_NOT_WORD:
223 return !SRE_UNI_IS_WORD(ch);
224 case SRE_CATEGORY_UNI_LINEBREAK:
225 return SRE_UNI_IS_LINEBREAK(ch);
226 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
227 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000228#else
229 case SRE_CATEGORY_UNI_DIGIT:
230 return SRE_IS_DIGIT(ch);
231 case SRE_CATEGORY_UNI_NOT_DIGIT:
232 return !SRE_IS_DIGIT(ch);
233 case SRE_CATEGORY_UNI_SPACE:
234 return SRE_IS_SPACE(ch);
235 case SRE_CATEGORY_UNI_NOT_SPACE:
236 return !SRE_IS_SPACE(ch);
237 case SRE_CATEGORY_UNI_WORD:
238 return SRE_LOC_IS_WORD(ch);
239 case SRE_CATEGORY_UNI_NOT_WORD:
240 return !SRE_LOC_IS_WORD(ch);
241 case SRE_CATEGORY_UNI_LINEBREAK:
242 return SRE_IS_LINEBREAK(ch);
243 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
244 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000245#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000246 }
247 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000248}
249
250/* helpers */
251
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000252static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000253data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000254{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000255 if (state->data_stack) {
Thomas Wouters477c8d52006-05-27 19:21:47 +0000256 PyMem_FREE(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000257 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000258 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000259 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000260}
261
262static int
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000263data_stack_grow(SRE_STATE* state, Py_ssize_t size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000264{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000265 Py_ssize_t minsize, cursize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000266 minsize = state->data_stack_base+size;
267 cursize = state->data_stack_size;
268 if (cursize < minsize) {
269 void* stack;
270 cursize = minsize+minsize/4+1024;
271 TRACE(("allocate/grow stack %d\n", cursize));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000272 stack = PyMem_REALLOC(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000273 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000274 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000275 return SRE_ERROR_MEMORY;
276 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000278 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000279 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000280 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000281}
282
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000283/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000284
285#define SRE_CHAR unsigned char
286#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000287#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000288#define SRE_CHARSET sre_charset
289#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000290#define SRE_MATCH sre_match
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000291#define SRE_MATCH_CONTEXT sre_match_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000292#define SRE_SEARCH sre_search
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000293#define SRE_LITERAL_TEMPLATE sre_literal_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000294
295#if defined(HAVE_UNICODE)
296
Guido van Rossumb700df92000-03-31 14:59:30 +0000297#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000298#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000299#undef SRE_RECURSIVE
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000300
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000301#undef SRE_LITERAL_TEMPLATE
Guido van Rossumb700df92000-03-31 14:59:30 +0000302#undef SRE_SEARCH
303#undef SRE_MATCH
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000304#undef SRE_MATCH_CONTEXT
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000305#undef SRE_INFO
306#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000307#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000308#undef SRE_AT
309#undef SRE_CHAR
310
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000311/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000312
313#define SRE_CHAR Py_UNICODE
314#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000315#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000316#define SRE_CHARSET sre_ucharset
317#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000318#define SRE_MATCH sre_umatch
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000319#define SRE_MATCH_CONTEXT sre_umatch_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000320#define SRE_SEARCH sre_usearch
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000321#define SRE_LITERAL_TEMPLATE sre_uliteral_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000322#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000323
324#endif /* SRE_RECURSIVE */
325
326/* -------------------------------------------------------------------- */
327/* String matching engine */
328
329/* the following section is compiled twice, with different character
330 settings */
331
332LOCAL(int)
333SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
334{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000335 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000336
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000337 Py_ssize_t thisp, thatp;
Guido van Rossumb700df92000-03-31 14:59:30 +0000338
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000339 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000340
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000341 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000342 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000343 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000344
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000345 case SRE_AT_BEGINNING_LINE:
346 return ((void*) ptr == state->beginning ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000347 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000348
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000349 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000350 return (((void*) (ptr+1) == state->end &&
351 SRE_IS_LINEBREAK((int) ptr[0])) ||
352 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000353
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000354 case SRE_AT_END_LINE:
355 return ((void*) ptr == state->end ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000356 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000357
Fredrik Lundh770617b2001-01-14 15:06:11 +0000358 case SRE_AT_END_STRING:
359 return ((void*) ptr == state->end);
360
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000361 case SRE_AT_BOUNDARY:
362 if (state->beginning == state->end)
363 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000364 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000365 SRE_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000366 thisp = ((void*) ptr < state->end) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000367 SRE_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000368 return thisp != thatp;
Fredrik Lundh80946112000-06-29 18:03:25 +0000369
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000370 case SRE_AT_NON_BOUNDARY:
371 if (state->beginning == state->end)
372 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000373 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000374 SRE_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000375 thisp = ((void*) ptr < state->end) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000376 SRE_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000377 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000378
379 case SRE_AT_LOC_BOUNDARY:
380 if (state->beginning == state->end)
381 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000382 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000383 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000384 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000385 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000386 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000387
388 case SRE_AT_LOC_NON_BOUNDARY:
389 if (state->beginning == state->end)
390 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000391 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000392 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000393 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000394 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000395 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000396
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000397#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000398 case SRE_AT_UNI_BOUNDARY:
399 if (state->beginning == state->end)
400 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000401 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000402 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000403 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000404 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000405 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000406
407 case SRE_AT_UNI_NON_BOUNDARY:
408 if (state->beginning == state->end)
409 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000410 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000411 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000412 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000413 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000414 return thisp == thatp;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000415#endif
416
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000417 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000418
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000419 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000420}
421
422LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000423SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000424{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000425 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000426
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000427 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000428
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000429 for (;;) {
430 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000431
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000432 case SRE_OP_FAILURE:
433 return !ok;
434
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000435 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000436 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000437 if (ch == set[0])
438 return ok;
439 set++;
440 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000441
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000442 case SRE_OP_CATEGORY:
443 /* <CATEGORY> <code> */
444 if (sre_category(set[0], (int) ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000445 return ok;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000446 set += 1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000447 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000448
Fredrik Lundh3562f112000-07-02 12:00:07 +0000449 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000450 if (sizeof(SRE_CODE) == 2) {
451 /* <CHARSET> <bitmap> (16 bits per code word) */
452 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
453 return ok;
454 set += 16;
Tim Peters3d563502006-01-21 02:47:53 +0000455 }
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000456 else {
457 /* <CHARSET> <bitmap> (32 bits per code word) */
458 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
459 return ok;
460 set += 8;
461 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000462 break;
463
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000464 case SRE_OP_RANGE:
465 /* <RANGE> <lower> <upper> */
466 if (set[0] <= ch && ch <= set[1])
467 return ok;
468 set += 2;
469 break;
470
471 case SRE_OP_NEGATE:
472 ok = !ok;
473 break;
474
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000475 case SRE_OP_BIGCHARSET:
476 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
477 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000478 Py_ssize_t count, block;
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000479 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000480
481 if (sizeof(SRE_CODE) == 2) {
482 block = ((unsigned char*)set)[ch >> 8];
483 set += 128;
484 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
485 return ok;
486 set += count*16;
487 }
488 else {
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000489 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
490 * warnings when c's type supports only numbers < N+1 */
491 if (!(ch & ~65535))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000492 block = ((unsigned char*)set)[ch >> 8];
493 else
494 block = -1;
495 set += 64;
Tim Peters3d563502006-01-21 02:47:53 +0000496 if (block >=0 &&
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000497 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
498 return ok;
499 set += count*8;
500 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000501 break;
502 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000503
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000504 default:
505 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000506 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000507 return 0;
508 }
509 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000510}
511
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000512LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000513
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000514LOCAL(Py_ssize_t)
515SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000516{
517 SRE_CODE chr;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000518 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
519 SRE_CHAR* end = (SRE_CHAR *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000520 Py_ssize_t i;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000521
522 /* adjust end */
523 if (maxcount < end - ptr && maxcount != 65535)
524 end = ptr + maxcount;
525
526 switch (pattern[0]) {
527
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000528 case SRE_OP_IN:
529 /* repeated set */
530 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
531 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
532 ptr++;
533 break;
534
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000535 case SRE_OP_ANY:
536 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000537 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000538 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
539 ptr++;
540 break;
541
542 case SRE_OP_ANY_ALL:
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000543 /* repeated dot wildcard. skip to the end of the target
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000544 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000545 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000546 ptr = end;
547 break;
548
549 case SRE_OP_LITERAL:
550 /* repeated literal */
551 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000552 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000553 while (ptr < end && (SRE_CODE) *ptr == chr)
554 ptr++;
555 break;
556
557 case SRE_OP_LITERAL_IGNORE:
558 /* repeated literal */
559 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000560 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000561 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
562 ptr++;
563 break;
564
565 case SRE_OP_NOT_LITERAL:
566 /* repeated non-literal */
567 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000568 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000569 while (ptr < end && (SRE_CODE) *ptr != chr)
570 ptr++;
571 break;
Tim Peters3d563502006-01-21 02:47:53 +0000572
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000573 case SRE_OP_NOT_LITERAL_IGNORE:
574 /* repeated non-literal */
575 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000576 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000577 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
578 ptr++;
579 break;
580
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000581 default:
582 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000583 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000584 while ((SRE_CHAR*) state->ptr < end) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000585 i = SRE_MATCH(state, pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000586 if (i < 0)
587 return i;
588 if (!i)
589 break;
590 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000591 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
592 (SRE_CHAR*) state->ptr - ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000593 return (SRE_CHAR*) state->ptr - ptr;
594 }
595
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000596 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000597 return ptr - (SRE_CHAR*) state->ptr;
598}
599
Fredrik Lundh33accc12000-08-27 20:59:47 +0000600#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000601LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000602SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
603{
604 /* check if an SRE_OP_INFO block matches at the current position.
605 returns the number of SRE_CODE objects to skip if successful, 0
606 if no match */
607
608 SRE_CHAR* end = state->end;
609 SRE_CHAR* ptr = state->ptr;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000610 Py_ssize_t i;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000611
612 /* check minimal length */
613 if (pattern[3] && (end - ptr) < pattern[3])
614 return 0;
615
616 /* check known prefix */
617 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
618 /* <length> <skip> <prefix data> <overlap data> */
619 for (i = 0; i < pattern[5]; i++)
620 if ((SRE_CODE) ptr[i] != pattern[7 + i])
621 return 0;
622 return pattern[0] + 2 * pattern[6];
623 }
624 return pattern[0];
625}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000626#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000627
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000628/* The macros below should be used to protect recursive SRE_MATCH()
629 * calls that *failed* and do *not* return immediately (IOW, those
630 * that will backtrack). Explaining:
631 *
632 * - Recursive SRE_MATCH() returned true: that's usually a success
633 * (besides atypical cases like ASSERT_NOT), therefore there's no
634 * reason to restore lastmark;
635 *
636 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
637 * is returning to the caller: If the current SRE_MATCH() is the
638 * top function of the recursion, returning false will be a matching
639 * failure, and it doesn't matter where lastmark is pointing to.
640 * If it's *not* the top function, it will be a recursive SRE_MATCH()
641 * failure by itself, and the calling SRE_MATCH() will have to deal
642 * with the failure by the same rules explained here (it will restore
643 * lastmark by itself if necessary);
644 *
645 * - Recursive SRE_MATCH() returned false, and will continue the
646 * outside 'for' loop: must be protected when breaking, since the next
647 * OP could potentially depend on lastmark;
Tim Peters3d563502006-01-21 02:47:53 +0000648 *
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000649 * - Recursive SRE_MATCH() returned false, and will be called again
650 * inside a local for/while loop: must be protected between each
651 * loop iteration, since the recursive SRE_MATCH() could do anything,
652 * and could potentially depend on lastmark.
653 *
654 * For more information, check the discussion at SF patch #712900.
655 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000656#define LASTMARK_SAVE() \
657 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000658 ctx->lastmark = state->lastmark; \
659 ctx->lastindex = state->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000660 } while (0)
661#define LASTMARK_RESTORE() \
662 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000663 state->lastmark = ctx->lastmark; \
664 state->lastindex = ctx->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000665 } while (0)
666
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000667#define RETURN_ERROR(i) do { return i; } while(0)
668#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
669#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
670
671#define RETURN_ON_ERROR(i) \
672 do { if (i < 0) RETURN_ERROR(i); } while (0)
673#define RETURN_ON_SUCCESS(i) \
674 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
675#define RETURN_ON_FAILURE(i) \
676 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
677
678#define SFY(x) #x
679
680#define DATA_STACK_ALLOC(state, type, ptr) \
681do { \
682 alloc_pos = state->data_stack_base; \
683 TRACE(("allocating %s in %d (%d)\n", \
684 SFY(type), alloc_pos, sizeof(type))); \
685 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
686 int j = data_stack_grow(state, sizeof(type)); \
687 if (j < 0) return j; \
688 if (ctx_pos != -1) \
689 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
690 } \
691 ptr = (type*)(state->data_stack+alloc_pos); \
692 state->data_stack_base += sizeof(type); \
693} while (0)
694
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000695#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
696do { \
697 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
698 ptr = (type*)(state->data_stack+pos); \
699} while (0)
700
701#define DATA_STACK_PUSH(state, data, size) \
702do { \
703 TRACE(("copy data in %p to %d (%d)\n", \
704 data, state->data_stack_base, size)); \
705 if (state->data_stack_size < state->data_stack_base+size) { \
706 int j = data_stack_grow(state, size); \
707 if (j < 0) return j; \
708 if (ctx_pos != -1) \
709 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
710 } \
711 memcpy(state->data_stack+state->data_stack_base, data, size); \
712 state->data_stack_base += size; \
713} while (0)
714
715#define DATA_STACK_POP(state, data, size, discard) \
716do { \
717 TRACE(("copy data to %p from %d (%d)\n", \
718 data, state->data_stack_base-size, size)); \
719 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
720 if (discard) \
721 state->data_stack_base -= size; \
722} while (0)
723
724#define DATA_STACK_POP_DISCARD(state, size) \
725do { \
726 TRACE(("discard data from %d (%d)\n", \
727 state->data_stack_base-size, size)); \
728 state->data_stack_base -= size; \
729} while(0)
730
731#define DATA_PUSH(x) \
732 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
733#define DATA_POP(x) \
734 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000735#define DATA_POP_DISCARD(x) \
736 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
737#define DATA_ALLOC(t,p) \
738 DATA_STACK_ALLOC(state, t, p)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000739#define DATA_LOOKUP_AT(t,p,pos) \
740 DATA_STACK_LOOKUP_AT(state,t,p,pos)
741
742#define MARK_PUSH(lastmark) \
743 do if (lastmark > 0) { \
744 i = lastmark; /* ctx->lastmark may change if reallocated */ \
745 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
746 } while (0)
747#define MARK_POP(lastmark) \
748 do if (lastmark > 0) { \
749 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
750 } while (0)
751#define MARK_POP_KEEP(lastmark) \
752 do if (lastmark > 0) { \
753 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
754 } while (0)
755#define MARK_POP_DISCARD(lastmark) \
756 do if (lastmark > 0) { \
757 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
758 } while (0)
759
760#define JUMP_NONE 0
761#define JUMP_MAX_UNTIL_1 1
762#define JUMP_MAX_UNTIL_2 2
763#define JUMP_MAX_UNTIL_3 3
764#define JUMP_MIN_UNTIL_1 4
765#define JUMP_MIN_UNTIL_2 5
766#define JUMP_MIN_UNTIL_3 6
767#define JUMP_REPEAT 7
768#define JUMP_REPEAT_ONE_1 8
769#define JUMP_REPEAT_ONE_2 9
770#define JUMP_MIN_REPEAT_ONE 10
771#define JUMP_BRANCH 11
772#define JUMP_ASSERT 12
773#define JUMP_ASSERT_NOT 13
774
775#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
776 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
777 nextctx->last_ctx_pos = ctx_pos; \
778 nextctx->jump = jumpvalue; \
779 nextctx->pattern = nextpattern; \
780 ctx_pos = alloc_pos; \
781 ctx = nextctx; \
782 goto entrance; \
783 jumplabel: \
784 while (0) /* gcc doesn't like labels at end of scopes */ \
785
786typedef struct {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000787 Py_ssize_t last_ctx_pos;
788 Py_ssize_t jump;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000789 SRE_CHAR* ptr;
790 SRE_CODE* pattern;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000791 Py_ssize_t count;
792 Py_ssize_t lastmark;
793 Py_ssize_t lastindex;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000794 union {
795 SRE_CODE chr;
796 SRE_REPEAT* rep;
797 } u;
798} SRE_MATCH_CONTEXT;
799
800/* check if string matches the given pattern. returns <0 for
801 error, 0 for failure, and 1 for success */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000802LOCAL(Py_ssize_t)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000803SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
Guido van Rossumb700df92000-03-31 14:59:30 +0000804{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000805 SRE_CHAR* end = (SRE_CHAR *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000806 Py_ssize_t alloc_pos, ctx_pos = -1;
807 Py_ssize_t i, ret = 0;
808 Py_ssize_t jump;
Christian Heimes2380ac72008-01-09 00:17:24 +0000809 unsigned int sigcount=0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000810
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000811 SRE_MATCH_CONTEXT* ctx;
812 SRE_MATCH_CONTEXT* nextctx;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000813
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000814 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000815
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000816 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
817 ctx->last_ctx_pos = -1;
818 ctx->jump = JUMP_NONE;
819 ctx->pattern = pattern;
820 ctx_pos = alloc_pos;
821
822entrance:
823
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000824 ctx->ptr = (SRE_CHAR *)state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000825
826 if (ctx->pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000827 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000828 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000829 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000830 TRACE(("reject (got %d chars, need %d)\n",
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000831 (end - ctx->ptr), ctx->pattern[3]));
832 RETURN_FAILURE;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000833 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000834 ctx->pattern += ctx->pattern[1] + 1;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000835 }
836
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000837 for (;;) {
Christian Heimes2380ac72008-01-09 00:17:24 +0000838 ++sigcount;
839 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
840 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
Guido van Rossumb700df92000-03-31 14:59:30 +0000841
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000842 switch (*ctx->pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000843
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000844 case SRE_OP_MARK:
845 /* set mark */
846 /* <MARK> <gid> */
847 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
848 ctx->ptr, ctx->pattern[0]));
849 i = ctx->pattern[0];
850 if (i & 1)
851 state->lastindex = i/2 + 1;
852 if (i > state->lastmark) {
853 /* state->lastmark is the highest valid index in the
854 state->mark array. If it is increased by more than 1,
855 the intervening marks must be set to NULL to signal
Tim Peters3d563502006-01-21 02:47:53 +0000856 that these marks have not been encountered. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000857 Py_ssize_t j = state->lastmark + 1;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000858 while (j < i)
859 state->mark[j++] = NULL;
860 state->lastmark = i;
861 }
862 state->mark[i] = ctx->ptr;
863 ctx->pattern++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000864 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000865
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000866 case SRE_OP_LITERAL:
867 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000868 /* <LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000869 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
870 ctx->ptr, *ctx->pattern));
871 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
872 RETURN_FAILURE;
873 ctx->pattern++;
874 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000875 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000876
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000877 case SRE_OP_NOT_LITERAL:
878 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000879 /* <NOT_LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000880 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
881 ctx->ptr, *ctx->pattern));
882 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
883 RETURN_FAILURE;
884 ctx->pattern++;
885 ctx->ptr++;
886 break;
887
888 case SRE_OP_SUCCESS:
889 /* end of pattern */
890 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
891 state->ptr = ctx->ptr;
892 RETURN_SUCCESS;
893
894 case SRE_OP_AT:
895 /* match at given position */
896 /* <AT> <code> */
897 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
898 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
899 RETURN_FAILURE;
900 ctx->pattern++;
901 break;
902
903 case SRE_OP_CATEGORY:
904 /* match at given category */
905 /* <CATEGORY> <code> */
906 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
907 ctx->ptr, *ctx->pattern));
908 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
909 RETURN_FAILURE;
910 ctx->pattern++;
911 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000912 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000913
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000914 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000915 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000916 /* <ANY> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000917 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
918 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
919 RETURN_FAILURE;
920 ctx->ptr++;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000921 break;
922
923 case SRE_OP_ANY_ALL:
924 /* match anything */
925 /* <ANY_ALL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000926 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
927 if (ctx->ptr >= end)
928 RETURN_FAILURE;
929 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000930 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000931
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000932 case SRE_OP_IN:
933 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000934 /* <IN> <skip> <set> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000935 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
936 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
937 RETURN_FAILURE;
938 ctx->pattern += ctx->pattern[0];
939 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000940 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000941
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000942 case SRE_OP_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000943 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
944 ctx->pattern, ctx->ptr, ctx->pattern[0]));
945 if (ctx->ptr >= end ||
946 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
947 RETURN_FAILURE;
948 ctx->pattern++;
949 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000950 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000951
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000952 case SRE_OP_NOT_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000953 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
954 ctx->pattern, ctx->ptr, *ctx->pattern));
955 if (ctx->ptr >= end ||
956 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
957 RETURN_FAILURE;
958 ctx->pattern++;
959 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000960 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000961
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000962 case SRE_OP_IN_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000963 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
964 if (ctx->ptr >= end
965 || !SRE_CHARSET(ctx->pattern+1,
966 (SRE_CODE)state->lower(*ctx->ptr)))
967 RETURN_FAILURE;
968 ctx->pattern += ctx->pattern[0];
969 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000970 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000971
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000972 case SRE_OP_JUMP:
973 case SRE_OP_INFO:
974 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000975 /* <JUMP> <offset> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000976 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
977 ctx->ptr, ctx->pattern[0]));
978 ctx->pattern += ctx->pattern[0];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000979 break;
980
981 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000982 /* alternation */
983 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000984 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000985 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000986 ctx->u.rep = state->repeat;
987 if (ctx->u.rep)
988 MARK_PUSH(ctx->lastmark);
989 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
990 if (ctx->pattern[1] == SRE_OP_LITERAL &&
991 (ctx->ptr >= end ||
992 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000993 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000994 if (ctx->pattern[1] == SRE_OP_IN &&
995 (ctx->ptr >= end ||
996 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000997 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000998 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000999 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001000 if (ret) {
1001 if (ctx->u.rep)
1002 MARK_POP_DISCARD(ctx->lastmark);
1003 RETURN_ON_ERROR(ret);
1004 RETURN_SUCCESS;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001005 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001006 if (ctx->u.rep)
1007 MARK_POP_KEEP(ctx->lastmark);
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001008 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001009 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001010 if (ctx->u.rep)
1011 MARK_POP_DISCARD(ctx->lastmark);
1012 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001013
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001014 case SRE_OP_REPEAT_ONE:
1015 /* match repeated sequence (maximizing regexp) */
1016
1017 /* this operator only works if the repeated item is
1018 exactly one character wide, and we're not already
1019 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +00001020 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001021
1022 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1023
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001024 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1025 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001026
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001027 if (ctx->ptr + ctx->pattern[1] > end)
1028 RETURN_FAILURE; /* cannot match */
Fredrik Lundhe1869832000-08-01 22:47:49 +00001029
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001030 state->ptr = ctx->ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001031
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001032 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1033 RETURN_ON_ERROR(ret);
1034 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1035 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001036 ctx->ptr += ctx->count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001037
1038 /* when we arrive here, count contains the number of
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001039 matches, and ctx->ptr points to the tail of the target
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001040 string. check if the rest of the pattern matches,
1041 and backtrack if not. */
1042
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001043 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001044 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001045
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001046 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001047 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001048 state->ptr = ctx->ptr;
1049 RETURN_SUCCESS;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001050 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001051
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001052 LASTMARK_SAVE();
1053
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001054 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001055 /* tail starts with a literal. skip positions where
1056 the rest of the pattern cannot possibly match */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001057 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001058 for (;;) {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001059 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001060 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1061 ctx->ptr--;
1062 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001063 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001064 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001065 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001066 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001067 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1068 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001069 if (ret) {
1070 RETURN_ON_ERROR(ret);
1071 RETURN_SUCCESS;
1072 }
Tim Peters3d563502006-01-21 02:47:53 +00001073
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001074 LASTMARK_RESTORE();
Tim Peters3d563502006-01-21 02:47:53 +00001075
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001076 ctx->ptr--;
1077 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001078 }
1079
1080 } else {
1081 /* general case */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001082 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001083 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001084 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1085 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001086 if (ret) {
1087 RETURN_ON_ERROR(ret);
1088 RETURN_SUCCESS;
1089 }
1090 ctx->ptr--;
1091 ctx->count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001092 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001093 }
1094 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001095 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001096
Guido van Rossum41c99e72003-04-14 17:59:34 +00001097 case SRE_OP_MIN_REPEAT_ONE:
1098 /* match repeated sequence (minimizing regexp) */
1099
1100 /* this operator only works if the repeated item is
1101 exactly one character wide, and we're not already
1102 collecting backtracking points. for other cases,
1103 use the MIN_REPEAT operator */
1104
1105 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1106
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001107 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1108 ctx->pattern[1], ctx->pattern[2]));
Guido van Rossum41c99e72003-04-14 17:59:34 +00001109
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001110 if (ctx->ptr + ctx->pattern[1] > end)
1111 RETURN_FAILURE; /* cannot match */
Guido van Rossum41c99e72003-04-14 17:59:34 +00001112
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001113 state->ptr = ctx->ptr;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001114
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001115 if (ctx->pattern[1] == 0)
1116 ctx->count = 0;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001117 else {
1118 /* count using pattern min as the maximum */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001119 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1120 RETURN_ON_ERROR(ret);
1121 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001122 if (ret < (Py_ssize_t) ctx->pattern[1])
Tim Peters3d563502006-01-21 02:47:53 +00001123 /* didn't match minimum number of times */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001124 RETURN_FAILURE;
1125 /* advance past minimum matches of repeat */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001126 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001127 ctx->ptr += ctx->count;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001128 }
1129
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001130 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Guido van Rossum41c99e72003-04-14 17:59:34 +00001131 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001132 state->ptr = ctx->ptr;
1133 RETURN_SUCCESS;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001134
1135 } else {
1136 /* general case */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001137 LASTMARK_SAVE();
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001138 while ((Py_ssize_t)ctx->pattern[2] == 65535
1139 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001140 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001141 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1142 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001143 if (ret) {
1144 RETURN_ON_ERROR(ret);
1145 RETURN_SUCCESS;
1146 }
1147 state->ptr = ctx->ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001148 ret = SRE_COUNT(state, ctx->pattern+3, 1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001149 RETURN_ON_ERROR(ret);
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001150 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001151 if (ret == 0)
Guido van Rossum41c99e72003-04-14 17:59:34 +00001152 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001153 assert(ret == 1);
1154 ctx->ptr++;
1155 ctx->count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001156 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001157 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001158 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001159 RETURN_FAILURE;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001160
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001161 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001162 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001163 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001164 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001165 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1166 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001167
1168 /* install new repeat context */
Thomas Wouters477c8d52006-05-27 19:21:47 +00001169 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001170 if (!ctx->u.rep) {
1171 PyErr_NoMemory();
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001172 RETURN_FAILURE;
Thomas Wouters89f507f2006-12-13 04:49:30 +00001173 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001174 ctx->u.rep->count = -1;
1175 ctx->u.rep->pattern = ctx->pattern;
1176 ctx->u.rep->prev = state->repeat;
1177 ctx->u.rep->last_ptr = NULL;
1178 state->repeat = ctx->u.rep;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001179
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001180 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001181 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001182 state->repeat = ctx->u.rep->prev;
Thomas Wouters477c8d52006-05-27 19:21:47 +00001183 PyObject_FREE(ctx->u.rep);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001184
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001185 if (ret) {
1186 RETURN_ON_ERROR(ret);
1187 RETURN_SUCCESS;
1188 }
1189 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001190
1191 case SRE_OP_MAX_UNTIL:
1192 /* maximizing repeat */
1193 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1194
1195 /* FIXME: we probably need to deal with zero-width
1196 matches in here... */
1197
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001198 ctx->u.rep = state->repeat;
1199 if (!ctx->u.rep)
1200 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001201
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001202 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001203
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001204 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001205
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001206 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1207 ctx->ptr, ctx->count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001208
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001209 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001210 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001211 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001212 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1213 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001214 if (ret) {
1215 RETURN_ON_ERROR(ret);
1216 RETURN_SUCCESS;
1217 }
1218 ctx->u.rep->count = ctx->count-1;
1219 state->ptr = ctx->ptr;
1220 RETURN_FAILURE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001221 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001222
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001223 if ((ctx->count < ctx->u.rep->pattern[2] ||
1224 ctx->u.rep->pattern[2] == 65535) &&
1225 state->ptr != ctx->u.rep->last_ptr) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001226 /* we may have enough matches, but if we can
1227 match another item, do so */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001228 ctx->u.rep->count = ctx->count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001229 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001230 MARK_PUSH(ctx->lastmark);
1231 /* zero-width match protection */
1232 DATA_PUSH(&ctx->u.rep->last_ptr);
1233 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001234 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1235 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001236 DATA_POP(&ctx->u.rep->last_ptr);
1237 if (ret) {
1238 MARK_POP_DISCARD(ctx->lastmark);
1239 RETURN_ON_ERROR(ret);
1240 RETURN_SUCCESS;
1241 }
1242 MARK_POP(ctx->lastmark);
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001243 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001244 ctx->u.rep->count = ctx->count-1;
1245 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001246 }
1247
1248 /* cannot match more repeated items here. make sure the
1249 tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001250 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001251 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001252 RETURN_ON_SUCCESS(ret);
1253 state->repeat = ctx->u.rep;
1254 state->ptr = ctx->ptr;
1255 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001256
1257 case SRE_OP_MIN_UNTIL:
1258 /* minimizing repeat */
1259 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1260
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001261 ctx->u.rep = state->repeat;
1262 if (!ctx->u.rep)
1263 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001264
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001265 state->ptr = ctx->ptr;
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001266
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001267 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001268
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001269 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1270 ctx->ptr, ctx->count, ctx->u.rep->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001271
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001272 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001273 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001274 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001275 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1276 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001277 if (ret) {
1278 RETURN_ON_ERROR(ret);
1279 RETURN_SUCCESS;
1280 }
1281 ctx->u.rep->count = ctx->count-1;
1282 state->ptr = ctx->ptr;
1283 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001284 }
1285
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001286 LASTMARK_SAVE();
1287
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001288 /* see if the tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001289 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001290 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001291 if (ret) {
1292 RETURN_ON_ERROR(ret);
1293 RETURN_SUCCESS;
1294 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001295
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001296 state->repeat = ctx->u.rep;
1297 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001298
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001299 LASTMARK_RESTORE();
1300
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001301 if (ctx->count >= ctx->u.rep->pattern[2]
1302 && ctx->u.rep->pattern[2] != 65535)
1303 RETURN_FAILURE;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001304
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001305 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001306 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1307 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001308 if (ret) {
1309 RETURN_ON_ERROR(ret);
1310 RETURN_SUCCESS;
1311 }
1312 ctx->u.rep->count = ctx->count-1;
1313 state->ptr = ctx->ptr;
1314 RETURN_FAILURE;
1315
1316 case SRE_OP_GROUPREF:
1317 /* match backreference */
1318 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1319 ctx->ptr, ctx->pattern[0]));
1320 i = ctx->pattern[0];
1321 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001322 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001323 if (groupref >= state->lastmark) {
1324 RETURN_FAILURE;
1325 } else {
1326 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1327 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1328 if (!p || !e || e < p)
1329 RETURN_FAILURE;
1330 while (p < e) {
1331 if (ctx->ptr >= end || *ctx->ptr != *p)
1332 RETURN_FAILURE;
1333 p++; ctx->ptr++;
1334 }
1335 }
1336 }
1337 ctx->pattern++;
1338 break;
1339
1340 case SRE_OP_GROUPREF_IGNORE:
1341 /* match backreference */
1342 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1343 ctx->ptr, ctx->pattern[0]));
1344 i = ctx->pattern[0];
1345 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001346 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001347 if (groupref >= state->lastmark) {
1348 RETURN_FAILURE;
1349 } else {
1350 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1351 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1352 if (!p || !e || e < p)
1353 RETURN_FAILURE;
1354 while (p < e) {
1355 if (ctx->ptr >= end ||
1356 state->lower(*ctx->ptr) != state->lower(*p))
1357 RETURN_FAILURE;
1358 p++; ctx->ptr++;
1359 }
1360 }
1361 }
1362 ctx->pattern++;
1363 break;
1364
1365 case SRE_OP_GROUPREF_EXISTS:
1366 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1367 ctx->ptr, ctx->pattern[0]));
1368 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1369 i = ctx->pattern[0];
1370 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001371 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001372 if (groupref >= state->lastmark) {
1373 ctx->pattern += ctx->pattern[1];
1374 break;
1375 } else {
1376 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1377 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1378 if (!p || !e || e < p) {
1379 ctx->pattern += ctx->pattern[1];
1380 break;
1381 }
1382 }
1383 }
1384 ctx->pattern += 2;
1385 break;
1386
1387 case SRE_OP_ASSERT:
1388 /* assert subpattern */
1389 /* <ASSERT> <skip> <back> <pattern> */
1390 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1391 ctx->ptr, ctx->pattern[1]));
1392 state->ptr = ctx->ptr - ctx->pattern[1];
1393 if (state->ptr < state->beginning)
1394 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001395 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001396 RETURN_ON_FAILURE(ret);
1397 ctx->pattern += ctx->pattern[0];
1398 break;
1399
1400 case SRE_OP_ASSERT_NOT:
1401 /* assert not subpattern */
1402 /* <ASSERT_NOT> <skip> <back> <pattern> */
1403 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1404 ctx->ptr, ctx->pattern[1]));
1405 state->ptr = ctx->ptr - ctx->pattern[1];
1406 if (state->ptr >= state->beginning) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001407 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001408 if (ret) {
1409 RETURN_ON_ERROR(ret);
1410 RETURN_FAILURE;
1411 }
1412 }
1413 ctx->pattern += ctx->pattern[0];
1414 break;
1415
1416 case SRE_OP_FAILURE:
1417 /* immediate failure */
1418 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1419 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001420
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001421 default:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001422 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1423 ctx->pattern[-1]));
1424 RETURN_ERROR(SRE_ERROR_ILLEGAL);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001425 }
1426 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001427
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001428exit:
1429 ctx_pos = ctx->last_ctx_pos;
1430 jump = ctx->jump;
1431 DATA_POP_DISCARD(ctx);
1432 if (ctx_pos == -1)
1433 return ret;
1434 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1435
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001436 switch (jump) {
1437 case JUMP_MAX_UNTIL_2:
1438 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1439 goto jump_max_until_2;
1440 case JUMP_MAX_UNTIL_3:
1441 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1442 goto jump_max_until_3;
1443 case JUMP_MIN_UNTIL_2:
1444 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1445 goto jump_min_until_2;
1446 case JUMP_MIN_UNTIL_3:
1447 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1448 goto jump_min_until_3;
1449 case JUMP_BRANCH:
1450 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1451 goto jump_branch;
1452 case JUMP_MAX_UNTIL_1:
1453 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1454 goto jump_max_until_1;
1455 case JUMP_MIN_UNTIL_1:
1456 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1457 goto jump_min_until_1;
1458 case JUMP_REPEAT:
1459 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1460 goto jump_repeat;
1461 case JUMP_REPEAT_ONE_1:
1462 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1463 goto jump_repeat_one_1;
1464 case JUMP_REPEAT_ONE_2:
1465 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1466 goto jump_repeat_one_2;
1467 case JUMP_MIN_REPEAT_ONE:
1468 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1469 goto jump_min_repeat_one;
1470 case JUMP_ASSERT:
1471 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1472 goto jump_assert;
1473 case JUMP_ASSERT_NOT:
1474 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1475 goto jump_assert_not;
1476 case JUMP_NONE:
1477 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1478 break;
1479 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001480
1481 return ret; /* should never get here */
Guido van Rossumb700df92000-03-31 14:59:30 +00001482}
1483
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001484LOCAL(Py_ssize_t)
Guido van Rossumb700df92000-03-31 14:59:30 +00001485SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1486{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001487 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1488 SRE_CHAR* end = (SRE_CHAR *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001489 Py_ssize_t status = 0;
1490 Py_ssize_t prefix_len = 0;
1491 Py_ssize_t prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001492 SRE_CODE* prefix = NULL;
1493 SRE_CODE* charset = NULL;
1494 SRE_CODE* overlap = NULL;
1495 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001496
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001497 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001498 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001499 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001500
1501 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001502
Gustavo Niemeyer28b5bb32003-06-26 14:41:08 +00001503 if (pattern[3] > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001504 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001505 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001506 end -= pattern[3]-1;
1507 if (end <= ptr)
1508 end = ptr+1;
1509 }
1510
Fredrik Lundh3562f112000-07-02 12:00:07 +00001511 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001512 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001513 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001514 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001515 prefix_skip = pattern[6];
1516 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001517 overlap = prefix + prefix_len - 1;
1518 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001519 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001520 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001521 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001522
1523 pattern += 1 + pattern[1];
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001524 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001525
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001526 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1527 TRACE(("charset = %p\n", charset));
1528
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001529#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001530 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001531 /* pattern starts with a known prefix. use the overlap
1532 table to skip forward as fast as we possibly can */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001533 Py_ssize_t i = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001534 end = (SRE_CHAR *)state->end;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001535 while (ptr < end) {
1536 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001537 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001538 if (!i)
1539 break;
1540 else
1541 i = overlap[i];
1542 } else {
1543 if (++i == prefix_len) {
1544 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001545 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1546 state->start = ptr + 1 - prefix_len;
1547 state->ptr = ptr + 1 - prefix_len + prefix_skip;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001548 if (flags & SRE_INFO_LITERAL)
1549 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001550 status = SRE_MATCH(state, pattern + 2*prefix_skip);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001551 if (status != 0)
1552 return status;
1553 /* close but no cigar -- try again */
1554 i = overlap[i];
1555 }
1556 break;
1557 }
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001558 }
1559 ptr++;
1560 }
1561 return 0;
1562 }
1563#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001564
Fredrik Lundh3562f112000-07-02 12:00:07 +00001565 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001566 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001567 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001568 SRE_CODE chr = pattern[1];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001569 end = (SRE_CHAR *)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001570 for (;;) {
1571 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1572 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001573 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001574 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001575 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001576 state->start = ptr;
1577 state->ptr = ++ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001578 if (flags & SRE_INFO_LITERAL)
1579 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001580 status = SRE_MATCH(state, pattern + 2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001581 if (status != 0)
1582 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001583 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001584 } else if (charset) {
1585 /* pattern starts with a character from a known set */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001586 end = (SRE_CHAR *)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001587 for (;;) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001588 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001589 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001590 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001591 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001592 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001593 state->start = ptr;
1594 state->ptr = ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001595 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001596 if (status != 0)
1597 break;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001598 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001599 }
1600 } else
1601 /* general case */
1602 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001603 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001604 state->start = state->ptr = ptr++;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001605 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001606 if (status != 0)
1607 break;
1608 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001609
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001610 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001611}
Tim Peters3d563502006-01-21 02:47:53 +00001612
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001613LOCAL(int)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001614SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001615{
1616 /* check if given string is a literal template (i.e. no escapes) */
1617 while (len-- > 0)
1618 if (*ptr++ == '\\')
1619 return 0;
1620 return 1;
1621}
Guido van Rossumb700df92000-03-31 14:59:30 +00001622
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001623#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001624
1625/* -------------------------------------------------------------------- */
1626/* factories and destructors */
1627
1628/* see sre.h for object declarations */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001629static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1630static PyObject*pattern_scanner(PatternObject*, PyObject*);
Guido van Rossumb700df92000-03-31 14:59:30 +00001631
1632static PyObject *
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001633sre_codesize(PyObject* self, PyObject *unused)
Guido van Rossumb700df92000-03-31 14:59:30 +00001634{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001635 return Py_BuildValue("l", sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001636}
1637
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001638static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001639sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001640{
1641 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001642 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001643 return NULL;
1644 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001645 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001646 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001647#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001648 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001649#else
1650 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001651#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001652 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001653}
1654
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001655LOCAL(void)
1656state_reset(SRE_STATE* state)
1657{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001658 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001659 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001660
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001661 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001662 state->lastindex = -1;
1663
1664 state->repeat = NULL;
1665
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001666 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001667}
1668
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001669static void*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001670getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
Guido van Rossumb700df92000-03-31 14:59:30 +00001671{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001672 /* given a python object, return a data pointer, a length (in
1673 characters), and a character size. return NULL if the object
1674 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +00001675
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001676 PyBufferProcs *buffer;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001677 Py_ssize_t size, bytes;
1678 int charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001679 void* ptr;
Travis E. Oliphant8ae62b62007-09-23 02:00:13 +00001680 Py_buffer view;
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001681
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00001682 /* Unicode objects do not support the buffer API. So, get the data
1683 directly instead. */
1684 if (PyUnicode_Check(string)) {
1685 ptr = (void *)PyUnicode_AS_DATA(string);
1686 *p_length = PyUnicode_GET_SIZE(string);
1687 *p_charsize = sizeof(Py_UNICODE);
1688 return ptr;
1689 }
1690
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001691 /* get pointer to string buffer */
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001692 view.len = -1;
Christian Heimes90aa7642007-12-19 02:45:37 +00001693 buffer = Py_TYPE(string)->tp_as_buffer;
Antoine Pitroufd036452008-08-19 17:56:33 +00001694 if (!buffer || !buffer->bf_getbuffer ||
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001695 (*buffer->bf_getbuffer)(string, &view, PyBUF_SIMPLE) < 0) {
1696 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1697 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001698 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001699
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001700 /* determine buffer size */
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001701 bytes = view.len;
1702 ptr = view.buf;
1703
1704 /* Release the buffer immediately --- possibly dangerous
1705 but doing something else would require some re-factoring
1706 */
Martin v. Löwis423be952008-08-13 15:53:07 +00001707 PyBuffer_Release(&view);
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001708
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001709 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001710 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1711 return NULL;
1712 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001713
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001714 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001715 size = PyObject_Size(string);
Guido van Rossumb700df92000-03-31 14:59:30 +00001716
Christian Heimes72b710a2008-05-26 13:28:38 +00001717 if (PyBytes_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001718 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001719#if defined(HAVE_UNICODE)
Antoine Pitroufd036452008-08-19 17:56:33 +00001720 else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001721 charsize = sizeof(Py_UNICODE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001722#endif
1723 else {
1724 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1725 return NULL;
1726 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001727
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001728 *p_length = size;
1729 *p_charsize = charsize;
1730
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001731 if (ptr == NULL) {
Antoine Pitroufd036452008-08-19 17:56:33 +00001732 PyErr_SetString(PyExc_ValueError,
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001733 "Buffer is NULL");
1734 }
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001735 return ptr;
1736}
1737
1738LOCAL(PyObject*)
1739state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001740 Py_ssize_t start, Py_ssize_t end)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001741{
1742 /* prepare state object */
1743
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001744 Py_ssize_t length;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001745 int charsize;
1746 void* ptr;
1747
1748 memset(state, 0, sizeof(SRE_STATE));
1749
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001750 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001751 state->lastindex = -1;
1752
1753 ptr = getstring(string, &length, &charsize);
1754 if (!ptr)
1755 return NULL;
1756
Antoine Pitroufd036452008-08-19 17:56:33 +00001757 if (charsize == 1 && pattern->charsize > 1) {
1758 PyErr_SetString(PyExc_TypeError,
1759 "can't use a string pattern on a bytes-like object");
1760 return NULL;
1761 }
1762 if (charsize > 1 && pattern->charsize == 1) {
1763 PyErr_SetString(PyExc_TypeError,
1764 "can't use a bytes pattern on a string-like object");
1765 return NULL;
1766 }
1767
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001768 /* adjust boundaries */
1769 if (start < 0)
1770 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001771 else if (start > length)
1772 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001773
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001774 if (end < 0)
1775 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001776 else if (end > length)
1777 end = length;
1778
1779 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001780
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001781 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001782
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001783 state->start = (void*) ((char*) ptr + start * state->charsize);
1784 state->end = (void*) ((char*) ptr + end * state->charsize);
1785
1786 Py_INCREF(string);
1787 state->string = string;
1788 state->pos = start;
1789 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001790
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001791 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001792 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001793 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001794#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001795 state->lower = sre_lower_unicode;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001796#else
1797 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001798#endif
1799 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001800 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001801
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001802 return string;
Guido van Rossumb700df92000-03-31 14:59:30 +00001803}
1804
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001805LOCAL(void)
1806state_fini(SRE_STATE* state)
1807{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001808 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001809 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001810}
1811
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001812/* calculate offset from start of string */
1813#define STATE_OFFSET(state, member)\
1814 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1815
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001816LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001817state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001818{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001819 Py_ssize_t i, j;
Fredrik Lundh58100642000-08-09 09:14:35 +00001820
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001821 index = (index - 1) * 2;
1822
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001823 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001824 if (empty)
1825 /* want empty string */
1826 i = j = 0;
1827 else {
1828 Py_INCREF(Py_None);
1829 return Py_None;
1830 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001831 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001832 i = STATE_OFFSET(state, state->mark[index]);
1833 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001834 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001835
Fredrik Lundh58100642000-08-09 09:14:35 +00001836 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001837}
1838
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001839static void
1840pattern_error(int status)
1841{
1842 switch (status) {
1843 case SRE_ERROR_RECURSION_LIMIT:
1844 PyErr_SetString(
1845 PyExc_RuntimeError,
1846 "maximum recursion limit exceeded"
1847 );
1848 break;
1849 case SRE_ERROR_MEMORY:
1850 PyErr_NoMemory();
1851 break;
Christian Heimes2380ac72008-01-09 00:17:24 +00001852 case SRE_ERROR_INTERRUPTED:
1853 /* An exception has already been raised, so let it fly */
1854 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001855 default:
1856 /* other error codes indicate compiler/engine bugs */
1857 PyErr_SetString(
1858 PyExc_RuntimeError,
1859 "internal error in regular expression engine"
1860 );
1861 }
1862}
1863
Guido van Rossumb700df92000-03-31 14:59:30 +00001864static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001865pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001866{
Raymond Hettinger027bb632004-05-31 03:09:25 +00001867 if (self->weakreflist != NULL)
1868 PyObject_ClearWeakRefs((PyObject *) self);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001869 Py_XDECREF(self->pattern);
1870 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001871 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001872 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001873}
1874
1875static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001876pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001877{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001878 SRE_STATE state;
1879 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001880
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001881 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001882 Py_ssize_t start = 0;
1883 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001884 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001885 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001886 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001887 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001888
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001889 string = state_init(&state, self, string, start, end);
1890 if (!string)
1891 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001892
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001893 state.ptr = state.start;
1894
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001895 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1896
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001897 if (state.charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001898 status = sre_match(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001899 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001900#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001901 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001902#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001903 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001904
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001905 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001906 if (PyErr_Occurred())
1907 return NULL;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001908
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001909 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001910
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001911 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001912}
1913
1914static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001915pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001916{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001917 SRE_STATE state;
1918 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001919
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001920 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001921 Py_ssize_t start = 0;
1922 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001923 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001924 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001925 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001926 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001927
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001928 string = state_init(&state, self, string, start, end);
1929 if (!string)
1930 return NULL;
1931
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001932 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1933
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001934 if (state.charsize == 1) {
1935 status = sre_search(&state, PatternObject_GetCode(self));
1936 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001937#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001938 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001939#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001940 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001941
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001942 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1943
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001944 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001945
Thomas Wouters89f507f2006-12-13 04:49:30 +00001946 if (PyErr_Occurred())
1947 return NULL;
1948
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001949 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001950}
1951
1952static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001953call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001954{
1955 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001956 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001957 PyObject* func;
1958 PyObject* result;
1959
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001960 if (!args)
1961 return NULL;
Neal Norwitzfe537132007-08-26 03:55:15 +00001962 name = PyUnicode_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001963 if (!name)
1964 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001965 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001966 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001967 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001968 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001969 func = PyObject_GetAttrString(mod, function);
1970 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001971 if (!func)
1972 return NULL;
1973 result = PyObject_CallObject(func, args);
1974 Py_DECREF(func);
1975 Py_DECREF(args);
1976 return result;
1977}
1978
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001979#ifdef USE_BUILTIN_COPY
1980static int
1981deepcopy(PyObject** object, PyObject* memo)
1982{
1983 PyObject* copy;
1984
1985 copy = call(
1986 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00001987 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001988 );
1989 if (!copy)
1990 return 0;
1991
1992 Py_DECREF(*object);
1993 *object = copy;
1994
1995 return 1; /* success */
1996}
1997#endif
1998
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001999static PyObject*
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002000join_list(PyObject* list, PyObject* string)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002001{
2002 /* join list elements */
2003
2004 PyObject* joiner;
2005#if PY_VERSION_HEX >= 0x01060000
2006 PyObject* function;
2007 PyObject* args;
2008#endif
2009 PyObject* result;
2010
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002011 joiner = PySequence_GetSlice(string, 0, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002012 if (!joiner)
2013 return NULL;
2014
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002015 if (PyList_GET_SIZE(list) == 0) {
2016 Py_DECREF(list);
2017 return joiner;
2018 }
2019
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002020#if PY_VERSION_HEX >= 0x01060000
2021 function = PyObject_GetAttrString(joiner, "join");
2022 if (!function) {
2023 Py_DECREF(joiner);
2024 return NULL;
2025 }
2026 args = PyTuple_New(1);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002027 if (!args) {
2028 Py_DECREF(function);
2029 Py_DECREF(joiner);
2030 return NULL;
2031 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002032 PyTuple_SET_ITEM(args, 0, list);
2033 result = PyObject_CallObject(function, args);
2034 Py_DECREF(args); /* also removes list */
2035 Py_DECREF(function);
2036#else
2037 result = call(
2038 "string", "join",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002039 PyTuple_Pack(2, list, joiner)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002040 );
2041#endif
2042 Py_DECREF(joiner);
2043
2044 return result;
2045}
2046
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002047static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002048pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002049{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002050 SRE_STATE state;
2051 PyObject* list;
2052 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002053 Py_ssize_t i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00002054
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002055 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002056 Py_ssize_t start = 0;
2057 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002058 static char* kwlist[] = { "source", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002059 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00002060 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002061 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002062
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002063 string = state_init(&state, self, string, start, end);
2064 if (!string)
2065 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002066
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002067 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002068 if (!list) {
2069 state_fini(&state);
2070 return NULL;
2071 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002072
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002073 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002074
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002075 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +00002076
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002077 state_reset(&state);
2078
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002079 state.ptr = state.start;
2080
2081 if (state.charsize == 1) {
2082 status = sre_search(&state, PatternObject_GetCode(self));
2083 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002084#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002085 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002086#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002087 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002088
Thomas Wouters89f507f2006-12-13 04:49:30 +00002089 if (PyErr_Occurred())
2090 goto error;
2091
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002092 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002093 if (status == 0)
2094 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002095 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002096 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002097 }
Tim Peters3d563502006-01-21 02:47:53 +00002098
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002099 /* don't bother to build a match object */
2100 switch (self->groups) {
2101 case 0:
2102 b = STATE_OFFSET(&state, state.start);
2103 e = STATE_OFFSET(&state, state.ptr);
2104 item = PySequence_GetSlice(string, b, e);
2105 if (!item)
2106 goto error;
2107 break;
2108 case 1:
2109 item = state_getslice(&state, 1, string, 1);
2110 if (!item)
2111 goto error;
2112 break;
2113 default:
2114 item = PyTuple_New(self->groups);
2115 if (!item)
2116 goto error;
2117 for (i = 0; i < self->groups; i++) {
2118 PyObject* o = state_getslice(&state, i+1, string, 1);
2119 if (!o) {
2120 Py_DECREF(item);
2121 goto error;
2122 }
2123 PyTuple_SET_ITEM(item, i, o);
2124 }
2125 break;
2126 }
2127
2128 status = PyList_Append(list, item);
2129 Py_DECREF(item);
2130 if (status < 0)
2131 goto error;
2132
2133 if (state.ptr == state.start)
2134 state.start = (void*) ((char*) state.ptr + state.charsize);
2135 else
2136 state.start = state.ptr;
2137
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002138 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002139
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002140 state_fini(&state);
2141 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002142
2143error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002144 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002145 state_fini(&state);
2146 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002147
Guido van Rossumb700df92000-03-31 14:59:30 +00002148}
2149
Fredrik Lundh703ce812001-10-24 22:16:30 +00002150#if PY_VERSION_HEX >= 0x02020000
2151static PyObject*
2152pattern_finditer(PatternObject* pattern, PyObject* args)
2153{
2154 PyObject* scanner;
2155 PyObject* search;
2156 PyObject* iterator;
2157
2158 scanner = pattern_scanner(pattern, args);
2159 if (!scanner)
2160 return NULL;
2161
2162 search = PyObject_GetAttrString(scanner, "search");
2163 Py_DECREF(scanner);
2164 if (!search)
2165 return NULL;
2166
2167 iterator = PyCallIter_New(search, Py_None);
2168 Py_DECREF(search);
2169
2170 return iterator;
2171}
2172#endif
2173
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002174static PyObject*
2175pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2176{
2177 SRE_STATE state;
2178 PyObject* list;
2179 PyObject* item;
2180 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002181 Py_ssize_t n;
2182 Py_ssize_t i;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002183 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002184
2185 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002186 Py_ssize_t maxsplit = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002187 static char* kwlist[] = { "source", "maxsplit", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002188 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002189 &string, &maxsplit))
2190 return NULL;
2191
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002192 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002193 if (!string)
2194 return NULL;
2195
2196 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002197 if (!list) {
2198 state_fini(&state);
2199 return NULL;
2200 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002201
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002202 n = 0;
2203 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002204
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002205 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002206
2207 state_reset(&state);
2208
2209 state.ptr = state.start;
2210
2211 if (state.charsize == 1) {
2212 status = sre_search(&state, PatternObject_GetCode(self));
2213 } else {
2214#if defined(HAVE_UNICODE)
2215 status = sre_usearch(&state, PatternObject_GetCode(self));
2216#endif
2217 }
2218
Thomas Wouters89f507f2006-12-13 04:49:30 +00002219 if (PyErr_Occurred())
2220 goto error;
2221
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002222 if (status <= 0) {
2223 if (status == 0)
2224 break;
2225 pattern_error(status);
2226 goto error;
2227 }
Tim Peters3d563502006-01-21 02:47:53 +00002228
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002229 if (state.start == state.ptr) {
2230 if (last == state.end)
2231 break;
2232 /* skip one character */
2233 state.start = (void*) ((char*) state.ptr + state.charsize);
2234 continue;
2235 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002236
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002237 /* get segment before this match */
2238 item = PySequence_GetSlice(
2239 string, STATE_OFFSET(&state, last),
2240 STATE_OFFSET(&state, state.start)
2241 );
2242 if (!item)
2243 goto error;
2244 status = PyList_Append(list, item);
2245 Py_DECREF(item);
2246 if (status < 0)
2247 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002248
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002249 /* add groups (if any) */
2250 for (i = 0; i < self->groups; i++) {
2251 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002252 if (!item)
2253 goto error;
2254 status = PyList_Append(list, item);
2255 Py_DECREF(item);
2256 if (status < 0)
2257 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002258 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002259
2260 n = n + 1;
2261
2262 last = state.start = state.ptr;
2263
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002264 }
2265
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002266 /* get segment following last match (even if empty) */
2267 item = PySequence_GetSlice(
2268 string, STATE_OFFSET(&state, last), state.endpos
2269 );
2270 if (!item)
2271 goto error;
2272 status = PyList_Append(list, item);
2273 Py_DECREF(item);
2274 if (status < 0)
2275 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002276
2277 state_fini(&state);
2278 return list;
2279
2280error:
2281 Py_DECREF(list);
2282 state_fini(&state);
2283 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002284
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002285}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002286
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002287static PyObject*
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002288pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002289 Py_ssize_t count, Py_ssize_t subn)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002290{
2291 SRE_STATE state;
2292 PyObject* list;
2293 PyObject* item;
2294 PyObject* filter;
2295 PyObject* args;
2296 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002297 void* ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002298 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002299 Py_ssize_t n;
2300 Py_ssize_t i, b, e;
2301 int bint;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002302 int filter_is_callable;
2303
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002304 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002305 /* sub/subn takes either a function or a template */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002306 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002307 Py_INCREF(filter);
2308 filter_is_callable = 1;
2309 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002310 /* if not callable, check if it's a literal string */
2311 int literal;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002312 ptr = getstring(ptemplate, &n, &bint);
2313 b = bint;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002314 if (ptr) {
2315 if (b == 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002316 literal = sre_literal_template((unsigned char *)ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002317 } else {
2318#if defined(HAVE_UNICODE)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002319 literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002320#endif
2321 }
2322 } else {
2323 PyErr_Clear();
2324 literal = 0;
2325 }
2326 if (literal) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002327 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002328 Py_INCREF(filter);
2329 filter_is_callable = 0;
2330 } else {
2331 /* not a literal; hand it over to the template compiler */
2332 filter = call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00002333 SRE_PY_MODULE, "_subx",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002334 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002335 );
2336 if (!filter)
2337 return NULL;
2338 filter_is_callable = PyCallable_Check(filter);
2339 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002340 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002341
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002342 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002343 if (!string) {
2344 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002345 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002346 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002347
2348 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002349 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002350 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002351 state_fini(&state);
2352 return NULL;
2353 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002354
2355 n = i = 0;
2356
2357 while (!count || n < count) {
2358
2359 state_reset(&state);
2360
2361 state.ptr = state.start;
2362
2363 if (state.charsize == 1) {
2364 status = sre_search(&state, PatternObject_GetCode(self));
2365 } else {
2366#if defined(HAVE_UNICODE)
2367 status = sre_usearch(&state, PatternObject_GetCode(self));
2368#endif
2369 }
2370
Thomas Wouters89f507f2006-12-13 04:49:30 +00002371 if (PyErr_Occurred())
2372 goto error;
2373
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002374 if (status <= 0) {
2375 if (status == 0)
2376 break;
2377 pattern_error(status);
2378 goto error;
2379 }
Tim Peters3d563502006-01-21 02:47:53 +00002380
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002381 b = STATE_OFFSET(&state, state.start);
2382 e = STATE_OFFSET(&state, state.ptr);
2383
2384 if (i < b) {
2385 /* get segment before this match */
2386 item = PySequence_GetSlice(string, i, b);
2387 if (!item)
2388 goto error;
2389 status = PyList_Append(list, item);
2390 Py_DECREF(item);
2391 if (status < 0)
2392 goto error;
2393
2394 } else if (i == b && i == e && n > 0)
2395 /* ignore empty match on latest position */
2396 goto next;
2397
2398 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002399 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002400 match = pattern_new_match(self, &state, 1);
2401 if (!match)
2402 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002403 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002404 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002405 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002406 goto error;
2407 }
2408 item = PyObject_CallObject(filter, args);
2409 Py_DECREF(args);
2410 Py_DECREF(match);
2411 if (!item)
2412 goto error;
2413 } else {
2414 /* filter is literal string */
2415 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002416 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002417 }
2418
2419 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002420 if (item != Py_None) {
2421 status = PyList_Append(list, item);
2422 Py_DECREF(item);
2423 if (status < 0)
2424 goto error;
2425 }
Tim Peters3d563502006-01-21 02:47:53 +00002426
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002427 i = e;
2428 n = n + 1;
2429
2430next:
2431 /* move on */
2432 if (state.ptr == state.start)
2433 state.start = (void*) ((char*) state.ptr + state.charsize);
2434 else
2435 state.start = state.ptr;
2436
2437 }
2438
2439 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002440 if (i < state.endpos) {
2441 item = PySequence_GetSlice(string, i, state.endpos);
2442 if (!item)
2443 goto error;
2444 status = PyList_Append(list, item);
2445 Py_DECREF(item);
2446 if (status < 0)
2447 goto error;
2448 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002449
2450 state_fini(&state);
2451
Guido van Rossum4e173842001-12-07 04:25:10 +00002452 Py_DECREF(filter);
2453
Fredrik Lundhdac58492001-10-21 21:48:30 +00002454 /* convert list to single string (also removes list) */
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002455 item = join_list(list, string);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002456
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002457 if (!item)
2458 return NULL;
2459
2460 if (subn)
2461 return Py_BuildValue("Ni", item, n);
2462
2463 return item;
2464
2465error:
2466 Py_DECREF(list);
2467 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002468 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002469 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002470
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002471}
2472
2473static PyObject*
2474pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2475{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002476 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002477 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002478 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002479 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002480 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002481 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002482 return NULL;
2483
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002484 return pattern_subx(self, ptemplate, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002485}
2486
2487static PyObject*
2488pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2489{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002490 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002491 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002492 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002493 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002494 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002495 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002496 return NULL;
2497
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002498 return pattern_subx(self, ptemplate, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002499}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002500
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002501static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002502pattern_copy(PatternObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002503{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002504#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002505 PatternObject* copy;
2506 int offset;
2507
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002508 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2509 if (!copy)
2510 return NULL;
2511
2512 offset = offsetof(PatternObject, groups);
2513
2514 Py_XINCREF(self->groupindex);
2515 Py_XINCREF(self->indexgroup);
2516 Py_XINCREF(self->pattern);
2517
2518 memcpy((char*) copy + offset, (char*) self + offset,
2519 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00002520 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002521
2522 return (PyObject*) copy;
2523#else
2524 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2525 return NULL;
2526#endif
2527}
2528
2529static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002530pattern_deepcopy(PatternObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002531{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002532#ifdef USE_BUILTIN_COPY
2533 PatternObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00002534
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002535 copy = (PatternObject*) pattern_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002536 if (!copy)
2537 return NULL;
2538
2539 if (!deepcopy(&copy->groupindex, memo) ||
2540 !deepcopy(&copy->indexgroup, memo) ||
2541 !deepcopy(&copy->pattern, memo)) {
2542 Py_DECREF(copy);
2543 return NULL;
2544 }
2545
2546#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002547 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2548 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002549#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002550}
2551
Raymond Hettinger94478742004-09-24 04:31:19 +00002552PyDoc_STRVAR(pattern_match_doc,
2553"match(string[, pos[, endpos]]) --> match object or None.\n\
2554 Matches zero or more characters at the beginning of the string");
2555
2556PyDoc_STRVAR(pattern_search_doc,
2557"search(string[, pos[, endpos]]) --> match object or None.\n\
2558 Scan through string looking for a match, and return a corresponding\n\
2559 MatchObject instance. Return None if no position in the string matches.");
2560
2561PyDoc_STRVAR(pattern_split_doc,
2562"split(string[, maxsplit = 0]) --> list.\n\
2563 Split string by the occurrences of pattern.");
2564
2565PyDoc_STRVAR(pattern_findall_doc,
2566"findall(string[, pos[, endpos]]) --> list.\n\
2567 Return a list of all non-overlapping matches of pattern in string.");
2568
2569PyDoc_STRVAR(pattern_finditer_doc,
2570"finditer(string[, pos[, endpos]]) --> iterator.\n\
2571 Return an iterator over all non-overlapping matches for the \n\
2572 RE pattern in string. For each match, the iterator returns a\n\
2573 match object.");
2574
2575PyDoc_STRVAR(pattern_sub_doc,
2576"sub(repl, string[, count = 0]) --> newstring\n\
2577 Return the string obtained by replacing the leftmost non-overlapping\n\
Tim Peters3d563502006-01-21 02:47:53 +00002578 occurrences of pattern in string by the replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002579
2580PyDoc_STRVAR(pattern_subn_doc,
2581"subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2582 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2583 the leftmost non-overlapping occurrences of pattern with the\n\
Tim Peters3d563502006-01-21 02:47:53 +00002584 replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002585
2586PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2587
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002588static PyMethodDef pattern_methods[] = {
Tim Peters3d563502006-01-21 02:47:53 +00002589 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002590 pattern_match_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002591 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002592 pattern_search_doc},
2593 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2594 pattern_sub_doc},
2595 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2596 pattern_subn_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002597 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002598 pattern_split_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002599 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002600 pattern_findall_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002601#if PY_VERSION_HEX >= 0x02020000
Raymond Hettinger94478742004-09-24 04:31:19 +00002602 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2603 pattern_finditer_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002604#endif
Fredrik Lundh562586e2000-10-03 20:43:34 +00002605 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002606 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2607 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002608 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002609};
2610
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002611#define PAT_OFF(x) offsetof(PatternObject, x)
2612static PyMemberDef pattern_members[] = {
2613 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
2614 {"flags", T_INT, PAT_OFF(flags), READONLY},
2615 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
2616 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
2617 {NULL} /* Sentinel */
2618};
Guido van Rossumb700df92000-03-31 14:59:30 +00002619
Neal Norwitz57c179c2006-03-22 07:18:02 +00002620static PyTypeObject Pattern_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00002621 PyVarObject_HEAD_INIT(NULL, 0)
2622 "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002623 sizeof(PatternObject), sizeof(SRE_CODE),
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002624 (destructor)pattern_dealloc, /* tp_dealloc */
2625 0, /* tp_print */
2626 0, /* tp_getattr */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002627 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00002628 0, /* tp_reserved */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002629 0, /* tp_repr */
2630 0, /* tp_as_number */
2631 0, /* tp_as_sequence */
2632 0, /* tp_as_mapping */
2633 0, /* tp_hash */
2634 0, /* tp_call */
2635 0, /* tp_str */
2636 0, /* tp_getattro */
2637 0, /* tp_setattro */
2638 0, /* tp_as_buffer */
Guido van Rossum3cf5b1e2006-07-27 21:53:35 +00002639 Py_TPFLAGS_DEFAULT, /* tp_flags */
Raymond Hettinger94478742004-09-24 04:31:19 +00002640 pattern_doc, /* tp_doc */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002641 0, /* tp_traverse */
2642 0, /* tp_clear */
2643 0, /* tp_richcompare */
2644 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002645 0, /* tp_iter */
2646 0, /* tp_iternext */
2647 pattern_methods, /* tp_methods */
2648 pattern_members, /* tp_members */
Guido van Rossumb700df92000-03-31 14:59:30 +00002649};
2650
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002651static int _validate(PatternObject *self); /* Forward */
2652
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002653static PyObject *
2654_compile(PyObject* self_, PyObject* args)
2655{
2656 /* "compile" pattern descriptor to pattern object */
2657
2658 PatternObject* self;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002659 Py_ssize_t i, n;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002660
2661 PyObject* pattern;
2662 int flags = 0;
2663 PyObject* code;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002664 Py_ssize_t groups = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002665 PyObject* groupindex = NULL;
2666 PyObject* indexgroup = NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002667 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002668 &PyList_Type, &code, &groups,
2669 &groupindex, &indexgroup))
2670 return NULL;
2671
2672 n = PyList_GET_SIZE(code);
Christian Heimes587c2bf2008-01-19 16:21:02 +00002673 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002674 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2675 if (!self)
2676 return NULL;
2677
2678 self->codesize = n;
2679
2680 for (i = 0; i < n; i++) {
2681 PyObject *o = PyList_GET_ITEM(code, i);
Guido van Rossumddefaf32007-01-14 03:31:43 +00002682 unsigned long value = PyLong_AsUnsignedLong(o);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002683 self->code[i] = (SRE_CODE) value;
2684 if ((unsigned long) self->code[i] != value) {
2685 PyErr_SetString(PyExc_OverflowError,
2686 "regular expression code size limit exceeded");
2687 break;
2688 }
2689 }
2690
2691 if (PyErr_Occurred()) {
2692 PyObject_DEL(self);
2693 return NULL;
2694 }
2695
Antoine Pitroufd036452008-08-19 17:56:33 +00002696 if (pattern == Py_None)
2697 self->charsize = -1;
2698 else {
2699 Py_ssize_t p_length;
2700 if (!getstring(pattern, &p_length, &self->charsize)) {
2701 PyObject_DEL(self);
2702 return NULL;
2703 }
2704 }
2705
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002706 Py_INCREF(pattern);
2707 self->pattern = pattern;
2708
2709 self->flags = flags;
2710
2711 self->groups = groups;
2712
2713 Py_XINCREF(groupindex);
2714 self->groupindex = groupindex;
2715
2716 Py_XINCREF(indexgroup);
2717 self->indexgroup = indexgroup;
2718
2719 self->weakreflist = NULL;
2720
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002721 if (!_validate(self)) {
2722 Py_DECREF(self);
2723 return NULL;
2724 }
2725
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002726 return (PyObject*) self;
2727}
2728
Guido van Rossumb700df92000-03-31 14:59:30 +00002729/* -------------------------------------------------------------------- */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002730/* Code validation */
2731
2732/* To learn more about this code, have a look at the _compile() function in
2733 Lib/sre_compile.py. The validation functions below checks the code array
2734 for conformance with the code patterns generated there.
2735
2736 The nice thing about the generated code is that it is position-independent:
2737 all jumps are relative jumps forward. Also, jumps don't cross each other:
2738 the target of a later jump is always earlier than the target of an earlier
2739 jump. IOW, this is okay:
2740
2741 J---------J-------T--------T
2742 \ \_____/ /
2743 \______________________/
2744
2745 but this is not:
2746
2747 J---------J-------T--------T
2748 \_________\_____/ /
2749 \____________/
2750
2751 It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
2752 bytes wide (the latter if Python is compiled for "wide" unicode support).
2753*/
2754
2755/* Defining this one enables tracing of the validator */
2756#undef VVERBOSE
2757
2758/* Trace macro for the validator */
2759#if defined(VVERBOSE)
2760#define VTRACE(v) printf v
2761#else
2762#define VTRACE(v)
2763#endif
2764
2765/* Report failure */
2766#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2767
2768/* Extract opcode, argument, or skip count from code array */
2769#define GET_OP \
2770 do { \
2771 VTRACE(("%p: ", code)); \
2772 if (code >= end) FAIL; \
2773 op = *code++; \
2774 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2775 } while (0)
2776#define GET_ARG \
2777 do { \
2778 VTRACE(("%p= ", code)); \
2779 if (code >= end) FAIL; \
2780 arg = *code++; \
2781 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2782 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00002783#define GET_SKIP_ADJ(adj) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002784 do { \
2785 VTRACE(("%p= ", code)); \
2786 if (code >= end) FAIL; \
2787 skip = *code; \
2788 VTRACE(("%lu (skip to %p)\n", \
2789 (unsigned long)skip, code+skip)); \
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00002790 if (code+skip-adj < code || code+skip-adj > end)\
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002791 FAIL; \
2792 code++; \
2793 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00002794#define GET_SKIP GET_SKIP_ADJ(0)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002795
2796static int
2797_validate_charset(SRE_CODE *code, SRE_CODE *end)
2798{
2799 /* Some variables are manipulated by the macros above */
2800 SRE_CODE op;
2801 SRE_CODE arg;
2802 SRE_CODE offset;
2803 int i;
2804
2805 while (code < end) {
2806 GET_OP;
2807 switch (op) {
2808
2809 case SRE_OP_NEGATE:
2810 break;
2811
2812 case SRE_OP_LITERAL:
2813 GET_ARG;
2814 break;
2815
2816 case SRE_OP_RANGE:
2817 GET_ARG;
2818 GET_ARG;
2819 break;
2820
2821 case SRE_OP_CHARSET:
2822 offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
2823 if (code+offset < code || code+offset > end)
2824 FAIL;
2825 code += offset;
2826 break;
2827
2828 case SRE_OP_BIGCHARSET:
2829 GET_ARG; /* Number of blocks */
2830 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
2831 if (code+offset < code || code+offset > end)
2832 FAIL;
2833 /* Make sure that each byte points to a valid block */
2834 for (i = 0; i < 256; i++) {
2835 if (((unsigned char *)code)[i] >= arg)
2836 FAIL;
2837 }
2838 code += offset;
2839 offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
2840 if (code+offset < code || code+offset > end)
2841 FAIL;
2842 code += offset;
2843 break;
2844
2845 case SRE_OP_CATEGORY:
2846 GET_ARG;
2847 switch (arg) {
2848 case SRE_CATEGORY_DIGIT:
2849 case SRE_CATEGORY_NOT_DIGIT:
2850 case SRE_CATEGORY_SPACE:
2851 case SRE_CATEGORY_NOT_SPACE:
2852 case SRE_CATEGORY_WORD:
2853 case SRE_CATEGORY_NOT_WORD:
2854 case SRE_CATEGORY_LINEBREAK:
2855 case SRE_CATEGORY_NOT_LINEBREAK:
2856 case SRE_CATEGORY_LOC_WORD:
2857 case SRE_CATEGORY_LOC_NOT_WORD:
2858 case SRE_CATEGORY_UNI_DIGIT:
2859 case SRE_CATEGORY_UNI_NOT_DIGIT:
2860 case SRE_CATEGORY_UNI_SPACE:
2861 case SRE_CATEGORY_UNI_NOT_SPACE:
2862 case SRE_CATEGORY_UNI_WORD:
2863 case SRE_CATEGORY_UNI_NOT_WORD:
2864 case SRE_CATEGORY_UNI_LINEBREAK:
2865 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2866 break;
2867 default:
2868 FAIL;
2869 }
2870 break;
2871
2872 default:
2873 FAIL;
2874
2875 }
2876 }
2877
2878 return 1;
2879}
2880
2881static int
2882_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2883{
2884 /* Some variables are manipulated by the macros above */
2885 SRE_CODE op;
2886 SRE_CODE arg;
2887 SRE_CODE skip;
2888
2889 VTRACE(("code=%p, end=%p\n", code, end));
2890
2891 if (code > end)
2892 FAIL;
2893
2894 while (code < end) {
2895 GET_OP;
2896 switch (op) {
2897
2898 case SRE_OP_MARK:
2899 /* We don't check whether marks are properly nested; the
2900 sre_match() code is robust even if they don't, and the worst
2901 you can get is nonsensical match results. */
2902 GET_ARG;
2903 if (arg > 2*groups+1) {
2904 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
2905 FAIL;
2906 }
2907 break;
2908
2909 case SRE_OP_LITERAL:
2910 case SRE_OP_NOT_LITERAL:
2911 case SRE_OP_LITERAL_IGNORE:
2912 case SRE_OP_NOT_LITERAL_IGNORE:
2913 GET_ARG;
2914 /* The arg is just a character, nothing to check */
2915 break;
2916
2917 case SRE_OP_SUCCESS:
2918 case SRE_OP_FAILURE:
2919 /* Nothing to check; these normally end the matching process */
2920 break;
2921
2922 case SRE_OP_AT:
2923 GET_ARG;
2924 switch (arg) {
2925 case SRE_AT_BEGINNING:
2926 case SRE_AT_BEGINNING_STRING:
2927 case SRE_AT_BEGINNING_LINE:
2928 case SRE_AT_END:
2929 case SRE_AT_END_LINE:
2930 case SRE_AT_END_STRING:
2931 case SRE_AT_BOUNDARY:
2932 case SRE_AT_NON_BOUNDARY:
2933 case SRE_AT_LOC_BOUNDARY:
2934 case SRE_AT_LOC_NON_BOUNDARY:
2935 case SRE_AT_UNI_BOUNDARY:
2936 case SRE_AT_UNI_NON_BOUNDARY:
2937 break;
2938 default:
2939 FAIL;
2940 }
2941 break;
2942
2943 case SRE_OP_ANY:
2944 case SRE_OP_ANY_ALL:
2945 /* These have no operands */
2946 break;
2947
2948 case SRE_OP_IN:
2949 case SRE_OP_IN_IGNORE:
2950 GET_SKIP;
2951 /* Stop 1 before the end; we check the FAILURE below */
2952 if (!_validate_charset(code, code+skip-2))
2953 FAIL;
2954 if (code[skip-2] != SRE_OP_FAILURE)
2955 FAIL;
2956 code += skip-1;
2957 break;
2958
2959 case SRE_OP_INFO:
2960 {
2961 /* A minimal info field is
2962 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2963 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2964 more follows. */
2965 SRE_CODE flags, min, max, i;
2966 SRE_CODE *newcode;
2967 GET_SKIP;
2968 newcode = code+skip-1;
2969 GET_ARG; flags = arg;
2970 GET_ARG; min = arg;
2971 GET_ARG; max = arg;
2972 /* Check that only valid flags are present */
2973 if ((flags & ~(SRE_INFO_PREFIX |
2974 SRE_INFO_LITERAL |
2975 SRE_INFO_CHARSET)) != 0)
2976 FAIL;
2977 /* PREFIX and CHARSET are mutually exclusive */
2978 if ((flags & SRE_INFO_PREFIX) &&
2979 (flags & SRE_INFO_CHARSET))
2980 FAIL;
2981 /* LITERAL implies PREFIX */
2982 if ((flags & SRE_INFO_LITERAL) &&
2983 !(flags & SRE_INFO_PREFIX))
2984 FAIL;
2985 /* Validate the prefix */
2986 if (flags & SRE_INFO_PREFIX) {
2987 SRE_CODE prefix_len, prefix_skip;
2988 GET_ARG; prefix_len = arg;
2989 GET_ARG; prefix_skip = arg;
2990 /* Here comes the prefix string */
2991 if (code+prefix_len < code || code+prefix_len > newcode)
2992 FAIL;
2993 code += prefix_len;
2994 /* And here comes the overlap table */
2995 if (code+prefix_len < code || code+prefix_len > newcode)
2996 FAIL;
2997 /* Each overlap value should be < prefix_len */
2998 for (i = 0; i < prefix_len; i++) {
2999 if (code[i] >= prefix_len)
3000 FAIL;
3001 }
3002 code += prefix_len;
3003 }
3004 /* Validate the charset */
3005 if (flags & SRE_INFO_CHARSET) {
3006 if (!_validate_charset(code, newcode-1))
3007 FAIL;
3008 if (newcode[-1] != SRE_OP_FAILURE)
3009 FAIL;
3010 code = newcode;
3011 }
3012 else if (code != newcode) {
3013 VTRACE(("code=%p, newcode=%p\n", code, newcode));
3014 FAIL;
3015 }
3016 }
3017 break;
3018
3019 case SRE_OP_BRANCH:
3020 {
3021 SRE_CODE *target = NULL;
3022 for (;;) {
3023 GET_SKIP;
3024 if (skip == 0)
3025 break;
3026 /* Stop 2 before the end; we check the JUMP below */
3027 if (!_validate_inner(code, code+skip-3, groups))
3028 FAIL;
3029 code += skip-3;
3030 /* Check that it ends with a JUMP, and that each JUMP
3031 has the same target */
3032 GET_OP;
3033 if (op != SRE_OP_JUMP)
3034 FAIL;
3035 GET_SKIP;
3036 if (target == NULL)
3037 target = code+skip-1;
3038 else if (code+skip-1 != target)
3039 FAIL;
3040 }
3041 }
3042 break;
3043
3044 case SRE_OP_REPEAT_ONE:
3045 case SRE_OP_MIN_REPEAT_ONE:
3046 {
3047 SRE_CODE min, max;
3048 GET_SKIP;
3049 GET_ARG; min = arg;
3050 GET_ARG; max = arg;
3051 if (min > max)
3052 FAIL;
3053#ifdef Py_UNICODE_WIDE
3054 if (max > 65535)
3055 FAIL;
3056#endif
3057 if (!_validate_inner(code, code+skip-4, groups))
3058 FAIL;
3059 code += skip-4;
3060 GET_OP;
3061 if (op != SRE_OP_SUCCESS)
3062 FAIL;
3063 }
3064 break;
3065
3066 case SRE_OP_REPEAT:
3067 {
3068 SRE_CODE min, max;
3069 GET_SKIP;
3070 GET_ARG; min = arg;
3071 GET_ARG; max = arg;
3072 if (min > max)
3073 FAIL;
3074#ifdef Py_UNICODE_WIDE
3075 if (max > 65535)
3076 FAIL;
3077#endif
3078 if (!_validate_inner(code, code+skip-3, groups))
3079 FAIL;
3080 code += skip-3;
3081 GET_OP;
3082 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3083 FAIL;
3084 }
3085 break;
3086
3087 case SRE_OP_GROUPREF:
3088 case SRE_OP_GROUPREF_IGNORE:
3089 GET_ARG;
3090 if (arg >= groups)
3091 FAIL;
3092 break;
3093
3094 case SRE_OP_GROUPREF_EXISTS:
3095 /* The regex syntax for this is: '(?(group)then|else)', where
3096 'group' is either an integer group number or a group name,
3097 'then' and 'else' are sub-regexes, and 'else' is optional. */
3098 GET_ARG;
3099 if (arg >= groups)
3100 FAIL;
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00003101 GET_SKIP_ADJ(1);
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003102 code--; /* The skip is relative to the first arg! */
3103 /* There are two possibilities here: if there is both a 'then'
3104 part and an 'else' part, the generated code looks like:
3105
3106 GROUPREF_EXISTS
3107 <group>
3108 <skipyes>
3109 ...then part...
3110 JUMP
3111 <skipno>
3112 (<skipyes> jumps here)
3113 ...else part...
3114 (<skipno> jumps here)
3115
3116 If there is only a 'then' part, it looks like:
3117
3118 GROUPREF_EXISTS
3119 <group>
3120 <skip>
3121 ...then part...
3122 (<skip> jumps here)
3123
3124 There is no direct way to decide which it is, and we don't want
3125 to allow arbitrary jumps anywhere in the code; so we just look
3126 for a JUMP opcode preceding our skip target.
3127 */
3128 if (skip >= 3 && code+skip-3 >= code &&
3129 code[skip-3] == SRE_OP_JUMP)
3130 {
3131 VTRACE(("both then and else parts present\n"));
3132 if (!_validate_inner(code+1, code+skip-3, groups))
3133 FAIL;
3134 code += skip-2; /* Position after JUMP, at <skipno> */
3135 GET_SKIP;
3136 if (!_validate_inner(code, code+skip-1, groups))
3137 FAIL;
3138 code += skip-1;
3139 }
3140 else {
3141 VTRACE(("only a then part present\n"));
3142 if (!_validate_inner(code+1, code+skip-1, groups))
3143 FAIL;
3144 code += skip-1;
3145 }
3146 break;
3147
3148 case SRE_OP_ASSERT:
3149 case SRE_OP_ASSERT_NOT:
3150 GET_SKIP;
3151 GET_ARG; /* 0 for lookahead, width for lookbehind */
3152 code--; /* Back up over arg to simplify math below */
3153 if (arg & 0x80000000)
3154 FAIL; /* Width too large */
3155 /* Stop 1 before the end; we check the SUCCESS below */
3156 if (!_validate_inner(code+1, code+skip-2, groups))
3157 FAIL;
3158 code += skip-2;
3159 GET_OP;
3160 if (op != SRE_OP_SUCCESS)
3161 FAIL;
3162 break;
3163
3164 default:
3165 FAIL;
3166
3167 }
3168 }
3169
3170 VTRACE(("okay\n"));
3171 return 1;
3172}
3173
3174static int
3175_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3176{
3177 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3178 FAIL;
3179 if (groups == 0) /* fix for simplejson */
3180 groups = 100; /* 100 groups should always be safe */
3181 return _validate_inner(code, end-1, groups);
3182}
3183
3184static int
3185_validate(PatternObject *self)
3186{
3187 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3188 {
3189 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3190 return 0;
3191 }
3192 else
3193 VTRACE(("Success!\n"));
3194 return 1;
3195}
3196
3197/* -------------------------------------------------------------------- */
Guido van Rossumb700df92000-03-31 14:59:30 +00003198/* match methods */
3199
3200static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003201match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003202{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003203 Py_XDECREF(self->regs);
3204 Py_XDECREF(self->string);
3205 Py_DECREF(self->pattern);
3206 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00003207}
3208
3209static PyObject*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003210match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00003211{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003212 if (index < 0 || index >= self->groups) {
3213 /* raise IndexError if we were given a bad group number */
3214 PyErr_SetString(
3215 PyExc_IndexError,
3216 "no such group"
3217 );
3218 return NULL;
3219 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003220
Fredrik Lundh6f013982000-07-03 18:44:21 +00003221 index *= 2;
3222
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003223 if (self->string == Py_None || self->mark[index] < 0) {
3224 /* return default value if the string or group is undefined */
3225 Py_INCREF(def);
3226 return def;
3227 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003228
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003229 return PySequence_GetSlice(
3230 self->string, self->mark[index], self->mark[index+1]
3231 );
Guido van Rossumb700df92000-03-31 14:59:30 +00003232}
3233
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003234static Py_ssize_t
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003235match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00003236{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003237 Py_ssize_t i;
Guido van Rossumb700df92000-03-31 14:59:30 +00003238
Guido van Rossumddefaf32007-01-14 03:31:43 +00003239 if (index == NULL)
3240 /* Default value */
3241 return 0;
3242
Christian Heimes217cfd12007-12-02 14:31:20 +00003243 if (PyLong_Check(index))
3244 return PyLong_AsSsize_t(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00003245
Fredrik Lundh6f013982000-07-03 18:44:21 +00003246 i = -1;
3247
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003248 if (self->pattern->groupindex) {
3249 index = PyObject_GetItem(self->pattern->groupindex, index);
3250 if (index) {
Neal Norwitz1fe5f382007-08-31 04:32:55 +00003251 if (PyLong_Check(index))
Christian Heimes217cfd12007-12-02 14:31:20 +00003252 i = PyLong_AsSsize_t(index);
Fredrik Lundh6f013982000-07-03 18:44:21 +00003253 Py_DECREF(index);
3254 } else
3255 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003256 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00003257
3258 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003259}
3260
3261static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003262match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003263{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003264 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00003265}
3266
3267static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003268match_expand(MatchObject* self, PyObject* ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003269{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003270 /* delegate to Python code */
3271 return call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00003272 SRE_PY_MODULE, "_expand",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003273 PyTuple_Pack(3, self->pattern, self, ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003274 );
3275}
3276
3277static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003278match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003279{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003280 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003281 Py_ssize_t i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00003282
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003283 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00003284
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003285 switch (size) {
3286 case 0:
3287 result = match_getslice(self, Py_False, Py_None);
3288 break;
3289 case 1:
3290 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3291 break;
3292 default:
3293 /* fetch multiple items */
3294 result = PyTuple_New(size);
3295 if (!result)
3296 return NULL;
3297 for (i = 0; i < size; i++) {
3298 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003299 self, PyTuple_GET_ITEM(args, i), Py_None
3300 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003301 if (!item) {
3302 Py_DECREF(result);
3303 return NULL;
3304 }
3305 PyTuple_SET_ITEM(result, i, item);
3306 }
3307 break;
3308 }
3309 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003310}
3311
3312static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003313match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003314{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003315 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003316 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003317
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003318 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003319 static char* kwlist[] = { "default", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00003320 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003321 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003322
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003323 result = PyTuple_New(self->groups-1);
3324 if (!result)
3325 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003326
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003327 for (index = 1; index < self->groups; index++) {
3328 PyObject* item;
3329 item = match_getslice_by_index(self, index, def);
3330 if (!item) {
3331 Py_DECREF(result);
3332 return NULL;
3333 }
3334 PyTuple_SET_ITEM(result, index-1, item);
3335 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003336
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003337 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003338}
3339
3340static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003341match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003342{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003343 PyObject* result;
3344 PyObject* keys;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003345 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003346
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003347 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003348 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00003349 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003350 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003351
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003352 result = PyDict_New();
3353 if (!result || !self->pattern->groupindex)
3354 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003355
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003356 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003357 if (!keys)
3358 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00003359
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003360 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00003361 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003362 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003363 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003364 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003365 if (!key)
3366 goto failed;
3367 value = match_getslice(self, key, def);
3368 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003369 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003370 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003371 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00003372 status = PyDict_SetItem(result, key, value);
3373 Py_DECREF(value);
3374 if (status < 0)
3375 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003376 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003377
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003378 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00003379
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003380 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003381
3382failed:
Neal Norwitz60da3162006-03-07 04:48:24 +00003383 Py_XDECREF(keys);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003384 Py_DECREF(result);
3385 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003386}
3387
3388static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003389match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003390{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003391 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003392
Guido van Rossumddefaf32007-01-14 03:31:43 +00003393 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003394 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003395 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003396
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003397 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003398
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003399 if (index < 0 || index >= self->groups) {
3400 PyErr_SetString(
3401 PyExc_IndexError,
3402 "no such group"
3403 );
3404 return NULL;
3405 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003406
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003407 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003408 return Py_BuildValue("i", self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00003409}
3410
3411static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003412match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003413{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003414 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003415
Guido van Rossumddefaf32007-01-14 03:31:43 +00003416 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003417 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003418 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003419
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003420 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003421
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003422 if (index < 0 || index >= self->groups) {
3423 PyErr_SetString(
3424 PyExc_IndexError,
3425 "no such group"
3426 );
3427 return NULL;
3428 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003429
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003430 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003431 return Py_BuildValue("i", self->mark[index*2+1]);
3432}
3433
3434LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003435_pair(Py_ssize_t i1, Py_ssize_t i2)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003436{
3437 PyObject* pair;
3438 PyObject* item;
3439
3440 pair = PyTuple_New(2);
3441 if (!pair)
3442 return NULL;
3443
Christian Heimes217cfd12007-12-02 14:31:20 +00003444 item = PyLong_FromSsize_t(i1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003445 if (!item)
3446 goto error;
3447 PyTuple_SET_ITEM(pair, 0, item);
3448
Christian Heimes217cfd12007-12-02 14:31:20 +00003449 item = PyLong_FromSsize_t(i2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003450 if (!item)
3451 goto error;
3452 PyTuple_SET_ITEM(pair, 1, item);
3453
3454 return pair;
3455
3456 error:
3457 Py_DECREF(pair);
3458 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003459}
3460
3461static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003462match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003463{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003464 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003465
Guido van Rossumddefaf32007-01-14 03:31:43 +00003466 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003467 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003468 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003469
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003470 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003471
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003472 if (index < 0 || index >= self->groups) {
3473 PyErr_SetString(
3474 PyExc_IndexError,
3475 "no such group"
3476 );
3477 return NULL;
3478 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003479
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003480 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003481 return _pair(self->mark[index*2], self->mark[index*2+1]);
3482}
3483
3484static PyObject*
3485match_regs(MatchObject* self)
3486{
3487 PyObject* regs;
3488 PyObject* item;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003489 Py_ssize_t index;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003490
3491 regs = PyTuple_New(self->groups);
3492 if (!regs)
3493 return NULL;
3494
3495 for (index = 0; index < self->groups; index++) {
3496 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3497 if (!item) {
3498 Py_DECREF(regs);
3499 return NULL;
3500 }
3501 PyTuple_SET_ITEM(regs, index, item);
3502 }
3503
3504 Py_INCREF(regs);
3505 self->regs = regs;
3506
3507 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00003508}
3509
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003510static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003511match_copy(MatchObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003512{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003513#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003514 MatchObject* copy;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003515 Py_ssize_t slots, offset;
Tim Peters3d563502006-01-21 02:47:53 +00003516
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003517 slots = 2 * (self->pattern->groups+1);
3518
3519 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3520 if (!copy)
3521 return NULL;
3522
3523 /* this value a constant, but any compiler should be able to
3524 figure that out all by itself */
3525 offset = offsetof(MatchObject, string);
3526
3527 Py_XINCREF(self->pattern);
3528 Py_XINCREF(self->string);
3529 Py_XINCREF(self->regs);
3530
3531 memcpy((char*) copy + offset, (char*) self + offset,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003532 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003533
3534 return (PyObject*) copy;
3535#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003536 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003537 return NULL;
3538#endif
3539}
3540
3541static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003542match_deepcopy(MatchObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003543{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003544#ifdef USE_BUILTIN_COPY
3545 MatchObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00003546
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003547 copy = (MatchObject*) match_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003548 if (!copy)
3549 return NULL;
3550
3551 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3552 !deepcopy(&copy->string, memo) ||
3553 !deepcopy(&copy->regs, memo)) {
3554 Py_DECREF(copy);
3555 return NULL;
3556 }
3557
3558#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003559 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3560 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003561#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003562}
3563
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003564static PyMethodDef match_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00003565 {"group", (PyCFunction) match_group, METH_VARARGS},
3566 {"start", (PyCFunction) match_start, METH_VARARGS},
3567 {"end", (PyCFunction) match_end, METH_VARARGS},
3568 {"span", (PyCFunction) match_span, METH_VARARGS},
3569 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3570 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003571 {"expand", (PyCFunction) match_expand, METH_O},
3572 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3573 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003574 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003575};
3576
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003577static PyObject *
3578match_lastindex_get(MatchObject *self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003579{
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003580 if (self->lastindex >= 0)
3581 return Py_BuildValue("i", self->lastindex);
3582 Py_INCREF(Py_None);
3583 return Py_None;
Guido van Rossumb700df92000-03-31 14:59:30 +00003584}
3585
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003586static PyObject *
3587match_lastgroup_get(MatchObject *self)
3588{
3589 if (self->pattern->indexgroup && self->lastindex >= 0) {
3590 PyObject* result = PySequence_GetItem(
3591 self->pattern->indexgroup, self->lastindex
3592 );
3593 if (result)
3594 return result;
3595 PyErr_Clear();
3596 }
3597 Py_INCREF(Py_None);
3598 return Py_None;
3599}
3600
3601static PyObject *
3602match_regs_get(MatchObject *self)
3603{
3604 if (self->regs) {
3605 Py_INCREF(self->regs);
3606 return self->regs;
3607 } else
3608 return match_regs(self);
3609}
3610
3611static PyGetSetDef match_getset[] = {
3612 {"lastindex", (getter)match_lastindex_get, (setter)NULL},
3613 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
3614 {"regs", (getter)match_regs_get, (setter)NULL},
3615 {NULL}
3616};
3617
3618#define MATCH_OFF(x) offsetof(MatchObject, x)
3619static PyMemberDef match_members[] = {
3620 {"string", T_OBJECT, MATCH_OFF(string), READONLY},
3621 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
3622 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
3623 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
3624 {NULL}
3625};
3626
Guido van Rossumb700df92000-03-31 14:59:30 +00003627/* FIXME: implement setattr("string", None) as a special case (to
3628 detach the associated string, if any */
3629
Neal Norwitz57c179c2006-03-22 07:18:02 +00003630static PyTypeObject Match_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003631 PyVarObject_HEAD_INIT(NULL,0)
3632 "_" SRE_MODULE ".SRE_Match",
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003633 sizeof(MatchObject), sizeof(Py_ssize_t),
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003634 (destructor)match_dealloc, /* tp_dealloc */
3635 0, /* tp_print */
3636 0, /* tp_getattr */
3637 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00003638 0, /* tp_reserved */
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003639 0, /* tp_repr */
3640 0, /* tp_as_number */
3641 0, /* tp_as_sequence */
3642 0, /* tp_as_mapping */
3643 0, /* tp_hash */
3644 0, /* tp_call */
3645 0, /* tp_str */
3646 0, /* tp_getattro */
3647 0, /* tp_setattro */
3648 0, /* tp_as_buffer */
3649 Py_TPFLAGS_DEFAULT, /* tp_flags */
3650 0, /* tp_doc */
3651 0, /* tp_traverse */
3652 0, /* tp_clear */
3653 0, /* tp_richcompare */
3654 0, /* tp_weaklistoffset */
3655 0, /* tp_iter */
3656 0, /* tp_iternext */
3657 match_methods, /* tp_methods */
3658 match_members, /* tp_members */
3659 match_getset, /* tp_getset */
Guido van Rossumb700df92000-03-31 14:59:30 +00003660};
3661
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003662static PyObject*
3663pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3664{
3665 /* create match object (from state object) */
3666
3667 MatchObject* match;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003668 Py_ssize_t i, j;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003669 char* base;
3670 int n;
3671
3672 if (status > 0) {
3673
3674 /* create match object (with room for extra group marks) */
Christian Heimes587c2bf2008-01-19 16:21:02 +00003675 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003676 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3677 2*(pattern->groups+1));
3678 if (!match)
3679 return NULL;
3680
3681 Py_INCREF(pattern);
3682 match->pattern = pattern;
3683
3684 Py_INCREF(state->string);
3685 match->string = state->string;
3686
3687 match->regs = NULL;
3688 match->groups = pattern->groups+1;
3689
3690 /* fill in group slices */
3691
3692 base = (char*) state->beginning;
3693 n = state->charsize;
3694
3695 match->mark[0] = ((char*) state->start - base) / n;
3696 match->mark[1] = ((char*) state->ptr - base) / n;
3697
3698 for (i = j = 0; i < pattern->groups; i++, j+=2)
3699 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3700 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3701 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3702 } else
3703 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3704
3705 match->pos = state->pos;
3706 match->endpos = state->endpos;
3707
3708 match->lastindex = state->lastindex;
3709
3710 return (PyObject*) match;
3711
3712 } else if (status == 0) {
3713
3714 /* no match */
3715 Py_INCREF(Py_None);
3716 return Py_None;
3717
3718 }
3719
3720 /* internal error */
3721 pattern_error(status);
3722 return NULL;
3723}
3724
3725
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003726/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003727/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003728
3729static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003730scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003731{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003732 state_fini(&self->state);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003733 Py_DECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003734 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003735}
3736
3737static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003738scanner_match(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003739{
3740 SRE_STATE* state = &self->state;
3741 PyObject* match;
3742 int status;
3743
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003744 state_reset(state);
3745
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003746 state->ptr = state->start;
3747
3748 if (state->charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003749 status = sre_match(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003750 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003751#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003752 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003753#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003754 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003755 if (PyErr_Occurred())
3756 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003757
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003758 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003759 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003760
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003761 if (status == 0 || state->ptr == state->start)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003762 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003763 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003764 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003765
3766 return match;
3767}
3768
3769
3770static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003771scanner_search(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003772{
3773 SRE_STATE* state = &self->state;
3774 PyObject* match;
3775 int status;
3776
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003777 state_reset(state);
3778
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003779 state->ptr = state->start;
3780
3781 if (state->charsize == 1) {
3782 status = sre_search(state, PatternObject_GetCode(self->pattern));
3783 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003784#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003785 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003786#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003787 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003788 if (PyErr_Occurred())
3789 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003790
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003791 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003792 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003793
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003794 if (status == 0 || state->ptr == state->start)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003795 state->start = (void*) ((char*) state->ptr + state->charsize);
3796 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003797 state->start = state->ptr;
3798
3799 return match;
3800}
3801
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003802static PyMethodDef scanner_methods[] = {
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003803 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3804 {"search", (PyCFunction) scanner_search, METH_NOARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003805 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003806};
3807
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003808#define SCAN_OFF(x) offsetof(ScannerObject, x)
3809static PyMemberDef scanner_members[] = {
3810 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
3811 {NULL} /* Sentinel */
3812};
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003813
Neal Norwitz57c179c2006-03-22 07:18:02 +00003814static PyTypeObject Scanner_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003815 PyVarObject_HEAD_INIT(NULL, 0)
3816 "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003817 sizeof(ScannerObject), 0,
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003818 (destructor)scanner_dealloc,/* tp_dealloc */
3819 0, /* tp_print */
3820 0, /* tp_getattr */
3821 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00003822 0, /* tp_reserved */
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003823 0, /* tp_repr */
3824 0, /* tp_as_number */
3825 0, /* tp_as_sequence */
3826 0, /* tp_as_mapping */
3827 0, /* tp_hash */
3828 0, /* tp_call */
3829 0, /* tp_str */
3830 0, /* tp_getattro */
3831 0, /* tp_setattro */
3832 0, /* tp_as_buffer */
3833 Py_TPFLAGS_DEFAULT, /* tp_flags */
3834 0, /* tp_doc */
3835 0, /* tp_traverse */
3836 0, /* tp_clear */
3837 0, /* tp_richcompare */
3838 0, /* tp_weaklistoffset */
3839 0, /* tp_iter */
3840 0, /* tp_iternext */
3841 scanner_methods, /* tp_methods */
3842 scanner_members, /* tp_members */
3843 0, /* tp_getset */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003844};
3845
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003846static PyObject*
3847pattern_scanner(PatternObject* pattern, PyObject* args)
3848{
3849 /* create search state object */
3850
3851 ScannerObject* self;
3852
3853 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003854 Py_ssize_t start = 0;
3855 Py_ssize_t end = PY_SSIZE_T_MAX;
3856 if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003857 return NULL;
3858
3859 /* create scanner object */
3860 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3861 if (!self)
3862 return NULL;
3863
3864 string = state_init(&self->state, pattern, string, start, end);
3865 if (!string) {
3866 PyObject_DEL(self);
3867 return NULL;
3868 }
3869
3870 Py_INCREF(pattern);
3871 self->pattern = (PyObject*) pattern;
3872
3873 return (PyObject*) self;
3874}
3875
Guido van Rossumb700df92000-03-31 14:59:30 +00003876static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00003877 {"compile", _compile, METH_VARARGS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003878 {"getcodesize", sre_codesize, METH_NOARGS},
Neal Norwitzb0493252002-03-31 14:44:22 +00003879 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003880 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003881};
3882
Martin v. Löwis1a214512008-06-11 05:26:20 +00003883static struct PyModuleDef sremodule = {
3884 PyModuleDef_HEAD_INIT,
3885 "_" SRE_MODULE,
3886 NULL,
3887 -1,
3888 _functions,
3889 NULL,
3890 NULL,
3891 NULL,
3892 NULL
3893};
3894
3895PyMODINIT_FUNC PyInit__sre(void)
Guido van Rossumb700df92000-03-31 14:59:30 +00003896{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003897 PyObject* m;
3898 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00003899 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003900
Guido van Rossum50e9fb92006-08-17 05:42:55 +00003901 /* Initialize object types */
3902 if (PyType_Ready(&Pattern_Type) < 0)
Martin v. Löwis1a214512008-06-11 05:26:20 +00003903 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00003904 if (PyType_Ready(&Match_Type) < 0)
Martin v. Löwis1a214512008-06-11 05:26:20 +00003905 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00003906 if (PyType_Ready(&Scanner_Type) < 0)
Martin v. Löwis1a214512008-06-11 05:26:20 +00003907 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003908
Martin v. Löwis1a214512008-06-11 05:26:20 +00003909 m = PyModule_Create(&sremodule);
Neal Norwitz1ac754f2006-01-19 06:09:39 +00003910 if (m == NULL)
Martin v. Löwis1a214512008-06-11 05:26:20 +00003911 return NULL;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003912 d = PyModule_GetDict(m);
3913
Christian Heimes217cfd12007-12-02 14:31:20 +00003914 x = PyLong_FromLong(SRE_MAGIC);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003915 if (x) {
3916 PyDict_SetItemString(d, "MAGIC", x);
3917 Py_DECREF(x);
3918 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00003919
Christian Heimes217cfd12007-12-02 14:31:20 +00003920 x = PyLong_FromLong(sizeof(SRE_CODE));
Martin v. Löwis78e2f062003-04-19 12:56:08 +00003921 if (x) {
3922 PyDict_SetItemString(d, "CODESIZE", x);
3923 Py_DECREF(x);
3924 }
3925
Neal Norwitzfe537132007-08-26 03:55:15 +00003926 x = PyUnicode_FromString(copyright);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003927 if (x) {
3928 PyDict_SetItemString(d, "copyright", x);
3929 Py_DECREF(x);
3930 }
Martin v. Löwis1a214512008-06-11 05:26:20 +00003931 return m;
Guido van Rossumb700df92000-03-31 14:59:30 +00003932}
3933
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003934#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00003935
3936/* vim:ts=4:sw=4:et
3937*/