blob: 64fc5132c8262a241391e3b8d171623855d363fd [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
Fredrik Lundhebc37b22000-10-28 19:30:41 +00009 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000010 * 2000-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000011 * 2000-09-20 fl added expand method
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000012 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000013 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000014 * 2001-04-28 fl added __copy__ methods (work in progress)
Fredrik Lundh09705f02002-11-22 12:46:35 +000015 * 2001-05-14 fl fixes for 1.5.2 compatibility
Fredrik Lundhf71ae462001-07-02 17:04:48 +000016 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh397a6542001-10-18 19:30:16 +000017 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Fredrik Lundh971e78b2001-10-20 17:48:46 +000018 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
Fredrik Lundhbec95b92001-10-21 16:47:57 +000019 * 2001-10-21 fl added sub/subn primitive
Fredrik Lundh703ce812001-10-24 22:16:30 +000020 * 2001-10-24 fl added finditer primitive (for 2.2 only)
Fredrik Lundh82b23072001-12-09 16:13:15 +000021 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
Fredrik Lundh09705f02002-11-22 12:46:35 +000022 * 2002-11-09 fl fixed empty sub/subn return type
Martin v. Löwis78e2f062003-04-19 12:56:08 +000023 * 2003-04-18 mvl fully support 4-byte codes
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +000024 * 2003-10-17 gn implemented non recursive scheme
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000026 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000027 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000028 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
Guido van Rossumb700df92000-03-31 14:59:30 +000032 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000033 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000034 * other compatibility work.
35 */
36
37#ifndef SRE_RECURSIVE
38
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000039static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000040 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000041
Thomas Wouters0e3f5912006-08-11 14:57:12 +000042#define PY_SSIZE_T_CLEAN
43
Guido van Rossumb700df92000-03-31 14:59:30 +000044#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000045#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000046
47#include "sre.h"
48
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000049#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000050
Fredrik Lundh436c3d582000-06-29 08:58:44 +000051/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000052#if !defined(SRE_MODULE)
53#define SRE_MODULE "sre"
54#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000055
Thomas Wouters9ada3d62006-04-21 09:47:09 +000056#define SRE_PY_MODULE "re"
57
Guido van Rossumb700df92000-03-31 14:59:30 +000058/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000059#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000060
Fredrik Lundh22d25462000-07-01 17:50:59 +000061/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +000062#define HAVE_UNICODE
Fredrik Lundh436c3d582000-06-29 08:58:44 +000063
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000065/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000066
67/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000068#define USE_FAST_SEARCH
69
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000070/* enables aggressive inlining (always on for Visual C) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000071#undef USE_INLINE
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000072
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000073/* enables copy/deepcopy handling (work in progress) */
74#undef USE_BUILTIN_COPY
75
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000076#if PY_VERSION_HEX < 0x01060000
77#define PyObject_DEL(op) PyMem_DEL((op))
78#endif
79
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000080/* -------------------------------------------------------------------- */
81
Fredrik Lundh80946112000-06-29 18:03:25 +000082#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000083#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000084#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000085/* fastest possible local call under MSVC */
86#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000087#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000088#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000089#else
90#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000091#endif
92
93/* error codes */
94#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000095#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000096#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000097#define SRE_ERROR_MEMORY -9 /* out of memory */
Christian Heimes2380ac72008-01-09 00:17:24 +000098#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
Guido van Rossumb700df92000-03-31 14:59:30 +000099
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000100#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +0000101#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +0000102#else
103#define TRACE(v)
104#endif
105
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000106/* -------------------------------------------------------------------- */
107/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000108
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000109/* default character predicates (run sre_chars.py to regenerate tables) */
110
111#define SRE_DIGIT_MASK 1
112#define SRE_SPACE_MASK 2
113#define SRE_LINEBREAK_MASK 4
114#define SRE_ALNUM_MASK 8
115#define SRE_WORD_MASK 16
116
Fredrik Lundh21009b92001-09-18 18:47:09 +0000117/* FIXME: this assumes ASCII. create tables in init_sre() instead */
118
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000119static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1202, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1210, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
12225, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12324, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1240, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12524, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
126
Fredrik Lundhb389df32000-06-29 12:48:37 +0000127static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000012810, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
12927, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
13044, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
13161, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
132108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
133122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
134106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
135120, 121, 122, 123, 124, 125, 126, 127 };
136
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000137#define SRE_IS_DIGIT(ch)\
138 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
139#define SRE_IS_SPACE(ch)\
140 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
141#define SRE_IS_LINEBREAK(ch)\
142 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
143#define SRE_IS_ALNUM(ch)\
144 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
145#define SRE_IS_WORD(ch)\
146 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000147
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000148static unsigned int sre_lower(unsigned int ch)
149{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000150 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000151}
152
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000153/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000154/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
155 * warnings when c's type supports only numbers < N+1 */
156#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
157#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000158#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000159#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000160#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
161
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000162static unsigned int sre_lower_locale(unsigned int ch)
163{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000164 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000165}
166
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000167/* unicode-specific character predicates */
168
169#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000170
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000171#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
172#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
173#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000174#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000175#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000176
177static unsigned int sre_lower_unicode(unsigned int ch)
178{
179 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
180}
181
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000182#endif
183
Guido van Rossumb700df92000-03-31 14:59:30 +0000184LOCAL(int)
185sre_category(SRE_CODE category, unsigned int ch)
186{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000187 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000188
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000189 case SRE_CATEGORY_DIGIT:
190 return SRE_IS_DIGIT(ch);
191 case SRE_CATEGORY_NOT_DIGIT:
192 return !SRE_IS_DIGIT(ch);
193 case SRE_CATEGORY_SPACE:
194 return SRE_IS_SPACE(ch);
195 case SRE_CATEGORY_NOT_SPACE:
196 return !SRE_IS_SPACE(ch);
197 case SRE_CATEGORY_WORD:
198 return SRE_IS_WORD(ch);
199 case SRE_CATEGORY_NOT_WORD:
200 return !SRE_IS_WORD(ch);
201 case SRE_CATEGORY_LINEBREAK:
202 return SRE_IS_LINEBREAK(ch);
203 case SRE_CATEGORY_NOT_LINEBREAK:
204 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000205
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000206 case SRE_CATEGORY_LOC_WORD:
207 return SRE_LOC_IS_WORD(ch);
208 case SRE_CATEGORY_LOC_NOT_WORD:
209 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000210
211#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000212 case SRE_CATEGORY_UNI_DIGIT:
213 return SRE_UNI_IS_DIGIT(ch);
214 case SRE_CATEGORY_UNI_NOT_DIGIT:
215 return !SRE_UNI_IS_DIGIT(ch);
216 case SRE_CATEGORY_UNI_SPACE:
217 return SRE_UNI_IS_SPACE(ch);
218 case SRE_CATEGORY_UNI_NOT_SPACE:
219 return !SRE_UNI_IS_SPACE(ch);
220 case SRE_CATEGORY_UNI_WORD:
221 return SRE_UNI_IS_WORD(ch);
222 case SRE_CATEGORY_UNI_NOT_WORD:
223 return !SRE_UNI_IS_WORD(ch);
224 case SRE_CATEGORY_UNI_LINEBREAK:
225 return SRE_UNI_IS_LINEBREAK(ch);
226 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
227 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000228#else
229 case SRE_CATEGORY_UNI_DIGIT:
230 return SRE_IS_DIGIT(ch);
231 case SRE_CATEGORY_UNI_NOT_DIGIT:
232 return !SRE_IS_DIGIT(ch);
233 case SRE_CATEGORY_UNI_SPACE:
234 return SRE_IS_SPACE(ch);
235 case SRE_CATEGORY_UNI_NOT_SPACE:
236 return !SRE_IS_SPACE(ch);
237 case SRE_CATEGORY_UNI_WORD:
238 return SRE_LOC_IS_WORD(ch);
239 case SRE_CATEGORY_UNI_NOT_WORD:
240 return !SRE_LOC_IS_WORD(ch);
241 case SRE_CATEGORY_UNI_LINEBREAK:
242 return SRE_IS_LINEBREAK(ch);
243 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
244 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000245#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000246 }
247 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000248}
249
250/* helpers */
251
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000252static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000253data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000254{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000255 if (state->data_stack) {
Thomas Wouters477c8d52006-05-27 19:21:47 +0000256 PyMem_FREE(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000257 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000258 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000259 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000260}
261
262static int
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000263data_stack_grow(SRE_STATE* state, Py_ssize_t size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000264{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000265 Py_ssize_t minsize, cursize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000266 minsize = state->data_stack_base+size;
267 cursize = state->data_stack_size;
268 if (cursize < minsize) {
269 void* stack;
270 cursize = minsize+minsize/4+1024;
271 TRACE(("allocate/grow stack %d\n", cursize));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000272 stack = PyMem_REALLOC(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000273 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000274 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000275 return SRE_ERROR_MEMORY;
276 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000278 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000279 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000280 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000281}
282
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000283/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000284
285#define SRE_CHAR unsigned char
286#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000287#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000288#define SRE_CHARSET sre_charset
289#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000290#define SRE_MATCH sre_match
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000291#define SRE_MATCH_CONTEXT sre_match_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000292#define SRE_SEARCH sre_search
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000293#define SRE_LITERAL_TEMPLATE sre_literal_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000294
295#if defined(HAVE_UNICODE)
296
Guido van Rossumb700df92000-03-31 14:59:30 +0000297#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000298#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000299#undef SRE_RECURSIVE
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000300
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000301#undef SRE_LITERAL_TEMPLATE
Guido van Rossumb700df92000-03-31 14:59:30 +0000302#undef SRE_SEARCH
303#undef SRE_MATCH
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000304#undef SRE_MATCH_CONTEXT
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000305#undef SRE_INFO
306#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000307#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000308#undef SRE_AT
309#undef SRE_CHAR
310
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000311/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000312
313#define SRE_CHAR Py_UNICODE
314#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000315#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000316#define SRE_CHARSET sre_ucharset
317#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000318#define SRE_MATCH sre_umatch
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000319#define SRE_MATCH_CONTEXT sre_umatch_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000320#define SRE_SEARCH sre_usearch
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000321#define SRE_LITERAL_TEMPLATE sre_uliteral_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000322#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000323
324#endif /* SRE_RECURSIVE */
325
326/* -------------------------------------------------------------------- */
327/* String matching engine */
328
329/* the following section is compiled twice, with different character
330 settings */
331
332LOCAL(int)
333SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
334{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000335 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000336
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000337 Py_ssize_t thisp, thatp;
Guido van Rossumb700df92000-03-31 14:59:30 +0000338
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000339 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000340
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000341 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000342 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000343 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000344
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000345 case SRE_AT_BEGINNING_LINE:
346 return ((void*) ptr == state->beginning ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000347 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000348
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000349 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000350 return (((void*) (ptr+1) == state->end &&
351 SRE_IS_LINEBREAK((int) ptr[0])) ||
352 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000353
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000354 case SRE_AT_END_LINE:
355 return ((void*) ptr == state->end ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000356 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000357
Fredrik Lundh770617b2001-01-14 15:06:11 +0000358 case SRE_AT_END_STRING:
359 return ((void*) ptr == state->end);
360
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000361 case SRE_AT_BOUNDARY:
362 if (state->beginning == state->end)
363 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000364 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000365 SRE_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000366 thisp = ((void*) ptr < state->end) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000367 SRE_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000368 return thisp != thatp;
Fredrik Lundh80946112000-06-29 18:03:25 +0000369
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000370 case SRE_AT_NON_BOUNDARY:
371 if (state->beginning == state->end)
372 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000373 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000374 SRE_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000375 thisp = ((void*) ptr < state->end) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000376 SRE_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000377 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000378
379 case SRE_AT_LOC_BOUNDARY:
380 if (state->beginning == state->end)
381 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000382 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000383 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000384 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000385 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000386 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000387
388 case SRE_AT_LOC_NON_BOUNDARY:
389 if (state->beginning == state->end)
390 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000391 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000392 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000393 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000394 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000395 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000396
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000397#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000398 case SRE_AT_UNI_BOUNDARY:
399 if (state->beginning == state->end)
400 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000401 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000402 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000403 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000404 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000405 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000406
407 case SRE_AT_UNI_NON_BOUNDARY:
408 if (state->beginning == state->end)
409 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000410 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000411 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000412 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000413 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000414 return thisp == thatp;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000415#endif
416
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000417 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000418
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000419 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000420}
421
422LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000423SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000424{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000425 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000426
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000427 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000428
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000429 for (;;) {
430 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000431
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000432 case SRE_OP_FAILURE:
433 return !ok;
434
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000435 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000436 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000437 if (ch == set[0])
438 return ok;
439 set++;
440 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000441
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000442 case SRE_OP_CATEGORY:
443 /* <CATEGORY> <code> */
444 if (sre_category(set[0], (int) ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000445 return ok;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000446 set += 1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000447 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000448
Fredrik Lundh3562f112000-07-02 12:00:07 +0000449 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000450 if (sizeof(SRE_CODE) == 2) {
451 /* <CHARSET> <bitmap> (16 bits per code word) */
452 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
453 return ok;
454 set += 16;
Tim Peters3d563502006-01-21 02:47:53 +0000455 }
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000456 else {
457 /* <CHARSET> <bitmap> (32 bits per code word) */
458 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
459 return ok;
460 set += 8;
461 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000462 break;
463
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000464 case SRE_OP_RANGE:
465 /* <RANGE> <lower> <upper> */
466 if (set[0] <= ch && ch <= set[1])
467 return ok;
468 set += 2;
469 break;
470
471 case SRE_OP_NEGATE:
472 ok = !ok;
473 break;
474
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000475 case SRE_OP_BIGCHARSET:
476 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
477 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000478 Py_ssize_t count, block;
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000479 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000480
481 if (sizeof(SRE_CODE) == 2) {
482 block = ((unsigned char*)set)[ch >> 8];
483 set += 128;
484 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
485 return ok;
486 set += count*16;
487 }
488 else {
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000489 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
490 * warnings when c's type supports only numbers < N+1 */
491 if (!(ch & ~65535))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000492 block = ((unsigned char*)set)[ch >> 8];
493 else
494 block = -1;
495 set += 64;
Tim Peters3d563502006-01-21 02:47:53 +0000496 if (block >=0 &&
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000497 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
498 return ok;
499 set += count*8;
500 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000501 break;
502 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000503
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000504 default:
505 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000506 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000507 return 0;
508 }
509 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000510}
511
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000512LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000513
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000514LOCAL(Py_ssize_t)
515SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000516{
517 SRE_CODE chr;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000518 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
519 SRE_CHAR* end = (SRE_CHAR *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000520 Py_ssize_t i;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000521
522 /* adjust end */
523 if (maxcount < end - ptr && maxcount != 65535)
524 end = ptr + maxcount;
525
526 switch (pattern[0]) {
527
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000528 case SRE_OP_IN:
529 /* repeated set */
530 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
531 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
532 ptr++;
533 break;
534
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000535 case SRE_OP_ANY:
536 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000537 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000538 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
539 ptr++;
540 break;
541
542 case SRE_OP_ANY_ALL:
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000543 /* repeated dot wildcard. skip to the end of the target
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000544 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000545 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000546 ptr = end;
547 break;
548
549 case SRE_OP_LITERAL:
550 /* repeated literal */
551 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000552 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000553 while (ptr < end && (SRE_CODE) *ptr == chr)
554 ptr++;
555 break;
556
557 case SRE_OP_LITERAL_IGNORE:
558 /* repeated literal */
559 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000560 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000561 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
562 ptr++;
563 break;
564
565 case SRE_OP_NOT_LITERAL:
566 /* repeated non-literal */
567 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000568 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000569 while (ptr < end && (SRE_CODE) *ptr != chr)
570 ptr++;
571 break;
Tim Peters3d563502006-01-21 02:47:53 +0000572
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000573 case SRE_OP_NOT_LITERAL_IGNORE:
574 /* repeated non-literal */
575 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000576 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000577 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
578 ptr++;
579 break;
580
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000581 default:
582 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000583 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000584 while ((SRE_CHAR*) state->ptr < end) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000585 i = SRE_MATCH(state, pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000586 if (i < 0)
587 return i;
588 if (!i)
589 break;
590 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000591 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
592 (SRE_CHAR*) state->ptr - ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000593 return (SRE_CHAR*) state->ptr - ptr;
594 }
595
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000596 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000597 return ptr - (SRE_CHAR*) state->ptr;
598}
599
Fredrik Lundh33accc12000-08-27 20:59:47 +0000600#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000601LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000602SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
603{
604 /* check if an SRE_OP_INFO block matches at the current position.
605 returns the number of SRE_CODE objects to skip if successful, 0
606 if no match */
607
608 SRE_CHAR* end = state->end;
609 SRE_CHAR* ptr = state->ptr;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000610 Py_ssize_t i;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000611
612 /* check minimal length */
613 if (pattern[3] && (end - ptr) < pattern[3])
614 return 0;
615
616 /* check known prefix */
617 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
618 /* <length> <skip> <prefix data> <overlap data> */
619 for (i = 0; i < pattern[5]; i++)
620 if ((SRE_CODE) ptr[i] != pattern[7 + i])
621 return 0;
622 return pattern[0] + 2 * pattern[6];
623 }
624 return pattern[0];
625}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000626#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000627
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000628/* The macros below should be used to protect recursive SRE_MATCH()
629 * calls that *failed* and do *not* return immediately (IOW, those
630 * that will backtrack). Explaining:
631 *
632 * - Recursive SRE_MATCH() returned true: that's usually a success
633 * (besides atypical cases like ASSERT_NOT), therefore there's no
634 * reason to restore lastmark;
635 *
636 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
637 * is returning to the caller: If the current SRE_MATCH() is the
638 * top function of the recursion, returning false will be a matching
639 * failure, and it doesn't matter where lastmark is pointing to.
640 * If it's *not* the top function, it will be a recursive SRE_MATCH()
641 * failure by itself, and the calling SRE_MATCH() will have to deal
642 * with the failure by the same rules explained here (it will restore
643 * lastmark by itself if necessary);
644 *
645 * - Recursive SRE_MATCH() returned false, and will continue the
646 * outside 'for' loop: must be protected when breaking, since the next
647 * OP could potentially depend on lastmark;
Tim Peters3d563502006-01-21 02:47:53 +0000648 *
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000649 * - Recursive SRE_MATCH() returned false, and will be called again
650 * inside a local for/while loop: must be protected between each
651 * loop iteration, since the recursive SRE_MATCH() could do anything,
652 * and could potentially depend on lastmark.
653 *
654 * For more information, check the discussion at SF patch #712900.
655 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000656#define LASTMARK_SAVE() \
657 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000658 ctx->lastmark = state->lastmark; \
659 ctx->lastindex = state->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000660 } while (0)
661#define LASTMARK_RESTORE() \
662 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000663 state->lastmark = ctx->lastmark; \
664 state->lastindex = ctx->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000665 } while (0)
666
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000667#define RETURN_ERROR(i) do { return i; } while(0)
668#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
669#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
670
671#define RETURN_ON_ERROR(i) \
672 do { if (i < 0) RETURN_ERROR(i); } while (0)
673#define RETURN_ON_SUCCESS(i) \
674 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
675#define RETURN_ON_FAILURE(i) \
676 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
677
678#define SFY(x) #x
679
680#define DATA_STACK_ALLOC(state, type, ptr) \
681do { \
682 alloc_pos = state->data_stack_base; \
683 TRACE(("allocating %s in %d (%d)\n", \
684 SFY(type), alloc_pos, sizeof(type))); \
685 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
686 int j = data_stack_grow(state, sizeof(type)); \
687 if (j < 0) return j; \
688 if (ctx_pos != -1) \
689 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
690 } \
691 ptr = (type*)(state->data_stack+alloc_pos); \
692 state->data_stack_base += sizeof(type); \
693} while (0)
694
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000695#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
696do { \
697 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
698 ptr = (type*)(state->data_stack+pos); \
699} while (0)
700
701#define DATA_STACK_PUSH(state, data, size) \
702do { \
703 TRACE(("copy data in %p to %d (%d)\n", \
704 data, state->data_stack_base, size)); \
705 if (state->data_stack_size < state->data_stack_base+size) { \
706 int j = data_stack_grow(state, size); \
707 if (j < 0) return j; \
708 if (ctx_pos != -1) \
709 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
710 } \
711 memcpy(state->data_stack+state->data_stack_base, data, size); \
712 state->data_stack_base += size; \
713} while (0)
714
715#define DATA_STACK_POP(state, data, size, discard) \
716do { \
717 TRACE(("copy data to %p from %d (%d)\n", \
718 data, state->data_stack_base-size, size)); \
719 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
720 if (discard) \
721 state->data_stack_base -= size; \
722} while (0)
723
724#define DATA_STACK_POP_DISCARD(state, size) \
725do { \
726 TRACE(("discard data from %d (%d)\n", \
727 state->data_stack_base-size, size)); \
728 state->data_stack_base -= size; \
729} while(0)
730
731#define DATA_PUSH(x) \
732 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
733#define DATA_POP(x) \
734 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000735#define DATA_POP_DISCARD(x) \
736 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
737#define DATA_ALLOC(t,p) \
738 DATA_STACK_ALLOC(state, t, p)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000739#define DATA_LOOKUP_AT(t,p,pos) \
740 DATA_STACK_LOOKUP_AT(state,t,p,pos)
741
742#define MARK_PUSH(lastmark) \
743 do if (lastmark > 0) { \
744 i = lastmark; /* ctx->lastmark may change if reallocated */ \
745 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
746 } while (0)
747#define MARK_POP(lastmark) \
748 do if (lastmark > 0) { \
749 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
750 } while (0)
751#define MARK_POP_KEEP(lastmark) \
752 do if (lastmark > 0) { \
753 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
754 } while (0)
755#define MARK_POP_DISCARD(lastmark) \
756 do if (lastmark > 0) { \
757 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
758 } while (0)
759
760#define JUMP_NONE 0
761#define JUMP_MAX_UNTIL_1 1
762#define JUMP_MAX_UNTIL_2 2
763#define JUMP_MAX_UNTIL_3 3
764#define JUMP_MIN_UNTIL_1 4
765#define JUMP_MIN_UNTIL_2 5
766#define JUMP_MIN_UNTIL_3 6
767#define JUMP_REPEAT 7
768#define JUMP_REPEAT_ONE_1 8
769#define JUMP_REPEAT_ONE_2 9
770#define JUMP_MIN_REPEAT_ONE 10
771#define JUMP_BRANCH 11
772#define JUMP_ASSERT 12
773#define JUMP_ASSERT_NOT 13
774
775#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
776 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
777 nextctx->last_ctx_pos = ctx_pos; \
778 nextctx->jump = jumpvalue; \
779 nextctx->pattern = nextpattern; \
780 ctx_pos = alloc_pos; \
781 ctx = nextctx; \
782 goto entrance; \
783 jumplabel: \
784 while (0) /* gcc doesn't like labels at end of scopes */ \
785
786typedef struct {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000787 Py_ssize_t last_ctx_pos;
788 Py_ssize_t jump;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000789 SRE_CHAR* ptr;
790 SRE_CODE* pattern;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000791 Py_ssize_t count;
792 Py_ssize_t lastmark;
793 Py_ssize_t lastindex;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000794 union {
795 SRE_CODE chr;
796 SRE_REPEAT* rep;
797 } u;
798} SRE_MATCH_CONTEXT;
799
800/* check if string matches the given pattern. returns <0 for
801 error, 0 for failure, and 1 for success */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000802LOCAL(Py_ssize_t)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000803SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
Guido van Rossumb700df92000-03-31 14:59:30 +0000804{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000805 SRE_CHAR* end = (SRE_CHAR *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000806 Py_ssize_t alloc_pos, ctx_pos = -1;
807 Py_ssize_t i, ret = 0;
808 Py_ssize_t jump;
Christian Heimes2380ac72008-01-09 00:17:24 +0000809 unsigned int sigcount=0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000810
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000811 SRE_MATCH_CONTEXT* ctx;
812 SRE_MATCH_CONTEXT* nextctx;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000813
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000814 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000815
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000816 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
817 ctx->last_ctx_pos = -1;
818 ctx->jump = JUMP_NONE;
819 ctx->pattern = pattern;
820 ctx_pos = alloc_pos;
821
822entrance:
823
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000824 ctx->ptr = (SRE_CHAR *)state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000825
826 if (ctx->pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000827 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000828 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000829 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000830 TRACE(("reject (got %d chars, need %d)\n",
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000831 (end - ctx->ptr), ctx->pattern[3]));
832 RETURN_FAILURE;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000833 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000834 ctx->pattern += ctx->pattern[1] + 1;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000835 }
836
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000837 for (;;) {
Christian Heimes2380ac72008-01-09 00:17:24 +0000838 ++sigcount;
839 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
840 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
Guido van Rossumb700df92000-03-31 14:59:30 +0000841
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000842 switch (*ctx->pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000843
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000844 case SRE_OP_MARK:
845 /* set mark */
846 /* <MARK> <gid> */
847 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
848 ctx->ptr, ctx->pattern[0]));
849 i = ctx->pattern[0];
850 if (i & 1)
851 state->lastindex = i/2 + 1;
852 if (i > state->lastmark) {
853 /* state->lastmark is the highest valid index in the
854 state->mark array. If it is increased by more than 1,
855 the intervening marks must be set to NULL to signal
Tim Peters3d563502006-01-21 02:47:53 +0000856 that these marks have not been encountered. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000857 Py_ssize_t j = state->lastmark + 1;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000858 while (j < i)
859 state->mark[j++] = NULL;
860 state->lastmark = i;
861 }
862 state->mark[i] = ctx->ptr;
863 ctx->pattern++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000864 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000865
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000866 case SRE_OP_LITERAL:
867 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000868 /* <LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000869 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
870 ctx->ptr, *ctx->pattern));
871 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
872 RETURN_FAILURE;
873 ctx->pattern++;
874 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000875 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000876
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000877 case SRE_OP_NOT_LITERAL:
878 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000879 /* <NOT_LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000880 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
881 ctx->ptr, *ctx->pattern));
882 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
883 RETURN_FAILURE;
884 ctx->pattern++;
885 ctx->ptr++;
886 break;
887
888 case SRE_OP_SUCCESS:
889 /* end of pattern */
890 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
891 state->ptr = ctx->ptr;
892 RETURN_SUCCESS;
893
894 case SRE_OP_AT:
895 /* match at given position */
896 /* <AT> <code> */
897 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
898 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
899 RETURN_FAILURE;
900 ctx->pattern++;
901 break;
902
903 case SRE_OP_CATEGORY:
904 /* match at given category */
905 /* <CATEGORY> <code> */
906 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
907 ctx->ptr, *ctx->pattern));
908 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
909 RETURN_FAILURE;
910 ctx->pattern++;
911 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000912 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000913
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000914 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000915 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000916 /* <ANY> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000917 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
918 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
919 RETURN_FAILURE;
920 ctx->ptr++;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000921 break;
922
923 case SRE_OP_ANY_ALL:
924 /* match anything */
925 /* <ANY_ALL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000926 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
927 if (ctx->ptr >= end)
928 RETURN_FAILURE;
929 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000930 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000931
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000932 case SRE_OP_IN:
933 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000934 /* <IN> <skip> <set> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000935 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
936 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
937 RETURN_FAILURE;
938 ctx->pattern += ctx->pattern[0];
939 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000940 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000941
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000942 case SRE_OP_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000943 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
944 ctx->pattern, ctx->ptr, ctx->pattern[0]));
945 if (ctx->ptr >= end ||
946 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
947 RETURN_FAILURE;
948 ctx->pattern++;
949 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000950 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000951
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000952 case SRE_OP_NOT_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000953 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
954 ctx->pattern, ctx->ptr, *ctx->pattern));
955 if (ctx->ptr >= end ||
956 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
957 RETURN_FAILURE;
958 ctx->pattern++;
959 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000960 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000961
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000962 case SRE_OP_IN_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000963 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
964 if (ctx->ptr >= end
965 || !SRE_CHARSET(ctx->pattern+1,
966 (SRE_CODE)state->lower(*ctx->ptr)))
967 RETURN_FAILURE;
968 ctx->pattern += ctx->pattern[0];
969 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000970 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000971
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000972 case SRE_OP_JUMP:
973 case SRE_OP_INFO:
974 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000975 /* <JUMP> <offset> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000976 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
977 ctx->ptr, ctx->pattern[0]));
978 ctx->pattern += ctx->pattern[0];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000979 break;
980
981 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000982 /* alternation */
983 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000984 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000985 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000986 ctx->u.rep = state->repeat;
987 if (ctx->u.rep)
988 MARK_PUSH(ctx->lastmark);
989 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
990 if (ctx->pattern[1] == SRE_OP_LITERAL &&
991 (ctx->ptr >= end ||
992 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000993 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000994 if (ctx->pattern[1] == SRE_OP_IN &&
995 (ctx->ptr >= end ||
996 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000997 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000998 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000999 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001000 if (ret) {
1001 if (ctx->u.rep)
1002 MARK_POP_DISCARD(ctx->lastmark);
1003 RETURN_ON_ERROR(ret);
1004 RETURN_SUCCESS;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001005 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001006 if (ctx->u.rep)
1007 MARK_POP_KEEP(ctx->lastmark);
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001008 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001009 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001010 if (ctx->u.rep)
1011 MARK_POP_DISCARD(ctx->lastmark);
1012 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001013
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001014 case SRE_OP_REPEAT_ONE:
1015 /* match repeated sequence (maximizing regexp) */
1016
1017 /* this operator only works if the repeated item is
1018 exactly one character wide, and we're not already
1019 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +00001020 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001021
1022 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1023
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001024 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1025 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001026
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001027 if (ctx->ptr + ctx->pattern[1] > end)
1028 RETURN_FAILURE; /* cannot match */
Fredrik Lundhe1869832000-08-01 22:47:49 +00001029
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001030 state->ptr = ctx->ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001031
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001032 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1033 RETURN_ON_ERROR(ret);
1034 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1035 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001036 ctx->ptr += ctx->count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001037
1038 /* when we arrive here, count contains the number of
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001039 matches, and ctx->ptr points to the tail of the target
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001040 string. check if the rest of the pattern matches,
1041 and backtrack if not. */
1042
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001043 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001044 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001045
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001046 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001047 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001048 state->ptr = ctx->ptr;
1049 RETURN_SUCCESS;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001050 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001051
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001052 LASTMARK_SAVE();
1053
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001054 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001055 /* tail starts with a literal. skip positions where
1056 the rest of the pattern cannot possibly match */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001057 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001058 for (;;) {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001059 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001060 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1061 ctx->ptr--;
1062 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001063 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001064 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001065 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001066 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001067 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1068 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001069 if (ret) {
1070 RETURN_ON_ERROR(ret);
1071 RETURN_SUCCESS;
1072 }
Tim Peters3d563502006-01-21 02:47:53 +00001073
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001074 LASTMARK_RESTORE();
Tim Peters3d563502006-01-21 02:47:53 +00001075
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001076 ctx->ptr--;
1077 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001078 }
1079
1080 } else {
1081 /* general case */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001082 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001083 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001084 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1085 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001086 if (ret) {
1087 RETURN_ON_ERROR(ret);
1088 RETURN_SUCCESS;
1089 }
1090 ctx->ptr--;
1091 ctx->count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001092 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001093 }
1094 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001095 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001096
Guido van Rossum41c99e72003-04-14 17:59:34 +00001097 case SRE_OP_MIN_REPEAT_ONE:
1098 /* match repeated sequence (minimizing regexp) */
1099
1100 /* this operator only works if the repeated item is
1101 exactly one character wide, and we're not already
1102 collecting backtracking points. for other cases,
1103 use the MIN_REPEAT operator */
1104
1105 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1106
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001107 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1108 ctx->pattern[1], ctx->pattern[2]));
Guido van Rossum41c99e72003-04-14 17:59:34 +00001109
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001110 if (ctx->ptr + ctx->pattern[1] > end)
1111 RETURN_FAILURE; /* cannot match */
Guido van Rossum41c99e72003-04-14 17:59:34 +00001112
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001113 state->ptr = ctx->ptr;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001114
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001115 if (ctx->pattern[1] == 0)
1116 ctx->count = 0;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001117 else {
1118 /* count using pattern min as the maximum */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001119 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1120 RETURN_ON_ERROR(ret);
1121 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001122 if (ret < (Py_ssize_t) ctx->pattern[1])
Tim Peters3d563502006-01-21 02:47:53 +00001123 /* didn't match minimum number of times */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001124 RETURN_FAILURE;
1125 /* advance past minimum matches of repeat */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001126 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001127 ctx->ptr += ctx->count;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001128 }
1129
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001130 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Guido van Rossum41c99e72003-04-14 17:59:34 +00001131 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001132 state->ptr = ctx->ptr;
1133 RETURN_SUCCESS;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001134
1135 } else {
1136 /* general case */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001137 LASTMARK_SAVE();
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001138 while ((Py_ssize_t)ctx->pattern[2] == 65535
1139 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001140 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001141 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1142 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001143 if (ret) {
1144 RETURN_ON_ERROR(ret);
1145 RETURN_SUCCESS;
1146 }
1147 state->ptr = ctx->ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001148 ret = SRE_COUNT(state, ctx->pattern+3, 1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001149 RETURN_ON_ERROR(ret);
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001150 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001151 if (ret == 0)
Guido van Rossum41c99e72003-04-14 17:59:34 +00001152 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001153 assert(ret == 1);
1154 ctx->ptr++;
1155 ctx->count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001156 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001157 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001158 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001159 RETURN_FAILURE;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001160
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001161 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001162 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001163 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001164 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001165 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1166 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001167
1168 /* install new repeat context */
Thomas Wouters477c8d52006-05-27 19:21:47 +00001169 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001170 if (!ctx->u.rep) {
1171 PyErr_NoMemory();
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001172 RETURN_FAILURE;
Thomas Wouters89f507f2006-12-13 04:49:30 +00001173 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001174 ctx->u.rep->count = -1;
1175 ctx->u.rep->pattern = ctx->pattern;
1176 ctx->u.rep->prev = state->repeat;
1177 ctx->u.rep->last_ptr = NULL;
1178 state->repeat = ctx->u.rep;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001179
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001180 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001181 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001182 state->repeat = ctx->u.rep->prev;
Thomas Wouters477c8d52006-05-27 19:21:47 +00001183 PyObject_FREE(ctx->u.rep);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001184
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001185 if (ret) {
1186 RETURN_ON_ERROR(ret);
1187 RETURN_SUCCESS;
1188 }
1189 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001190
1191 case SRE_OP_MAX_UNTIL:
1192 /* maximizing repeat */
1193 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1194
1195 /* FIXME: we probably need to deal with zero-width
1196 matches in here... */
1197
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001198 ctx->u.rep = state->repeat;
1199 if (!ctx->u.rep)
1200 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001201
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001202 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001203
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001204 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001205
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001206 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1207 ctx->ptr, ctx->count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001208
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001209 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001210 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001211 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001212 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1213 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001214 if (ret) {
1215 RETURN_ON_ERROR(ret);
1216 RETURN_SUCCESS;
1217 }
1218 ctx->u.rep->count = ctx->count-1;
1219 state->ptr = ctx->ptr;
1220 RETURN_FAILURE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001221 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001222
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001223 if ((ctx->count < ctx->u.rep->pattern[2] ||
1224 ctx->u.rep->pattern[2] == 65535) &&
1225 state->ptr != ctx->u.rep->last_ptr) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001226 /* we may have enough matches, but if we can
1227 match another item, do so */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001228 ctx->u.rep->count = ctx->count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001229 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001230 MARK_PUSH(ctx->lastmark);
1231 /* zero-width match protection */
1232 DATA_PUSH(&ctx->u.rep->last_ptr);
1233 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001234 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1235 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001236 DATA_POP(&ctx->u.rep->last_ptr);
1237 if (ret) {
1238 MARK_POP_DISCARD(ctx->lastmark);
1239 RETURN_ON_ERROR(ret);
1240 RETURN_SUCCESS;
1241 }
1242 MARK_POP(ctx->lastmark);
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001243 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001244 ctx->u.rep->count = ctx->count-1;
1245 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001246 }
1247
1248 /* cannot match more repeated items here. make sure the
1249 tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001250 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001251 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001252 RETURN_ON_SUCCESS(ret);
1253 state->repeat = ctx->u.rep;
1254 state->ptr = ctx->ptr;
1255 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001256
1257 case SRE_OP_MIN_UNTIL:
1258 /* minimizing repeat */
1259 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1260
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001261 ctx->u.rep = state->repeat;
1262 if (!ctx->u.rep)
1263 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001264
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001265 state->ptr = ctx->ptr;
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001266
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001267 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001268
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001269 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1270 ctx->ptr, ctx->count, ctx->u.rep->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001271
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001272 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001273 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001274 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001275 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1276 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001277 if (ret) {
1278 RETURN_ON_ERROR(ret);
1279 RETURN_SUCCESS;
1280 }
1281 ctx->u.rep->count = ctx->count-1;
1282 state->ptr = ctx->ptr;
1283 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001284 }
1285
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001286 LASTMARK_SAVE();
1287
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001288 /* see if the tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001289 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001290 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001291 if (ret) {
1292 RETURN_ON_ERROR(ret);
1293 RETURN_SUCCESS;
1294 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001295
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001296 state->repeat = ctx->u.rep;
1297 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001298
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001299 LASTMARK_RESTORE();
1300
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001301 if (ctx->count >= ctx->u.rep->pattern[2]
1302 && ctx->u.rep->pattern[2] != 65535)
1303 RETURN_FAILURE;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001304
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001305 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001306 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1307 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001308 if (ret) {
1309 RETURN_ON_ERROR(ret);
1310 RETURN_SUCCESS;
1311 }
1312 ctx->u.rep->count = ctx->count-1;
1313 state->ptr = ctx->ptr;
1314 RETURN_FAILURE;
1315
1316 case SRE_OP_GROUPREF:
1317 /* match backreference */
1318 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1319 ctx->ptr, ctx->pattern[0]));
1320 i = ctx->pattern[0];
1321 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001322 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001323 if (groupref >= state->lastmark) {
1324 RETURN_FAILURE;
1325 } else {
1326 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1327 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1328 if (!p || !e || e < p)
1329 RETURN_FAILURE;
1330 while (p < e) {
1331 if (ctx->ptr >= end || *ctx->ptr != *p)
1332 RETURN_FAILURE;
1333 p++; ctx->ptr++;
1334 }
1335 }
1336 }
1337 ctx->pattern++;
1338 break;
1339
1340 case SRE_OP_GROUPREF_IGNORE:
1341 /* match backreference */
1342 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1343 ctx->ptr, ctx->pattern[0]));
1344 i = ctx->pattern[0];
1345 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001346 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001347 if (groupref >= state->lastmark) {
1348 RETURN_FAILURE;
1349 } else {
1350 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1351 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1352 if (!p || !e || e < p)
1353 RETURN_FAILURE;
1354 while (p < e) {
1355 if (ctx->ptr >= end ||
1356 state->lower(*ctx->ptr) != state->lower(*p))
1357 RETURN_FAILURE;
1358 p++; ctx->ptr++;
1359 }
1360 }
1361 }
1362 ctx->pattern++;
1363 break;
1364
1365 case SRE_OP_GROUPREF_EXISTS:
1366 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1367 ctx->ptr, ctx->pattern[0]));
1368 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1369 i = ctx->pattern[0];
1370 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001371 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001372 if (groupref >= state->lastmark) {
1373 ctx->pattern += ctx->pattern[1];
1374 break;
1375 } else {
1376 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1377 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1378 if (!p || !e || e < p) {
1379 ctx->pattern += ctx->pattern[1];
1380 break;
1381 }
1382 }
1383 }
1384 ctx->pattern += 2;
1385 break;
1386
1387 case SRE_OP_ASSERT:
1388 /* assert subpattern */
1389 /* <ASSERT> <skip> <back> <pattern> */
1390 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1391 ctx->ptr, ctx->pattern[1]));
1392 state->ptr = ctx->ptr - ctx->pattern[1];
1393 if (state->ptr < state->beginning)
1394 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001395 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001396 RETURN_ON_FAILURE(ret);
1397 ctx->pattern += ctx->pattern[0];
1398 break;
1399
1400 case SRE_OP_ASSERT_NOT:
1401 /* assert not subpattern */
1402 /* <ASSERT_NOT> <skip> <back> <pattern> */
1403 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1404 ctx->ptr, ctx->pattern[1]));
1405 state->ptr = ctx->ptr - ctx->pattern[1];
1406 if (state->ptr >= state->beginning) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001407 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001408 if (ret) {
1409 RETURN_ON_ERROR(ret);
1410 RETURN_FAILURE;
1411 }
1412 }
1413 ctx->pattern += ctx->pattern[0];
1414 break;
1415
1416 case SRE_OP_FAILURE:
1417 /* immediate failure */
1418 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1419 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001420
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001421 default:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001422 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1423 ctx->pattern[-1]));
1424 RETURN_ERROR(SRE_ERROR_ILLEGAL);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001425 }
1426 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001427
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001428exit:
1429 ctx_pos = ctx->last_ctx_pos;
1430 jump = ctx->jump;
1431 DATA_POP_DISCARD(ctx);
1432 if (ctx_pos == -1)
1433 return ret;
1434 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1435
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001436 switch (jump) {
1437 case JUMP_MAX_UNTIL_2:
1438 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1439 goto jump_max_until_2;
1440 case JUMP_MAX_UNTIL_3:
1441 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1442 goto jump_max_until_3;
1443 case JUMP_MIN_UNTIL_2:
1444 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1445 goto jump_min_until_2;
1446 case JUMP_MIN_UNTIL_3:
1447 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1448 goto jump_min_until_3;
1449 case JUMP_BRANCH:
1450 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1451 goto jump_branch;
1452 case JUMP_MAX_UNTIL_1:
1453 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1454 goto jump_max_until_1;
1455 case JUMP_MIN_UNTIL_1:
1456 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1457 goto jump_min_until_1;
1458 case JUMP_REPEAT:
1459 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1460 goto jump_repeat;
1461 case JUMP_REPEAT_ONE_1:
1462 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1463 goto jump_repeat_one_1;
1464 case JUMP_REPEAT_ONE_2:
1465 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1466 goto jump_repeat_one_2;
1467 case JUMP_MIN_REPEAT_ONE:
1468 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1469 goto jump_min_repeat_one;
1470 case JUMP_ASSERT:
1471 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1472 goto jump_assert;
1473 case JUMP_ASSERT_NOT:
1474 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1475 goto jump_assert_not;
1476 case JUMP_NONE:
1477 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1478 break;
1479 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001480
1481 return ret; /* should never get here */
Guido van Rossumb700df92000-03-31 14:59:30 +00001482}
1483
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001484LOCAL(Py_ssize_t)
Guido van Rossumb700df92000-03-31 14:59:30 +00001485SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1486{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001487 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1488 SRE_CHAR* end = (SRE_CHAR *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001489 Py_ssize_t status = 0;
1490 Py_ssize_t prefix_len = 0;
1491 Py_ssize_t prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001492 SRE_CODE* prefix = NULL;
1493 SRE_CODE* charset = NULL;
1494 SRE_CODE* overlap = NULL;
1495 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001496
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001497 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001498 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001499 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001500
1501 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001502
Gustavo Niemeyer28b5bb32003-06-26 14:41:08 +00001503 if (pattern[3] > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001504 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001505 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001506 end -= pattern[3]-1;
1507 if (end <= ptr)
1508 end = ptr+1;
1509 }
1510
Fredrik Lundh3562f112000-07-02 12:00:07 +00001511 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001512 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001513 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001514 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001515 prefix_skip = pattern[6];
1516 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001517 overlap = prefix + prefix_len - 1;
1518 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001519 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001520 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001521 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001522
1523 pattern += 1 + pattern[1];
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001524 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001525
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001526 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1527 TRACE(("charset = %p\n", charset));
1528
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001529#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001530 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001531 /* pattern starts with a known prefix. use the overlap
1532 table to skip forward as fast as we possibly can */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001533 Py_ssize_t i = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001534 end = (SRE_CHAR *)state->end;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001535 while (ptr < end) {
1536 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001537 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001538 if (!i)
1539 break;
1540 else
1541 i = overlap[i];
1542 } else {
1543 if (++i == prefix_len) {
1544 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001545 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1546 state->start = ptr + 1 - prefix_len;
1547 state->ptr = ptr + 1 - prefix_len + prefix_skip;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001548 if (flags & SRE_INFO_LITERAL)
1549 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001550 status = SRE_MATCH(state, pattern + 2*prefix_skip);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001551 if (status != 0)
1552 return status;
1553 /* close but no cigar -- try again */
1554 i = overlap[i];
1555 }
1556 break;
1557 }
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001558 }
1559 ptr++;
1560 }
1561 return 0;
1562 }
1563#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001564
Fredrik Lundh3562f112000-07-02 12:00:07 +00001565 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001566 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001567 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001568 SRE_CODE chr = pattern[1];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001569 end = (SRE_CHAR *)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001570 for (;;) {
1571 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1572 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001573 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001574 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001575 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001576 state->start = ptr;
1577 state->ptr = ++ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001578 if (flags & SRE_INFO_LITERAL)
1579 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001580 status = SRE_MATCH(state, pattern + 2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001581 if (status != 0)
1582 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001583 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001584 } else if (charset) {
1585 /* pattern starts with a character from a known set */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001586 end = (SRE_CHAR *)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001587 for (;;) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001588 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001589 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001590 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001591 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001592 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001593 state->start = ptr;
1594 state->ptr = ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001595 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001596 if (status != 0)
1597 break;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001598 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001599 }
1600 } else
1601 /* general case */
1602 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001603 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001604 state->start = state->ptr = ptr++;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001605 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001606 if (status != 0)
1607 break;
1608 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001609
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001610 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001611}
Tim Peters3d563502006-01-21 02:47:53 +00001612
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001613LOCAL(int)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001614SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001615{
1616 /* check if given string is a literal template (i.e. no escapes) */
1617 while (len-- > 0)
1618 if (*ptr++ == '\\')
1619 return 0;
1620 return 1;
1621}
Guido van Rossumb700df92000-03-31 14:59:30 +00001622
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001623#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001624
1625/* -------------------------------------------------------------------- */
1626/* factories and destructors */
1627
1628/* see sre.h for object declarations */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001629static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1630static PyObject*pattern_scanner(PatternObject*, PyObject*);
Guido van Rossumb700df92000-03-31 14:59:30 +00001631
1632static PyObject *
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001633sre_codesize(PyObject* self, PyObject *unused)
Guido van Rossumb700df92000-03-31 14:59:30 +00001634{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001635 return Py_BuildValue("l", sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001636}
1637
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001638static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001639sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001640{
1641 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001642 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001643 return NULL;
1644 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001645 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001646 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001647#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001648 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001649#else
1650 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001651#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001652 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001653}
1654
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001655LOCAL(void)
1656state_reset(SRE_STATE* state)
1657{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001658 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001659 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001660
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001661 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001662 state->lastindex = -1;
1663
1664 state->repeat = NULL;
1665
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001666 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001667}
1668
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001669static void*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001670getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
Guido van Rossumb700df92000-03-31 14:59:30 +00001671{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001672 /* given a python object, return a data pointer, a length (in
1673 characters), and a character size. return NULL if the object
1674 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +00001675
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001676 PyBufferProcs *buffer;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001677 Py_ssize_t size, bytes;
1678 int charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001679 void* ptr;
Travis E. Oliphant8ae62b62007-09-23 02:00:13 +00001680 Py_buffer view;
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001681
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00001682 /* Unicode objects do not support the buffer API. So, get the data
1683 directly instead. */
1684 if (PyUnicode_Check(string)) {
1685 ptr = (void *)PyUnicode_AS_DATA(string);
1686 *p_length = PyUnicode_GET_SIZE(string);
1687 *p_charsize = sizeof(Py_UNICODE);
1688 return ptr;
1689 }
1690
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001691 /* get pointer to string buffer */
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001692 view.len = -1;
Christian Heimes90aa7642007-12-19 02:45:37 +00001693 buffer = Py_TYPE(string)->tp_as_buffer;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001694 if (!buffer || !buffer->bf_getbuffer ||
1695 (*buffer->bf_getbuffer)(string, &view, PyBUF_SIMPLE) < 0) {
1696 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1697 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001698 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001699
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001700 /* determine buffer size */
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001701 bytes = view.len;
1702 ptr = view.buf;
1703
1704 /* Release the buffer immediately --- possibly dangerous
1705 but doing something else would require some re-factoring
1706 */
Martin v. Löwis423be952008-08-13 15:53:07 +00001707 PyBuffer_Release(&view);
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001708
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001709 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001710 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1711 return NULL;
1712 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001713
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001714 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001715 size = PyObject_Size(string);
Guido van Rossumb700df92000-03-31 14:59:30 +00001716
Christian Heimes72b710a2008-05-26 13:28:38 +00001717 if (PyBytes_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001718 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001719#if defined(HAVE_UNICODE)
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001720 else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001721 charsize = sizeof(Py_UNICODE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001722#endif
1723 else {
1724 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1725 return NULL;
1726 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001727
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001728 *p_length = size;
1729 *p_charsize = charsize;
1730
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001731 if (ptr == NULL) {
1732 PyErr_SetString(PyExc_ValueError,
1733 "Buffer is NULL");
1734 }
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001735 return ptr;
1736}
1737
1738LOCAL(PyObject*)
1739state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001740 Py_ssize_t start, Py_ssize_t end)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001741{
1742 /* prepare state object */
1743
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001744 Py_ssize_t length;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001745 int charsize;
1746 void* ptr;
1747
1748 memset(state, 0, sizeof(SRE_STATE));
1749
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001750 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001751 state->lastindex = -1;
1752
1753 ptr = getstring(string, &length, &charsize);
1754 if (!ptr)
1755 return NULL;
1756
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001757 /* adjust boundaries */
1758 if (start < 0)
1759 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001760 else if (start > length)
1761 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001762
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001763 if (end < 0)
1764 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001765 else if (end > length)
1766 end = length;
1767
1768 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001769
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001770 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001771
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001772 state->start = (void*) ((char*) ptr + start * state->charsize);
1773 state->end = (void*) ((char*) ptr + end * state->charsize);
1774
1775 Py_INCREF(string);
1776 state->string = string;
1777 state->pos = start;
1778 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001779
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001780 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001781 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001782 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001783#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001784 state->lower = sre_lower_unicode;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001785#else
1786 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001787#endif
1788 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001789 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001790
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001791 return string;
Guido van Rossumb700df92000-03-31 14:59:30 +00001792}
1793
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001794LOCAL(void)
1795state_fini(SRE_STATE* state)
1796{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001797 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001798 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001799}
1800
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001801/* calculate offset from start of string */
1802#define STATE_OFFSET(state, member)\
1803 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1804
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001805LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001806state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001807{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001808 Py_ssize_t i, j;
Fredrik Lundh58100642000-08-09 09:14:35 +00001809
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001810 index = (index - 1) * 2;
1811
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001812 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001813 if (empty)
1814 /* want empty string */
1815 i = j = 0;
1816 else {
1817 Py_INCREF(Py_None);
1818 return Py_None;
1819 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001820 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001821 i = STATE_OFFSET(state, state->mark[index]);
1822 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001823 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001824
Fredrik Lundh58100642000-08-09 09:14:35 +00001825 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001826}
1827
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001828static void
1829pattern_error(int status)
1830{
1831 switch (status) {
1832 case SRE_ERROR_RECURSION_LIMIT:
1833 PyErr_SetString(
1834 PyExc_RuntimeError,
1835 "maximum recursion limit exceeded"
1836 );
1837 break;
1838 case SRE_ERROR_MEMORY:
1839 PyErr_NoMemory();
1840 break;
Christian Heimes2380ac72008-01-09 00:17:24 +00001841 case SRE_ERROR_INTERRUPTED:
1842 /* An exception has already been raised, so let it fly */
1843 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001844 default:
1845 /* other error codes indicate compiler/engine bugs */
1846 PyErr_SetString(
1847 PyExc_RuntimeError,
1848 "internal error in regular expression engine"
1849 );
1850 }
1851}
1852
Guido van Rossumb700df92000-03-31 14:59:30 +00001853static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001854pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001855{
Raymond Hettinger027bb632004-05-31 03:09:25 +00001856 if (self->weakreflist != NULL)
1857 PyObject_ClearWeakRefs((PyObject *) self);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001858 Py_XDECREF(self->pattern);
1859 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001860 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001861 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001862}
1863
1864static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001865pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001866{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001867 SRE_STATE state;
1868 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001869
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001870 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001871 Py_ssize_t start = 0;
1872 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001873 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001874 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001875 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001876 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001877
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001878 string = state_init(&state, self, string, start, end);
1879 if (!string)
1880 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001881
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001882 state.ptr = state.start;
1883
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001884 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1885
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001886 if (state.charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001887 status = sre_match(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001888 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001889#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001890 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001891#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001892 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001893
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001894 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001895 if (PyErr_Occurred())
1896 return NULL;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001897
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001898 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001899
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001900 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001901}
1902
1903static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001904pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001905{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001906 SRE_STATE state;
1907 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001908
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001909 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001910 Py_ssize_t start = 0;
1911 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001912 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001913 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001914 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001915 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001916
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001917 string = state_init(&state, self, string, start, end);
1918 if (!string)
1919 return NULL;
1920
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001921 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1922
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001923 if (state.charsize == 1) {
1924 status = sre_search(&state, PatternObject_GetCode(self));
1925 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001926#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001927 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001928#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001929 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001930
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001931 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1932
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001933 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001934
Thomas Wouters89f507f2006-12-13 04:49:30 +00001935 if (PyErr_Occurred())
1936 return NULL;
1937
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001938 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001939}
1940
1941static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001942call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001943{
1944 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001945 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001946 PyObject* func;
1947 PyObject* result;
1948
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001949 if (!args)
1950 return NULL;
Neal Norwitzfe537132007-08-26 03:55:15 +00001951 name = PyUnicode_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001952 if (!name)
1953 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001954 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001955 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001956 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001957 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001958 func = PyObject_GetAttrString(mod, function);
1959 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001960 if (!func)
1961 return NULL;
1962 result = PyObject_CallObject(func, args);
1963 Py_DECREF(func);
1964 Py_DECREF(args);
1965 return result;
1966}
1967
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001968#ifdef USE_BUILTIN_COPY
1969static int
1970deepcopy(PyObject** object, PyObject* memo)
1971{
1972 PyObject* copy;
1973
1974 copy = call(
1975 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00001976 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001977 );
1978 if (!copy)
1979 return 0;
1980
1981 Py_DECREF(*object);
1982 *object = copy;
1983
1984 return 1; /* success */
1985}
1986#endif
1987
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001988static PyObject*
Thomas Wouters1b7f8912007-09-19 03:06:30 +00001989join_list(PyObject* list, PyObject* string)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001990{
1991 /* join list elements */
1992
1993 PyObject* joiner;
1994#if PY_VERSION_HEX >= 0x01060000
1995 PyObject* function;
1996 PyObject* args;
1997#endif
1998 PyObject* result;
1999
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002000 joiner = PySequence_GetSlice(string, 0, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002001 if (!joiner)
2002 return NULL;
2003
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002004 if (PyList_GET_SIZE(list) == 0) {
2005 Py_DECREF(list);
2006 return joiner;
2007 }
2008
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002009#if PY_VERSION_HEX >= 0x01060000
2010 function = PyObject_GetAttrString(joiner, "join");
2011 if (!function) {
2012 Py_DECREF(joiner);
2013 return NULL;
2014 }
2015 args = PyTuple_New(1);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002016 if (!args) {
2017 Py_DECREF(function);
2018 Py_DECREF(joiner);
2019 return NULL;
2020 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002021 PyTuple_SET_ITEM(args, 0, list);
2022 result = PyObject_CallObject(function, args);
2023 Py_DECREF(args); /* also removes list */
2024 Py_DECREF(function);
2025#else
2026 result = call(
2027 "string", "join",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002028 PyTuple_Pack(2, list, joiner)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002029 );
2030#endif
2031 Py_DECREF(joiner);
2032
2033 return result;
2034}
2035
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002036static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002037pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002038{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002039 SRE_STATE state;
2040 PyObject* list;
2041 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002042 Py_ssize_t i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00002043
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002044 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002045 Py_ssize_t start = 0;
2046 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002047 static char* kwlist[] = { "source", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002048 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00002049 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002050 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002051
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002052 string = state_init(&state, self, string, start, end);
2053 if (!string)
2054 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002055
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002056 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002057 if (!list) {
2058 state_fini(&state);
2059 return NULL;
2060 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002061
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002062 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002063
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002064 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +00002065
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002066 state_reset(&state);
2067
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002068 state.ptr = state.start;
2069
2070 if (state.charsize == 1) {
2071 status = sre_search(&state, PatternObject_GetCode(self));
2072 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002073#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002074 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002075#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002076 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002077
Thomas Wouters89f507f2006-12-13 04:49:30 +00002078 if (PyErr_Occurred())
2079 goto error;
2080
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002081 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002082 if (status == 0)
2083 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002084 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002085 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002086 }
Tim Peters3d563502006-01-21 02:47:53 +00002087
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002088 /* don't bother to build a match object */
2089 switch (self->groups) {
2090 case 0:
2091 b = STATE_OFFSET(&state, state.start);
2092 e = STATE_OFFSET(&state, state.ptr);
2093 item = PySequence_GetSlice(string, b, e);
2094 if (!item)
2095 goto error;
2096 break;
2097 case 1:
2098 item = state_getslice(&state, 1, string, 1);
2099 if (!item)
2100 goto error;
2101 break;
2102 default:
2103 item = PyTuple_New(self->groups);
2104 if (!item)
2105 goto error;
2106 for (i = 0; i < self->groups; i++) {
2107 PyObject* o = state_getslice(&state, i+1, string, 1);
2108 if (!o) {
2109 Py_DECREF(item);
2110 goto error;
2111 }
2112 PyTuple_SET_ITEM(item, i, o);
2113 }
2114 break;
2115 }
2116
2117 status = PyList_Append(list, item);
2118 Py_DECREF(item);
2119 if (status < 0)
2120 goto error;
2121
2122 if (state.ptr == state.start)
2123 state.start = (void*) ((char*) state.ptr + state.charsize);
2124 else
2125 state.start = state.ptr;
2126
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002127 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002128
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002129 state_fini(&state);
2130 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002131
2132error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002133 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002134 state_fini(&state);
2135 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002136
Guido van Rossumb700df92000-03-31 14:59:30 +00002137}
2138
Fredrik Lundh703ce812001-10-24 22:16:30 +00002139#if PY_VERSION_HEX >= 0x02020000
2140static PyObject*
2141pattern_finditer(PatternObject* pattern, PyObject* args)
2142{
2143 PyObject* scanner;
2144 PyObject* search;
2145 PyObject* iterator;
2146
2147 scanner = pattern_scanner(pattern, args);
2148 if (!scanner)
2149 return NULL;
2150
2151 search = PyObject_GetAttrString(scanner, "search");
2152 Py_DECREF(scanner);
2153 if (!search)
2154 return NULL;
2155
2156 iterator = PyCallIter_New(search, Py_None);
2157 Py_DECREF(search);
2158
2159 return iterator;
2160}
2161#endif
2162
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002163static PyObject*
2164pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2165{
2166 SRE_STATE state;
2167 PyObject* list;
2168 PyObject* item;
2169 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002170 Py_ssize_t n;
2171 Py_ssize_t i;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002172 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002173
2174 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002175 Py_ssize_t maxsplit = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002176 static char* kwlist[] = { "source", "maxsplit", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002177 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002178 &string, &maxsplit))
2179 return NULL;
2180
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002181 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002182 if (!string)
2183 return NULL;
2184
2185 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002186 if (!list) {
2187 state_fini(&state);
2188 return NULL;
2189 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002190
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002191 n = 0;
2192 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002193
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002194 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002195
2196 state_reset(&state);
2197
2198 state.ptr = state.start;
2199
2200 if (state.charsize == 1) {
2201 status = sre_search(&state, PatternObject_GetCode(self));
2202 } else {
2203#if defined(HAVE_UNICODE)
2204 status = sre_usearch(&state, PatternObject_GetCode(self));
2205#endif
2206 }
2207
Thomas Wouters89f507f2006-12-13 04:49:30 +00002208 if (PyErr_Occurred())
2209 goto error;
2210
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002211 if (status <= 0) {
2212 if (status == 0)
2213 break;
2214 pattern_error(status);
2215 goto error;
2216 }
Tim Peters3d563502006-01-21 02:47:53 +00002217
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002218 if (state.start == state.ptr) {
2219 if (last == state.end)
2220 break;
2221 /* skip one character */
2222 state.start = (void*) ((char*) state.ptr + state.charsize);
2223 continue;
2224 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002225
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002226 /* get segment before this match */
2227 item = PySequence_GetSlice(
2228 string, STATE_OFFSET(&state, last),
2229 STATE_OFFSET(&state, state.start)
2230 );
2231 if (!item)
2232 goto error;
2233 status = PyList_Append(list, item);
2234 Py_DECREF(item);
2235 if (status < 0)
2236 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002237
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002238 /* add groups (if any) */
2239 for (i = 0; i < self->groups; i++) {
2240 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002241 if (!item)
2242 goto error;
2243 status = PyList_Append(list, item);
2244 Py_DECREF(item);
2245 if (status < 0)
2246 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002247 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002248
2249 n = n + 1;
2250
2251 last = state.start = state.ptr;
2252
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002253 }
2254
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002255 /* get segment following last match (even if empty) */
2256 item = PySequence_GetSlice(
2257 string, STATE_OFFSET(&state, last), state.endpos
2258 );
2259 if (!item)
2260 goto error;
2261 status = PyList_Append(list, item);
2262 Py_DECREF(item);
2263 if (status < 0)
2264 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002265
2266 state_fini(&state);
2267 return list;
2268
2269error:
2270 Py_DECREF(list);
2271 state_fini(&state);
2272 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002273
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002274}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002275
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002276static PyObject*
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002277pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002278 Py_ssize_t count, Py_ssize_t subn)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002279{
2280 SRE_STATE state;
2281 PyObject* list;
2282 PyObject* item;
2283 PyObject* filter;
2284 PyObject* args;
2285 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002286 void* ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002287 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002288 Py_ssize_t n;
2289 Py_ssize_t i, b, e;
2290 int bint;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002291 int filter_is_callable;
2292
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002293 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002294 /* sub/subn takes either a function or a template */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002295 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002296 Py_INCREF(filter);
2297 filter_is_callable = 1;
2298 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002299 /* if not callable, check if it's a literal string */
2300 int literal;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002301 ptr = getstring(ptemplate, &n, &bint);
2302 b = bint;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002303 if (ptr) {
2304 if (b == 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002305 literal = sre_literal_template((unsigned char *)ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002306 } else {
2307#if defined(HAVE_UNICODE)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002308 literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002309#endif
2310 }
2311 } else {
2312 PyErr_Clear();
2313 literal = 0;
2314 }
2315 if (literal) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002316 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002317 Py_INCREF(filter);
2318 filter_is_callable = 0;
2319 } else {
2320 /* not a literal; hand it over to the template compiler */
2321 filter = call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00002322 SRE_PY_MODULE, "_subx",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002323 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002324 );
2325 if (!filter)
2326 return NULL;
2327 filter_is_callable = PyCallable_Check(filter);
2328 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002329 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002330
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002331 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002332 if (!string) {
2333 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002334 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002335 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002336
2337 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002338 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002339 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002340 state_fini(&state);
2341 return NULL;
2342 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002343
2344 n = i = 0;
2345
2346 while (!count || n < count) {
2347
2348 state_reset(&state);
2349
2350 state.ptr = state.start;
2351
2352 if (state.charsize == 1) {
2353 status = sre_search(&state, PatternObject_GetCode(self));
2354 } else {
2355#if defined(HAVE_UNICODE)
2356 status = sre_usearch(&state, PatternObject_GetCode(self));
2357#endif
2358 }
2359
Thomas Wouters89f507f2006-12-13 04:49:30 +00002360 if (PyErr_Occurred())
2361 goto error;
2362
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002363 if (status <= 0) {
2364 if (status == 0)
2365 break;
2366 pattern_error(status);
2367 goto error;
2368 }
Tim Peters3d563502006-01-21 02:47:53 +00002369
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002370 b = STATE_OFFSET(&state, state.start);
2371 e = STATE_OFFSET(&state, state.ptr);
2372
2373 if (i < b) {
2374 /* get segment before this match */
2375 item = PySequence_GetSlice(string, i, b);
2376 if (!item)
2377 goto error;
2378 status = PyList_Append(list, item);
2379 Py_DECREF(item);
2380 if (status < 0)
2381 goto error;
2382
2383 } else if (i == b && i == e && n > 0)
2384 /* ignore empty match on latest position */
2385 goto next;
2386
2387 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002388 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002389 match = pattern_new_match(self, &state, 1);
2390 if (!match)
2391 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002392 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002393 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002394 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002395 goto error;
2396 }
2397 item = PyObject_CallObject(filter, args);
2398 Py_DECREF(args);
2399 Py_DECREF(match);
2400 if (!item)
2401 goto error;
2402 } else {
2403 /* filter is literal string */
2404 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002405 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002406 }
2407
2408 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002409 if (item != Py_None) {
2410 status = PyList_Append(list, item);
2411 Py_DECREF(item);
2412 if (status < 0)
2413 goto error;
2414 }
Tim Peters3d563502006-01-21 02:47:53 +00002415
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002416 i = e;
2417 n = n + 1;
2418
2419next:
2420 /* move on */
2421 if (state.ptr == state.start)
2422 state.start = (void*) ((char*) state.ptr + state.charsize);
2423 else
2424 state.start = state.ptr;
2425
2426 }
2427
2428 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002429 if (i < state.endpos) {
2430 item = PySequence_GetSlice(string, i, state.endpos);
2431 if (!item)
2432 goto error;
2433 status = PyList_Append(list, item);
2434 Py_DECREF(item);
2435 if (status < 0)
2436 goto error;
2437 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002438
2439 state_fini(&state);
2440
Guido van Rossum4e173842001-12-07 04:25:10 +00002441 Py_DECREF(filter);
2442
Fredrik Lundhdac58492001-10-21 21:48:30 +00002443 /* convert list to single string (also removes list) */
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002444 item = join_list(list, string);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002445
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002446 if (!item)
2447 return NULL;
2448
2449 if (subn)
2450 return Py_BuildValue("Ni", item, n);
2451
2452 return item;
2453
2454error:
2455 Py_DECREF(list);
2456 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002457 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002458 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002459
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002460}
2461
2462static PyObject*
2463pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2464{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002465 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002466 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002467 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002468 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002469 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002470 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002471 return NULL;
2472
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002473 return pattern_subx(self, ptemplate, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002474}
2475
2476static PyObject*
2477pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2478{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002479 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002480 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002481 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002482 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002483 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002484 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002485 return NULL;
2486
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002487 return pattern_subx(self, ptemplate, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002488}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002489
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002490static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002491pattern_copy(PatternObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002492{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002493#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002494 PatternObject* copy;
2495 int offset;
2496
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002497 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2498 if (!copy)
2499 return NULL;
2500
2501 offset = offsetof(PatternObject, groups);
2502
2503 Py_XINCREF(self->groupindex);
2504 Py_XINCREF(self->indexgroup);
2505 Py_XINCREF(self->pattern);
2506
2507 memcpy((char*) copy + offset, (char*) self + offset,
2508 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00002509 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002510
2511 return (PyObject*) copy;
2512#else
2513 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2514 return NULL;
2515#endif
2516}
2517
2518static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002519pattern_deepcopy(PatternObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002520{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002521#ifdef USE_BUILTIN_COPY
2522 PatternObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00002523
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002524 copy = (PatternObject*) pattern_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002525 if (!copy)
2526 return NULL;
2527
2528 if (!deepcopy(&copy->groupindex, memo) ||
2529 !deepcopy(&copy->indexgroup, memo) ||
2530 !deepcopy(&copy->pattern, memo)) {
2531 Py_DECREF(copy);
2532 return NULL;
2533 }
2534
2535#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002536 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2537 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002538#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002539}
2540
Raymond Hettinger94478742004-09-24 04:31:19 +00002541PyDoc_STRVAR(pattern_match_doc,
2542"match(string[, pos[, endpos]]) --> match object or None.\n\
2543 Matches zero or more characters at the beginning of the string");
2544
2545PyDoc_STRVAR(pattern_search_doc,
2546"search(string[, pos[, endpos]]) --> match object or None.\n\
2547 Scan through string looking for a match, and return a corresponding\n\
2548 MatchObject instance. Return None if no position in the string matches.");
2549
2550PyDoc_STRVAR(pattern_split_doc,
2551"split(string[, maxsplit = 0]) --> list.\n\
2552 Split string by the occurrences of pattern.");
2553
2554PyDoc_STRVAR(pattern_findall_doc,
2555"findall(string[, pos[, endpos]]) --> list.\n\
2556 Return a list of all non-overlapping matches of pattern in string.");
2557
2558PyDoc_STRVAR(pattern_finditer_doc,
2559"finditer(string[, pos[, endpos]]) --> iterator.\n\
2560 Return an iterator over all non-overlapping matches for the \n\
2561 RE pattern in string. For each match, the iterator returns a\n\
2562 match object.");
2563
2564PyDoc_STRVAR(pattern_sub_doc,
2565"sub(repl, string[, count = 0]) --> newstring\n\
2566 Return the string obtained by replacing the leftmost non-overlapping\n\
Tim Peters3d563502006-01-21 02:47:53 +00002567 occurrences of pattern in string by the replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002568
2569PyDoc_STRVAR(pattern_subn_doc,
2570"subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2571 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2572 the leftmost non-overlapping occurrences of pattern with the\n\
Tim Peters3d563502006-01-21 02:47:53 +00002573 replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002574
2575PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2576
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002577static PyMethodDef pattern_methods[] = {
Tim Peters3d563502006-01-21 02:47:53 +00002578 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002579 pattern_match_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002580 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002581 pattern_search_doc},
2582 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2583 pattern_sub_doc},
2584 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2585 pattern_subn_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002586 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002587 pattern_split_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002588 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002589 pattern_findall_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002590#if PY_VERSION_HEX >= 0x02020000
Raymond Hettinger94478742004-09-24 04:31:19 +00002591 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2592 pattern_finditer_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002593#endif
Fredrik Lundh562586e2000-10-03 20:43:34 +00002594 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002595 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2596 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002597 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002598};
2599
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002600#define PAT_OFF(x) offsetof(PatternObject, x)
2601static PyMemberDef pattern_members[] = {
2602 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
2603 {"flags", T_INT, PAT_OFF(flags), READONLY},
2604 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
2605 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
2606 {NULL} /* Sentinel */
2607};
Guido van Rossumb700df92000-03-31 14:59:30 +00002608
Neal Norwitz57c179c2006-03-22 07:18:02 +00002609static PyTypeObject Pattern_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00002610 PyVarObject_HEAD_INIT(NULL, 0)
2611 "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002612 sizeof(PatternObject), sizeof(SRE_CODE),
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002613 (destructor)pattern_dealloc, /* tp_dealloc */
2614 0, /* tp_print */
2615 0, /* tp_getattr */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002616 0, /* tp_setattr */
2617 0, /* tp_compare */
2618 0, /* tp_repr */
2619 0, /* tp_as_number */
2620 0, /* tp_as_sequence */
2621 0, /* tp_as_mapping */
2622 0, /* tp_hash */
2623 0, /* tp_call */
2624 0, /* tp_str */
2625 0, /* tp_getattro */
2626 0, /* tp_setattro */
2627 0, /* tp_as_buffer */
Guido van Rossum3cf5b1e2006-07-27 21:53:35 +00002628 Py_TPFLAGS_DEFAULT, /* tp_flags */
Raymond Hettinger94478742004-09-24 04:31:19 +00002629 pattern_doc, /* tp_doc */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002630 0, /* tp_traverse */
2631 0, /* tp_clear */
2632 0, /* tp_richcompare */
2633 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002634 0, /* tp_iter */
2635 0, /* tp_iternext */
2636 pattern_methods, /* tp_methods */
2637 pattern_members, /* tp_members */
Guido van Rossumb700df92000-03-31 14:59:30 +00002638};
2639
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002640static int _validate(PatternObject *self); /* Forward */
2641
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002642static PyObject *
2643_compile(PyObject* self_, PyObject* args)
2644{
2645 /* "compile" pattern descriptor to pattern object */
2646
2647 PatternObject* self;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002648 Py_ssize_t i, n;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002649
2650 PyObject* pattern;
2651 int flags = 0;
2652 PyObject* code;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002653 Py_ssize_t groups = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002654 PyObject* groupindex = NULL;
2655 PyObject* indexgroup = NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002656 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002657 &PyList_Type, &code, &groups,
2658 &groupindex, &indexgroup))
2659 return NULL;
2660
2661 n = PyList_GET_SIZE(code);
Christian Heimes587c2bf2008-01-19 16:21:02 +00002662 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002663 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2664 if (!self)
2665 return NULL;
2666
2667 self->codesize = n;
2668
2669 for (i = 0; i < n; i++) {
2670 PyObject *o = PyList_GET_ITEM(code, i);
Guido van Rossumddefaf32007-01-14 03:31:43 +00002671 unsigned long value = PyLong_AsUnsignedLong(o);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002672 self->code[i] = (SRE_CODE) value;
2673 if ((unsigned long) self->code[i] != value) {
2674 PyErr_SetString(PyExc_OverflowError,
2675 "regular expression code size limit exceeded");
2676 break;
2677 }
2678 }
2679
2680 if (PyErr_Occurred()) {
2681 PyObject_DEL(self);
2682 return NULL;
2683 }
2684
2685 Py_INCREF(pattern);
2686 self->pattern = pattern;
2687
2688 self->flags = flags;
2689
2690 self->groups = groups;
2691
2692 Py_XINCREF(groupindex);
2693 self->groupindex = groupindex;
2694
2695 Py_XINCREF(indexgroup);
2696 self->indexgroup = indexgroup;
2697
2698 self->weakreflist = NULL;
2699
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002700 if (!_validate(self)) {
2701 Py_DECREF(self);
2702 return NULL;
2703 }
2704
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002705 return (PyObject*) self;
2706}
2707
Guido van Rossumb700df92000-03-31 14:59:30 +00002708/* -------------------------------------------------------------------- */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002709/* Code validation */
2710
2711/* To learn more about this code, have a look at the _compile() function in
2712 Lib/sre_compile.py. The validation functions below checks the code array
2713 for conformance with the code patterns generated there.
2714
2715 The nice thing about the generated code is that it is position-independent:
2716 all jumps are relative jumps forward. Also, jumps don't cross each other:
2717 the target of a later jump is always earlier than the target of an earlier
2718 jump. IOW, this is okay:
2719
2720 J---------J-------T--------T
2721 \ \_____/ /
2722 \______________________/
2723
2724 but this is not:
2725
2726 J---------J-------T--------T
2727 \_________\_____/ /
2728 \____________/
2729
2730 It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
2731 bytes wide (the latter if Python is compiled for "wide" unicode support).
2732*/
2733
2734/* Defining this one enables tracing of the validator */
2735#undef VVERBOSE
2736
2737/* Trace macro for the validator */
2738#if defined(VVERBOSE)
2739#define VTRACE(v) printf v
2740#else
2741#define VTRACE(v)
2742#endif
2743
2744/* Report failure */
2745#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2746
2747/* Extract opcode, argument, or skip count from code array */
2748#define GET_OP \
2749 do { \
2750 VTRACE(("%p: ", code)); \
2751 if (code >= end) FAIL; \
2752 op = *code++; \
2753 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2754 } while (0)
2755#define GET_ARG \
2756 do { \
2757 VTRACE(("%p= ", code)); \
2758 if (code >= end) FAIL; \
2759 arg = *code++; \
2760 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2761 } while (0)
2762#define GET_SKIP \
2763 do { \
2764 VTRACE(("%p= ", code)); \
2765 if (code >= end) FAIL; \
2766 skip = *code; \
2767 VTRACE(("%lu (skip to %p)\n", \
2768 (unsigned long)skip, code+skip)); \
2769 if (code+skip < code || code+skip > end) \
2770 FAIL; \
2771 code++; \
2772 } while (0)
2773
2774static int
2775_validate_charset(SRE_CODE *code, SRE_CODE *end)
2776{
2777 /* Some variables are manipulated by the macros above */
2778 SRE_CODE op;
2779 SRE_CODE arg;
2780 SRE_CODE offset;
2781 int i;
2782
2783 while (code < end) {
2784 GET_OP;
2785 switch (op) {
2786
2787 case SRE_OP_NEGATE:
2788 break;
2789
2790 case SRE_OP_LITERAL:
2791 GET_ARG;
2792 break;
2793
2794 case SRE_OP_RANGE:
2795 GET_ARG;
2796 GET_ARG;
2797 break;
2798
2799 case SRE_OP_CHARSET:
2800 offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
2801 if (code+offset < code || code+offset > end)
2802 FAIL;
2803 code += offset;
2804 break;
2805
2806 case SRE_OP_BIGCHARSET:
2807 GET_ARG; /* Number of blocks */
2808 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
2809 if (code+offset < code || code+offset > end)
2810 FAIL;
2811 /* Make sure that each byte points to a valid block */
2812 for (i = 0; i < 256; i++) {
2813 if (((unsigned char *)code)[i] >= arg)
2814 FAIL;
2815 }
2816 code += offset;
2817 offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
2818 if (code+offset < code || code+offset > end)
2819 FAIL;
2820 code += offset;
2821 break;
2822
2823 case SRE_OP_CATEGORY:
2824 GET_ARG;
2825 switch (arg) {
2826 case SRE_CATEGORY_DIGIT:
2827 case SRE_CATEGORY_NOT_DIGIT:
2828 case SRE_CATEGORY_SPACE:
2829 case SRE_CATEGORY_NOT_SPACE:
2830 case SRE_CATEGORY_WORD:
2831 case SRE_CATEGORY_NOT_WORD:
2832 case SRE_CATEGORY_LINEBREAK:
2833 case SRE_CATEGORY_NOT_LINEBREAK:
2834 case SRE_CATEGORY_LOC_WORD:
2835 case SRE_CATEGORY_LOC_NOT_WORD:
2836 case SRE_CATEGORY_UNI_DIGIT:
2837 case SRE_CATEGORY_UNI_NOT_DIGIT:
2838 case SRE_CATEGORY_UNI_SPACE:
2839 case SRE_CATEGORY_UNI_NOT_SPACE:
2840 case SRE_CATEGORY_UNI_WORD:
2841 case SRE_CATEGORY_UNI_NOT_WORD:
2842 case SRE_CATEGORY_UNI_LINEBREAK:
2843 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2844 break;
2845 default:
2846 FAIL;
2847 }
2848 break;
2849
2850 default:
2851 FAIL;
2852
2853 }
2854 }
2855
2856 return 1;
2857}
2858
2859static int
2860_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2861{
2862 /* Some variables are manipulated by the macros above */
2863 SRE_CODE op;
2864 SRE_CODE arg;
2865 SRE_CODE skip;
2866
2867 VTRACE(("code=%p, end=%p\n", code, end));
2868
2869 if (code > end)
2870 FAIL;
2871
2872 while (code < end) {
2873 GET_OP;
2874 switch (op) {
2875
2876 case SRE_OP_MARK:
2877 /* We don't check whether marks are properly nested; the
2878 sre_match() code is robust even if they don't, and the worst
2879 you can get is nonsensical match results. */
2880 GET_ARG;
2881 if (arg > 2*groups+1) {
2882 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
2883 FAIL;
2884 }
2885 break;
2886
2887 case SRE_OP_LITERAL:
2888 case SRE_OP_NOT_LITERAL:
2889 case SRE_OP_LITERAL_IGNORE:
2890 case SRE_OP_NOT_LITERAL_IGNORE:
2891 GET_ARG;
2892 /* The arg is just a character, nothing to check */
2893 break;
2894
2895 case SRE_OP_SUCCESS:
2896 case SRE_OP_FAILURE:
2897 /* Nothing to check; these normally end the matching process */
2898 break;
2899
2900 case SRE_OP_AT:
2901 GET_ARG;
2902 switch (arg) {
2903 case SRE_AT_BEGINNING:
2904 case SRE_AT_BEGINNING_STRING:
2905 case SRE_AT_BEGINNING_LINE:
2906 case SRE_AT_END:
2907 case SRE_AT_END_LINE:
2908 case SRE_AT_END_STRING:
2909 case SRE_AT_BOUNDARY:
2910 case SRE_AT_NON_BOUNDARY:
2911 case SRE_AT_LOC_BOUNDARY:
2912 case SRE_AT_LOC_NON_BOUNDARY:
2913 case SRE_AT_UNI_BOUNDARY:
2914 case SRE_AT_UNI_NON_BOUNDARY:
2915 break;
2916 default:
2917 FAIL;
2918 }
2919 break;
2920
2921 case SRE_OP_ANY:
2922 case SRE_OP_ANY_ALL:
2923 /* These have no operands */
2924 break;
2925
2926 case SRE_OP_IN:
2927 case SRE_OP_IN_IGNORE:
2928 GET_SKIP;
2929 /* Stop 1 before the end; we check the FAILURE below */
2930 if (!_validate_charset(code, code+skip-2))
2931 FAIL;
2932 if (code[skip-2] != SRE_OP_FAILURE)
2933 FAIL;
2934 code += skip-1;
2935 break;
2936
2937 case SRE_OP_INFO:
2938 {
2939 /* A minimal info field is
2940 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2941 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2942 more follows. */
2943 SRE_CODE flags, min, max, i;
2944 SRE_CODE *newcode;
2945 GET_SKIP;
2946 newcode = code+skip-1;
2947 GET_ARG; flags = arg;
2948 GET_ARG; min = arg;
2949 GET_ARG; max = arg;
2950 /* Check that only valid flags are present */
2951 if ((flags & ~(SRE_INFO_PREFIX |
2952 SRE_INFO_LITERAL |
2953 SRE_INFO_CHARSET)) != 0)
2954 FAIL;
2955 /* PREFIX and CHARSET are mutually exclusive */
2956 if ((flags & SRE_INFO_PREFIX) &&
2957 (flags & SRE_INFO_CHARSET))
2958 FAIL;
2959 /* LITERAL implies PREFIX */
2960 if ((flags & SRE_INFO_LITERAL) &&
2961 !(flags & SRE_INFO_PREFIX))
2962 FAIL;
2963 /* Validate the prefix */
2964 if (flags & SRE_INFO_PREFIX) {
2965 SRE_CODE prefix_len, prefix_skip;
2966 GET_ARG; prefix_len = arg;
2967 GET_ARG; prefix_skip = arg;
2968 /* Here comes the prefix string */
2969 if (code+prefix_len < code || code+prefix_len > newcode)
2970 FAIL;
2971 code += prefix_len;
2972 /* And here comes the overlap table */
2973 if (code+prefix_len < code || code+prefix_len > newcode)
2974 FAIL;
2975 /* Each overlap value should be < prefix_len */
2976 for (i = 0; i < prefix_len; i++) {
2977 if (code[i] >= prefix_len)
2978 FAIL;
2979 }
2980 code += prefix_len;
2981 }
2982 /* Validate the charset */
2983 if (flags & SRE_INFO_CHARSET) {
2984 if (!_validate_charset(code, newcode-1))
2985 FAIL;
2986 if (newcode[-1] != SRE_OP_FAILURE)
2987 FAIL;
2988 code = newcode;
2989 }
2990 else if (code != newcode) {
2991 VTRACE(("code=%p, newcode=%p\n", code, newcode));
2992 FAIL;
2993 }
2994 }
2995 break;
2996
2997 case SRE_OP_BRANCH:
2998 {
2999 SRE_CODE *target = NULL;
3000 for (;;) {
3001 GET_SKIP;
3002 if (skip == 0)
3003 break;
3004 /* Stop 2 before the end; we check the JUMP below */
3005 if (!_validate_inner(code, code+skip-3, groups))
3006 FAIL;
3007 code += skip-3;
3008 /* Check that it ends with a JUMP, and that each JUMP
3009 has the same target */
3010 GET_OP;
3011 if (op != SRE_OP_JUMP)
3012 FAIL;
3013 GET_SKIP;
3014 if (target == NULL)
3015 target = code+skip-1;
3016 else if (code+skip-1 != target)
3017 FAIL;
3018 }
3019 }
3020 break;
3021
3022 case SRE_OP_REPEAT_ONE:
3023 case SRE_OP_MIN_REPEAT_ONE:
3024 {
3025 SRE_CODE min, max;
3026 GET_SKIP;
3027 GET_ARG; min = arg;
3028 GET_ARG; max = arg;
3029 if (min > max)
3030 FAIL;
3031#ifdef Py_UNICODE_WIDE
3032 if (max > 65535)
3033 FAIL;
3034#endif
3035 if (!_validate_inner(code, code+skip-4, groups))
3036 FAIL;
3037 code += skip-4;
3038 GET_OP;
3039 if (op != SRE_OP_SUCCESS)
3040 FAIL;
3041 }
3042 break;
3043
3044 case SRE_OP_REPEAT:
3045 {
3046 SRE_CODE min, max;
3047 GET_SKIP;
3048 GET_ARG; min = arg;
3049 GET_ARG; max = arg;
3050 if (min > max)
3051 FAIL;
3052#ifdef Py_UNICODE_WIDE
3053 if (max > 65535)
3054 FAIL;
3055#endif
3056 if (!_validate_inner(code, code+skip-3, groups))
3057 FAIL;
3058 code += skip-3;
3059 GET_OP;
3060 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3061 FAIL;
3062 }
3063 break;
3064
3065 case SRE_OP_GROUPREF:
3066 case SRE_OP_GROUPREF_IGNORE:
3067 GET_ARG;
3068 if (arg >= groups)
3069 FAIL;
3070 break;
3071
3072 case SRE_OP_GROUPREF_EXISTS:
3073 /* The regex syntax for this is: '(?(group)then|else)', where
3074 'group' is either an integer group number or a group name,
3075 'then' and 'else' are sub-regexes, and 'else' is optional. */
3076 GET_ARG;
3077 if (arg >= groups)
3078 FAIL;
3079 GET_SKIP;
3080 code--; /* The skip is relative to the first arg! */
3081 /* There are two possibilities here: if there is both a 'then'
3082 part and an 'else' part, the generated code looks like:
3083
3084 GROUPREF_EXISTS
3085 <group>
3086 <skipyes>
3087 ...then part...
3088 JUMP
3089 <skipno>
3090 (<skipyes> jumps here)
3091 ...else part...
3092 (<skipno> jumps here)
3093
3094 If there is only a 'then' part, it looks like:
3095
3096 GROUPREF_EXISTS
3097 <group>
3098 <skip>
3099 ...then part...
3100 (<skip> jumps here)
3101
3102 There is no direct way to decide which it is, and we don't want
3103 to allow arbitrary jumps anywhere in the code; so we just look
3104 for a JUMP opcode preceding our skip target.
3105 */
3106 if (skip >= 3 && code+skip-3 >= code &&
3107 code[skip-3] == SRE_OP_JUMP)
3108 {
3109 VTRACE(("both then and else parts present\n"));
3110 if (!_validate_inner(code+1, code+skip-3, groups))
3111 FAIL;
3112 code += skip-2; /* Position after JUMP, at <skipno> */
3113 GET_SKIP;
3114 if (!_validate_inner(code, code+skip-1, groups))
3115 FAIL;
3116 code += skip-1;
3117 }
3118 else {
3119 VTRACE(("only a then part present\n"));
3120 if (!_validate_inner(code+1, code+skip-1, groups))
3121 FAIL;
3122 code += skip-1;
3123 }
3124 break;
3125
3126 case SRE_OP_ASSERT:
3127 case SRE_OP_ASSERT_NOT:
3128 GET_SKIP;
3129 GET_ARG; /* 0 for lookahead, width for lookbehind */
3130 code--; /* Back up over arg to simplify math below */
3131 if (arg & 0x80000000)
3132 FAIL; /* Width too large */
3133 /* Stop 1 before the end; we check the SUCCESS below */
3134 if (!_validate_inner(code+1, code+skip-2, groups))
3135 FAIL;
3136 code += skip-2;
3137 GET_OP;
3138 if (op != SRE_OP_SUCCESS)
3139 FAIL;
3140 break;
3141
3142 default:
3143 FAIL;
3144
3145 }
3146 }
3147
3148 VTRACE(("okay\n"));
3149 return 1;
3150}
3151
3152static int
3153_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3154{
3155 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3156 FAIL;
3157 if (groups == 0) /* fix for simplejson */
3158 groups = 100; /* 100 groups should always be safe */
3159 return _validate_inner(code, end-1, groups);
3160}
3161
3162static int
3163_validate(PatternObject *self)
3164{
3165 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3166 {
3167 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3168 return 0;
3169 }
3170 else
3171 VTRACE(("Success!\n"));
3172 return 1;
3173}
3174
3175/* -------------------------------------------------------------------- */
Guido van Rossumb700df92000-03-31 14:59:30 +00003176/* match methods */
3177
3178static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003179match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003180{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003181 Py_XDECREF(self->regs);
3182 Py_XDECREF(self->string);
3183 Py_DECREF(self->pattern);
3184 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00003185}
3186
3187static PyObject*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003188match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00003189{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003190 if (index < 0 || index >= self->groups) {
3191 /* raise IndexError if we were given a bad group number */
3192 PyErr_SetString(
3193 PyExc_IndexError,
3194 "no such group"
3195 );
3196 return NULL;
3197 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003198
Fredrik Lundh6f013982000-07-03 18:44:21 +00003199 index *= 2;
3200
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003201 if (self->string == Py_None || self->mark[index] < 0) {
3202 /* return default value if the string or group is undefined */
3203 Py_INCREF(def);
3204 return def;
3205 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003206
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003207 return PySequence_GetSlice(
3208 self->string, self->mark[index], self->mark[index+1]
3209 );
Guido van Rossumb700df92000-03-31 14:59:30 +00003210}
3211
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003212static Py_ssize_t
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003213match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00003214{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003215 Py_ssize_t i;
Guido van Rossumb700df92000-03-31 14:59:30 +00003216
Guido van Rossumddefaf32007-01-14 03:31:43 +00003217 if (index == NULL)
3218 /* Default value */
3219 return 0;
3220
Christian Heimes217cfd12007-12-02 14:31:20 +00003221 if (PyLong_Check(index))
3222 return PyLong_AsSsize_t(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00003223
Fredrik Lundh6f013982000-07-03 18:44:21 +00003224 i = -1;
3225
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003226 if (self->pattern->groupindex) {
3227 index = PyObject_GetItem(self->pattern->groupindex, index);
3228 if (index) {
Neal Norwitz1fe5f382007-08-31 04:32:55 +00003229 if (PyLong_Check(index))
Christian Heimes217cfd12007-12-02 14:31:20 +00003230 i = PyLong_AsSsize_t(index);
Fredrik Lundh6f013982000-07-03 18:44:21 +00003231 Py_DECREF(index);
3232 } else
3233 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003234 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00003235
3236 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003237}
3238
3239static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003240match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003241{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003242 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00003243}
3244
3245static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003246match_expand(MatchObject* self, PyObject* ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003247{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003248 /* delegate to Python code */
3249 return call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00003250 SRE_PY_MODULE, "_expand",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003251 PyTuple_Pack(3, self->pattern, self, ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003252 );
3253}
3254
3255static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003256match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003257{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003258 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003259 Py_ssize_t i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00003260
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003261 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00003262
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003263 switch (size) {
3264 case 0:
3265 result = match_getslice(self, Py_False, Py_None);
3266 break;
3267 case 1:
3268 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3269 break;
3270 default:
3271 /* fetch multiple items */
3272 result = PyTuple_New(size);
3273 if (!result)
3274 return NULL;
3275 for (i = 0; i < size; i++) {
3276 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003277 self, PyTuple_GET_ITEM(args, i), Py_None
3278 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003279 if (!item) {
3280 Py_DECREF(result);
3281 return NULL;
3282 }
3283 PyTuple_SET_ITEM(result, i, item);
3284 }
3285 break;
3286 }
3287 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003288}
3289
3290static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003291match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003292{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003293 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003294 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003295
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003296 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003297 static char* kwlist[] = { "default", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00003298 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003299 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003300
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003301 result = PyTuple_New(self->groups-1);
3302 if (!result)
3303 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003304
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003305 for (index = 1; index < self->groups; index++) {
3306 PyObject* item;
3307 item = match_getslice_by_index(self, index, def);
3308 if (!item) {
3309 Py_DECREF(result);
3310 return NULL;
3311 }
3312 PyTuple_SET_ITEM(result, index-1, item);
3313 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003314
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003315 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003316}
3317
3318static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003319match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003320{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003321 PyObject* result;
3322 PyObject* keys;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003323 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003324
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003325 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003326 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00003327 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003328 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003329
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003330 result = PyDict_New();
3331 if (!result || !self->pattern->groupindex)
3332 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003333
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003334 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003335 if (!keys)
3336 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00003337
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003338 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00003339 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003340 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003341 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003342 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003343 if (!key)
3344 goto failed;
3345 value = match_getslice(self, key, def);
3346 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003347 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003348 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003349 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00003350 status = PyDict_SetItem(result, key, value);
3351 Py_DECREF(value);
3352 if (status < 0)
3353 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003354 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003355
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003356 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00003357
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003358 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003359
3360failed:
Neal Norwitz60da3162006-03-07 04:48:24 +00003361 Py_XDECREF(keys);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003362 Py_DECREF(result);
3363 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003364}
3365
3366static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003367match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003368{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003369 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003370
Guido van Rossumddefaf32007-01-14 03:31:43 +00003371 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003372 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003373 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003374
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003375 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003376
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003377 if (index < 0 || index >= self->groups) {
3378 PyErr_SetString(
3379 PyExc_IndexError,
3380 "no such group"
3381 );
3382 return NULL;
3383 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003384
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003385 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003386 return Py_BuildValue("i", self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00003387}
3388
3389static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003390match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003391{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003392 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003393
Guido van Rossumddefaf32007-01-14 03:31:43 +00003394 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003395 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003396 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003397
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003398 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003399
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003400 if (index < 0 || index >= self->groups) {
3401 PyErr_SetString(
3402 PyExc_IndexError,
3403 "no such group"
3404 );
3405 return NULL;
3406 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003407
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003408 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003409 return Py_BuildValue("i", self->mark[index*2+1]);
3410}
3411
3412LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003413_pair(Py_ssize_t i1, Py_ssize_t i2)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003414{
3415 PyObject* pair;
3416 PyObject* item;
3417
3418 pair = PyTuple_New(2);
3419 if (!pair)
3420 return NULL;
3421
Christian Heimes217cfd12007-12-02 14:31:20 +00003422 item = PyLong_FromSsize_t(i1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003423 if (!item)
3424 goto error;
3425 PyTuple_SET_ITEM(pair, 0, item);
3426
Christian Heimes217cfd12007-12-02 14:31:20 +00003427 item = PyLong_FromSsize_t(i2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003428 if (!item)
3429 goto error;
3430 PyTuple_SET_ITEM(pair, 1, item);
3431
3432 return pair;
3433
3434 error:
3435 Py_DECREF(pair);
3436 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003437}
3438
3439static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003440match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003441{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003442 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003443
Guido van Rossumddefaf32007-01-14 03:31:43 +00003444 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003445 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003446 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003447
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003448 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003449
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003450 if (index < 0 || index >= self->groups) {
3451 PyErr_SetString(
3452 PyExc_IndexError,
3453 "no such group"
3454 );
3455 return NULL;
3456 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003457
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003458 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003459 return _pair(self->mark[index*2], self->mark[index*2+1]);
3460}
3461
3462static PyObject*
3463match_regs(MatchObject* self)
3464{
3465 PyObject* regs;
3466 PyObject* item;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003467 Py_ssize_t index;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003468
3469 regs = PyTuple_New(self->groups);
3470 if (!regs)
3471 return NULL;
3472
3473 for (index = 0; index < self->groups; index++) {
3474 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3475 if (!item) {
3476 Py_DECREF(regs);
3477 return NULL;
3478 }
3479 PyTuple_SET_ITEM(regs, index, item);
3480 }
3481
3482 Py_INCREF(regs);
3483 self->regs = regs;
3484
3485 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00003486}
3487
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003488static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003489match_copy(MatchObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003490{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003491#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003492 MatchObject* copy;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003493 Py_ssize_t slots, offset;
Tim Peters3d563502006-01-21 02:47:53 +00003494
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003495 slots = 2 * (self->pattern->groups+1);
3496
3497 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3498 if (!copy)
3499 return NULL;
3500
3501 /* this value a constant, but any compiler should be able to
3502 figure that out all by itself */
3503 offset = offsetof(MatchObject, string);
3504
3505 Py_XINCREF(self->pattern);
3506 Py_XINCREF(self->string);
3507 Py_XINCREF(self->regs);
3508
3509 memcpy((char*) copy + offset, (char*) self + offset,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003510 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003511
3512 return (PyObject*) copy;
3513#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003514 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003515 return NULL;
3516#endif
3517}
3518
3519static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003520match_deepcopy(MatchObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003521{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003522#ifdef USE_BUILTIN_COPY
3523 MatchObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00003524
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003525 copy = (MatchObject*) match_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003526 if (!copy)
3527 return NULL;
3528
3529 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3530 !deepcopy(&copy->string, memo) ||
3531 !deepcopy(&copy->regs, memo)) {
3532 Py_DECREF(copy);
3533 return NULL;
3534 }
3535
3536#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003537 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3538 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003539#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003540}
3541
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003542static PyMethodDef match_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00003543 {"group", (PyCFunction) match_group, METH_VARARGS},
3544 {"start", (PyCFunction) match_start, METH_VARARGS},
3545 {"end", (PyCFunction) match_end, METH_VARARGS},
3546 {"span", (PyCFunction) match_span, METH_VARARGS},
3547 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3548 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003549 {"expand", (PyCFunction) match_expand, METH_O},
3550 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3551 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003552 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003553};
3554
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003555static PyObject *
3556match_lastindex_get(MatchObject *self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003557{
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003558 if (self->lastindex >= 0)
3559 return Py_BuildValue("i", self->lastindex);
3560 Py_INCREF(Py_None);
3561 return Py_None;
Guido van Rossumb700df92000-03-31 14:59:30 +00003562}
3563
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003564static PyObject *
3565match_lastgroup_get(MatchObject *self)
3566{
3567 if (self->pattern->indexgroup && self->lastindex >= 0) {
3568 PyObject* result = PySequence_GetItem(
3569 self->pattern->indexgroup, self->lastindex
3570 );
3571 if (result)
3572 return result;
3573 PyErr_Clear();
3574 }
3575 Py_INCREF(Py_None);
3576 return Py_None;
3577}
3578
3579static PyObject *
3580match_regs_get(MatchObject *self)
3581{
3582 if (self->regs) {
3583 Py_INCREF(self->regs);
3584 return self->regs;
3585 } else
3586 return match_regs(self);
3587}
3588
3589static PyGetSetDef match_getset[] = {
3590 {"lastindex", (getter)match_lastindex_get, (setter)NULL},
3591 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
3592 {"regs", (getter)match_regs_get, (setter)NULL},
3593 {NULL}
3594};
3595
3596#define MATCH_OFF(x) offsetof(MatchObject, x)
3597static PyMemberDef match_members[] = {
3598 {"string", T_OBJECT, MATCH_OFF(string), READONLY},
3599 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
3600 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
3601 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
3602 {NULL}
3603};
3604
Guido van Rossumb700df92000-03-31 14:59:30 +00003605/* FIXME: implement setattr("string", None) as a special case (to
3606 detach the associated string, if any */
3607
Neal Norwitz57c179c2006-03-22 07:18:02 +00003608static PyTypeObject Match_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003609 PyVarObject_HEAD_INIT(NULL,0)
3610 "_" SRE_MODULE ".SRE_Match",
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003611 sizeof(MatchObject), sizeof(Py_ssize_t),
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003612 (destructor)match_dealloc, /* tp_dealloc */
3613 0, /* tp_print */
3614 0, /* tp_getattr */
3615 0, /* tp_setattr */
3616 0, /* tp_compare */
3617 0, /* tp_repr */
3618 0, /* tp_as_number */
3619 0, /* tp_as_sequence */
3620 0, /* tp_as_mapping */
3621 0, /* tp_hash */
3622 0, /* tp_call */
3623 0, /* tp_str */
3624 0, /* tp_getattro */
3625 0, /* tp_setattro */
3626 0, /* tp_as_buffer */
3627 Py_TPFLAGS_DEFAULT, /* tp_flags */
3628 0, /* tp_doc */
3629 0, /* tp_traverse */
3630 0, /* tp_clear */
3631 0, /* tp_richcompare */
3632 0, /* tp_weaklistoffset */
3633 0, /* tp_iter */
3634 0, /* tp_iternext */
3635 match_methods, /* tp_methods */
3636 match_members, /* tp_members */
3637 match_getset, /* tp_getset */
Guido van Rossumb700df92000-03-31 14:59:30 +00003638};
3639
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003640static PyObject*
3641pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3642{
3643 /* create match object (from state object) */
3644
3645 MatchObject* match;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003646 Py_ssize_t i, j;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003647 char* base;
3648 int n;
3649
3650 if (status > 0) {
3651
3652 /* create match object (with room for extra group marks) */
Christian Heimes587c2bf2008-01-19 16:21:02 +00003653 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003654 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3655 2*(pattern->groups+1));
3656 if (!match)
3657 return NULL;
3658
3659 Py_INCREF(pattern);
3660 match->pattern = pattern;
3661
3662 Py_INCREF(state->string);
3663 match->string = state->string;
3664
3665 match->regs = NULL;
3666 match->groups = pattern->groups+1;
3667
3668 /* fill in group slices */
3669
3670 base = (char*) state->beginning;
3671 n = state->charsize;
3672
3673 match->mark[0] = ((char*) state->start - base) / n;
3674 match->mark[1] = ((char*) state->ptr - base) / n;
3675
3676 for (i = j = 0; i < pattern->groups; i++, j+=2)
3677 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3678 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3679 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3680 } else
3681 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3682
3683 match->pos = state->pos;
3684 match->endpos = state->endpos;
3685
3686 match->lastindex = state->lastindex;
3687
3688 return (PyObject*) match;
3689
3690 } else if (status == 0) {
3691
3692 /* no match */
3693 Py_INCREF(Py_None);
3694 return Py_None;
3695
3696 }
3697
3698 /* internal error */
3699 pattern_error(status);
3700 return NULL;
3701}
3702
3703
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003704/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003705/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003706
3707static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003708scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003709{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003710 state_fini(&self->state);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003711 Py_DECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003712 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003713}
3714
3715static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003716scanner_match(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003717{
3718 SRE_STATE* state = &self->state;
3719 PyObject* match;
3720 int status;
3721
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003722 state_reset(state);
3723
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003724 state->ptr = state->start;
3725
3726 if (state->charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003727 status = sre_match(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003728 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003729#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003730 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003731#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003732 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003733 if (PyErr_Occurred())
3734 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003735
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003736 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003737 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003738
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003739 if (status == 0 || state->ptr == state->start)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003740 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003741 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003742 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003743
3744 return match;
3745}
3746
3747
3748static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003749scanner_search(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003750{
3751 SRE_STATE* state = &self->state;
3752 PyObject* match;
3753 int status;
3754
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003755 state_reset(state);
3756
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003757 state->ptr = state->start;
3758
3759 if (state->charsize == 1) {
3760 status = sre_search(state, PatternObject_GetCode(self->pattern));
3761 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003762#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003763 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003764#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003765 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003766 if (PyErr_Occurred())
3767 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003768
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003769 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003770 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003771
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003772 if (status == 0 || state->ptr == state->start)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003773 state->start = (void*) ((char*) state->ptr + state->charsize);
3774 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003775 state->start = state->ptr;
3776
3777 return match;
3778}
3779
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003780static PyMethodDef scanner_methods[] = {
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003781 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3782 {"search", (PyCFunction) scanner_search, METH_NOARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003783 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003784};
3785
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003786#define SCAN_OFF(x) offsetof(ScannerObject, x)
3787static PyMemberDef scanner_members[] = {
3788 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
3789 {NULL} /* Sentinel */
3790};
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003791
Neal Norwitz57c179c2006-03-22 07:18:02 +00003792static PyTypeObject Scanner_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003793 PyVarObject_HEAD_INIT(NULL, 0)
3794 "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003795 sizeof(ScannerObject), 0,
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003796 (destructor)scanner_dealloc,/* tp_dealloc */
3797 0, /* tp_print */
3798 0, /* tp_getattr */
3799 0, /* tp_setattr */
3800 0, /* tp_compare */
3801 0, /* tp_repr */
3802 0, /* tp_as_number */
3803 0, /* tp_as_sequence */
3804 0, /* tp_as_mapping */
3805 0, /* tp_hash */
3806 0, /* tp_call */
3807 0, /* tp_str */
3808 0, /* tp_getattro */
3809 0, /* tp_setattro */
3810 0, /* tp_as_buffer */
3811 Py_TPFLAGS_DEFAULT, /* tp_flags */
3812 0, /* tp_doc */
3813 0, /* tp_traverse */
3814 0, /* tp_clear */
3815 0, /* tp_richcompare */
3816 0, /* tp_weaklistoffset */
3817 0, /* tp_iter */
3818 0, /* tp_iternext */
3819 scanner_methods, /* tp_methods */
3820 scanner_members, /* tp_members */
3821 0, /* tp_getset */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003822};
3823
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003824static PyObject*
3825pattern_scanner(PatternObject* pattern, PyObject* args)
3826{
3827 /* create search state object */
3828
3829 ScannerObject* self;
3830
3831 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003832 Py_ssize_t start = 0;
3833 Py_ssize_t end = PY_SSIZE_T_MAX;
3834 if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003835 return NULL;
3836
3837 /* create scanner object */
3838 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3839 if (!self)
3840 return NULL;
3841
3842 string = state_init(&self->state, pattern, string, start, end);
3843 if (!string) {
3844 PyObject_DEL(self);
3845 return NULL;
3846 }
3847
3848 Py_INCREF(pattern);
3849 self->pattern = (PyObject*) pattern;
3850
3851 return (PyObject*) self;
3852}
3853
Guido van Rossumb700df92000-03-31 14:59:30 +00003854static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00003855 {"compile", _compile, METH_VARARGS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003856 {"getcodesize", sre_codesize, METH_NOARGS},
Neal Norwitzb0493252002-03-31 14:44:22 +00003857 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003858 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003859};
3860
Martin v. Löwis1a214512008-06-11 05:26:20 +00003861static struct PyModuleDef sremodule = {
3862 PyModuleDef_HEAD_INIT,
3863 "_" SRE_MODULE,
3864 NULL,
3865 -1,
3866 _functions,
3867 NULL,
3868 NULL,
3869 NULL,
3870 NULL
3871};
3872
3873PyMODINIT_FUNC PyInit__sre(void)
Guido van Rossumb700df92000-03-31 14:59:30 +00003874{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003875 PyObject* m;
3876 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00003877 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003878
Guido van Rossum50e9fb92006-08-17 05:42:55 +00003879 /* Initialize object types */
3880 if (PyType_Ready(&Pattern_Type) < 0)
Martin v. Löwis1a214512008-06-11 05:26:20 +00003881 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00003882 if (PyType_Ready(&Match_Type) < 0)
Martin v. Löwis1a214512008-06-11 05:26:20 +00003883 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00003884 if (PyType_Ready(&Scanner_Type) < 0)
Martin v. Löwis1a214512008-06-11 05:26:20 +00003885 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003886
Martin v. Löwis1a214512008-06-11 05:26:20 +00003887 m = PyModule_Create(&sremodule);
Neal Norwitz1ac754f2006-01-19 06:09:39 +00003888 if (m == NULL)
Martin v. Löwis1a214512008-06-11 05:26:20 +00003889 return NULL;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003890 d = PyModule_GetDict(m);
3891
Christian Heimes217cfd12007-12-02 14:31:20 +00003892 x = PyLong_FromLong(SRE_MAGIC);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003893 if (x) {
3894 PyDict_SetItemString(d, "MAGIC", x);
3895 Py_DECREF(x);
3896 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00003897
Christian Heimes217cfd12007-12-02 14:31:20 +00003898 x = PyLong_FromLong(sizeof(SRE_CODE));
Martin v. Löwis78e2f062003-04-19 12:56:08 +00003899 if (x) {
3900 PyDict_SetItemString(d, "CODESIZE", x);
3901 Py_DECREF(x);
3902 }
3903
Neal Norwitzfe537132007-08-26 03:55:15 +00003904 x = PyUnicode_FromString(copyright);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003905 if (x) {
3906 PyDict_SetItemString(d, "copyright", x);
3907 Py_DECREF(x);
3908 }
Martin v. Löwis1a214512008-06-11 05:26:20 +00003909 return m;
Guido van Rossumb700df92000-03-31 14:59:30 +00003910}
3911
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003912#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00003913
3914/* vim:ts=4:sw=4:et
3915*/