blob: 9315f102c77fc1d72ed447a92301104e71c2dac7 [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
Fredrik Lundhebc37b22000-10-28 19:30:41 +00009 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000010 * 2000-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000011 * 2000-09-20 fl added expand method
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000012 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000013 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000014 * 2001-04-28 fl added __copy__ methods (work in progress)
Fredrik Lundh09705f02002-11-22 12:46:35 +000015 * 2001-05-14 fl fixes for 1.5.2 compatibility
Fredrik Lundhf71ae462001-07-02 17:04:48 +000016 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh397a6542001-10-18 19:30:16 +000017 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Fredrik Lundh971e78b2001-10-20 17:48:46 +000018 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
Fredrik Lundhbec95b92001-10-21 16:47:57 +000019 * 2001-10-21 fl added sub/subn primitive
Fredrik Lundh703ce812001-10-24 22:16:30 +000020 * 2001-10-24 fl added finditer primitive (for 2.2 only)
Fredrik Lundh82b23072001-12-09 16:13:15 +000021 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
Fredrik Lundh09705f02002-11-22 12:46:35 +000022 * 2002-11-09 fl fixed empty sub/subn return type
Martin v. Löwis78e2f062003-04-19 12:56:08 +000023 * 2003-04-18 mvl fully support 4-byte codes
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +000024 * 2003-10-17 gn implemented non recursive scheme
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000026 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000027 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000028 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
Guido van Rossumb700df92000-03-31 14:59:30 +000032 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000033 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000034 * other compatibility work.
35 */
36
37#ifndef SRE_RECURSIVE
38
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000039static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000040 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000041
Neal Norwitza6d80fa2006-06-12 03:05:40 +000042#define PY_SSIZE_T_CLEAN
43
Guido van Rossumb700df92000-03-31 14:59:30 +000044#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000045#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000046
47#include "sre.h"
48
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000049#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000050
Fredrik Lundh436c3d52000-06-29 08:58:44 +000051/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000052#if !defined(SRE_MODULE)
53#define SRE_MODULE "sre"
54#endif
Fredrik Lundh436c3d52000-06-29 08:58:44 +000055
Neal Norwitz94a9c092006-03-16 06:30:02 +000056#define SRE_PY_MODULE "re"
57
Guido van Rossumb700df92000-03-31 14:59:30 +000058/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000059#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000060
Fredrik Lundh971e78b2001-10-20 17:48:46 +000061#if PY_VERSION_HEX >= 0x01060000
62#if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
Fredrik Lundh22d25462000-07-01 17:50:59 +000063/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d52000-06-29 08:58:44 +000064#define HAVE_UNICODE
65#endif
Fredrik Lundh971e78b2001-10-20 17:48:46 +000066#endif
Fredrik Lundh436c3d52000-06-29 08:58:44 +000067
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000068/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000069/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000070
71/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000072#define USE_FAST_SEARCH
73
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000074/* enables aggressive inlining (always on for Visual C) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000075#undef USE_INLINE
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000076
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000077/* enables copy/deepcopy handling (work in progress) */
78#undef USE_BUILTIN_COPY
79
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000080#if PY_VERSION_HEX < 0x01060000
81#define PyObject_DEL(op) PyMem_DEL((op))
82#endif
83
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000084/* -------------------------------------------------------------------- */
85
Fredrik Lundh80946112000-06-29 18:03:25 +000086#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000087#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000088#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000089/* fastest possible local call under MSVC */
90#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000091#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000092#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000093#else
94#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000095#endif
96
97/* error codes */
98#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000099#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000100#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +0000101#define SRE_ERROR_MEMORY -9 /* out of memory */
Facundo Batista4473d222008-01-08 21:10:12 +0000102#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
Guido van Rossumb700df92000-03-31 14:59:30 +0000103
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000104#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +0000105#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +0000106#else
107#define TRACE(v)
108#endif
109
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000110/* -------------------------------------------------------------------- */
111/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000112
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000113/* default character predicates (run sre_chars.py to regenerate tables) */
114
115#define SRE_DIGIT_MASK 1
116#define SRE_SPACE_MASK 2
117#define SRE_LINEBREAK_MASK 4
118#define SRE_ALNUM_MASK 8
119#define SRE_WORD_MASK 16
120
Fredrik Lundh21009b92001-09-18 18:47:09 +0000121/* FIXME: this assumes ASCII. create tables in init_sre() instead */
122
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000123static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1242, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1250, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
12625, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12724, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1280, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12924, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
130
Fredrik Lundhb389df32000-06-29 12:48:37 +0000131static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d52000-06-29 08:58:44 +000013210, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
13327, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
13444, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
13561, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
136108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
137122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
138106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
139120, 121, 122, 123, 124, 125, 126, 127 };
140
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000141#define SRE_IS_DIGIT(ch)\
142 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
143#define SRE_IS_SPACE(ch)\
144 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
145#define SRE_IS_LINEBREAK(ch)\
146 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
147#define SRE_IS_ALNUM(ch)\
148 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
149#define SRE_IS_WORD(ch)\
150 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000151
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000152static unsigned int sre_lower(unsigned int ch)
153{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000154 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000155}
156
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000157/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000158/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
159 * warnings when c's type supports only numbers < N+1 */
160#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
161#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000162#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000163#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000164#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
165
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000166static unsigned int sre_lower_locale(unsigned int ch)
167{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000168 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000169}
170
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000171/* unicode-specific character predicates */
172
173#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000174
Mark Dickinsonfe67bd92009-07-28 20:35:03 +0000175#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000176#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
177#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000178#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000179#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000180
181static unsigned int sre_lower_unicode(unsigned int ch)
182{
183 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
184}
185
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000186#endif
187
Guido van Rossumb700df92000-03-31 14:59:30 +0000188LOCAL(int)
189sre_category(SRE_CODE category, unsigned int ch)
190{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000191 switch (category) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000192
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000193 case SRE_CATEGORY_DIGIT:
194 return SRE_IS_DIGIT(ch);
195 case SRE_CATEGORY_NOT_DIGIT:
196 return !SRE_IS_DIGIT(ch);
197 case SRE_CATEGORY_SPACE:
198 return SRE_IS_SPACE(ch);
199 case SRE_CATEGORY_NOT_SPACE:
200 return !SRE_IS_SPACE(ch);
201 case SRE_CATEGORY_WORD:
202 return SRE_IS_WORD(ch);
203 case SRE_CATEGORY_NOT_WORD:
204 return !SRE_IS_WORD(ch);
205 case SRE_CATEGORY_LINEBREAK:
206 return SRE_IS_LINEBREAK(ch);
207 case SRE_CATEGORY_NOT_LINEBREAK:
208 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000209
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000210 case SRE_CATEGORY_LOC_WORD:
211 return SRE_LOC_IS_WORD(ch);
212 case SRE_CATEGORY_LOC_NOT_WORD:
213 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000214
215#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000216 case SRE_CATEGORY_UNI_DIGIT:
217 return SRE_UNI_IS_DIGIT(ch);
218 case SRE_CATEGORY_UNI_NOT_DIGIT:
219 return !SRE_UNI_IS_DIGIT(ch);
220 case SRE_CATEGORY_UNI_SPACE:
221 return SRE_UNI_IS_SPACE(ch);
222 case SRE_CATEGORY_UNI_NOT_SPACE:
223 return !SRE_UNI_IS_SPACE(ch);
224 case SRE_CATEGORY_UNI_WORD:
225 return SRE_UNI_IS_WORD(ch);
226 case SRE_CATEGORY_UNI_NOT_WORD:
227 return !SRE_UNI_IS_WORD(ch);
228 case SRE_CATEGORY_UNI_LINEBREAK:
229 return SRE_UNI_IS_LINEBREAK(ch);
230 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
231 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000232#else
233 case SRE_CATEGORY_UNI_DIGIT:
234 return SRE_IS_DIGIT(ch);
235 case SRE_CATEGORY_UNI_NOT_DIGIT:
236 return !SRE_IS_DIGIT(ch);
237 case SRE_CATEGORY_UNI_SPACE:
238 return SRE_IS_SPACE(ch);
239 case SRE_CATEGORY_UNI_NOT_SPACE:
240 return !SRE_IS_SPACE(ch);
241 case SRE_CATEGORY_UNI_WORD:
242 return SRE_LOC_IS_WORD(ch);
243 case SRE_CATEGORY_UNI_NOT_WORD:
244 return !SRE_LOC_IS_WORD(ch);
245 case SRE_CATEGORY_UNI_LINEBREAK:
246 return SRE_IS_LINEBREAK(ch);
247 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
248 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000249#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000250 }
251 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000252}
253
254/* helpers */
255
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000256static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000257data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000258{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000259 if (state->data_stack) {
Jack Diederich2d400772006-05-27 15:44:34 +0000260 PyMem_FREE(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000261 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000262 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000263 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000264}
265
266static int
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000267data_stack_grow(SRE_STATE* state, Py_ssize_t size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000268{
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000269 Py_ssize_t minsize, cursize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000270 minsize = state->data_stack_base+size;
271 cursize = state->data_stack_size;
272 if (cursize < minsize) {
273 void* stack;
274 cursize = minsize+minsize/4+1024;
275 TRACE(("allocate/grow stack %d\n", cursize));
Jack Diederich2d400772006-05-27 15:44:34 +0000276 stack = PyMem_REALLOC(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000277 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000278 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000279 return SRE_ERROR_MEMORY;
280 }
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000281 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000282 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000283 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000284 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000285}
286
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000287/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000288
289#define SRE_CHAR unsigned char
290#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000291#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000292#define SRE_CHARSET sre_charset
293#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000294#define SRE_MATCH sre_match
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000295#define SRE_MATCH_CONTEXT sre_match_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000296#define SRE_SEARCH sre_search
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000297#define SRE_LITERAL_TEMPLATE sre_literal_template
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000298
299#if defined(HAVE_UNICODE)
300
Guido van Rossumb700df92000-03-31 14:59:30 +0000301#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000302#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000303#undef SRE_RECURSIVE
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000304
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000305#undef SRE_LITERAL_TEMPLATE
Guido van Rossumb700df92000-03-31 14:59:30 +0000306#undef SRE_SEARCH
307#undef SRE_MATCH
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000308#undef SRE_MATCH_CONTEXT
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000309#undef SRE_INFO
310#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000311#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000312#undef SRE_AT
313#undef SRE_CHAR
314
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000315/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000316
317#define SRE_CHAR Py_UNICODE
318#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000319#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000320#define SRE_CHARSET sre_ucharset
321#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000322#define SRE_MATCH sre_umatch
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000323#define SRE_MATCH_CONTEXT sre_umatch_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000324#define SRE_SEARCH sre_usearch
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000325#define SRE_LITERAL_TEMPLATE sre_uliteral_template
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000326#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000327
328#endif /* SRE_RECURSIVE */
329
330/* -------------------------------------------------------------------- */
331/* String matching engine */
332
333/* the following section is compiled twice, with different character
334 settings */
335
336LOCAL(int)
337SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
338{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000339 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000340
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000341 Py_ssize_t thisp, thatp;
Guido van Rossumb700df92000-03-31 14:59:30 +0000342
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000343 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000344
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000345 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000346 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000347 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000348
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000349 case SRE_AT_BEGINNING_LINE:
350 return ((void*) ptr == state->beginning ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000351 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000352
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000353 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000354 return (((void*) (ptr+1) == state->end &&
355 SRE_IS_LINEBREAK((int) ptr[0])) ||
356 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000357
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000358 case SRE_AT_END_LINE:
359 return ((void*) ptr == state->end ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000360 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000361
Fredrik Lundh770617b2001-01-14 15:06:11 +0000362 case SRE_AT_END_STRING:
363 return ((void*) ptr == state->end);
364
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000365 case SRE_AT_BOUNDARY:
366 if (state->beginning == state->end)
367 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000368 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000369 SRE_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000370 thisp = ((void*) ptr < state->end) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000371 SRE_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000372 return thisp != thatp;
Fredrik Lundh80946112000-06-29 18:03:25 +0000373
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000374 case SRE_AT_NON_BOUNDARY:
375 if (state->beginning == state->end)
376 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000377 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000378 SRE_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000379 thisp = ((void*) ptr < state->end) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000380 SRE_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000381 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000382
383 case SRE_AT_LOC_BOUNDARY:
384 if (state->beginning == state->end)
385 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000386 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000387 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000388 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000389 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000390 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000391
392 case SRE_AT_LOC_NON_BOUNDARY:
393 if (state->beginning == state->end)
394 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000395 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000396 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000397 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000398 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000399 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000400
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000401#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000402 case SRE_AT_UNI_BOUNDARY:
403 if (state->beginning == state->end)
404 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000405 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000406 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000407 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000408 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000409 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000410
411 case SRE_AT_UNI_NON_BOUNDARY:
412 if (state->beginning == state->end)
413 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000414 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000415 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000416 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000417 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000418 return thisp == thatp;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000419#endif
420
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000421 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000422
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000423 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000424}
425
426LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000427SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000428{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000429 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000430
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000431 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000432
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000433 for (;;) {
434 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000435
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000436 case SRE_OP_FAILURE:
437 return !ok;
438
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000439 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000440 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000441 if (ch == set[0])
442 return ok;
443 set++;
444 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000445
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000446 case SRE_OP_CATEGORY:
447 /* <CATEGORY> <code> */
448 if (sre_category(set[0], (int) ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000449 return ok;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000450 set += 1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000451 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000452
Fredrik Lundh3562f112000-07-02 12:00:07 +0000453 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000454 if (sizeof(SRE_CODE) == 2) {
455 /* <CHARSET> <bitmap> (16 bits per code word) */
456 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
457 return ok;
458 set += 16;
Tim Peters3d563502006-01-21 02:47:53 +0000459 }
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000460 else {
461 /* <CHARSET> <bitmap> (32 bits per code word) */
462 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
463 return ok;
464 set += 8;
465 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000466 break;
467
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000468 case SRE_OP_RANGE:
469 /* <RANGE> <lower> <upper> */
470 if (set[0] <= ch && ch <= set[1])
471 return ok;
472 set += 2;
473 break;
474
475 case SRE_OP_NEGATE:
476 ok = !ok;
477 break;
478
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000479 case SRE_OP_BIGCHARSET:
480 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
481 {
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000482 Py_ssize_t count, block;
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000483 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000484
485 if (sizeof(SRE_CODE) == 2) {
486 block = ((unsigned char*)set)[ch >> 8];
487 set += 128;
488 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
489 return ok;
490 set += count*16;
491 }
492 else {
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000493 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
494 * warnings when c's type supports only numbers < N+1 */
495 if (!(ch & ~65535))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000496 block = ((unsigned char*)set)[ch >> 8];
497 else
498 block = -1;
499 set += 64;
Tim Peters3d563502006-01-21 02:47:53 +0000500 if (block >=0 &&
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000501 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
502 return ok;
503 set += count*8;
504 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000505 break;
506 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000507
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000508 default:
509 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000510 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000511 return 0;
512 }
513 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000514}
515
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000516LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000517
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000518LOCAL(Py_ssize_t)
519SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000520{
521 SRE_CODE chr;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000522 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
523 SRE_CHAR* end = (SRE_CHAR *)state->end;
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000524 Py_ssize_t i;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000525
526 /* adjust end */
527 if (maxcount < end - ptr && maxcount != 65535)
528 end = ptr + maxcount;
529
530 switch (pattern[0]) {
531
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000532 case SRE_OP_IN:
533 /* repeated set */
534 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
535 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
536 ptr++;
537 break;
538
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000539 case SRE_OP_ANY:
540 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000541 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000542 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
543 ptr++;
544 break;
545
546 case SRE_OP_ANY_ALL:
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000547 /* repeated dot wildcard. skip to the end of the target
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000548 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000549 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000550 ptr = end;
551 break;
552
553 case SRE_OP_LITERAL:
554 /* repeated literal */
555 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000556 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000557 while (ptr < end && (SRE_CODE) *ptr == chr)
558 ptr++;
559 break;
560
561 case SRE_OP_LITERAL_IGNORE:
562 /* repeated literal */
563 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000564 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000565 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
566 ptr++;
567 break;
568
569 case SRE_OP_NOT_LITERAL:
570 /* repeated non-literal */
571 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000572 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000573 while (ptr < end && (SRE_CODE) *ptr != chr)
574 ptr++;
575 break;
Tim Peters3d563502006-01-21 02:47:53 +0000576
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000577 case SRE_OP_NOT_LITERAL_IGNORE:
578 /* repeated non-literal */
579 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000580 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000581 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
582 ptr++;
583 break;
584
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000585 default:
586 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000587 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000588 while ((SRE_CHAR*) state->ptr < end) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000589 i = SRE_MATCH(state, pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000590 if (i < 0)
591 return i;
592 if (!i)
593 break;
594 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000595 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
596 (SRE_CHAR*) state->ptr - ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000597 return (SRE_CHAR*) state->ptr - ptr;
598 }
599
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000600 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000601 return ptr - (SRE_CHAR*) state->ptr;
602}
603
Fredrik Lundh33accc12000-08-27 20:59:47 +0000604#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000605LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000606SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
607{
608 /* check if an SRE_OP_INFO block matches at the current position.
609 returns the number of SRE_CODE objects to skip if successful, 0
610 if no match */
611
612 SRE_CHAR* end = state->end;
613 SRE_CHAR* ptr = state->ptr;
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000614 Py_ssize_t i;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000615
616 /* check minimal length */
617 if (pattern[3] && (end - ptr) < pattern[3])
618 return 0;
619
620 /* check known prefix */
621 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
622 /* <length> <skip> <prefix data> <overlap data> */
623 for (i = 0; i < pattern[5]; i++)
624 if ((SRE_CODE) ptr[i] != pattern[7 + i])
625 return 0;
626 return pattern[0] + 2 * pattern[6];
627 }
628 return pattern[0];
629}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000630#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000631
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000632/* The macros below should be used to protect recursive SRE_MATCH()
633 * calls that *failed* and do *not* return immediately (IOW, those
634 * that will backtrack). Explaining:
635 *
636 * - Recursive SRE_MATCH() returned true: that's usually a success
637 * (besides atypical cases like ASSERT_NOT), therefore there's no
638 * reason to restore lastmark;
639 *
640 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
641 * is returning to the caller: If the current SRE_MATCH() is the
642 * top function of the recursion, returning false will be a matching
643 * failure, and it doesn't matter where lastmark is pointing to.
644 * If it's *not* the top function, it will be a recursive SRE_MATCH()
645 * failure by itself, and the calling SRE_MATCH() will have to deal
646 * with the failure by the same rules explained here (it will restore
647 * lastmark by itself if necessary);
648 *
649 * - Recursive SRE_MATCH() returned false, and will continue the
650 * outside 'for' loop: must be protected when breaking, since the next
651 * OP could potentially depend on lastmark;
Tim Peters3d563502006-01-21 02:47:53 +0000652 *
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000653 * - Recursive SRE_MATCH() returned false, and will be called again
654 * inside a local for/while loop: must be protected between each
655 * loop iteration, since the recursive SRE_MATCH() could do anything,
656 * and could potentially depend on lastmark.
657 *
658 * For more information, check the discussion at SF patch #712900.
659 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000660#define LASTMARK_SAVE() \
661 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000662 ctx->lastmark = state->lastmark; \
663 ctx->lastindex = state->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000664 } while (0)
665#define LASTMARK_RESTORE() \
666 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000667 state->lastmark = ctx->lastmark; \
668 state->lastindex = ctx->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000669 } while (0)
670
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000671#define RETURN_ERROR(i) do { return i; } while(0)
672#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
673#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
674
675#define RETURN_ON_ERROR(i) \
676 do { if (i < 0) RETURN_ERROR(i); } while (0)
677#define RETURN_ON_SUCCESS(i) \
678 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
679#define RETURN_ON_FAILURE(i) \
680 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
681
682#define SFY(x) #x
683
684#define DATA_STACK_ALLOC(state, type, ptr) \
685do { \
686 alloc_pos = state->data_stack_base; \
687 TRACE(("allocating %s in %d (%d)\n", \
688 SFY(type), alloc_pos, sizeof(type))); \
689 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
690 int j = data_stack_grow(state, sizeof(type)); \
691 if (j < 0) return j; \
692 if (ctx_pos != -1) \
693 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
694 } \
695 ptr = (type*)(state->data_stack+alloc_pos); \
696 state->data_stack_base += sizeof(type); \
697} while (0)
698
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000699#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
700do { \
701 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
702 ptr = (type*)(state->data_stack+pos); \
703} while (0)
704
705#define DATA_STACK_PUSH(state, data, size) \
706do { \
707 TRACE(("copy data in %p to %d (%d)\n", \
708 data, state->data_stack_base, size)); \
709 if (state->data_stack_size < state->data_stack_base+size) { \
710 int j = data_stack_grow(state, size); \
711 if (j < 0) return j; \
712 if (ctx_pos != -1) \
713 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
714 } \
715 memcpy(state->data_stack+state->data_stack_base, data, size); \
716 state->data_stack_base += size; \
717} while (0)
718
719#define DATA_STACK_POP(state, data, size, discard) \
720do { \
721 TRACE(("copy data to %p from %d (%d)\n", \
722 data, state->data_stack_base-size, size)); \
723 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
724 if (discard) \
725 state->data_stack_base -= size; \
726} while (0)
727
728#define DATA_STACK_POP_DISCARD(state, size) \
729do { \
730 TRACE(("discard data from %d (%d)\n", \
731 state->data_stack_base-size, size)); \
732 state->data_stack_base -= size; \
733} while(0)
734
735#define DATA_PUSH(x) \
736 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
737#define DATA_POP(x) \
738 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000739#define DATA_POP_DISCARD(x) \
740 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
741#define DATA_ALLOC(t,p) \
742 DATA_STACK_ALLOC(state, t, p)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000743#define DATA_LOOKUP_AT(t,p,pos) \
744 DATA_STACK_LOOKUP_AT(state,t,p,pos)
745
746#define MARK_PUSH(lastmark) \
747 do if (lastmark > 0) { \
748 i = lastmark; /* ctx->lastmark may change if reallocated */ \
749 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
750 } while (0)
751#define MARK_POP(lastmark) \
752 do if (lastmark > 0) { \
753 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
754 } while (0)
755#define MARK_POP_KEEP(lastmark) \
756 do if (lastmark > 0) { \
757 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
758 } while (0)
759#define MARK_POP_DISCARD(lastmark) \
760 do if (lastmark > 0) { \
761 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
762 } while (0)
763
764#define JUMP_NONE 0
765#define JUMP_MAX_UNTIL_1 1
766#define JUMP_MAX_UNTIL_2 2
767#define JUMP_MAX_UNTIL_3 3
768#define JUMP_MIN_UNTIL_1 4
769#define JUMP_MIN_UNTIL_2 5
770#define JUMP_MIN_UNTIL_3 6
771#define JUMP_REPEAT 7
772#define JUMP_REPEAT_ONE_1 8
773#define JUMP_REPEAT_ONE_2 9
774#define JUMP_MIN_REPEAT_ONE 10
775#define JUMP_BRANCH 11
776#define JUMP_ASSERT 12
777#define JUMP_ASSERT_NOT 13
778
779#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
780 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
781 nextctx->last_ctx_pos = ctx_pos; \
782 nextctx->jump = jumpvalue; \
783 nextctx->pattern = nextpattern; \
784 ctx_pos = alloc_pos; \
785 ctx = nextctx; \
786 goto entrance; \
787 jumplabel: \
788 while (0) /* gcc doesn't like labels at end of scopes */ \
789
790typedef struct {
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000791 Py_ssize_t last_ctx_pos;
792 Py_ssize_t jump;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000793 SRE_CHAR* ptr;
794 SRE_CODE* pattern;
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000795 Py_ssize_t count;
796 Py_ssize_t lastmark;
797 Py_ssize_t lastindex;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000798 union {
799 SRE_CODE chr;
800 SRE_REPEAT* rep;
801 } u;
802} SRE_MATCH_CONTEXT;
803
804/* check if string matches the given pattern. returns <0 for
805 error, 0 for failure, and 1 for success */
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000806LOCAL(Py_ssize_t)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000807SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
Guido van Rossumb700df92000-03-31 14:59:30 +0000808{
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000809 SRE_CHAR* end = (SRE_CHAR *)state->end;
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000810 Py_ssize_t alloc_pos, ctx_pos = -1;
811 Py_ssize_t i, ret = 0;
812 Py_ssize_t jump;
Facundo Batista4473d222008-01-08 21:10:12 +0000813 unsigned int sigcount=0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000814
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000815 SRE_MATCH_CONTEXT* ctx;
816 SRE_MATCH_CONTEXT* nextctx;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000817
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000818 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000819
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000820 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
821 ctx->last_ctx_pos = -1;
822 ctx->jump = JUMP_NONE;
823 ctx->pattern = pattern;
824 ctx_pos = alloc_pos;
825
826entrance:
827
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000828 ctx->ptr = (SRE_CHAR *)state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000829
830 if (ctx->pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000831 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000832 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000833 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000834 TRACE(("reject (got %d chars, need %d)\n",
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000835 (end - ctx->ptr), ctx->pattern[3]));
836 RETURN_FAILURE;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000837 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000838 ctx->pattern += ctx->pattern[1] + 1;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000839 }
840
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000841 for (;;) {
Facundo Batista4473d222008-01-08 21:10:12 +0000842 ++sigcount;
843 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
844 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
Guido van Rossumb700df92000-03-31 14:59:30 +0000845
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000846 switch (*ctx->pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000847
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000848 case SRE_OP_MARK:
849 /* set mark */
850 /* <MARK> <gid> */
851 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
852 ctx->ptr, ctx->pattern[0]));
853 i = ctx->pattern[0];
854 if (i & 1)
855 state->lastindex = i/2 + 1;
856 if (i > state->lastmark) {
857 /* state->lastmark is the highest valid index in the
858 state->mark array. If it is increased by more than 1,
859 the intervening marks must be set to NULL to signal
Tim Peters3d563502006-01-21 02:47:53 +0000860 that these marks have not been encountered. */
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000861 Py_ssize_t j = state->lastmark + 1;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000862 while (j < i)
863 state->mark[j++] = NULL;
864 state->lastmark = i;
865 }
866 state->mark[i] = ctx->ptr;
867 ctx->pattern++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000868 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000869
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000870 case SRE_OP_LITERAL:
871 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000872 /* <LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000873 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
874 ctx->ptr, *ctx->pattern));
875 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
876 RETURN_FAILURE;
877 ctx->pattern++;
878 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000879 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000880
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000881 case SRE_OP_NOT_LITERAL:
882 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000883 /* <NOT_LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000884 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
885 ctx->ptr, *ctx->pattern));
886 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
887 RETURN_FAILURE;
888 ctx->pattern++;
889 ctx->ptr++;
890 break;
891
892 case SRE_OP_SUCCESS:
893 /* end of pattern */
894 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
895 state->ptr = ctx->ptr;
896 RETURN_SUCCESS;
897
898 case SRE_OP_AT:
899 /* match at given position */
900 /* <AT> <code> */
901 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
902 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
903 RETURN_FAILURE;
904 ctx->pattern++;
905 break;
906
907 case SRE_OP_CATEGORY:
908 /* match at given category */
909 /* <CATEGORY> <code> */
910 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
911 ctx->ptr, *ctx->pattern));
912 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
913 RETURN_FAILURE;
914 ctx->pattern++;
915 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000916 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000917
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000918 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000919 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000920 /* <ANY> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000921 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
922 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
923 RETURN_FAILURE;
924 ctx->ptr++;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000925 break;
926
927 case SRE_OP_ANY_ALL:
928 /* match anything */
929 /* <ANY_ALL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000930 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
931 if (ctx->ptr >= end)
932 RETURN_FAILURE;
933 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000934 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000935
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000936 case SRE_OP_IN:
937 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000938 /* <IN> <skip> <set> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000939 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
940 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
941 RETURN_FAILURE;
942 ctx->pattern += ctx->pattern[0];
943 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000944 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000945
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000946 case SRE_OP_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000947 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
948 ctx->pattern, ctx->ptr, ctx->pattern[0]));
949 if (ctx->ptr >= end ||
950 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
951 RETURN_FAILURE;
952 ctx->pattern++;
953 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000954 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000955
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000956 case SRE_OP_NOT_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000957 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
958 ctx->pattern, ctx->ptr, *ctx->pattern));
959 if (ctx->ptr >= end ||
960 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
961 RETURN_FAILURE;
962 ctx->pattern++;
963 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000964 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000965
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000966 case SRE_OP_IN_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000967 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
968 if (ctx->ptr >= end
969 || !SRE_CHARSET(ctx->pattern+1,
970 (SRE_CODE)state->lower(*ctx->ptr)))
971 RETURN_FAILURE;
972 ctx->pattern += ctx->pattern[0];
973 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000974 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000975
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000976 case SRE_OP_JUMP:
977 case SRE_OP_INFO:
978 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000979 /* <JUMP> <offset> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000980 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
981 ctx->ptr, ctx->pattern[0]));
982 ctx->pattern += ctx->pattern[0];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000983 break;
984
985 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000986 /* alternation */
987 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000988 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000989 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000990 ctx->u.rep = state->repeat;
991 if (ctx->u.rep)
992 MARK_PUSH(ctx->lastmark);
993 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
994 if (ctx->pattern[1] == SRE_OP_LITERAL &&
995 (ctx->ptr >= end ||
996 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000997 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000998 if (ctx->pattern[1] == SRE_OP_IN &&
999 (ctx->ptr >= end ||
1000 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001001 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001002 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001003 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001004 if (ret) {
1005 if (ctx->u.rep)
1006 MARK_POP_DISCARD(ctx->lastmark);
1007 RETURN_ON_ERROR(ret);
1008 RETURN_SUCCESS;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001009 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001010 if (ctx->u.rep)
1011 MARK_POP_KEEP(ctx->lastmark);
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001012 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001013 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001014 if (ctx->u.rep)
1015 MARK_POP_DISCARD(ctx->lastmark);
1016 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001017
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001018 case SRE_OP_REPEAT_ONE:
1019 /* match repeated sequence (maximizing regexp) */
1020
1021 /* this operator only works if the repeated item is
1022 exactly one character wide, and we're not already
1023 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +00001024 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001025
1026 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1027
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001028 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1029 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001030
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001031 if (ctx->ptr + ctx->pattern[1] > end)
1032 RETURN_FAILURE; /* cannot match */
Fredrik Lundhe1869832000-08-01 22:47:49 +00001033
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001034 state->ptr = ctx->ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001035
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001036 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1037 RETURN_ON_ERROR(ret);
1038 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1039 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001040 ctx->ptr += ctx->count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001041
1042 /* when we arrive here, count contains the number of
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001043 matches, and ctx->ptr points to the tail of the target
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001044 string. check if the rest of the pattern matches,
1045 and backtrack if not. */
1046
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001047 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001048 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001049
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001050 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001051 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001052 state->ptr = ctx->ptr;
1053 RETURN_SUCCESS;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001054 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001055
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001056 LASTMARK_SAVE();
1057
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001058 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001059 /* tail starts with a literal. skip positions where
1060 the rest of the pattern cannot possibly match */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001061 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001062 for (;;) {
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001063 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001064 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1065 ctx->ptr--;
1066 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001067 }
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001068 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001069 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001070 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001071 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1072 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001073 if (ret) {
1074 RETURN_ON_ERROR(ret);
1075 RETURN_SUCCESS;
1076 }
Tim Peters3d563502006-01-21 02:47:53 +00001077
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001078 LASTMARK_RESTORE();
Tim Peters3d563502006-01-21 02:47:53 +00001079
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001080 ctx->ptr--;
1081 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001082 }
1083
1084 } else {
1085 /* general case */
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001086 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001087 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001088 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1089 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001090 if (ret) {
1091 RETURN_ON_ERROR(ret);
1092 RETURN_SUCCESS;
1093 }
1094 ctx->ptr--;
1095 ctx->count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001096 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001097 }
1098 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001099 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001100
Guido van Rossum41c99e72003-04-14 17:59:34 +00001101 case SRE_OP_MIN_REPEAT_ONE:
1102 /* match repeated sequence (minimizing regexp) */
1103
1104 /* this operator only works if the repeated item is
1105 exactly one character wide, and we're not already
1106 collecting backtracking points. for other cases,
1107 use the MIN_REPEAT operator */
1108
1109 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1110
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001111 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1112 ctx->pattern[1], ctx->pattern[2]));
Guido van Rossum41c99e72003-04-14 17:59:34 +00001113
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001114 if (ctx->ptr + ctx->pattern[1] > end)
1115 RETURN_FAILURE; /* cannot match */
Guido van Rossum41c99e72003-04-14 17:59:34 +00001116
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001117 state->ptr = ctx->ptr;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001118
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001119 if (ctx->pattern[1] == 0)
1120 ctx->count = 0;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001121 else {
1122 /* count using pattern min as the maximum */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001123 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1124 RETURN_ON_ERROR(ret);
1125 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001126 if (ret < (Py_ssize_t) ctx->pattern[1])
Tim Peters3d563502006-01-21 02:47:53 +00001127 /* didn't match minimum number of times */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001128 RETURN_FAILURE;
1129 /* advance past minimum matches of repeat */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001130 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001131 ctx->ptr += ctx->count;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001132 }
1133
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001134 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Guido van Rossum41c99e72003-04-14 17:59:34 +00001135 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001136 state->ptr = ctx->ptr;
1137 RETURN_SUCCESS;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001138
1139 } else {
1140 /* general case */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001141 LASTMARK_SAVE();
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001142 while ((Py_ssize_t)ctx->pattern[2] == 65535
1143 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001144 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001145 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1146 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001147 if (ret) {
1148 RETURN_ON_ERROR(ret);
1149 RETURN_SUCCESS;
1150 }
1151 state->ptr = ctx->ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001152 ret = SRE_COUNT(state, ctx->pattern+3, 1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001153 RETURN_ON_ERROR(ret);
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001154 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001155 if (ret == 0)
Guido van Rossum41c99e72003-04-14 17:59:34 +00001156 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001157 assert(ret == 1);
1158 ctx->ptr++;
1159 ctx->count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001160 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001161 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001162 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001163 RETURN_FAILURE;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001164
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001165 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001166 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001167 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001168 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001169 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1170 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001171
1172 /* install new repeat context */
Jack Diederich2d400772006-05-27 15:44:34 +00001173 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00001174 if (!ctx->u.rep) {
1175 PyErr_NoMemory();
Neal Norwitzef0de022006-08-12 01:53:28 +00001176 RETURN_FAILURE;
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00001177 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001178 ctx->u.rep->count = -1;
1179 ctx->u.rep->pattern = ctx->pattern;
1180 ctx->u.rep->prev = state->repeat;
1181 ctx->u.rep->last_ptr = NULL;
1182 state->repeat = ctx->u.rep;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001183
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001184 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001185 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001186 state->repeat = ctx->u.rep->prev;
Jack Diederich2d400772006-05-27 15:44:34 +00001187 PyObject_FREE(ctx->u.rep);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001188
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001189 if (ret) {
1190 RETURN_ON_ERROR(ret);
1191 RETURN_SUCCESS;
1192 }
1193 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001194
1195 case SRE_OP_MAX_UNTIL:
1196 /* maximizing repeat */
1197 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1198
1199 /* FIXME: we probably need to deal with zero-width
1200 matches in here... */
1201
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001202 ctx->u.rep = state->repeat;
1203 if (!ctx->u.rep)
1204 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001205
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001206 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001207
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001208 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001209
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001210 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1211 ctx->ptr, ctx->count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001212
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001213 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001214 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001215 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001216 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1217 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001218 if (ret) {
1219 RETURN_ON_ERROR(ret);
1220 RETURN_SUCCESS;
1221 }
1222 ctx->u.rep->count = ctx->count-1;
1223 state->ptr = ctx->ptr;
1224 RETURN_FAILURE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001225 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001226
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001227 if ((ctx->count < ctx->u.rep->pattern[2] ||
1228 ctx->u.rep->pattern[2] == 65535) &&
1229 state->ptr != ctx->u.rep->last_ptr) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001230 /* we may have enough matches, but if we can
1231 match another item, do so */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001232 ctx->u.rep->count = ctx->count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001233 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001234 MARK_PUSH(ctx->lastmark);
1235 /* zero-width match protection */
1236 DATA_PUSH(&ctx->u.rep->last_ptr);
1237 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001238 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1239 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001240 DATA_POP(&ctx->u.rep->last_ptr);
1241 if (ret) {
1242 MARK_POP_DISCARD(ctx->lastmark);
1243 RETURN_ON_ERROR(ret);
1244 RETURN_SUCCESS;
1245 }
1246 MARK_POP(ctx->lastmark);
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001247 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001248 ctx->u.rep->count = ctx->count-1;
1249 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001250 }
1251
1252 /* cannot match more repeated items here. make sure the
1253 tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001254 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001255 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001256 RETURN_ON_SUCCESS(ret);
1257 state->repeat = ctx->u.rep;
1258 state->ptr = ctx->ptr;
1259 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001260
1261 case SRE_OP_MIN_UNTIL:
1262 /* minimizing repeat */
1263 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1264
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001265 ctx->u.rep = state->repeat;
1266 if (!ctx->u.rep)
1267 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001268
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001269 state->ptr = ctx->ptr;
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001270
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001271 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001272
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001273 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1274 ctx->ptr, ctx->count, ctx->u.rep->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001275
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001276 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001277 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001278 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001279 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1280 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001281 if (ret) {
1282 RETURN_ON_ERROR(ret);
1283 RETURN_SUCCESS;
1284 }
1285 ctx->u.rep->count = ctx->count-1;
1286 state->ptr = ctx->ptr;
1287 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001288 }
1289
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001290 LASTMARK_SAVE();
1291
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001292 /* see if the tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001293 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001294 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001295 if (ret) {
1296 RETURN_ON_ERROR(ret);
1297 RETURN_SUCCESS;
1298 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001299
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001300 state->repeat = ctx->u.rep;
1301 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001302
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001303 LASTMARK_RESTORE();
1304
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001305 if (ctx->count >= ctx->u.rep->pattern[2]
1306 && ctx->u.rep->pattern[2] != 65535)
1307 RETURN_FAILURE;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001308
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001309 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001310 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1311 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001312 if (ret) {
1313 RETURN_ON_ERROR(ret);
1314 RETURN_SUCCESS;
1315 }
1316 ctx->u.rep->count = ctx->count-1;
1317 state->ptr = ctx->ptr;
1318 RETURN_FAILURE;
1319
1320 case SRE_OP_GROUPREF:
1321 /* match backreference */
1322 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1323 ctx->ptr, ctx->pattern[0]));
1324 i = ctx->pattern[0];
1325 {
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001326 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001327 if (groupref >= state->lastmark) {
1328 RETURN_FAILURE;
1329 } else {
1330 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1331 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1332 if (!p || !e || e < p)
1333 RETURN_FAILURE;
1334 while (p < e) {
1335 if (ctx->ptr >= end || *ctx->ptr != *p)
1336 RETURN_FAILURE;
1337 p++; ctx->ptr++;
1338 }
1339 }
1340 }
1341 ctx->pattern++;
1342 break;
1343
1344 case SRE_OP_GROUPREF_IGNORE:
1345 /* match backreference */
1346 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1347 ctx->ptr, ctx->pattern[0]));
1348 i = ctx->pattern[0];
1349 {
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001350 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001351 if (groupref >= state->lastmark) {
1352 RETURN_FAILURE;
1353 } else {
1354 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1355 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1356 if (!p || !e || e < p)
1357 RETURN_FAILURE;
1358 while (p < e) {
1359 if (ctx->ptr >= end ||
1360 state->lower(*ctx->ptr) != state->lower(*p))
1361 RETURN_FAILURE;
1362 p++; ctx->ptr++;
1363 }
1364 }
1365 }
1366 ctx->pattern++;
1367 break;
1368
1369 case SRE_OP_GROUPREF_EXISTS:
1370 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1371 ctx->ptr, ctx->pattern[0]));
1372 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1373 i = ctx->pattern[0];
1374 {
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001375 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001376 if (groupref >= state->lastmark) {
1377 ctx->pattern += ctx->pattern[1];
1378 break;
1379 } else {
1380 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1381 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1382 if (!p || !e || e < p) {
1383 ctx->pattern += ctx->pattern[1];
1384 break;
1385 }
1386 }
1387 }
1388 ctx->pattern += 2;
1389 break;
1390
1391 case SRE_OP_ASSERT:
1392 /* assert subpattern */
1393 /* <ASSERT> <skip> <back> <pattern> */
1394 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1395 ctx->ptr, ctx->pattern[1]));
1396 state->ptr = ctx->ptr - ctx->pattern[1];
1397 if (state->ptr < state->beginning)
1398 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001399 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001400 RETURN_ON_FAILURE(ret);
1401 ctx->pattern += ctx->pattern[0];
1402 break;
1403
1404 case SRE_OP_ASSERT_NOT:
1405 /* assert not subpattern */
1406 /* <ASSERT_NOT> <skip> <back> <pattern> */
1407 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1408 ctx->ptr, ctx->pattern[1]));
1409 state->ptr = ctx->ptr - ctx->pattern[1];
1410 if (state->ptr >= state->beginning) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001411 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001412 if (ret) {
1413 RETURN_ON_ERROR(ret);
1414 RETURN_FAILURE;
1415 }
1416 }
1417 ctx->pattern += ctx->pattern[0];
1418 break;
1419
1420 case SRE_OP_FAILURE:
1421 /* immediate failure */
1422 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1423 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001424
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001425 default:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001426 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1427 ctx->pattern[-1]));
1428 RETURN_ERROR(SRE_ERROR_ILLEGAL);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001429 }
1430 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001431
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001432exit:
1433 ctx_pos = ctx->last_ctx_pos;
1434 jump = ctx->jump;
1435 DATA_POP_DISCARD(ctx);
1436 if (ctx_pos == -1)
1437 return ret;
1438 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1439
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001440 switch (jump) {
1441 case JUMP_MAX_UNTIL_2:
1442 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1443 goto jump_max_until_2;
1444 case JUMP_MAX_UNTIL_3:
1445 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1446 goto jump_max_until_3;
1447 case JUMP_MIN_UNTIL_2:
1448 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1449 goto jump_min_until_2;
1450 case JUMP_MIN_UNTIL_3:
1451 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1452 goto jump_min_until_3;
1453 case JUMP_BRANCH:
1454 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1455 goto jump_branch;
1456 case JUMP_MAX_UNTIL_1:
1457 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1458 goto jump_max_until_1;
1459 case JUMP_MIN_UNTIL_1:
1460 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1461 goto jump_min_until_1;
1462 case JUMP_REPEAT:
1463 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1464 goto jump_repeat;
1465 case JUMP_REPEAT_ONE_1:
1466 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1467 goto jump_repeat_one_1;
1468 case JUMP_REPEAT_ONE_2:
1469 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1470 goto jump_repeat_one_2;
1471 case JUMP_MIN_REPEAT_ONE:
1472 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1473 goto jump_min_repeat_one;
1474 case JUMP_ASSERT:
1475 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1476 goto jump_assert;
1477 case JUMP_ASSERT_NOT:
1478 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1479 goto jump_assert_not;
1480 case JUMP_NONE:
1481 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1482 break;
1483 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001484
1485 return ret; /* should never get here */
Guido van Rossumb700df92000-03-31 14:59:30 +00001486}
1487
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001488LOCAL(Py_ssize_t)
Guido van Rossumb700df92000-03-31 14:59:30 +00001489SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1490{
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00001491 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1492 SRE_CHAR* end = (SRE_CHAR *)state->end;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001493 Py_ssize_t status = 0;
1494 Py_ssize_t prefix_len = 0;
1495 Py_ssize_t prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001496 SRE_CODE* prefix = NULL;
1497 SRE_CODE* charset = NULL;
1498 SRE_CODE* overlap = NULL;
1499 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001500
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001501 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001502 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001503 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001504
1505 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001506
Gustavo Niemeyer28b5bb32003-06-26 14:41:08 +00001507 if (pattern[3] > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001508 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001509 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001510 end -= pattern[3]-1;
1511 if (end <= ptr)
1512 end = ptr+1;
1513 }
1514
Fredrik Lundh3562f112000-07-02 12:00:07 +00001515 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001516 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001517 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001518 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001519 prefix_skip = pattern[6];
1520 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001521 overlap = prefix + prefix_len - 1;
1522 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001523 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001524 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001525 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001526
1527 pattern += 1 + pattern[1];
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001528 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001529
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001530 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1531 TRACE(("charset = %p\n", charset));
1532
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001533#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001534 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001535 /* pattern starts with a known prefix. use the overlap
1536 table to skip forward as fast as we possibly can */
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001537 Py_ssize_t i = 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00001538 end = (SRE_CHAR *)state->end;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001539 while (ptr < end) {
1540 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001541 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001542 if (!i)
1543 break;
1544 else
1545 i = overlap[i];
1546 } else {
1547 if (++i == prefix_len) {
1548 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001549 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1550 state->start = ptr + 1 - prefix_len;
1551 state->ptr = ptr + 1 - prefix_len + prefix_skip;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001552 if (flags & SRE_INFO_LITERAL)
1553 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001554 status = SRE_MATCH(state, pattern + 2*prefix_skip);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001555 if (status != 0)
1556 return status;
1557 /* close but no cigar -- try again */
1558 i = overlap[i];
1559 }
1560 break;
1561 }
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001562 }
1563 ptr++;
1564 }
1565 return 0;
1566 }
1567#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001568
Fredrik Lundh3562f112000-07-02 12:00:07 +00001569 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001570 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001571 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001572 SRE_CODE chr = pattern[1];
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00001573 end = (SRE_CHAR *)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001574 for (;;) {
1575 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1576 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001577 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001578 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001579 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001580 state->start = ptr;
1581 state->ptr = ++ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001582 if (flags & SRE_INFO_LITERAL)
1583 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001584 status = SRE_MATCH(state, pattern + 2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001585 if (status != 0)
1586 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001587 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001588 } else if (charset) {
1589 /* pattern starts with a character from a known set */
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00001590 end = (SRE_CHAR *)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001591 for (;;) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001592 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001593 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001594 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001595 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001596 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001597 state->start = ptr;
1598 state->ptr = ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001599 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001600 if (status != 0)
1601 break;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001602 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001603 }
1604 } else
1605 /* general case */
1606 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001607 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001608 state->start = state->ptr = ptr++;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001609 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001610 if (status != 0)
1611 break;
1612 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001613
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001614 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001615}
Tim Peters3d563502006-01-21 02:47:53 +00001616
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001617LOCAL(int)
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001618SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001619{
1620 /* check if given string is a literal template (i.e. no escapes) */
1621 while (len-- > 0)
1622 if (*ptr++ == '\\')
1623 return 0;
1624 return 1;
1625}
Guido van Rossumb700df92000-03-31 14:59:30 +00001626
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001627#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001628
1629/* -------------------------------------------------------------------- */
1630/* factories and destructors */
1631
1632/* see sre.h for object declarations */
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00001633static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1634static PyObject*pattern_scanner(PatternObject*, PyObject*);
Guido van Rossumb700df92000-03-31 14:59:30 +00001635
1636static PyObject *
Georg Brandl964f5972006-05-28 22:38:57 +00001637sre_codesize(PyObject* self, PyObject *unused)
Guido van Rossumb700df92000-03-31 14:59:30 +00001638{
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001639 return Py_BuildValue("l", sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001640}
1641
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001642static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001643sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001644{
1645 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001646 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001647 return NULL;
1648 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001649 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001650 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001651#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001652 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001653#else
1654 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001655#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001656 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001657}
1658
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001659LOCAL(void)
1660state_reset(SRE_STATE* state)
1661{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001662 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001663 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001664
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001665 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001666 state->lastindex = -1;
1667
1668 state->repeat = NULL;
1669
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001670 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001671}
1672
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001673static void*
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001674getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
Guido van Rossumb700df92000-03-31 14:59:30 +00001675{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001676 /* given a python object, return a data pointer, a length (in
1677 characters), and a character size. return NULL if the object
1678 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +00001679
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001680 PyBufferProcs *buffer;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001681 Py_ssize_t size, bytes;
1682 int charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001683 void* ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001684
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001685#if defined(HAVE_UNICODE)
1686 if (PyUnicode_Check(string)) {
1687 /* unicode strings doesn't always support the buffer interface */
1688 ptr = (void*) PyUnicode_AS_DATA(string);
Brett Cannon8ffe7bb2010-05-03 23:51:28 +00001689 /* bytes = PyUnicode_GET_DATA_SIZE(string); */
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001690 size = PyUnicode_GET_SIZE(string);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001691 charsize = sizeof(Py_UNICODE);
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001692
1693 } else {
1694#endif
1695
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001696 /* get pointer to string buffer */
Christian Heimese93237d2007-12-19 02:37:44 +00001697 buffer = Py_TYPE(string)->tp_as_buffer;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001698 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1699 buffer->bf_getsegcount(string, NULL) != 1) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001700 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001701 return NULL;
1702 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001703
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001704 /* determine buffer size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001705 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1706 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001707 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1708 return NULL;
1709 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001710
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001711 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001712#if PY_VERSION_HEX >= 0x01060000
1713 size = PyObject_Size(string);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001714#else
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001715 size = PyObject_Length(string);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001716#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001717
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001718 if (PyString_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001719 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001720#if defined(HAVE_UNICODE)
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001721 else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001722 charsize = sizeof(Py_UNICODE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001723#endif
1724 else {
1725 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1726 return NULL;
1727 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001728
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001729#if defined(HAVE_UNICODE)
1730 }
1731#endif
1732
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001733 *p_length = size;
1734 *p_charsize = charsize;
1735
1736 return ptr;
1737}
1738
1739LOCAL(PyObject*)
1740state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001741 Py_ssize_t start, Py_ssize_t end)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001742{
1743 /* prepare state object */
1744
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001745 Py_ssize_t length;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001746 int charsize;
1747 void* ptr;
1748
1749 memset(state, 0, sizeof(SRE_STATE));
1750
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001751 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001752 state->lastindex = -1;
1753
1754 ptr = getstring(string, &length, &charsize);
1755 if (!ptr)
1756 return NULL;
1757
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001758 /* adjust boundaries */
1759 if (start < 0)
1760 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001761 else if (start > length)
1762 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001763
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001764 if (end < 0)
1765 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001766 else if (end > length)
1767 end = length;
1768
1769 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001770
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001771 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001772
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001773 state->start = (void*) ((char*) ptr + start * state->charsize);
1774 state->end = (void*) ((char*) ptr + end * state->charsize);
1775
1776 Py_INCREF(string);
1777 state->string = string;
1778 state->pos = start;
1779 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001780
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001781 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001782 state->lower = sre_lower_locale;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001783 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001784#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001785 state->lower = sre_lower_unicode;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001786#else
1787 state->lower = sre_lower_locale;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001788#endif
1789 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001790 state->lower = sre_lower;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001791
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001792 return string;
Guido van Rossumb700df92000-03-31 14:59:30 +00001793}
1794
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001795LOCAL(void)
1796state_fini(SRE_STATE* state)
1797{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001798 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001799 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001800}
1801
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001802/* calculate offset from start of string */
1803#define STATE_OFFSET(state, member)\
1804 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1805
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001806LOCAL(PyObject*)
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001807state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001808{
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001809 Py_ssize_t i, j;
Fredrik Lundh58100642000-08-09 09:14:35 +00001810
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001811 index = (index - 1) * 2;
1812
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001813 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001814 if (empty)
1815 /* want empty string */
1816 i = j = 0;
1817 else {
1818 Py_INCREF(Py_None);
1819 return Py_None;
1820 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001821 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001822 i = STATE_OFFSET(state, state->mark[index]);
1823 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001824 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001825
Fredrik Lundh58100642000-08-09 09:14:35 +00001826 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001827}
1828
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001829static void
1830pattern_error(int status)
1831{
1832 switch (status) {
1833 case SRE_ERROR_RECURSION_LIMIT:
1834 PyErr_SetString(
1835 PyExc_RuntimeError,
1836 "maximum recursion limit exceeded"
1837 );
1838 break;
1839 case SRE_ERROR_MEMORY:
1840 PyErr_NoMemory();
1841 break;
Facundo Batista4473d222008-01-08 21:10:12 +00001842 case SRE_ERROR_INTERRUPTED:
1843 /* An exception has already been raised, so let it fly */
1844 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001845 default:
1846 /* other error codes indicate compiler/engine bugs */
1847 PyErr_SetString(
1848 PyExc_RuntimeError,
1849 "internal error in regular expression engine"
1850 );
1851 }
1852}
1853
Guido van Rossumb700df92000-03-31 14:59:30 +00001854static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001855pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001856{
Raymond Hettinger027bb632004-05-31 03:09:25 +00001857 if (self->weakreflist != NULL)
1858 PyObject_ClearWeakRefs((PyObject *) self);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001859 Py_XDECREF(self->pattern);
1860 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001861 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001862 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001863}
1864
1865static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001866pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001867{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001868 SRE_STATE state;
1869 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001870
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001871 PyObject* string;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001872 Py_ssize_t start = 0;
1873 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001874 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001875 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001876 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001877 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001878
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001879 string = state_init(&state, self, string, start, end);
1880 if (!string)
1881 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001882
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001883 state.ptr = state.start;
1884
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001885 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1886
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001887 if (state.charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001888 status = sre_match(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001889 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001890#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001891 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001892#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001893 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001894
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001895 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00001896 if (PyErr_Occurred())
1897 return NULL;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001898
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001899 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001900
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001901 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001902}
1903
1904static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001905pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001906{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001907 SRE_STATE state;
1908 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001909
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001910 PyObject* string;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001911 Py_ssize_t start = 0;
1912 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001913 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001914 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001915 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001916 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001917
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001918 string = state_init(&state, self, string, start, end);
1919 if (!string)
1920 return NULL;
1921
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001922 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1923
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001924 if (state.charsize == 1) {
1925 status = sre_search(&state, PatternObject_GetCode(self));
1926 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001927#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001928 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001929#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001930 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001931
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001932 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1933
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001934 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001935
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00001936 if (PyErr_Occurred())
1937 return NULL;
1938
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001939 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001940}
1941
1942static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001943call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001944{
1945 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001946 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001947 PyObject* func;
1948 PyObject* result;
1949
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001950 if (!args)
1951 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001952 name = PyString_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001953 if (!name)
1954 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001955 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001956 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001957 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001958 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001959 func = PyObject_GetAttrString(mod, function);
1960 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001961 if (!func)
1962 return NULL;
1963 result = PyObject_CallObject(func, args);
1964 Py_DECREF(func);
1965 Py_DECREF(args);
1966 return result;
1967}
1968
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001969#ifdef USE_BUILTIN_COPY
1970static int
1971deepcopy(PyObject** object, PyObject* memo)
1972{
1973 PyObject* copy;
1974
1975 copy = call(
1976 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00001977 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001978 );
1979 if (!copy)
1980 return 0;
1981
1982 Py_DECREF(*object);
1983 *object = copy;
1984
1985 return 1; /* success */
1986}
1987#endif
1988
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001989static PyObject*
Guido van Rossum1ff91d92007-09-10 22:02:25 +00001990join_list(PyObject* list, PyObject* string)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001991{
1992 /* join list elements */
1993
1994 PyObject* joiner;
1995#if PY_VERSION_HEX >= 0x01060000
1996 PyObject* function;
1997 PyObject* args;
1998#endif
1999 PyObject* result;
2000
Guido van Rossum1ff91d92007-09-10 22:02:25 +00002001 joiner = PySequence_GetSlice(string, 0, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002002 if (!joiner)
2003 return NULL;
2004
Guido van Rossum1ff91d92007-09-10 22:02:25 +00002005 if (PyList_GET_SIZE(list) == 0) {
2006 Py_DECREF(list);
2007 return joiner;
2008 }
2009
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002010#if PY_VERSION_HEX >= 0x01060000
2011 function = PyObject_GetAttrString(joiner, "join");
2012 if (!function) {
2013 Py_DECREF(joiner);
2014 return NULL;
2015 }
2016 args = PyTuple_New(1);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002017 if (!args) {
2018 Py_DECREF(function);
2019 Py_DECREF(joiner);
2020 return NULL;
2021 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002022 PyTuple_SET_ITEM(args, 0, list);
2023 result = PyObject_CallObject(function, args);
2024 Py_DECREF(args); /* also removes list */
2025 Py_DECREF(function);
2026#else
2027 result = call(
2028 "string", "join",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002029 PyTuple_Pack(2, list, joiner)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002030 );
2031#endif
2032 Py_DECREF(joiner);
2033
2034 return result;
2035}
2036
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002037static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002038pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002039{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002040 SRE_STATE state;
2041 PyObject* list;
2042 int status;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002043 Py_ssize_t i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00002044
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002045 PyObject* string;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002046 Py_ssize_t start = 0;
2047 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002048 static char* kwlist[] = { "source", "pos", "endpos", NULL };
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002049 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00002050 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002051 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002052
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002053 string = state_init(&state, self, string, start, end);
2054 if (!string)
2055 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002056
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002057 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002058 if (!list) {
2059 state_fini(&state);
2060 return NULL;
2061 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002062
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002063 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002064
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002065 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +00002066
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002067 state_reset(&state);
2068
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002069 state.ptr = state.start;
2070
2071 if (state.charsize == 1) {
2072 status = sre_search(&state, PatternObject_GetCode(self));
2073 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002074#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002075 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002076#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002077 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002078
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00002079 if (PyErr_Occurred())
2080 goto error;
2081
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002082 if (status <= 0) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002083 if (status == 0)
2084 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002085 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002086 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002087 }
Tim Peters3d563502006-01-21 02:47:53 +00002088
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002089 /* don't bother to build a match object */
2090 switch (self->groups) {
2091 case 0:
2092 b = STATE_OFFSET(&state, state.start);
2093 e = STATE_OFFSET(&state, state.ptr);
2094 item = PySequence_GetSlice(string, b, e);
2095 if (!item)
2096 goto error;
2097 break;
2098 case 1:
2099 item = state_getslice(&state, 1, string, 1);
2100 if (!item)
2101 goto error;
2102 break;
2103 default:
2104 item = PyTuple_New(self->groups);
2105 if (!item)
2106 goto error;
2107 for (i = 0; i < self->groups; i++) {
2108 PyObject* o = state_getslice(&state, i+1, string, 1);
2109 if (!o) {
2110 Py_DECREF(item);
2111 goto error;
2112 }
2113 PyTuple_SET_ITEM(item, i, o);
2114 }
2115 break;
2116 }
2117
2118 status = PyList_Append(list, item);
2119 Py_DECREF(item);
2120 if (status < 0)
2121 goto error;
2122
2123 if (state.ptr == state.start)
2124 state.start = (void*) ((char*) state.ptr + state.charsize);
2125 else
2126 state.start = state.ptr;
2127
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002128 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002129
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002130 state_fini(&state);
2131 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002132
2133error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002134 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002135 state_fini(&state);
2136 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002137
Guido van Rossumb700df92000-03-31 14:59:30 +00002138}
2139
Fredrik Lundh703ce812001-10-24 22:16:30 +00002140#if PY_VERSION_HEX >= 0x02020000
2141static PyObject*
2142pattern_finditer(PatternObject* pattern, PyObject* args)
2143{
2144 PyObject* scanner;
2145 PyObject* search;
2146 PyObject* iterator;
2147
2148 scanner = pattern_scanner(pattern, args);
2149 if (!scanner)
2150 return NULL;
2151
2152 search = PyObject_GetAttrString(scanner, "search");
2153 Py_DECREF(scanner);
2154 if (!search)
2155 return NULL;
2156
2157 iterator = PyCallIter_New(search, Py_None);
2158 Py_DECREF(search);
2159
2160 return iterator;
2161}
2162#endif
2163
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002164static PyObject*
2165pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2166{
2167 SRE_STATE state;
2168 PyObject* list;
2169 PyObject* item;
2170 int status;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002171 Py_ssize_t n;
2172 Py_ssize_t i;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002173 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002174
2175 PyObject* string;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002176 Py_ssize_t maxsplit = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002177 static char* kwlist[] = { "source", "maxsplit", NULL };
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002178 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002179 &string, &maxsplit))
2180 return NULL;
2181
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002182 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002183 if (!string)
2184 return NULL;
2185
2186 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002187 if (!list) {
2188 state_fini(&state);
2189 return NULL;
2190 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002191
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002192 n = 0;
2193 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002194
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002195 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002196
2197 state_reset(&state);
2198
2199 state.ptr = state.start;
2200
2201 if (state.charsize == 1) {
2202 status = sre_search(&state, PatternObject_GetCode(self));
2203 } else {
2204#if defined(HAVE_UNICODE)
2205 status = sre_usearch(&state, PatternObject_GetCode(self));
2206#endif
2207 }
2208
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00002209 if (PyErr_Occurred())
2210 goto error;
2211
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002212 if (status <= 0) {
2213 if (status == 0)
2214 break;
2215 pattern_error(status);
2216 goto error;
2217 }
Tim Peters3d563502006-01-21 02:47:53 +00002218
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002219 if (state.start == state.ptr) {
2220 if (last == state.end)
2221 break;
2222 /* skip one character */
2223 state.start = (void*) ((char*) state.ptr + state.charsize);
2224 continue;
2225 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002226
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002227 /* get segment before this match */
2228 item = PySequence_GetSlice(
2229 string, STATE_OFFSET(&state, last),
2230 STATE_OFFSET(&state, state.start)
2231 );
2232 if (!item)
2233 goto error;
2234 status = PyList_Append(list, item);
2235 Py_DECREF(item);
2236 if (status < 0)
2237 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002238
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002239 /* add groups (if any) */
2240 for (i = 0; i < self->groups; i++) {
2241 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002242 if (!item)
2243 goto error;
2244 status = PyList_Append(list, item);
2245 Py_DECREF(item);
2246 if (status < 0)
2247 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002248 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002249
2250 n = n + 1;
2251
2252 last = state.start = state.ptr;
2253
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002254 }
2255
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002256 /* get segment following last match (even if empty) */
2257 item = PySequence_GetSlice(
2258 string, STATE_OFFSET(&state, last), state.endpos
2259 );
2260 if (!item)
2261 goto error;
2262 status = PyList_Append(list, item);
2263 Py_DECREF(item);
2264 if (status < 0)
2265 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002266
2267 state_fini(&state);
2268 return list;
2269
2270error:
2271 Py_DECREF(list);
2272 state_fini(&state);
2273 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002274
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002275}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002276
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002277static PyObject*
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002278pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002279 Py_ssize_t count, Py_ssize_t subn)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002280{
2281 SRE_STATE state;
2282 PyObject* list;
2283 PyObject* item;
2284 PyObject* filter;
2285 PyObject* args;
2286 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002287 void* ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002288 int status;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002289 Py_ssize_t n;
2290 Py_ssize_t i, b, e;
2291 int bint;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002292 int filter_is_callable;
2293
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002294 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002295 /* sub/subn takes either a function or a template */
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002296 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002297 Py_INCREF(filter);
2298 filter_is_callable = 1;
2299 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002300 /* if not callable, check if it's a literal string */
2301 int literal;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002302 ptr = getstring(ptemplate, &n, &bint);
2303 b = bint;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002304 if (ptr) {
2305 if (b == 1) {
Skip Montanaro816a1622006-04-18 11:53:09 +00002306 literal = sre_literal_template((unsigned char *)ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002307 } else {
2308#if defined(HAVE_UNICODE)
Skip Montanaro816a1622006-04-18 11:53:09 +00002309 literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002310#endif
2311 }
2312 } else {
2313 PyErr_Clear();
2314 literal = 0;
2315 }
2316 if (literal) {
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002317 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002318 Py_INCREF(filter);
2319 filter_is_callable = 0;
2320 } else {
2321 /* not a literal; hand it over to the template compiler */
2322 filter = call(
Neal Norwitz94a9c092006-03-16 06:30:02 +00002323 SRE_PY_MODULE, "_subx",
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002324 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002325 );
2326 if (!filter)
2327 return NULL;
2328 filter_is_callable = PyCallable_Check(filter);
2329 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002330 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002331
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002332 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002333 if (!string) {
2334 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002335 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002336 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002337
2338 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002339 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002340 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002341 state_fini(&state);
2342 return NULL;
2343 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002344
2345 n = i = 0;
2346
2347 while (!count || n < count) {
2348
2349 state_reset(&state);
2350
2351 state.ptr = state.start;
2352
2353 if (state.charsize == 1) {
2354 status = sre_search(&state, PatternObject_GetCode(self));
2355 } else {
2356#if defined(HAVE_UNICODE)
2357 status = sre_usearch(&state, PatternObject_GetCode(self));
2358#endif
2359 }
2360
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00002361 if (PyErr_Occurred())
2362 goto error;
2363
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002364 if (status <= 0) {
2365 if (status == 0)
2366 break;
2367 pattern_error(status);
2368 goto error;
2369 }
Tim Peters3d563502006-01-21 02:47:53 +00002370
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002371 b = STATE_OFFSET(&state, state.start);
2372 e = STATE_OFFSET(&state, state.ptr);
2373
2374 if (i < b) {
2375 /* get segment before this match */
2376 item = PySequence_GetSlice(string, i, b);
2377 if (!item)
2378 goto error;
2379 status = PyList_Append(list, item);
2380 Py_DECREF(item);
2381 if (status < 0)
2382 goto error;
2383
2384 } else if (i == b && i == e && n > 0)
2385 /* ignore empty match on latest position */
2386 goto next;
2387
2388 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002389 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002390 match = pattern_new_match(self, &state, 1);
2391 if (!match)
2392 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002393 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002394 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002395 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002396 goto error;
2397 }
2398 item = PyObject_CallObject(filter, args);
2399 Py_DECREF(args);
2400 Py_DECREF(match);
2401 if (!item)
2402 goto error;
2403 } else {
2404 /* filter is literal string */
2405 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002406 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002407 }
2408
2409 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002410 if (item != Py_None) {
2411 status = PyList_Append(list, item);
2412 Py_DECREF(item);
2413 if (status < 0)
2414 goto error;
2415 }
Tim Peters3d563502006-01-21 02:47:53 +00002416
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002417 i = e;
2418 n = n + 1;
2419
2420next:
2421 /* move on */
2422 if (state.ptr == state.start)
2423 state.start = (void*) ((char*) state.ptr + state.charsize);
2424 else
2425 state.start = state.ptr;
2426
2427 }
2428
2429 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002430 if (i < state.endpos) {
2431 item = PySequence_GetSlice(string, i, state.endpos);
2432 if (!item)
2433 goto error;
2434 status = PyList_Append(list, item);
2435 Py_DECREF(item);
2436 if (status < 0)
2437 goto error;
2438 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002439
2440 state_fini(&state);
2441
Guido van Rossum4e173842001-12-07 04:25:10 +00002442 Py_DECREF(filter);
2443
Fredrik Lundhdac58492001-10-21 21:48:30 +00002444 /* convert list to single string (also removes list) */
Guido van Rossum1ff91d92007-09-10 22:02:25 +00002445 item = join_list(list, string);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002446
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002447 if (!item)
2448 return NULL;
2449
2450 if (subn)
2451 return Py_BuildValue("Ni", item, n);
2452
2453 return item;
2454
2455error:
2456 Py_DECREF(list);
2457 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002458 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002459 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002460
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002461}
2462
2463static PyObject*
2464pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2465{
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002466 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002467 PyObject* string;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002468 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002469 static char* kwlist[] = { "repl", "string", "count", NULL };
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002470 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002471 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002472 return NULL;
2473
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002474 return pattern_subx(self, ptemplate, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002475}
2476
2477static PyObject*
2478pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2479{
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002480 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002481 PyObject* string;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002482 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002483 static char* kwlist[] = { "repl", "string", "count", NULL };
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002484 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002485 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002486 return NULL;
2487
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002488 return pattern_subx(self, ptemplate, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002489}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002490
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002491static PyObject*
Georg Brandl964f5972006-05-28 22:38:57 +00002492pattern_copy(PatternObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002493{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002494#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002495 PatternObject* copy;
2496 int offset;
2497
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002498 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2499 if (!copy)
2500 return NULL;
2501
2502 offset = offsetof(PatternObject, groups);
2503
2504 Py_XINCREF(self->groupindex);
2505 Py_XINCREF(self->indexgroup);
2506 Py_XINCREF(self->pattern);
2507
2508 memcpy((char*) copy + offset, (char*) self + offset,
2509 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00002510 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002511
2512 return (PyObject*) copy;
2513#else
2514 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2515 return NULL;
2516#endif
2517}
2518
2519static PyObject*
Georg Brandlfbef5882006-05-28 22:14:04 +00002520pattern_deepcopy(PatternObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002521{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002522#ifdef USE_BUILTIN_COPY
2523 PatternObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00002524
Georg Brandlfbef5882006-05-28 22:14:04 +00002525 copy = (PatternObject*) pattern_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002526 if (!copy)
2527 return NULL;
2528
2529 if (!deepcopy(&copy->groupindex, memo) ||
2530 !deepcopy(&copy->indexgroup, memo) ||
2531 !deepcopy(&copy->pattern, memo)) {
2532 Py_DECREF(copy);
2533 return NULL;
2534 }
2535
2536#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002537 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2538 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002539#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002540}
2541
Raymond Hettinger94478742004-09-24 04:31:19 +00002542PyDoc_STRVAR(pattern_match_doc,
2543"match(string[, pos[, endpos]]) --> match object or None.\n\
2544 Matches zero or more characters at the beginning of the string");
2545
2546PyDoc_STRVAR(pattern_search_doc,
2547"search(string[, pos[, endpos]]) --> match object or None.\n\
2548 Scan through string looking for a match, and return a corresponding\n\
2549 MatchObject instance. Return None if no position in the string matches.");
2550
2551PyDoc_STRVAR(pattern_split_doc,
2552"split(string[, maxsplit = 0]) --> list.\n\
2553 Split string by the occurrences of pattern.");
2554
2555PyDoc_STRVAR(pattern_findall_doc,
2556"findall(string[, pos[, endpos]]) --> list.\n\
2557 Return a list of all non-overlapping matches of pattern in string.");
2558
2559PyDoc_STRVAR(pattern_finditer_doc,
2560"finditer(string[, pos[, endpos]]) --> iterator.\n\
2561 Return an iterator over all non-overlapping matches for the \n\
2562 RE pattern in string. For each match, the iterator returns a\n\
2563 match object.");
2564
2565PyDoc_STRVAR(pattern_sub_doc,
2566"sub(repl, string[, count = 0]) --> newstring\n\
2567 Return the string obtained by replacing the leftmost non-overlapping\n\
Tim Peters3d563502006-01-21 02:47:53 +00002568 occurrences of pattern in string by the replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002569
2570PyDoc_STRVAR(pattern_subn_doc,
2571"subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2572 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2573 the leftmost non-overlapping occurrences of pattern with the\n\
Tim Peters3d563502006-01-21 02:47:53 +00002574 replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002575
2576PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2577
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002578static PyMethodDef pattern_methods[] = {
Tim Peters3d563502006-01-21 02:47:53 +00002579 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002580 pattern_match_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002581 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002582 pattern_search_doc},
2583 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2584 pattern_sub_doc},
2585 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2586 pattern_subn_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002587 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002588 pattern_split_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002589 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002590 pattern_findall_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002591#if PY_VERSION_HEX >= 0x02020000
Raymond Hettinger94478742004-09-24 04:31:19 +00002592 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2593 pattern_finditer_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002594#endif
Fredrik Lundh562586e2000-10-03 20:43:34 +00002595 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
Georg Brandlfbef5882006-05-28 22:14:04 +00002596 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2597 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002598 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002599};
2600
Tim Peters3d563502006-01-21 02:47:53 +00002601static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002602pattern_getattr(PatternObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00002603{
2604 PyObject* res;
2605
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002606 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
Guido van Rossumb700df92000-03-31 14:59:30 +00002607
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002608 if (res)
2609 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00002610
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002611 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00002612
2613 /* attributes */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002614 if (!strcmp(name, "pattern")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002615 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002616 return self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00002617 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002618
2619 if (!strcmp(name, "flags"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002620 return Py_BuildValue("i", self->flags);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002621
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002622 if (!strcmp(name, "groups"))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002623 return Py_BuildValue("i", self->groups);
Fredrik Lundh01016fe2000-06-30 00:27:46 +00002624
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002625 if (!strcmp(name, "groupindex") && self->groupindex) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002626 Py_INCREF(self->groupindex);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002627 return self->groupindex;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002628 }
2629
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002630 PyErr_SetString(PyExc_AttributeError, name);
2631 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002632}
2633
2634statichere PyTypeObject Pattern_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002635 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00002636 0, "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002637 sizeof(PatternObject), sizeof(SRE_CODE),
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002638 (destructor)pattern_dealloc, /*tp_dealloc*/
2639 0, /*tp_print*/
Raymond Hettinger027bb632004-05-31 03:09:25 +00002640 (getattrfunc)pattern_getattr, /*tp_getattr*/
2641 0, /* tp_setattr */
2642 0, /* tp_compare */
2643 0, /* tp_repr */
2644 0, /* tp_as_number */
2645 0, /* tp_as_sequence */
2646 0, /* tp_as_mapping */
2647 0, /* tp_hash */
2648 0, /* tp_call */
2649 0, /* tp_str */
2650 0, /* tp_getattro */
2651 0, /* tp_setattro */
2652 0, /* tp_as_buffer */
2653 Py_TPFLAGS_HAVE_WEAKREFS, /* tp_flags */
Raymond Hettinger94478742004-09-24 04:31:19 +00002654 pattern_doc, /* tp_doc */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002655 0, /* tp_traverse */
2656 0, /* tp_clear */
2657 0, /* tp_richcompare */
2658 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
Guido van Rossumb700df92000-03-31 14:59:30 +00002659};
2660
Guido van Rossum8b762f02008-08-05 03:39:21 +00002661static int _validate(PatternObject *self); /* Forward */
2662
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002663static PyObject *
2664_compile(PyObject* self_, PyObject* args)
2665{
2666 /* "compile" pattern descriptor to pattern object */
2667
2668 PatternObject* self;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002669 Py_ssize_t i, n;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002670
2671 PyObject* pattern;
2672 int flags = 0;
2673 PyObject* code;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002674 Py_ssize_t groups = 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002675 PyObject* groupindex = NULL;
2676 PyObject* indexgroup = NULL;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002677 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002678 &PyList_Type, &code, &groups,
2679 &groupindex, &indexgroup))
2680 return NULL;
2681
2682 n = PyList_GET_SIZE(code);
Christian Heimes4956d2b2008-01-18 19:12:56 +00002683 /* coverity[ampersand_in_size] */
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002684 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2685 if (!self)
2686 return NULL;
Antoine Pitrouefdddd32010-01-14 17:25:24 +00002687 self->weakreflist = NULL;
2688 self->pattern = NULL;
2689 self->groupindex = NULL;
2690 self->indexgroup = NULL;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002691
2692 self->codesize = n;
2693
2694 for (i = 0; i < n; i++) {
2695 PyObject *o = PyList_GET_ITEM(code, i);
2696 unsigned long value = PyInt_Check(o) ? (unsigned long)PyInt_AsLong(o)
2697 : PyLong_AsUnsignedLong(o);
2698 self->code[i] = (SRE_CODE) value;
2699 if ((unsigned long) self->code[i] != value) {
2700 PyErr_SetString(PyExc_OverflowError,
2701 "regular expression code size limit exceeded");
2702 break;
2703 }
2704 }
2705
2706 if (PyErr_Occurred()) {
Antoine Pitrouefdddd32010-01-14 17:25:24 +00002707 Py_DECREF(self);
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002708 return NULL;
2709 }
2710
2711 Py_INCREF(pattern);
2712 self->pattern = pattern;
2713
2714 self->flags = flags;
2715
2716 self->groups = groups;
2717
2718 Py_XINCREF(groupindex);
2719 self->groupindex = groupindex;
2720
2721 Py_XINCREF(indexgroup);
2722 self->indexgroup = indexgroup;
2723
2724 self->weakreflist = NULL;
2725
Guido van Rossum8b762f02008-08-05 03:39:21 +00002726 if (!_validate(self)) {
2727 Py_DECREF(self);
2728 return NULL;
2729 }
2730
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002731 return (PyObject*) self;
2732}
2733
Guido van Rossumb700df92000-03-31 14:59:30 +00002734/* -------------------------------------------------------------------- */
Guido van Rossum8b762f02008-08-05 03:39:21 +00002735/* Code validation */
2736
2737/* To learn more about this code, have a look at the _compile() function in
2738 Lib/sre_compile.py. The validation functions below checks the code array
2739 for conformance with the code patterns generated there.
2740
2741 The nice thing about the generated code is that it is position-independent:
2742 all jumps are relative jumps forward. Also, jumps don't cross each other:
2743 the target of a later jump is always earlier than the target of an earlier
2744 jump. IOW, this is okay:
2745
2746 J---------J-------T--------T
2747 \ \_____/ /
2748 \______________________/
2749
2750 but this is not:
2751
2752 J---------J-------T--------T
2753 \_________\_____/ /
2754 \____________/
2755
2756 It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
2757 bytes wide (the latter if Python is compiled for "wide" unicode support).
2758*/
2759
2760/* Defining this one enables tracing of the validator */
2761#undef VVERBOSE
2762
2763/* Trace macro for the validator */
2764#if defined(VVERBOSE)
2765#define VTRACE(v) printf v
2766#else
2767#define VTRACE(v)
2768#endif
2769
2770/* Report failure */
2771#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2772
2773/* Extract opcode, argument, or skip count from code array */
2774#define GET_OP \
2775 do { \
2776 VTRACE(("%p: ", code)); \
2777 if (code >= end) FAIL; \
2778 op = *code++; \
2779 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2780 } while (0)
2781#define GET_ARG \
2782 do { \
2783 VTRACE(("%p= ", code)); \
2784 if (code >= end) FAIL; \
2785 arg = *code++; \
2786 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2787 } while (0)
Guido van Rossume3c4fd92008-09-10 14:27:00 +00002788#define GET_SKIP_ADJ(adj) \
Guido van Rossum8b762f02008-08-05 03:39:21 +00002789 do { \
2790 VTRACE(("%p= ", code)); \
2791 if (code >= end) FAIL; \
2792 skip = *code; \
2793 VTRACE(("%lu (skip to %p)\n", \
2794 (unsigned long)skip, code+skip)); \
Guido van Rossume3c4fd92008-09-10 14:27:00 +00002795 if (code+skip-adj < code || code+skip-adj > end)\
Guido van Rossum8b762f02008-08-05 03:39:21 +00002796 FAIL; \
2797 code++; \
2798 } while (0)
Guido van Rossume3c4fd92008-09-10 14:27:00 +00002799#define GET_SKIP GET_SKIP_ADJ(0)
Guido van Rossum8b762f02008-08-05 03:39:21 +00002800
2801static int
2802_validate_charset(SRE_CODE *code, SRE_CODE *end)
2803{
2804 /* Some variables are manipulated by the macros above */
2805 SRE_CODE op;
2806 SRE_CODE arg;
2807 SRE_CODE offset;
2808 int i;
2809
2810 while (code < end) {
2811 GET_OP;
2812 switch (op) {
2813
2814 case SRE_OP_NEGATE:
2815 break;
2816
2817 case SRE_OP_LITERAL:
2818 GET_ARG;
2819 break;
2820
2821 case SRE_OP_RANGE:
2822 GET_ARG;
2823 GET_ARG;
2824 break;
2825
2826 case SRE_OP_CHARSET:
2827 offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
2828 if (code+offset < code || code+offset > end)
2829 FAIL;
2830 code += offset;
2831 break;
2832
2833 case SRE_OP_BIGCHARSET:
2834 GET_ARG; /* Number of blocks */
2835 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
2836 if (code+offset < code || code+offset > end)
2837 FAIL;
2838 /* Make sure that each byte points to a valid block */
2839 for (i = 0; i < 256; i++) {
2840 if (((unsigned char *)code)[i] >= arg)
2841 FAIL;
2842 }
2843 code += offset;
2844 offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
2845 if (code+offset < code || code+offset > end)
2846 FAIL;
2847 code += offset;
2848 break;
2849
2850 case SRE_OP_CATEGORY:
2851 GET_ARG;
2852 switch (arg) {
2853 case SRE_CATEGORY_DIGIT:
2854 case SRE_CATEGORY_NOT_DIGIT:
2855 case SRE_CATEGORY_SPACE:
2856 case SRE_CATEGORY_NOT_SPACE:
2857 case SRE_CATEGORY_WORD:
2858 case SRE_CATEGORY_NOT_WORD:
2859 case SRE_CATEGORY_LINEBREAK:
2860 case SRE_CATEGORY_NOT_LINEBREAK:
2861 case SRE_CATEGORY_LOC_WORD:
2862 case SRE_CATEGORY_LOC_NOT_WORD:
2863 case SRE_CATEGORY_UNI_DIGIT:
2864 case SRE_CATEGORY_UNI_NOT_DIGIT:
2865 case SRE_CATEGORY_UNI_SPACE:
2866 case SRE_CATEGORY_UNI_NOT_SPACE:
2867 case SRE_CATEGORY_UNI_WORD:
2868 case SRE_CATEGORY_UNI_NOT_WORD:
2869 case SRE_CATEGORY_UNI_LINEBREAK:
2870 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2871 break;
2872 default:
2873 FAIL;
2874 }
2875 break;
2876
2877 default:
2878 FAIL;
2879
2880 }
2881 }
2882
2883 return 1;
2884}
2885
2886static int
2887_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2888{
2889 /* Some variables are manipulated by the macros above */
2890 SRE_CODE op;
2891 SRE_CODE arg;
2892 SRE_CODE skip;
2893
2894 VTRACE(("code=%p, end=%p\n", code, end));
2895
2896 if (code > end)
2897 FAIL;
2898
2899 while (code < end) {
2900 GET_OP;
2901 switch (op) {
2902
2903 case SRE_OP_MARK:
2904 /* We don't check whether marks are properly nested; the
2905 sre_match() code is robust even if they don't, and the worst
2906 you can get is nonsensical match results. */
2907 GET_ARG;
2908 if (arg > 2*groups+1) {
2909 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
2910 FAIL;
2911 }
2912 break;
2913
2914 case SRE_OP_LITERAL:
2915 case SRE_OP_NOT_LITERAL:
2916 case SRE_OP_LITERAL_IGNORE:
2917 case SRE_OP_NOT_LITERAL_IGNORE:
2918 GET_ARG;
2919 /* The arg is just a character, nothing to check */
2920 break;
2921
2922 case SRE_OP_SUCCESS:
2923 case SRE_OP_FAILURE:
2924 /* Nothing to check; these normally end the matching process */
2925 break;
2926
2927 case SRE_OP_AT:
2928 GET_ARG;
2929 switch (arg) {
2930 case SRE_AT_BEGINNING:
2931 case SRE_AT_BEGINNING_STRING:
2932 case SRE_AT_BEGINNING_LINE:
2933 case SRE_AT_END:
2934 case SRE_AT_END_LINE:
2935 case SRE_AT_END_STRING:
2936 case SRE_AT_BOUNDARY:
2937 case SRE_AT_NON_BOUNDARY:
2938 case SRE_AT_LOC_BOUNDARY:
2939 case SRE_AT_LOC_NON_BOUNDARY:
2940 case SRE_AT_UNI_BOUNDARY:
2941 case SRE_AT_UNI_NON_BOUNDARY:
2942 break;
2943 default:
2944 FAIL;
2945 }
2946 break;
2947
2948 case SRE_OP_ANY:
2949 case SRE_OP_ANY_ALL:
2950 /* These have no operands */
2951 break;
2952
2953 case SRE_OP_IN:
2954 case SRE_OP_IN_IGNORE:
2955 GET_SKIP;
2956 /* Stop 1 before the end; we check the FAILURE below */
2957 if (!_validate_charset(code, code+skip-2))
2958 FAIL;
2959 if (code[skip-2] != SRE_OP_FAILURE)
2960 FAIL;
2961 code += skip-1;
2962 break;
2963
2964 case SRE_OP_INFO:
2965 {
2966 /* A minimal info field is
2967 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2968 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2969 more follows. */
Brett Cannon8ffe7bb2010-05-03 23:51:28 +00002970 SRE_CODE flags, i;
Guido van Rossum8b762f02008-08-05 03:39:21 +00002971 SRE_CODE *newcode;
2972 GET_SKIP;
2973 newcode = code+skip-1;
2974 GET_ARG; flags = arg;
Brett Cannon8ffe7bb2010-05-03 23:51:28 +00002975 GET_ARG; /* min */
2976 GET_ARG; /* max */
Guido van Rossum8b762f02008-08-05 03:39:21 +00002977 /* Check that only valid flags are present */
2978 if ((flags & ~(SRE_INFO_PREFIX |
2979 SRE_INFO_LITERAL |
2980 SRE_INFO_CHARSET)) != 0)
2981 FAIL;
2982 /* PREFIX and CHARSET are mutually exclusive */
2983 if ((flags & SRE_INFO_PREFIX) &&
2984 (flags & SRE_INFO_CHARSET))
2985 FAIL;
2986 /* LITERAL implies PREFIX */
2987 if ((flags & SRE_INFO_LITERAL) &&
2988 !(flags & SRE_INFO_PREFIX))
2989 FAIL;
2990 /* Validate the prefix */
2991 if (flags & SRE_INFO_PREFIX) {
Brett Cannon8ffe7bb2010-05-03 23:51:28 +00002992 SRE_CODE prefix_len;
Guido van Rossum8b762f02008-08-05 03:39:21 +00002993 GET_ARG; prefix_len = arg;
Brett Cannon8ffe7bb2010-05-03 23:51:28 +00002994 GET_ARG; /* prefix skip */
Guido van Rossum8b762f02008-08-05 03:39:21 +00002995 /* Here comes the prefix string */
2996 if (code+prefix_len < code || code+prefix_len > newcode)
2997 FAIL;
2998 code += prefix_len;
2999 /* And here comes the overlap table */
3000 if (code+prefix_len < code || code+prefix_len > newcode)
3001 FAIL;
3002 /* Each overlap value should be < prefix_len */
3003 for (i = 0; i < prefix_len; i++) {
3004 if (code[i] >= prefix_len)
3005 FAIL;
3006 }
3007 code += prefix_len;
3008 }
3009 /* Validate the charset */
3010 if (flags & SRE_INFO_CHARSET) {
3011 if (!_validate_charset(code, newcode-1))
3012 FAIL;
3013 if (newcode[-1] != SRE_OP_FAILURE)
3014 FAIL;
3015 code = newcode;
3016 }
3017 else if (code != newcode) {
3018 VTRACE(("code=%p, newcode=%p\n", code, newcode));
3019 FAIL;
3020 }
3021 }
3022 break;
3023
3024 case SRE_OP_BRANCH:
3025 {
3026 SRE_CODE *target = NULL;
3027 for (;;) {
3028 GET_SKIP;
3029 if (skip == 0)
3030 break;
3031 /* Stop 2 before the end; we check the JUMP below */
3032 if (!_validate_inner(code, code+skip-3, groups))
3033 FAIL;
3034 code += skip-3;
3035 /* Check that it ends with a JUMP, and that each JUMP
3036 has the same target */
3037 GET_OP;
3038 if (op != SRE_OP_JUMP)
3039 FAIL;
3040 GET_SKIP;
3041 if (target == NULL)
3042 target = code+skip-1;
3043 else if (code+skip-1 != target)
3044 FAIL;
3045 }
3046 }
3047 break;
3048
3049 case SRE_OP_REPEAT_ONE:
3050 case SRE_OP_MIN_REPEAT_ONE:
3051 {
3052 SRE_CODE min, max;
3053 GET_SKIP;
3054 GET_ARG; min = arg;
3055 GET_ARG; max = arg;
3056 if (min > max)
3057 FAIL;
3058#ifdef Py_UNICODE_WIDE
3059 if (max > 65535)
3060 FAIL;
3061#endif
3062 if (!_validate_inner(code, code+skip-4, groups))
3063 FAIL;
3064 code += skip-4;
3065 GET_OP;
3066 if (op != SRE_OP_SUCCESS)
3067 FAIL;
3068 }
3069 break;
3070
3071 case SRE_OP_REPEAT:
3072 {
3073 SRE_CODE min, max;
3074 GET_SKIP;
3075 GET_ARG; min = arg;
3076 GET_ARG; max = arg;
3077 if (min > max)
3078 FAIL;
3079#ifdef Py_UNICODE_WIDE
3080 if (max > 65535)
3081 FAIL;
3082#endif
3083 if (!_validate_inner(code, code+skip-3, groups))
3084 FAIL;
3085 code += skip-3;
3086 GET_OP;
3087 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3088 FAIL;
3089 }
3090 break;
3091
3092 case SRE_OP_GROUPREF:
3093 case SRE_OP_GROUPREF_IGNORE:
3094 GET_ARG;
3095 if (arg >= groups)
3096 FAIL;
3097 break;
3098
3099 case SRE_OP_GROUPREF_EXISTS:
3100 /* The regex syntax for this is: '(?(group)then|else)', where
3101 'group' is either an integer group number or a group name,
3102 'then' and 'else' are sub-regexes, and 'else' is optional. */
3103 GET_ARG;
3104 if (arg >= groups)
3105 FAIL;
Guido van Rossume3c4fd92008-09-10 14:27:00 +00003106 GET_SKIP_ADJ(1);
Guido van Rossum8b762f02008-08-05 03:39:21 +00003107 code--; /* The skip is relative to the first arg! */
3108 /* There are two possibilities here: if there is both a 'then'
3109 part and an 'else' part, the generated code looks like:
3110
3111 GROUPREF_EXISTS
3112 <group>
3113 <skipyes>
3114 ...then part...
3115 JUMP
3116 <skipno>
3117 (<skipyes> jumps here)
3118 ...else part...
3119 (<skipno> jumps here)
3120
3121 If there is only a 'then' part, it looks like:
3122
3123 GROUPREF_EXISTS
3124 <group>
3125 <skip>
3126 ...then part...
3127 (<skip> jumps here)
3128
3129 There is no direct way to decide which it is, and we don't want
3130 to allow arbitrary jumps anywhere in the code; so we just look
3131 for a JUMP opcode preceding our skip target.
3132 */
3133 if (skip >= 3 && code+skip-3 >= code &&
3134 code[skip-3] == SRE_OP_JUMP)
3135 {
3136 VTRACE(("both then and else parts present\n"));
3137 if (!_validate_inner(code+1, code+skip-3, groups))
3138 FAIL;
3139 code += skip-2; /* Position after JUMP, at <skipno> */
3140 GET_SKIP;
3141 if (!_validate_inner(code, code+skip-1, groups))
3142 FAIL;
3143 code += skip-1;
3144 }
3145 else {
3146 VTRACE(("only a then part present\n"));
3147 if (!_validate_inner(code+1, code+skip-1, groups))
3148 FAIL;
3149 code += skip-1;
3150 }
3151 break;
3152
3153 case SRE_OP_ASSERT:
3154 case SRE_OP_ASSERT_NOT:
3155 GET_SKIP;
3156 GET_ARG; /* 0 for lookahead, width for lookbehind */
3157 code--; /* Back up over arg to simplify math below */
3158 if (arg & 0x80000000)
3159 FAIL; /* Width too large */
3160 /* Stop 1 before the end; we check the SUCCESS below */
3161 if (!_validate_inner(code+1, code+skip-2, groups))
3162 FAIL;
3163 code += skip-2;
3164 GET_OP;
3165 if (op != SRE_OP_SUCCESS)
3166 FAIL;
3167 break;
3168
3169 default:
3170 FAIL;
3171
3172 }
3173 }
3174
3175 VTRACE(("okay\n"));
3176 return 1;
3177}
3178
3179static int
3180_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3181{
3182 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3183 FAIL;
3184 if (groups == 0) /* fix for simplejson */
3185 groups = 100; /* 100 groups should always be safe */
3186 return _validate_inner(code, end-1, groups);
3187}
3188
3189static int
3190_validate(PatternObject *self)
3191{
3192 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3193 {
3194 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3195 return 0;
3196 }
3197 else
3198 VTRACE(("Success!\n"));
3199 return 1;
3200}
3201
3202/* -------------------------------------------------------------------- */
Guido van Rossumb700df92000-03-31 14:59:30 +00003203/* match methods */
3204
3205static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003206match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003207{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003208 Py_XDECREF(self->regs);
3209 Py_XDECREF(self->string);
3210 Py_DECREF(self->pattern);
3211 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00003212}
3213
3214static PyObject*
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003215match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00003216{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003217 if (index < 0 || index >= self->groups) {
3218 /* raise IndexError if we were given a bad group number */
3219 PyErr_SetString(
3220 PyExc_IndexError,
3221 "no such group"
3222 );
3223 return NULL;
3224 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003225
Fredrik Lundh6f013982000-07-03 18:44:21 +00003226 index *= 2;
3227
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003228 if (self->string == Py_None || self->mark[index] < 0) {
3229 /* return default value if the string or group is undefined */
3230 Py_INCREF(def);
3231 return def;
3232 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003233
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003234 return PySequence_GetSlice(
3235 self->string, self->mark[index], self->mark[index+1]
3236 );
Guido van Rossumb700df92000-03-31 14:59:30 +00003237}
3238
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003239static Py_ssize_t
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003240match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00003241{
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003242 Py_ssize_t i;
Guido van Rossumb700df92000-03-31 14:59:30 +00003243
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003244 if (PyInt_Check(index))
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003245 return PyInt_AsSsize_t(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00003246
Fredrik Lundh6f013982000-07-03 18:44:21 +00003247 i = -1;
3248
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003249 if (self->pattern->groupindex) {
3250 index = PyObject_GetItem(self->pattern->groupindex, index);
3251 if (index) {
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003252 if (PyInt_Check(index) || PyLong_Check(index))
3253 i = PyInt_AsSsize_t(index);
Fredrik Lundh6f013982000-07-03 18:44:21 +00003254 Py_DECREF(index);
3255 } else
3256 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003257 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00003258
3259 return i;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003260}
3261
3262static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003263match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003264{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003265 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00003266}
3267
3268static PyObject*
Georg Brandlfbef5882006-05-28 22:14:04 +00003269match_expand(MatchObject* self, PyObject* ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003270{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003271 /* delegate to Python code */
3272 return call(
Neal Norwitz94a9c092006-03-16 06:30:02 +00003273 SRE_PY_MODULE, "_expand",
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003274 PyTuple_Pack(3, self->pattern, self, ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003275 );
3276}
3277
3278static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003279match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003280{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003281 PyObject* result;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003282 Py_ssize_t i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00003283
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003284 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00003285
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003286 switch (size) {
3287 case 0:
3288 result = match_getslice(self, Py_False, Py_None);
3289 break;
3290 case 1:
3291 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3292 break;
3293 default:
3294 /* fetch multiple items */
3295 result = PyTuple_New(size);
3296 if (!result)
3297 return NULL;
3298 for (i = 0; i < size; i++) {
3299 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003300 self, PyTuple_GET_ITEM(args, i), Py_None
3301 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003302 if (!item) {
3303 Py_DECREF(result);
3304 return NULL;
3305 }
3306 PyTuple_SET_ITEM(result, i, item);
3307 }
3308 break;
3309 }
3310 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003311}
3312
3313static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003314match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003315{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003316 PyObject* result;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003317 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003318
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003319 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003320 static char* kwlist[] = { "default", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00003321 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003322 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003323
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003324 result = PyTuple_New(self->groups-1);
3325 if (!result)
3326 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003327
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003328 for (index = 1; index < self->groups; index++) {
3329 PyObject* item;
3330 item = match_getslice_by_index(self, index, def);
3331 if (!item) {
3332 Py_DECREF(result);
3333 return NULL;
3334 }
3335 PyTuple_SET_ITEM(result, index-1, item);
3336 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003337
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003338 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003339}
3340
3341static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003342match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003343{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003344 PyObject* result;
3345 PyObject* keys;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003346 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003347
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003348 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003349 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00003350 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003351 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003352
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003353 result = PyDict_New();
3354 if (!result || !self->pattern->groupindex)
3355 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003356
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003357 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003358 if (!keys)
3359 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00003360
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003361 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00003362 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003363 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003364 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003365 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003366 if (!key)
3367 goto failed;
3368 value = match_getslice(self, key, def);
3369 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003370 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003371 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003372 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00003373 status = PyDict_SetItem(result, key, value);
3374 Py_DECREF(value);
3375 if (status < 0)
3376 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003377 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003378
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003379 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00003380
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003381 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003382
3383failed:
Neal Norwitz60da3162006-03-07 04:48:24 +00003384 Py_XDECREF(keys);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003385 Py_DECREF(result);
3386 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003387}
3388
3389static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003390match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003391{
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003392 Py_ssize_t index;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003393
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003394 PyObject* index_ = Py_False; /* zero */
Georg Brandl96a8c392006-05-29 21:04:52 +00003395 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003396 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003397
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003398 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003399
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003400 if (index < 0 || index >= self->groups) {
3401 PyErr_SetString(
3402 PyExc_IndexError,
3403 "no such group"
3404 );
3405 return NULL;
3406 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003407
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003408 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003409 return Py_BuildValue("i", self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00003410}
3411
3412static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003413match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003414{
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003415 Py_ssize_t index;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003416
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003417 PyObject* index_ = Py_False; /* zero */
Georg Brandl96a8c392006-05-29 21:04:52 +00003418 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003419 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003420
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003421 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003422
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003423 if (index < 0 || index >= self->groups) {
3424 PyErr_SetString(
3425 PyExc_IndexError,
3426 "no such group"
3427 );
3428 return NULL;
3429 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003430
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003431 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003432 return Py_BuildValue("i", self->mark[index*2+1]);
3433}
3434
3435LOCAL(PyObject*)
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003436_pair(Py_ssize_t i1, Py_ssize_t i2)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003437{
3438 PyObject* pair;
3439 PyObject* item;
3440
3441 pair = PyTuple_New(2);
3442 if (!pair)
3443 return NULL;
3444
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003445 item = PyInt_FromSsize_t(i1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003446 if (!item)
3447 goto error;
3448 PyTuple_SET_ITEM(pair, 0, item);
3449
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003450 item = PyInt_FromSsize_t(i2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003451 if (!item)
3452 goto error;
3453 PyTuple_SET_ITEM(pair, 1, item);
3454
3455 return pair;
3456
3457 error:
3458 Py_DECREF(pair);
3459 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003460}
3461
3462static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003463match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003464{
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003465 Py_ssize_t index;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003466
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003467 PyObject* index_ = Py_False; /* zero */
Georg Brandl96a8c392006-05-29 21:04:52 +00003468 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003469 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003470
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003471 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003472
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003473 if (index < 0 || index >= self->groups) {
3474 PyErr_SetString(
3475 PyExc_IndexError,
3476 "no such group"
3477 );
3478 return NULL;
3479 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003480
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003481 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003482 return _pair(self->mark[index*2], self->mark[index*2+1]);
3483}
3484
3485static PyObject*
3486match_regs(MatchObject* self)
3487{
3488 PyObject* regs;
3489 PyObject* item;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003490 Py_ssize_t index;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003491
3492 regs = PyTuple_New(self->groups);
3493 if (!regs)
3494 return NULL;
3495
3496 for (index = 0; index < self->groups; index++) {
3497 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3498 if (!item) {
3499 Py_DECREF(regs);
3500 return NULL;
3501 }
3502 PyTuple_SET_ITEM(regs, index, item);
3503 }
3504
3505 Py_INCREF(regs);
3506 self->regs = regs;
3507
3508 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00003509}
3510
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003511static PyObject*
Georg Brandl964f5972006-05-28 22:38:57 +00003512match_copy(MatchObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003513{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003514#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003515 MatchObject* copy;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003516 Py_ssize_t slots, offset;
Tim Peters3d563502006-01-21 02:47:53 +00003517
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003518 slots = 2 * (self->pattern->groups+1);
3519
3520 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3521 if (!copy)
3522 return NULL;
3523
3524 /* this value a constant, but any compiler should be able to
3525 figure that out all by itself */
3526 offset = offsetof(MatchObject, string);
3527
3528 Py_XINCREF(self->pattern);
3529 Py_XINCREF(self->string);
3530 Py_XINCREF(self->regs);
3531
3532 memcpy((char*) copy + offset, (char*) self + offset,
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003533 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003534
3535 return (PyObject*) copy;
3536#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003537 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003538 return NULL;
3539#endif
3540}
3541
3542static PyObject*
Georg Brandlfbef5882006-05-28 22:14:04 +00003543match_deepcopy(MatchObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003544{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003545#ifdef USE_BUILTIN_COPY
3546 MatchObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00003547
Georg Brandlfbef5882006-05-28 22:14:04 +00003548 copy = (MatchObject*) match_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003549 if (!copy)
3550 return NULL;
3551
3552 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3553 !deepcopy(&copy->string, memo) ||
3554 !deepcopy(&copy->regs, memo)) {
3555 Py_DECREF(copy);
3556 return NULL;
3557 }
3558
3559#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003560 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3561 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003562#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003563}
3564
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003565static PyMethodDef match_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00003566 {"group", (PyCFunction) match_group, METH_VARARGS},
3567 {"start", (PyCFunction) match_start, METH_VARARGS},
3568 {"end", (PyCFunction) match_end, METH_VARARGS},
3569 {"span", (PyCFunction) match_span, METH_VARARGS},
3570 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3571 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
Georg Brandlfbef5882006-05-28 22:14:04 +00003572 {"expand", (PyCFunction) match_expand, METH_O},
3573 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3574 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003575 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003576};
3577
Tim Peters3d563502006-01-21 02:47:53 +00003578static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003579match_getattr(MatchObject* self, char* name)
Guido van Rossumb700df92000-03-31 14:59:30 +00003580{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003581 PyObject* res;
Guido van Rossumb700df92000-03-31 14:59:30 +00003582
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003583 res = Py_FindMethod(match_methods, (PyObject*) self, name);
3584 if (res)
3585 return res;
Guido van Rossumb700df92000-03-31 14:59:30 +00003586
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003587 PyErr_Clear();
Guido van Rossumb700df92000-03-31 14:59:30 +00003588
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003589 if (!strcmp(name, "lastindex")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00003590 if (self->lastindex >= 0)
3591 return Py_BuildValue("i", self->lastindex);
Fredrik Lundhc2301732000-07-02 22:25:39 +00003592 Py_INCREF(Py_None);
3593 return Py_None;
3594 }
3595
3596 if (!strcmp(name, "lastgroup")) {
Fredrik Lundh6f013982000-07-03 18:44:21 +00003597 if (self->pattern->indexgroup && self->lastindex >= 0) {
Fredrik Lundhc2301732000-07-02 22:25:39 +00003598 PyObject* result = PySequence_GetItem(
Fredrik Lundh6f013982000-07-03 18:44:21 +00003599 self->pattern->indexgroup, self->lastindex
Fredrik Lundhc2301732000-07-02 22:25:39 +00003600 );
3601 if (result)
3602 return result;
3603 PyErr_Clear();
3604 }
3605 Py_INCREF(Py_None);
3606 return Py_None;
3607 }
3608
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003609 if (!strcmp(name, "string")) {
3610 if (self->string) {
3611 Py_INCREF(self->string);
3612 return self->string;
3613 } else {
3614 Py_INCREF(Py_None);
3615 return Py_None;
3616 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003617 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003618
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003619 if (!strcmp(name, "regs")) {
3620 if (self->regs) {
3621 Py_INCREF(self->regs);
3622 return self->regs;
3623 } else
3624 return match_regs(self);
3625 }
3626
3627 if (!strcmp(name, "re")) {
Guido van Rossumb700df92000-03-31 14:59:30 +00003628 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003629 return (PyObject*) self->pattern;
Guido van Rossumb700df92000-03-31 14:59:30 +00003630 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003631
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003632 if (!strcmp(name, "pos"))
3633 return Py_BuildValue("i", self->pos);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003634
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003635 if (!strcmp(name, "endpos"))
3636 return Py_BuildValue("i", self->endpos);
Guido van Rossumb700df92000-03-31 14:59:30 +00003637
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003638 PyErr_SetString(PyExc_AttributeError, name);
3639 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003640}
3641
3642/* FIXME: implement setattr("string", None) as a special case (to
3643 detach the associated string, if any */
3644
3645statichere PyTypeObject Match_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003646 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00003647 0, "_" SRE_MODULE ".SRE_Match",
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003648 sizeof(MatchObject), sizeof(Py_ssize_t),
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003649 (destructor)match_dealloc, /*tp_dealloc*/
3650 0, /*tp_print*/
3651 (getattrfunc)match_getattr /*tp_getattr*/
Guido van Rossumb700df92000-03-31 14:59:30 +00003652};
3653
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003654static PyObject*
3655pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3656{
3657 /* create match object (from state object) */
3658
3659 MatchObject* match;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003660 Py_ssize_t i, j;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003661 char* base;
3662 int n;
3663
3664 if (status > 0) {
3665
3666 /* create match object (with room for extra group marks) */
Christian Heimes4956d2b2008-01-18 19:12:56 +00003667 /* coverity[ampersand_in_size] */
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003668 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3669 2*(pattern->groups+1));
3670 if (!match)
3671 return NULL;
3672
3673 Py_INCREF(pattern);
3674 match->pattern = pattern;
3675
3676 Py_INCREF(state->string);
3677 match->string = state->string;
3678
3679 match->regs = NULL;
3680 match->groups = pattern->groups+1;
3681
3682 /* fill in group slices */
3683
3684 base = (char*) state->beginning;
3685 n = state->charsize;
3686
3687 match->mark[0] = ((char*) state->start - base) / n;
3688 match->mark[1] = ((char*) state->ptr - base) / n;
3689
3690 for (i = j = 0; i < pattern->groups; i++, j+=2)
3691 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3692 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3693 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3694 } else
3695 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3696
3697 match->pos = state->pos;
3698 match->endpos = state->endpos;
3699
3700 match->lastindex = state->lastindex;
3701
3702 return (PyObject*) match;
3703
3704 } else if (status == 0) {
3705
3706 /* no match */
3707 Py_INCREF(Py_None);
3708 return Py_None;
3709
3710 }
3711
3712 /* internal error */
3713 pattern_error(status);
3714 return NULL;
3715}
3716
3717
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003718/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003719/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003720
3721static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003722scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003723{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003724 state_fini(&self->state);
Antoine Pitrouefdddd32010-01-14 17:25:24 +00003725 Py_XDECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003726 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003727}
3728
3729static PyObject*
Georg Brandl964f5972006-05-28 22:38:57 +00003730scanner_match(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003731{
3732 SRE_STATE* state = &self->state;
3733 PyObject* match;
3734 int status;
3735
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003736 state_reset(state);
3737
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003738 state->ptr = state->start;
3739
3740 if (state->charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003741 status = sre_match(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003742 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003743#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003744 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003745#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003746 }
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00003747 if (PyErr_Occurred())
3748 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003749
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003750 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003751 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003752
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003753 if (status == 0 || state->ptr == state->start)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003754 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003755 else
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003756 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003757
3758 return match;
3759}
3760
3761
3762static PyObject*
Georg Brandl964f5972006-05-28 22:38:57 +00003763scanner_search(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003764{
3765 SRE_STATE* state = &self->state;
3766 PyObject* match;
3767 int status;
3768
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003769 state_reset(state);
3770
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003771 state->ptr = state->start;
3772
3773 if (state->charsize == 1) {
3774 status = sre_search(state, PatternObject_GetCode(self->pattern));
3775 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003776#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003777 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003778#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003779 }
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00003780 if (PyErr_Occurred())
3781 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003782
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003783 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003784 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003785
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003786 if (status == 0 || state->ptr == state->start)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003787 state->start = (void*) ((char*) state->ptr + state->charsize);
3788 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003789 state->start = state->ptr;
3790
3791 return match;
3792}
3793
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003794static PyMethodDef scanner_methods[] = {
Georg Brandlfbef5882006-05-28 22:14:04 +00003795 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3796 {"search", (PyCFunction) scanner_search, METH_NOARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003797 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003798};
3799
Tim Peters3d563502006-01-21 02:47:53 +00003800static PyObject*
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003801scanner_getattr(ScannerObject* self, char* name)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003802{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003803 PyObject* res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003804
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003805 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
3806 if (res)
3807 return res;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003808
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003809 PyErr_Clear();
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003810
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003811 /* attributes */
3812 if (!strcmp(name, "pattern")) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003813 Py_INCREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003814 return self->pattern;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003815 }
3816
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003817 PyErr_SetString(PyExc_AttributeError, name);
3818 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003819}
3820
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003821statichere PyTypeObject Scanner_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003822 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00003823 0, "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003824 sizeof(ScannerObject), 0,
3825 (destructor)scanner_dealloc, /*tp_dealloc*/
3826 0, /*tp_print*/
3827 (getattrfunc)scanner_getattr, /*tp_getattr*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003828};
3829
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003830static PyObject*
3831pattern_scanner(PatternObject* pattern, PyObject* args)
3832{
3833 /* create search state object */
3834
3835 ScannerObject* self;
3836
3837 PyObject* string;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003838 Py_ssize_t start = 0;
3839 Py_ssize_t end = PY_SSIZE_T_MAX;
3840 if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003841 return NULL;
3842
3843 /* create scanner object */
3844 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3845 if (!self)
3846 return NULL;
Antoine Pitrouefdddd32010-01-14 17:25:24 +00003847 self->pattern = NULL;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003848
3849 string = state_init(&self->state, pattern, string, start, end);
3850 if (!string) {
Antoine Pitrouefdddd32010-01-14 17:25:24 +00003851 Py_DECREF(self);
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003852 return NULL;
3853 }
3854
3855 Py_INCREF(pattern);
3856 self->pattern = (PyObject*) pattern;
3857
3858 return (PyObject*) self;
3859}
3860
Guido van Rossumb700df92000-03-31 14:59:30 +00003861static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00003862 {"compile", _compile, METH_VARARGS},
Georg Brandlfbef5882006-05-28 22:14:04 +00003863 {"getcodesize", sre_codesize, METH_NOARGS},
Neal Norwitzb0493252002-03-31 14:44:22 +00003864 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003865 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003866};
3867
Tim Peters3d563502006-01-21 02:47:53 +00003868#if PY_VERSION_HEX < 0x02030000
Andrew M. Kuchlingc24fe362003-04-30 13:09:08 +00003869DL_EXPORT(void) init_sre(void)
3870#else
Mark Hammond8235ea12002-07-19 06:55:41 +00003871PyMODINIT_FUNC init_sre(void)
Andrew M. Kuchlingc24fe362003-04-30 13:09:08 +00003872#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00003873{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003874 PyObject* m;
3875 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00003876 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003877
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003878 /* Patch object types */
Benjamin Petersone266d3e2010-04-06 03:34:09 +00003879 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
3880 PyType_Ready(&Scanner_Type))
3881 return;
Guido van Rossumb700df92000-03-31 14:59:30 +00003882
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00003883 m = Py_InitModule("_" SRE_MODULE, _functions);
Neal Norwitz1ac754f2006-01-19 06:09:39 +00003884 if (m == NULL)
3885 return;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003886 d = PyModule_GetDict(m);
3887
Fredrik Lundh21009b92001-09-18 18:47:09 +00003888 x = PyInt_FromLong(SRE_MAGIC);
3889 if (x) {
3890 PyDict_SetItemString(d, "MAGIC", x);
3891 Py_DECREF(x);
3892 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00003893
Martin v. Löwis78e2f062003-04-19 12:56:08 +00003894 x = PyInt_FromLong(sizeof(SRE_CODE));
3895 if (x) {
3896 PyDict_SetItemString(d, "CODESIZE", x);
3897 Py_DECREF(x);
3898 }
3899
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003900 x = PyString_FromString(copyright);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003901 if (x) {
3902 PyDict_SetItemString(d, "copyright", x);
3903 Py_DECREF(x);
3904 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003905}
3906
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003907#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00003908
3909/* vim:ts=4:sw=4:et
3910*/