blob: 2b00121a2553a41d0b28807c560b76d1d01b2f68 [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Georg Brandldaa1fa92013-10-13 09:32:59 +02007 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000026 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000027 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000028 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
Guido van Rossumb700df92000-03-31 14:59:30 +000032 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000033 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000034 * other compatibility work.
35 */
36
37#ifndef SRE_RECURSIVE
38
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000039static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000040 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000041
Thomas Wouters0e3f5912006-08-11 14:57:12 +000042#define PY_SSIZE_T_CLEAN
43
Guido van Rossumb700df92000-03-31 14:59:30 +000044#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000045#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000046
47#include "sre.h"
48
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000049#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000050
Fredrik Lundh436c3d582000-06-29 08:58:44 +000051/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000052#if !defined(SRE_MODULE)
53#define SRE_MODULE "sre"
54#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000055
Thomas Wouters9ada3d62006-04-21 09:47:09 +000056#define SRE_PY_MODULE "re"
57
Guido van Rossumb700df92000-03-31 14:59:30 +000058/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000059#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000060
Fredrik Lundh22d25462000-07-01 17:50:59 +000061/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +000062#define HAVE_UNICODE
Fredrik Lundh436c3d582000-06-29 08:58:44 +000063
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000065/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000066
67/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000068#define USE_FAST_SEARCH
69
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000070/* enables copy/deepcopy handling (work in progress) */
71#undef USE_BUILTIN_COPY
72
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000073/* -------------------------------------------------------------------- */
74
Fredrik Lundh80946112000-06-29 18:03:25 +000075#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000076#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000077#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000078/* fastest possible local call under MSVC */
79#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000080#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000081#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000082#else
83#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000084#endif
85
86/* error codes */
87#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000088#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000089#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000090#define SRE_ERROR_MEMORY -9 /* out of memory */
Christian Heimes2380ac72008-01-09 00:17:24 +000091#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
Guido van Rossumb700df92000-03-31 14:59:30 +000092
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000093#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +000094#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +000095#else
96#define TRACE(v)
97#endif
98
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000099/* -------------------------------------------------------------------- */
100/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000101
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000102/* default character predicates (run sre_chars.py to regenerate tables) */
103
104#define SRE_DIGIT_MASK 1
105#define SRE_SPACE_MASK 2
106#define SRE_LINEBREAK_MASK 4
107#define SRE_ALNUM_MASK 8
108#define SRE_WORD_MASK 16
109
Fredrik Lundh21009b92001-09-18 18:47:09 +0000110/* FIXME: this assumes ASCII. create tables in init_sre() instead */
111
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000112static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1132, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1140, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
11525, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
11624, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1170, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
11824, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
119
Fredrik Lundhb389df32000-06-29 12:48:37 +0000120static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000012110, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
12227, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
12344, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
12461, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
125108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
126122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
127106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
128120, 121, 122, 123, 124, 125, 126, 127 };
129
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000130#define SRE_IS_DIGIT(ch)\
131 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
132#define SRE_IS_SPACE(ch)\
133 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
134#define SRE_IS_LINEBREAK(ch)\
135 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
136#define SRE_IS_ALNUM(ch)\
137 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
138#define SRE_IS_WORD(ch)\
139 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000140
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000141static unsigned int sre_lower(unsigned int ch)
142{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000143 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000144}
145
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000146/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000147/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
148 * warnings when c's type supports only numbers < N+1 */
149#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
150#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000151#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000152#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000153#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
154
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000155static unsigned int sre_lower_locale(unsigned int ch)
156{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000157 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000158}
159
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000160/* unicode-specific character predicates */
161
Victor Stinner0058b862011-09-29 03:27:47 +0200162#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
163#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch)
164#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch)
165#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch)
166#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000167
168static unsigned int sre_lower_unicode(unsigned int ch)
169{
Victor Stinner0058b862011-09-29 03:27:47 +0200170 return (unsigned int) Py_UNICODE_TOLOWER(ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000171}
172
Guido van Rossumb700df92000-03-31 14:59:30 +0000173LOCAL(int)
174sre_category(SRE_CODE category, unsigned int ch)
175{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000176 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000177
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000178 case SRE_CATEGORY_DIGIT:
179 return SRE_IS_DIGIT(ch);
180 case SRE_CATEGORY_NOT_DIGIT:
181 return !SRE_IS_DIGIT(ch);
182 case SRE_CATEGORY_SPACE:
183 return SRE_IS_SPACE(ch);
184 case SRE_CATEGORY_NOT_SPACE:
185 return !SRE_IS_SPACE(ch);
186 case SRE_CATEGORY_WORD:
187 return SRE_IS_WORD(ch);
188 case SRE_CATEGORY_NOT_WORD:
189 return !SRE_IS_WORD(ch);
190 case SRE_CATEGORY_LINEBREAK:
191 return SRE_IS_LINEBREAK(ch);
192 case SRE_CATEGORY_NOT_LINEBREAK:
193 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000194
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000195 case SRE_CATEGORY_LOC_WORD:
196 return SRE_LOC_IS_WORD(ch);
197 case SRE_CATEGORY_LOC_NOT_WORD:
198 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000199
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000200 case SRE_CATEGORY_UNI_DIGIT:
201 return SRE_UNI_IS_DIGIT(ch);
202 case SRE_CATEGORY_UNI_NOT_DIGIT:
203 return !SRE_UNI_IS_DIGIT(ch);
204 case SRE_CATEGORY_UNI_SPACE:
205 return SRE_UNI_IS_SPACE(ch);
206 case SRE_CATEGORY_UNI_NOT_SPACE:
207 return !SRE_UNI_IS_SPACE(ch);
208 case SRE_CATEGORY_UNI_WORD:
209 return SRE_UNI_IS_WORD(ch);
210 case SRE_CATEGORY_UNI_NOT_WORD:
211 return !SRE_UNI_IS_WORD(ch);
212 case SRE_CATEGORY_UNI_LINEBREAK:
213 return SRE_UNI_IS_LINEBREAK(ch);
214 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
215 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000216 }
217 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000218}
219
220/* helpers */
221
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000222static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000223data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000224{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000225 if (state->data_stack) {
Thomas Wouters477c8d52006-05-27 19:21:47 +0000226 PyMem_FREE(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000227 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000228 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000229 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000230}
231
232static int
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000233data_stack_grow(SRE_STATE* state, Py_ssize_t size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000234{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000235 Py_ssize_t minsize, cursize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000236 minsize = state->data_stack_base+size;
237 cursize = state->data_stack_size;
238 if (cursize < minsize) {
239 void* stack;
240 cursize = minsize+minsize/4+1024;
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300241 TRACE(("allocate/grow stack %" PY_FORMAT_SIZE_T "d\n", cursize));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000242 stack = PyMem_REALLOC(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000243 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000244 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000245 return SRE_ERROR_MEMORY;
246 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000247 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000248 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000249 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000250 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000251}
252
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000253/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000254
255#define SRE_CHAR unsigned char
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200256#define SRE_CHARGET(state, buf, index) ((unsigned char*)buf)[index]
Guido van Rossumb700df92000-03-31 14:59:30 +0000257#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000258#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000259#define SRE_CHARSET sre_charset
260#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000261#define SRE_MATCH sre_match
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000262#define SRE_MATCH_CONTEXT sre_match_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000263#define SRE_SEARCH sre_search
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000264
Guido van Rossumb700df92000-03-31 14:59:30 +0000265#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000266#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000267#undef SRE_RECURSIVE
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000268
Guido van Rossumb700df92000-03-31 14:59:30 +0000269#undef SRE_SEARCH
270#undef SRE_MATCH
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000271#undef SRE_MATCH_CONTEXT
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000272#undef SRE_INFO
273#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000274#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000275#undef SRE_AT
276#undef SRE_CHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200277#undef SRE_CHARGET
Guido van Rossumb700df92000-03-31 14:59:30 +0000278
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200279/* generate 8/16/32-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000280
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200281#define SRE_CHAR void
282#define SRE_CHARGET(state, buf, index) \
283 ((state->charsize==1) ? ((Py_UCS1*)buf)[index] : \
284 (state->charsize==2) ? ((Py_UCS2*)buf)[index] : \
285 ((Py_UCS4*)buf)[index])
Guido van Rossumb700df92000-03-31 14:59:30 +0000286#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000287#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000288#define SRE_CHARSET sre_ucharset
289#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000290#define SRE_MATCH sre_umatch
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000291#define SRE_MATCH_CONTEXT sre_umatch_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000292#define SRE_SEARCH sre_usearch
293
294#endif /* SRE_RECURSIVE */
295
296/* -------------------------------------------------------------------- */
297/* String matching engine */
298
299/* the following section is compiled twice, with different character
300 settings */
301
302LOCAL(int)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200303SRE_AT(SRE_STATE* state, char* ptr, SRE_CODE at)
Guido van Rossumb700df92000-03-31 14:59:30 +0000304{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000305 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000306
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000307 Py_ssize_t thisp, thatp;
Guido van Rossumb700df92000-03-31 14:59:30 +0000308
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000309 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000310
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000311 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000312 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000313 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000314
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000315 case SRE_AT_BEGINNING_LINE:
316 return ((void*) ptr == state->beginning ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200317 SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, -1)));
Fredrik Lundh80946112000-06-29 18:03:25 +0000318
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000319 case SRE_AT_END:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200320 return (((void*) (ptr+state->charsize) == state->end &&
321 SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, 0))) ||
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000322 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000323
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000324 case SRE_AT_END_LINE:
325 return ((void*) ptr == state->end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200326 SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, 0)));
Fredrik Lundh80946112000-06-29 18:03:25 +0000327
Fredrik Lundh770617b2001-01-14 15:06:11 +0000328 case SRE_AT_END_STRING:
329 return ((void*) ptr == state->end);
330
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000331 case SRE_AT_BOUNDARY:
332 if (state->beginning == state->end)
333 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000334 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200335 SRE_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000336 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200337 SRE_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000338 return thisp != thatp;
Fredrik Lundh80946112000-06-29 18:03:25 +0000339
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000340 case SRE_AT_NON_BOUNDARY:
341 if (state->beginning == state->end)
342 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000343 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200344 SRE_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000345 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200346 SRE_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000347 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000348
349 case SRE_AT_LOC_BOUNDARY:
350 if (state->beginning == state->end)
351 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000352 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200353 SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000354 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200355 SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000356 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000357
358 case SRE_AT_LOC_NON_BOUNDARY:
359 if (state->beginning == state->end)
360 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000361 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200362 SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000363 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200364 SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000365 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000366
367 case SRE_AT_UNI_BOUNDARY:
368 if (state->beginning == state->end)
369 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000370 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200371 SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000372 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200373 SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000374 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000375
376 case SRE_AT_UNI_NON_BOUNDARY:
377 if (state->beginning == state->end)
378 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000379 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200380 SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000381 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200382 SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000383 return thisp == thatp;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000384
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000385 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000386
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000387 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000388}
389
390LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000391SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000392{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000393 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000394
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000395 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000396
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000397 for (;;) {
398 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000399
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000400 case SRE_OP_FAILURE:
401 return !ok;
402
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000403 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000404 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000405 if (ch == set[0])
406 return ok;
407 set++;
408 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000409
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000410 case SRE_OP_CATEGORY:
411 /* <CATEGORY> <code> */
412 if (sre_category(set[0], (int) ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000413 return ok;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000414 set += 1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000415 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000416
Fredrik Lundh3562f112000-07-02 12:00:07 +0000417 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000418 if (sizeof(SRE_CODE) == 2) {
419 /* <CHARSET> <bitmap> (16 bits per code word) */
420 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
421 return ok;
422 set += 16;
Tim Peters3d563502006-01-21 02:47:53 +0000423 }
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000424 else {
425 /* <CHARSET> <bitmap> (32 bits per code word) */
Gregory P. Smith90555d02012-12-10 17:44:44 -0800426 if (ch < 256 && (set[ch >> 5] & (1u << (ch & 31))))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000427 return ok;
428 set += 8;
429 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000430 break;
431
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000432 case SRE_OP_RANGE:
433 /* <RANGE> <lower> <upper> */
434 if (set[0] <= ch && ch <= set[1])
435 return ok;
436 set += 2;
437 break;
438
439 case SRE_OP_NEGATE:
440 ok = !ok;
441 break;
442
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000443 case SRE_OP_BIGCHARSET:
444 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
445 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000446 Py_ssize_t count, block;
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000447 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000448
449 if (sizeof(SRE_CODE) == 2) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200450 block = ((char*)set)[ch >> 8];
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000451 set += 128;
452 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
453 return ok;
454 set += count*16;
455 }
456 else {
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000457 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
458 * warnings when c's type supports only numbers < N+1 */
459 if (!(ch & ~65535))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200460 block = ((char*)set)[ch >> 8];
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000461 else
462 block = -1;
463 set += 64;
Tim Peters3d563502006-01-21 02:47:53 +0000464 if (block >=0 &&
Gregory P. Smith90555d02012-12-10 17:44:44 -0800465 (set[block*8 + ((ch & 255)>>5)] & (1u << (ch & 31))))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000466 return ok;
467 set += count*8;
468 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000469 break;
470 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000471
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000472 default:
473 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000474 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000475 return 0;
476 }
477 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000478}
479
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000480LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000481
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000482LOCAL(Py_ssize_t)
483SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000484{
485 SRE_CODE chr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200486 char* ptr = (char *)state->ptr;
487 char* end = (char *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000488 Py_ssize_t i;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000489
490 /* adjust end */
Serhiy Storchakaa0eb8092013-02-16 16:54:33 +0200491 if (maxcount < (end - ptr) / state->charsize && maxcount != SRE_MAXREPEAT)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200492 end = ptr + maxcount*state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000493
494 switch (pattern[0]) {
495
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000496 case SRE_OP_IN:
497 /* repeated set */
498 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
Victor Stinner63ab8752011-11-22 03:31:20 +0100499 while (ptr < end &&
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200500 SRE_CHARSET(pattern + 2, SRE_CHARGET(state, ptr, 0)))
501 ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000502 break;
503
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000504 case SRE_OP_ANY:
505 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000506 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200507 while (ptr < end && !SRE_IS_LINEBREAK(SRE_CHARGET(state, ptr, 0)))
508 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000509 break;
510
511 case SRE_OP_ANY_ALL:
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000512 /* repeated dot wildcard. skip to the end of the target
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000513 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000514 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000515 ptr = end;
516 break;
517
518 case SRE_OP_LITERAL:
519 /* repeated literal */
520 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000521 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200522 while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) == chr)
523 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000524 break;
525
526 case SRE_OP_LITERAL_IGNORE:
527 /* repeated literal */
528 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000529 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200530 while (ptr < end && (SRE_CODE) state->lower(SRE_CHARGET(state, ptr, 0)) == chr)
531 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000532 break;
533
534 case SRE_OP_NOT_LITERAL:
535 /* repeated non-literal */
536 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000537 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200538 while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) != chr)
539 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000540 break;
Tim Peters3d563502006-01-21 02:47:53 +0000541
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000542 case SRE_OP_NOT_LITERAL_IGNORE:
543 /* repeated non-literal */
544 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000545 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200546 while (ptr < end && (SRE_CODE) state->lower(SRE_CHARGET(state, ptr, 0)) != chr)
547 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000548 break;
549
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000550 default:
551 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000552 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200553 while ((char*) state->ptr < end) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000554 i = SRE_MATCH(state, pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000555 if (i < 0)
556 return i;
557 if (!i)
558 break;
559 }
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300560 TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200561 ((char*)state->ptr - ptr)/state->charsize));
562 return ((char*)state->ptr - ptr)/state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000563 }
564
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300565 TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
566 (ptr - (char*) state->ptr)/state->charsize));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200567 return (ptr - (char*) state->ptr)/state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000568}
569
Fredrik Lundh33accc12000-08-27 20:59:47 +0000570#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000571LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000572SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
573{
574 /* check if an SRE_OP_INFO block matches at the current position.
575 returns the number of SRE_CODE objects to skip if successful, 0
576 if no match */
577
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200578 char* end = state->end;
579 char* ptr = state->ptr;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000580 Py_ssize_t i;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000581
582 /* check minimal length */
Serhiy Storchakac1b59d42012-12-29 23:38:48 +0200583 if (pattern[3] && (end - ptr)/state->charsize < pattern[3])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000584 return 0;
585
586 /* check known prefix */
587 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
588 /* <length> <skip> <prefix data> <overlap data> */
589 for (i = 0; i < pattern[5]; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200590 if ((SRE_CODE) SRE_CHARGET(state, ptr, i) != pattern[7 + i])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000591 return 0;
592 return pattern[0] + 2 * pattern[6];
593 }
594 return pattern[0];
595}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000596#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000597
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000598/* The macros below should be used to protect recursive SRE_MATCH()
599 * calls that *failed* and do *not* return immediately (IOW, those
600 * that will backtrack). Explaining:
601 *
602 * - Recursive SRE_MATCH() returned true: that's usually a success
603 * (besides atypical cases like ASSERT_NOT), therefore there's no
604 * reason to restore lastmark;
605 *
606 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
607 * is returning to the caller: If the current SRE_MATCH() is the
608 * top function of the recursion, returning false will be a matching
609 * failure, and it doesn't matter where lastmark is pointing to.
610 * If it's *not* the top function, it will be a recursive SRE_MATCH()
611 * failure by itself, and the calling SRE_MATCH() will have to deal
612 * with the failure by the same rules explained here (it will restore
613 * lastmark by itself if necessary);
614 *
615 * - Recursive SRE_MATCH() returned false, and will continue the
616 * outside 'for' loop: must be protected when breaking, since the next
617 * OP could potentially depend on lastmark;
Tim Peters3d563502006-01-21 02:47:53 +0000618 *
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000619 * - Recursive SRE_MATCH() returned false, and will be called again
620 * inside a local for/while loop: must be protected between each
621 * loop iteration, since the recursive SRE_MATCH() could do anything,
622 * and could potentially depend on lastmark.
623 *
624 * For more information, check the discussion at SF patch #712900.
625 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000626#define LASTMARK_SAVE() \
627 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000628 ctx->lastmark = state->lastmark; \
629 ctx->lastindex = state->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000630 } while (0)
631#define LASTMARK_RESTORE() \
632 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000633 state->lastmark = ctx->lastmark; \
634 state->lastindex = ctx->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000635 } while (0)
636
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000637#define RETURN_ERROR(i) do { return i; } while(0)
638#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
639#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
640
641#define RETURN_ON_ERROR(i) \
642 do { if (i < 0) RETURN_ERROR(i); } while (0)
643#define RETURN_ON_SUCCESS(i) \
644 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
645#define RETURN_ON_FAILURE(i) \
646 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
647
648#define SFY(x) #x
649
650#define DATA_STACK_ALLOC(state, type, ptr) \
651do { \
652 alloc_pos = state->data_stack_base; \
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300653 TRACE(("allocating %s in %" PY_FORMAT_SIZE_T "d " \
654 "(%" PY_FORMAT_SIZE_T "d)\n", \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000655 SFY(type), alloc_pos, sizeof(type))); \
Serhiy Storchaka4bb17342013-04-13 21:15:47 +0300656 if (sizeof(type) > state->data_stack_size - alloc_pos) { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000657 int j = data_stack_grow(state, sizeof(type)); \
658 if (j < 0) return j; \
659 if (ctx_pos != -1) \
660 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
661 } \
662 ptr = (type*)(state->data_stack+alloc_pos); \
663 state->data_stack_base += sizeof(type); \
664} while (0)
665
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000666#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
667do { \
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300668 TRACE(("looking up %s at %" PY_FORMAT_SIZE_T "d\n", SFY(type), pos)); \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000669 ptr = (type*)(state->data_stack+pos); \
670} while (0)
671
672#define DATA_STACK_PUSH(state, data, size) \
673do { \
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300674 TRACE(("copy data in %p to %" PY_FORMAT_SIZE_T "d " \
675 "(%" PY_FORMAT_SIZE_T "d)\n", \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000676 data, state->data_stack_base, size)); \
Serhiy Storchaka4bb17342013-04-13 21:15:47 +0300677 if (size > state->data_stack_size - state->data_stack_base) { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000678 int j = data_stack_grow(state, size); \
679 if (j < 0) return j; \
680 if (ctx_pos != -1) \
681 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
682 } \
683 memcpy(state->data_stack+state->data_stack_base, data, size); \
684 state->data_stack_base += size; \
685} while (0)
686
687#define DATA_STACK_POP(state, data, size, discard) \
688do { \
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300689 TRACE(("copy data to %p from %" PY_FORMAT_SIZE_T "d " \
690 "(%" PY_FORMAT_SIZE_T "d)\n", \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000691 data, state->data_stack_base-size, size)); \
692 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
693 if (discard) \
694 state->data_stack_base -= size; \
695} while (0)
696
697#define DATA_STACK_POP_DISCARD(state, size) \
698do { \
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300699 TRACE(("discard data from %" PY_FORMAT_SIZE_T "d " \
700 "(%" PY_FORMAT_SIZE_T "d)\n", \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000701 state->data_stack_base-size, size)); \
702 state->data_stack_base -= size; \
703} while(0)
704
705#define DATA_PUSH(x) \
706 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
707#define DATA_POP(x) \
708 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000709#define DATA_POP_DISCARD(x) \
710 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
711#define DATA_ALLOC(t,p) \
712 DATA_STACK_ALLOC(state, t, p)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000713#define DATA_LOOKUP_AT(t,p,pos) \
714 DATA_STACK_LOOKUP_AT(state,t,p,pos)
715
716#define MARK_PUSH(lastmark) \
717 do if (lastmark > 0) { \
718 i = lastmark; /* ctx->lastmark may change if reallocated */ \
719 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
720 } while (0)
721#define MARK_POP(lastmark) \
722 do if (lastmark > 0) { \
723 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
724 } while (0)
725#define MARK_POP_KEEP(lastmark) \
726 do if (lastmark > 0) { \
727 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
728 } while (0)
729#define MARK_POP_DISCARD(lastmark) \
730 do if (lastmark > 0) { \
731 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
732 } while (0)
733
734#define JUMP_NONE 0
735#define JUMP_MAX_UNTIL_1 1
736#define JUMP_MAX_UNTIL_2 2
737#define JUMP_MAX_UNTIL_3 3
738#define JUMP_MIN_UNTIL_1 4
739#define JUMP_MIN_UNTIL_2 5
740#define JUMP_MIN_UNTIL_3 6
741#define JUMP_REPEAT 7
742#define JUMP_REPEAT_ONE_1 8
743#define JUMP_REPEAT_ONE_2 9
744#define JUMP_MIN_REPEAT_ONE 10
745#define JUMP_BRANCH 11
746#define JUMP_ASSERT 12
747#define JUMP_ASSERT_NOT 13
748
Georg Brandldaa1fa92013-10-13 09:32:59 +0200749#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000750 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
751 nextctx->last_ctx_pos = ctx_pos; \
752 nextctx->jump = jumpvalue; \
753 nextctx->pattern = nextpattern; \
754 ctx_pos = alloc_pos; \
755 ctx = nextctx; \
756 goto entrance; \
757 jumplabel: \
758 while (0) /* gcc doesn't like labels at end of scopes */ \
759
760typedef struct {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000761 Py_ssize_t last_ctx_pos;
762 Py_ssize_t jump;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200763 char* ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000764 SRE_CODE* pattern;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000765 Py_ssize_t count;
766 Py_ssize_t lastmark;
767 Py_ssize_t lastindex;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000768 union {
769 SRE_CODE chr;
770 SRE_REPEAT* rep;
771 } u;
772} SRE_MATCH_CONTEXT;
773
774/* check if string matches the given pattern. returns <0 for
775 error, 0 for failure, and 1 for success */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000776LOCAL(Py_ssize_t)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000777SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
Guido van Rossumb700df92000-03-31 14:59:30 +0000778{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200779 char* end = (char*)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000780 Py_ssize_t alloc_pos, ctx_pos = -1;
781 Py_ssize_t i, ret = 0;
782 Py_ssize_t jump;
Christian Heimes2380ac72008-01-09 00:17:24 +0000783 unsigned int sigcount=0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000784
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000785 SRE_MATCH_CONTEXT* ctx;
786 SRE_MATCH_CONTEXT* nextctx;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000787
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000788 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000789
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000790 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
791 ctx->last_ctx_pos = -1;
792 ctx->jump = JUMP_NONE;
793 ctx->pattern = pattern;
794 ctx_pos = alloc_pos;
795
796entrance:
797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200798 ctx->ptr = (char *)state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000799
800 if (ctx->pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000801 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000802 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Victor Stinner1fa174a2013-08-28 02:06:21 +0200803 if (ctx->pattern[3] && (Py_uintptr_t)(end - ctx->ptr)/state->charsize < ctx->pattern[3]) {
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300804 TRACE(("reject (got %" PY_FORMAT_SIZE_T "d chars, "
805 "need %" PY_FORMAT_SIZE_T "d)\n",
806 (end - ctx->ptr)/state->charsize,
807 (Py_ssize_t) ctx->pattern[3]));
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000808 RETURN_FAILURE;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000809 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000810 ctx->pattern += ctx->pattern[1] + 1;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000811 }
812
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000813 for (;;) {
Christian Heimes2380ac72008-01-09 00:17:24 +0000814 ++sigcount;
815 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
816 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
Guido van Rossumb700df92000-03-31 14:59:30 +0000817
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000818 switch (*ctx->pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000819
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000820 case SRE_OP_MARK:
821 /* set mark */
822 /* <MARK> <gid> */
823 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
824 ctx->ptr, ctx->pattern[0]));
825 i = ctx->pattern[0];
826 if (i & 1)
827 state->lastindex = i/2 + 1;
828 if (i > state->lastmark) {
829 /* state->lastmark is the highest valid index in the
830 state->mark array. If it is increased by more than 1,
831 the intervening marks must be set to NULL to signal
Tim Peters3d563502006-01-21 02:47:53 +0000832 that these marks have not been encountered. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000833 Py_ssize_t j = state->lastmark + 1;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000834 while (j < i)
835 state->mark[j++] = NULL;
836 state->lastmark = i;
837 }
838 state->mark[i] = ctx->ptr;
839 ctx->pattern++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000840 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000841
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000842 case SRE_OP_LITERAL:
843 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000844 /* <LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000845 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
846 ctx->ptr, *ctx->pattern));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200847 if (ctx->ptr >= end || (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) != ctx->pattern[0])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000848 RETURN_FAILURE;
849 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200850 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000851 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000852
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000853 case SRE_OP_NOT_LITERAL:
854 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000855 /* <NOT_LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000856 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
857 ctx->ptr, *ctx->pattern));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858 if (ctx->ptr >= end || (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) == ctx->pattern[0])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000859 RETURN_FAILURE;
860 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200861 ctx->ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000862 break;
863
864 case SRE_OP_SUCCESS:
865 /* end of pattern */
866 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
867 state->ptr = ctx->ptr;
868 RETURN_SUCCESS;
869
870 case SRE_OP_AT:
871 /* match at given position */
872 /* <AT> <code> */
873 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
874 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
875 RETURN_FAILURE;
876 ctx->pattern++;
877 break;
878
879 case SRE_OP_CATEGORY:
880 /* match at given category */
881 /* <CATEGORY> <code> */
882 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
883 ctx->ptr, *ctx->pattern));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200884 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], SRE_CHARGET(state, ctx->ptr, 0)))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000885 RETURN_FAILURE;
886 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200887 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000888 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000889
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000890 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000891 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000892 /* <ANY> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000893 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200894 if (ctx->ptr >= end || SRE_IS_LINEBREAK(SRE_CHARGET(state, ctx->ptr, 0)))
895 RETURN_FAILURE;
896 ctx->ptr += state->charsize;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000897 break;
898
899 case SRE_OP_ANY_ALL:
900 /* match anything */
901 /* <ANY_ALL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000902 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
903 if (ctx->ptr >= end)
904 RETURN_FAILURE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200905 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000906 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000907
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000908 case SRE_OP_IN:
909 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000910 /* <IN> <skip> <set> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000911 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, SRE_CHARGET(state, ctx->ptr, 0)))
913 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000914 ctx->pattern += ctx->pattern[0];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200915 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000916 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000917
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000918 case SRE_OP_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000919 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
920 ctx->pattern, ctx->ptr, ctx->pattern[0]));
921 if (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200922 state->lower(SRE_CHARGET(state, ctx->ptr, 0)) != state->lower(*ctx->pattern))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000923 RETURN_FAILURE;
924 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200925 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000926 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000927
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000928 case SRE_OP_NOT_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000929 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
930 ctx->pattern, ctx->ptr, *ctx->pattern));
931 if (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200932 state->lower(SRE_CHARGET(state, ctx->ptr, 0)) == state->lower(*ctx->pattern))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000933 RETURN_FAILURE;
934 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200935 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000936 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000937
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000938 case SRE_OP_IN_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000939 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
940 if (ctx->ptr >= end
941 || !SRE_CHARSET(ctx->pattern+1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200942 (SRE_CODE)state->lower(SRE_CHARGET(state, ctx->ptr, 0))))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000943 RETURN_FAILURE;
944 ctx->pattern += ctx->pattern[0];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200945 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000946 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000947
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000948 case SRE_OP_JUMP:
949 case SRE_OP_INFO:
950 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000951 /* <JUMP> <offset> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000952 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
953 ctx->ptr, ctx->pattern[0]));
954 ctx->pattern += ctx->pattern[0];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000955 break;
956
957 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000958 /* alternation */
959 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000960 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000961 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000962 ctx->u.rep = state->repeat;
963 if (ctx->u.rep)
964 MARK_PUSH(ctx->lastmark);
965 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
966 if (ctx->pattern[1] == SRE_OP_LITERAL &&
967 (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968 (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) != ctx->pattern[2]))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000969 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000970 if (ctx->pattern[1] == SRE_OP_IN &&
971 (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200972 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0))))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000973 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000974 state->ptr = ctx->ptr;
Georg Brandldaa1fa92013-10-13 09:32:59 +0200975 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000976 if (ret) {
977 if (ctx->u.rep)
978 MARK_POP_DISCARD(ctx->lastmark);
979 RETURN_ON_ERROR(ret);
980 RETURN_SUCCESS;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000981 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000982 if (ctx->u.rep)
983 MARK_POP_KEEP(ctx->lastmark);
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000984 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000985 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000986 if (ctx->u.rep)
987 MARK_POP_DISCARD(ctx->lastmark);
988 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +0000989
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000990 case SRE_OP_REPEAT_ONE:
991 /* match repeated sequence (maximizing regexp) */
992
993 /* this operator only works if the repeated item is
994 exactly one character wide, and we're not already
995 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +0000996 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000997
998 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
999
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001000 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1001 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001002
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001003 if ((Py_ssize_t) ctx->pattern[1] > (end - ctx->ptr) / state->charsize)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001004 RETURN_FAILURE; /* cannot match */
Fredrik Lundhe1869832000-08-01 22:47:49 +00001005
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001006 state->ptr = ctx->ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001007
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001008 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1009 RETURN_ON_ERROR(ret);
1010 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1011 ctx->count = ret;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001012 ctx->ptr += state->charsize * ctx->count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001013
1014 /* when we arrive here, count contains the number of
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001015 matches, and ctx->ptr points to the tail of the target
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001016 string. check if the rest of the pattern matches,
1017 and backtrack if not. */
1018
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001019 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001020 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001021
Georg Brandldaa1fa92013-10-13 09:32:59 +02001022 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001023 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001024 state->ptr = ctx->ptr;
1025 RETURN_SUCCESS;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001026 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001027
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001028 LASTMARK_SAVE();
1029
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001030 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001031 /* tail starts with a literal. skip positions where
1032 the rest of the pattern cannot possibly match */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001033 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001034 for (;;) {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001035 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
Victor Stinner63ab8752011-11-22 03:31:20 +01001036 (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001037 SRE_CHARGET(state, ctx->ptr, 0) != ctx->u.chr)) {
1038 ctx->ptr -= state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001039 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001040 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001041 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001042 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001043 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001044 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
Georg Brandldaa1fa92013-10-13 09:32:59 +02001045 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001046 if (ret) {
1047 RETURN_ON_ERROR(ret);
1048 RETURN_SUCCESS;
1049 }
Tim Peters3d563502006-01-21 02:47:53 +00001050
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001051 LASTMARK_RESTORE();
Tim Peters3d563502006-01-21 02:47:53 +00001052
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053 ctx->ptr -= state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001054 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001055 }
1056
1057 } else {
1058 /* general case */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001059 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001060 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001061 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
Georg Brandldaa1fa92013-10-13 09:32:59 +02001062 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001063 if (ret) {
1064 RETURN_ON_ERROR(ret);
1065 RETURN_SUCCESS;
1066 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067 ctx->ptr -= state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001068 ctx->count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001069 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001070 }
1071 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001072 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001073
Guido van Rossum41c99e72003-04-14 17:59:34 +00001074 case SRE_OP_MIN_REPEAT_ONE:
1075 /* match repeated sequence (minimizing regexp) */
1076
1077 /* this operator only works if the repeated item is
1078 exactly one character wide, and we're not already
1079 collecting backtracking points. for other cases,
1080 use the MIN_REPEAT operator */
1081
1082 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1083
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001084 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1085 ctx->pattern[1], ctx->pattern[2]));
Guido van Rossum41c99e72003-04-14 17:59:34 +00001086
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001087 if ((Py_ssize_t) ctx->pattern[1] > (end - ctx->ptr) / state->charsize)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001088 RETURN_FAILURE; /* cannot match */
Guido van Rossum41c99e72003-04-14 17:59:34 +00001089
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001090 state->ptr = ctx->ptr;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001091
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001092 if (ctx->pattern[1] == 0)
1093 ctx->count = 0;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001094 else {
1095 /* count using pattern min as the maximum */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001096 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1097 RETURN_ON_ERROR(ret);
1098 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001099 if (ret < (Py_ssize_t) ctx->pattern[1])
Tim Peters3d563502006-01-21 02:47:53 +00001100 /* didn't match minimum number of times */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001101 RETURN_FAILURE;
1102 /* advance past minimum matches of repeat */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001103 ctx->count = ret;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001104 ctx->ptr += state->charsize * ctx->count;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001105 }
1106
Georg Brandldaa1fa92013-10-13 09:32:59 +02001107 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Guido van Rossum41c99e72003-04-14 17:59:34 +00001108 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001109 state->ptr = ctx->ptr;
1110 RETURN_SUCCESS;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001111
1112 } else {
1113 /* general case */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001114 LASTMARK_SAVE();
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001115 while ((Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001116 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001117 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001118 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
Georg Brandldaa1fa92013-10-13 09:32:59 +02001119 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001120 if (ret) {
1121 RETURN_ON_ERROR(ret);
1122 RETURN_SUCCESS;
1123 }
1124 state->ptr = ctx->ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001125 ret = SRE_COUNT(state, ctx->pattern+3, 1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001126 RETURN_ON_ERROR(ret);
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001127 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001128 if (ret == 0)
Guido van Rossum41c99e72003-04-14 17:59:34 +00001129 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001130 assert(ret == 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001131 ctx->ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001132 ctx->count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001133 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001134 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001135 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001136 RETURN_FAILURE;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001137
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001138 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001139 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001140 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001141 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001142 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1143 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001144
1145 /* install new repeat context */
Thomas Wouters477c8d52006-05-27 19:21:47 +00001146 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001147 if (!ctx->u.rep) {
1148 PyErr_NoMemory();
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001149 RETURN_FAILURE;
Thomas Wouters89f507f2006-12-13 04:49:30 +00001150 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001151 ctx->u.rep->count = -1;
1152 ctx->u.rep->pattern = ctx->pattern;
1153 ctx->u.rep->prev = state->repeat;
1154 ctx->u.rep->last_ptr = NULL;
1155 state->repeat = ctx->u.rep;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001156
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001157 state->ptr = ctx->ptr;
Georg Brandldaa1fa92013-10-13 09:32:59 +02001158 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001159 state->repeat = ctx->u.rep->prev;
Thomas Wouters477c8d52006-05-27 19:21:47 +00001160 PyObject_FREE(ctx->u.rep);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001161
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001162 if (ret) {
1163 RETURN_ON_ERROR(ret);
1164 RETURN_SUCCESS;
1165 }
1166 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001167
1168 case SRE_OP_MAX_UNTIL:
1169 /* maximizing repeat */
1170 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1171
1172 /* FIXME: we probably need to deal with zero-width
1173 matches in here... */
1174
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001175 ctx->u.rep = state->repeat;
1176 if (!ctx->u.rep)
1177 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001178
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001179 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001180
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001181 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001182
Serhiy Storchaka134f0de2013-09-05 18:01:15 +03001183 TRACE(("|%p|%p|MAX_UNTIL %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001184 ctx->ptr, ctx->count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001185
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001186 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001187 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001188 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001189 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
Georg Brandldaa1fa92013-10-13 09:32:59 +02001190 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001191 if (ret) {
1192 RETURN_ON_ERROR(ret);
1193 RETURN_SUCCESS;
1194 }
1195 ctx->u.rep->count = ctx->count-1;
1196 state->ptr = ctx->ptr;
1197 RETURN_FAILURE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001198 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001199
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001200 if ((ctx->count < (Py_ssize_t) ctx->u.rep->pattern[2] ||
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001201 ctx->u.rep->pattern[2] == SRE_MAXREPEAT) &&
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001202 state->ptr != ctx->u.rep->last_ptr) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001203 /* we may have enough matches, but if we can
1204 match another item, do so */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001205 ctx->u.rep->count = ctx->count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001206 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001207 MARK_PUSH(ctx->lastmark);
1208 /* zero-width match protection */
1209 DATA_PUSH(&ctx->u.rep->last_ptr);
1210 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001211 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
Georg Brandldaa1fa92013-10-13 09:32:59 +02001212 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001213 DATA_POP(&ctx->u.rep->last_ptr);
1214 if (ret) {
1215 MARK_POP_DISCARD(ctx->lastmark);
1216 RETURN_ON_ERROR(ret);
1217 RETURN_SUCCESS;
1218 }
1219 MARK_POP(ctx->lastmark);
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001220 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001221 ctx->u.rep->count = ctx->count-1;
1222 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001223 }
1224
1225 /* cannot match more repeated items here. make sure the
1226 tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001227 state->repeat = ctx->u.rep->prev;
Georg Brandldaa1fa92013-10-13 09:32:59 +02001228 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001229 RETURN_ON_SUCCESS(ret);
1230 state->repeat = ctx->u.rep;
1231 state->ptr = ctx->ptr;
1232 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001233
1234 case SRE_OP_MIN_UNTIL:
1235 /* minimizing repeat */
1236 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1237
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001238 ctx->u.rep = state->repeat;
1239 if (!ctx->u.rep)
1240 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001241
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001242 state->ptr = ctx->ptr;
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001243
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001244 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001245
Serhiy Storchaka134f0de2013-09-05 18:01:15 +03001246 TRACE(("|%p|%p|MIN_UNTIL %" PY_FORMAT_SIZE_T "d %p\n", ctx->pattern,
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001247 ctx->ptr, ctx->count, ctx->u.rep->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001248
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001249 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001250 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001251 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001252 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
Georg Brandldaa1fa92013-10-13 09:32:59 +02001253 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001254 if (ret) {
1255 RETURN_ON_ERROR(ret);
1256 RETURN_SUCCESS;
1257 }
1258 ctx->u.rep->count = ctx->count-1;
1259 state->ptr = ctx->ptr;
1260 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001261 }
1262
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001263 LASTMARK_SAVE();
1264
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001265 /* see if the tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001266 state->repeat = ctx->u.rep->prev;
Georg Brandldaa1fa92013-10-13 09:32:59 +02001267 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001268 if (ret) {
1269 RETURN_ON_ERROR(ret);
1270 RETURN_SUCCESS;
1271 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001272
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001273 state->repeat = ctx->u.rep;
1274 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001275
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001276 LASTMARK_RESTORE();
1277
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001278 if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2]
Serhiy Storchakafa468162013-02-16 21:23:53 +02001279 && ctx->u.rep->pattern[2] != SRE_MAXREPEAT) ||
1280 state->ptr == ctx->u.rep->last_ptr)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001281 RETURN_FAILURE;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001282
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001283 ctx->u.rep->count = ctx->count;
Serhiy Storchakafa468162013-02-16 21:23:53 +02001284 /* zero-width match protection */
1285 DATA_PUSH(&ctx->u.rep->last_ptr);
1286 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001287 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
Georg Brandldaa1fa92013-10-13 09:32:59 +02001288 ctx->u.rep->pattern+3);
Serhiy Storchakafa468162013-02-16 21:23:53 +02001289 DATA_POP(&ctx->u.rep->last_ptr);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001290 if (ret) {
1291 RETURN_ON_ERROR(ret);
1292 RETURN_SUCCESS;
1293 }
1294 ctx->u.rep->count = ctx->count-1;
1295 state->ptr = ctx->ptr;
1296 RETURN_FAILURE;
1297
1298 case SRE_OP_GROUPREF:
1299 /* match backreference */
1300 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1301 ctx->ptr, ctx->pattern[0]));
1302 i = ctx->pattern[0];
1303 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001304 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001305 if (groupref >= state->lastmark) {
1306 RETURN_FAILURE;
1307 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001308 char* p = (char*) state->mark[groupref];
1309 char* e = (char*) state->mark[groupref+1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001310 if (!p || !e || e < p)
1311 RETURN_FAILURE;
1312 while (p < e) {
Victor Stinner63ab8752011-11-22 03:31:20 +01001313 if (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001314 SRE_CHARGET(state, ctx->ptr, 0) != SRE_CHARGET(state, p, 0))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001315 RETURN_FAILURE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 p += state->charsize;
1317 ctx->ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001318 }
1319 }
1320 }
1321 ctx->pattern++;
1322 break;
1323
1324 case SRE_OP_GROUPREF_IGNORE:
1325 /* match backreference */
1326 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1327 ctx->ptr, ctx->pattern[0]));
1328 i = ctx->pattern[0];
1329 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001330 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001331 if (groupref >= state->lastmark) {
1332 RETURN_FAILURE;
1333 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334 char* p = (char*) state->mark[groupref];
1335 char* e = (char*) state->mark[groupref+1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001336 if (!p || !e || e < p)
1337 RETURN_FAILURE;
1338 while (p < e) {
1339 if (ctx->ptr >= end ||
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001340 state->lower(SRE_CHARGET(state, ctx->ptr, 0)) !=
1341 state->lower(SRE_CHARGET(state, p, 0)))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001342 RETURN_FAILURE;
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001343 p += state->charsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344 ctx->ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001345 }
1346 }
1347 }
1348 ctx->pattern++;
1349 break;
1350
1351 case SRE_OP_GROUPREF_EXISTS:
1352 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1353 ctx->ptr, ctx->pattern[0]));
1354 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1355 i = ctx->pattern[0];
1356 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001357 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001358 if (groupref >= state->lastmark) {
1359 ctx->pattern += ctx->pattern[1];
1360 break;
1361 } else {
1362 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1363 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1364 if (!p || !e || e < p) {
1365 ctx->pattern += ctx->pattern[1];
1366 break;
1367 }
1368 }
1369 }
1370 ctx->pattern += 2;
1371 break;
1372
1373 case SRE_OP_ASSERT:
1374 /* assert subpattern */
1375 /* <ASSERT> <skip> <back> <pattern> */
1376 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1377 ctx->ptr, ctx->pattern[1]));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001378 state->ptr = ctx->ptr - state->charsize * ctx->pattern[1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001379 if (state->ptr < state->beginning)
1380 RETURN_FAILURE;
Georg Brandldaa1fa92013-10-13 09:32:59 +02001381 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001382 RETURN_ON_FAILURE(ret);
1383 ctx->pattern += ctx->pattern[0];
1384 break;
1385
1386 case SRE_OP_ASSERT_NOT:
1387 /* assert not subpattern */
1388 /* <ASSERT_NOT> <skip> <back> <pattern> */
1389 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1390 ctx->ptr, ctx->pattern[1]));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391 state->ptr = ctx->ptr - state->charsize * ctx->pattern[1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001392 if (state->ptr >= state->beginning) {
Georg Brandldaa1fa92013-10-13 09:32:59 +02001393 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001394 if (ret) {
1395 RETURN_ON_ERROR(ret);
1396 RETURN_FAILURE;
1397 }
1398 }
1399 ctx->pattern += ctx->pattern[0];
1400 break;
1401
1402 case SRE_OP_FAILURE:
1403 /* immediate failure */
1404 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1405 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001406
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001407 default:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001408 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1409 ctx->pattern[-1]));
1410 RETURN_ERROR(SRE_ERROR_ILLEGAL);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001411 }
1412 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001413
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001414exit:
1415 ctx_pos = ctx->last_ctx_pos;
1416 jump = ctx->jump;
1417 DATA_POP_DISCARD(ctx);
1418 if (ctx_pos == -1)
1419 return ret;
1420 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1421
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001422 switch (jump) {
1423 case JUMP_MAX_UNTIL_2:
1424 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1425 goto jump_max_until_2;
1426 case JUMP_MAX_UNTIL_3:
1427 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1428 goto jump_max_until_3;
1429 case JUMP_MIN_UNTIL_2:
1430 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1431 goto jump_min_until_2;
1432 case JUMP_MIN_UNTIL_3:
1433 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1434 goto jump_min_until_3;
1435 case JUMP_BRANCH:
1436 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1437 goto jump_branch;
1438 case JUMP_MAX_UNTIL_1:
1439 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1440 goto jump_max_until_1;
1441 case JUMP_MIN_UNTIL_1:
1442 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1443 goto jump_min_until_1;
1444 case JUMP_REPEAT:
1445 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1446 goto jump_repeat;
1447 case JUMP_REPEAT_ONE_1:
1448 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1449 goto jump_repeat_one_1;
1450 case JUMP_REPEAT_ONE_2:
1451 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1452 goto jump_repeat_one_2;
1453 case JUMP_MIN_REPEAT_ONE:
1454 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1455 goto jump_min_repeat_one;
1456 case JUMP_ASSERT:
1457 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1458 goto jump_assert;
1459 case JUMP_ASSERT_NOT:
1460 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1461 goto jump_assert_not;
1462 case JUMP_NONE:
Serhiy Storchaka134f0de2013-09-05 18:01:15 +03001463 TRACE(("|%p|%p|RETURN %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
1464 ctx->ptr, ret));
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001465 break;
1466 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001467
1468 return ret; /* should never get here */
Guido van Rossumb700df92000-03-31 14:59:30 +00001469}
1470
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001471LOCAL(Py_ssize_t)
Guido van Rossumb700df92000-03-31 14:59:30 +00001472SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1473{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474 char* ptr = (char*)state->start;
1475 char* end = (char*)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001476 Py_ssize_t status = 0;
1477 Py_ssize_t prefix_len = 0;
1478 Py_ssize_t prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001479 SRE_CODE* prefix = NULL;
1480 SRE_CODE* charset = NULL;
1481 SRE_CODE* overlap = NULL;
1482 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001483
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001484 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001485 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001486 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001487
1488 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001489
Gustavo Niemeyer28b5bb32003-06-26 14:41:08 +00001490 if (pattern[3] > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001491 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001492 character in there, so literal search will work) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001493 end -= (pattern[3]-1) * state->charsize;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001494 if (end <= ptr)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001495 end = ptr + state->charsize;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001496 }
1497
Fredrik Lundh3562f112000-07-02 12:00:07 +00001498 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001499 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001500 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001501 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001502 prefix_skip = pattern[6];
1503 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001504 overlap = prefix + prefix_len - 1;
1505 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001506 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001507 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001508 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001509
1510 pattern += 1 + pattern[1];
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001511 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001512
Serhiy Storchaka134f0de2013-09-05 18:01:15 +03001513 TRACE(("prefix = %p %" PY_FORMAT_SIZE_T "d %" PY_FORMAT_SIZE_T "d\n",
1514 prefix, prefix_len, prefix_skip));
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001515 TRACE(("charset = %p\n", charset));
1516
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001517#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001518 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001519 /* pattern starts with a known prefix. use the overlap
1520 table to skip forward as fast as we possibly can */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001521 Py_ssize_t i = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001522 end = (char *)state->end;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001523 while (ptr < end) {
1524 for (;;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001525 if ((SRE_CODE) SRE_CHARGET(state, ptr, 0) != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001526 if (!i)
1527 break;
1528 else
1529 i = overlap[i];
1530 } else {
1531 if (++i == prefix_len) {
1532 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001533 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001534 state->start = ptr - (prefix_len - 1) * state->charsize;
1535 state->ptr = ptr - (prefix_len - prefix_skip - 1) * state->charsize;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001536 if (flags & SRE_INFO_LITERAL)
1537 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001538 status = SRE_MATCH(state, pattern + 2*prefix_skip);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001539 if (status != 0)
1540 return status;
1541 /* close but no cigar -- try again */
1542 i = overlap[i];
1543 }
1544 break;
1545 }
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001546 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001547 ptr += state->charsize;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001548 }
1549 return 0;
1550 }
1551#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001552
Fredrik Lundh3562f112000-07-02 12:00:07 +00001553 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001554 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001555 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001556 SRE_CODE chr = pattern[1];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001557 end = (char*)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001558 for (;;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001559 while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) != chr)
1560 ptr += state->charsize;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001561 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001562 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001563 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001564 state->start = ptr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001565 ptr += state->charsize;
1566 state->ptr = ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001567 if (flags & SRE_INFO_LITERAL)
1568 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001569 status = SRE_MATCH(state, pattern + 2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001570 if (status != 0)
1571 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001572 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001573 } else if (charset) {
1574 /* pattern starts with a character from a known set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001575 end = (char*)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001576 for (;;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001577 while (ptr < end && !SRE_CHARSET(charset, SRE_CHARGET(state, ptr, 0)))
1578 ptr += state->charsize;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001579 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001580 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001581 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001582 state->start = ptr;
1583 state->ptr = ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001584 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001585 if (status != 0)
1586 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001587 ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001588 }
1589 } else
1590 /* general case */
1591 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001592 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001593 state->start = state->ptr = ptr;
1594 ptr += state->charsize;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001595 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001596 if (status != 0)
1597 break;
1598 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001599
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001600 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001601}
Tim Peters3d563502006-01-21 02:47:53 +00001602
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001603#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001604
1605/* -------------------------------------------------------------------- */
1606/* factories and destructors */
1607
1608/* see sre.h for object declarations */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001609static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06001610static PyObject*pattern_scanner(PatternObject*, PyObject*, PyObject* kw);
Guido van Rossumb700df92000-03-31 14:59:30 +00001611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001612static int
1613sre_literal_template(int charsize, char* ptr, Py_ssize_t len)
1614{
1615 /* check if given string is a literal template (i.e. no escapes) */
1616 struct {
1617 int charsize;
1618 } state = {
1619 charsize
1620 };
1621 while (len-- > 0) {
1622 if (SRE_CHARGET((&state), ptr, 0) == '\\')
1623 return 0;
1624 ptr += charsize;
1625 }
1626 return 1;
1627}
1628
Guido van Rossumb700df92000-03-31 14:59:30 +00001629static PyObject *
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001630sre_codesize(PyObject* self, PyObject *unused)
Guido van Rossumb700df92000-03-31 14:59:30 +00001631{
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001632 return PyLong_FromSize_t(sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001633}
1634
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001635static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001636sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001637{
1638 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001639 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001640 return NULL;
1641 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001642 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001643 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001644 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundhb389df32000-06-29 12:48:37 +00001645 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001646}
1647
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001648LOCAL(void)
1649state_reset(SRE_STATE* state)
1650{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001651 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001652 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001653
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001654 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001655 state->lastindex = -1;
1656
1657 state->repeat = NULL;
1658
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001659 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001660}
1661
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001662static void*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001663getstring(PyObject* string, Py_ssize_t* p_length,
Benjamin Peterson33d21a22012-03-07 14:59:13 -06001664 int* p_logical_charsize, int* p_charsize,
1665 Py_buffer *view)
Guido van Rossumb700df92000-03-31 14:59:30 +00001666{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001667 /* given a python object, return a data pointer, a length (in
1668 characters), and a character size. return NULL if the object
1669 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +00001670
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001671 PyBufferProcs *buffer;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001672 Py_ssize_t size, bytes;
1673 int charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001674 void* ptr;
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001675
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00001676 /* Unicode objects do not support the buffer API. So, get the data
1677 directly instead. */
1678 if (PyUnicode_Check(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001679 if (PyUnicode_READY(string) == -1)
1680 return NULL;
1681 ptr = PyUnicode_DATA(string);
1682 *p_length = PyUnicode_GET_LENGTH(string);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001683 *p_charsize = PyUnicode_KIND(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001684 *p_logical_charsize = 4;
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00001685 return ptr;
1686 }
1687
Victor Stinner0058b862011-09-29 03:27:47 +02001688 /* get pointer to byte string buffer */
Benjamin Petersone48944b2012-03-07 14:50:25 -06001689 view->len = -1;
Christian Heimes90aa7642007-12-19 02:45:37 +00001690 buffer = Py_TYPE(string)->tp_as_buffer;
Antoine Pitroufd036452008-08-19 17:56:33 +00001691 if (!buffer || !buffer->bf_getbuffer ||
Benjamin Petersone48944b2012-03-07 14:50:25 -06001692 (*buffer->bf_getbuffer)(string, view, PyBUF_SIMPLE) < 0) {
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001693 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1694 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001695 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001696
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001697 /* determine buffer size */
Benjamin Petersone48944b2012-03-07 14:50:25 -06001698 bytes = view->len;
1699 ptr = view->buf;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001700
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001701 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001702 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001703 goto err;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001704 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001705
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001706 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001707 size = PyObject_Size(string);
Guido van Rossumb700df92000-03-31 14:59:30 +00001708
Christian Heimes72b710a2008-05-26 13:28:38 +00001709 if (PyBytes_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001710 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001711 else {
1712 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001713 goto err;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001714 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001715
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001716 *p_length = size;
1717 *p_charsize = charsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001718 *p_logical_charsize = charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001719
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001720 if (ptr == NULL) {
Antoine Pitroufd036452008-08-19 17:56:33 +00001721 PyErr_SetString(PyExc_ValueError,
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001722 "Buffer is NULL");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001723 goto err;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001724 }
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001725 return ptr;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001726 err:
1727 PyBuffer_Release(view);
1728 view->buf = NULL;
1729 return NULL;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001730}
1731
1732LOCAL(PyObject*)
1733state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001734 Py_ssize_t start, Py_ssize_t end)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001735{
1736 /* prepare state object */
1737
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001738 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001739 int logical_charsize, charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001740 void* ptr;
1741
1742 memset(state, 0, sizeof(SRE_STATE));
1743
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001744 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001745 state->lastindex = -1;
1746
Benjamin Petersone48944b2012-03-07 14:50:25 -06001747 state->buffer.buf = NULL;
Benjamin Peterson33d21a22012-03-07 14:59:13 -06001748 ptr = getstring(string, &length, &logical_charsize, &charsize, &state->buffer);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001749 if (!ptr)
Benjamin Petersone48944b2012-03-07 14:50:25 -06001750 goto err;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001751
Benjamin Peterson33d21a22012-03-07 14:59:13 -06001752 if (logical_charsize == 1 && pattern->logical_charsize > 1) {
Benjamin Petersone48944b2012-03-07 14:50:25 -06001753 PyErr_SetString(PyExc_TypeError,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001754 "can't use a string pattern on a bytes-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001755 goto err;
1756 }
Benjamin Peterson33d21a22012-03-07 14:59:13 -06001757 if (logical_charsize > 1 && pattern->logical_charsize == 1) {
Benjamin Petersone48944b2012-03-07 14:50:25 -06001758 PyErr_SetString(PyExc_TypeError,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001759 "can't use a bytes pattern on a string-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001760 goto err;
1761 }
Antoine Pitroufd036452008-08-19 17:56:33 +00001762
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001763 /* adjust boundaries */
1764 if (start < 0)
1765 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001766 else if (start > length)
1767 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001768
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001769 if (end < 0)
1770 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001771 else if (end > length)
1772 end = length;
1773
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001774 state->logical_charsize = logical_charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001775 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001776
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001777 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001778
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001779 state->start = (void*) ((char*) ptr + start * state->charsize);
1780 state->end = (void*) ((char*) ptr + end * state->charsize);
1781
1782 Py_INCREF(string);
1783 state->string = string;
1784 state->pos = start;
1785 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001786
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001787 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001788 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001789 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001790 state->lower = sre_lower_unicode;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001791 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001792 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001793
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001794 return string;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001795 err:
1796 if (state->buffer.buf)
1797 PyBuffer_Release(&state->buffer);
1798 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001799}
1800
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001801LOCAL(void)
1802state_fini(SRE_STATE* state)
1803{
Benjamin Petersone48944b2012-03-07 14:50:25 -06001804 if (state->buffer.buf)
1805 PyBuffer_Release(&state->buffer);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001806 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001807 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001808}
1809
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001810/* calculate offset from start of string */
1811#define STATE_OFFSET(state, member)\
1812 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1813
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001814LOCAL(PyObject*)
Serhiy Storchaka25324972013-10-16 12:46:28 +03001815getslice(int logical_charsize, const void *ptr,
1816 PyObject* string, Py_ssize_t start, Py_ssize_t end)
1817{
1818 if (logical_charsize == 1) {
1819 if (PyBytes_CheckExact(string) &&
1820 start == 0 && end == PyBytes_GET_SIZE(string)) {
1821 Py_INCREF(string);
1822 return string;
1823 }
1824 return PyBytes_FromStringAndSize(
1825 (const char *)ptr + start, end - start);
1826 }
1827 else {
1828 return PyUnicode_Substring(string, start, end);
1829 }
1830}
1831
1832LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001833state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001834{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001835 Py_ssize_t i, j;
Fredrik Lundh58100642000-08-09 09:14:35 +00001836
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001837 index = (index - 1) * 2;
1838
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001839 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001840 if (empty)
1841 /* want empty string */
1842 i = j = 0;
1843 else {
1844 Py_INCREF(Py_None);
1845 return Py_None;
1846 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001847 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001848 i = STATE_OFFSET(state, state->mark[index]);
1849 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001850 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001851
Serhiy Storchaka25324972013-10-16 12:46:28 +03001852 return getslice(state->logical_charsize, state->beginning, string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001853}
1854
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001855static void
1856pattern_error(int status)
1857{
1858 switch (status) {
1859 case SRE_ERROR_RECURSION_LIMIT:
1860 PyErr_SetString(
1861 PyExc_RuntimeError,
1862 "maximum recursion limit exceeded"
1863 );
1864 break;
1865 case SRE_ERROR_MEMORY:
1866 PyErr_NoMemory();
1867 break;
Christian Heimes2380ac72008-01-09 00:17:24 +00001868 case SRE_ERROR_INTERRUPTED:
1869 /* An exception has already been raised, so let it fly */
1870 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001871 default:
1872 /* other error codes indicate compiler/engine bugs */
1873 PyErr_SetString(
1874 PyExc_RuntimeError,
1875 "internal error in regular expression engine"
1876 );
1877 }
1878}
1879
Guido van Rossumb700df92000-03-31 14:59:30 +00001880static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001881pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001882{
Raymond Hettinger027bb632004-05-31 03:09:25 +00001883 if (self->weakreflist != NULL)
1884 PyObject_ClearWeakRefs((PyObject *) self);
Benjamin Petersone48944b2012-03-07 14:50:25 -06001885 if (self->view.buf)
1886 PyBuffer_Release(&self->view);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001887 Py_XDECREF(self->pattern);
1888 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001889 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001890 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001891}
1892
1893static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001894pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001895{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001896 SRE_STATE state;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01001897 Py_ssize_t status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001898
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001899 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001900 Py_ssize_t start = 0;
1901 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001902 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001903 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001904 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001905 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001906
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001907 string = state_init(&state, self, string, start, end);
1908 if (!string)
1909 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001910
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001911 state.ptr = state.start;
1912
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001913 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001915 if (state.logical_charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001916 status = sre_match(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001917 } else {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001918 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001919 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001920
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001921 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001922 if (PyErr_Occurred())
1923 return NULL;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001924
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001925 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001926
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001927 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001928}
1929
1930static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001931pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001932{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001933 SRE_STATE state;
1934 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001935
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001936 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001937 Py_ssize_t start = 0;
1938 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001939 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001940 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001941 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001942 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001943
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001944 string = state_init(&state, self, string, start, end);
1945 if (!string)
1946 return NULL;
1947
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001948 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1949
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001950 if (state.logical_charsize == 1) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001951 status = sre_search(&state, PatternObject_GetCode(self));
1952 } else {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001953 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001954 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001955
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001956 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1957
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001958 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001959
Thomas Wouters89f507f2006-12-13 04:49:30 +00001960 if (PyErr_Occurred())
1961 return NULL;
1962
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001963 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001964}
1965
1966static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001967call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001968{
1969 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001970 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001971 PyObject* func;
1972 PyObject* result;
1973
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001974 if (!args)
1975 return NULL;
Neal Norwitzfe537132007-08-26 03:55:15 +00001976 name = PyUnicode_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001977 if (!name)
1978 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001979 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001980 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001981 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001982 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001983 func = PyObject_GetAttrString(mod, function);
1984 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001985 if (!func)
1986 return NULL;
1987 result = PyObject_CallObject(func, args);
1988 Py_DECREF(func);
1989 Py_DECREF(args);
1990 return result;
1991}
1992
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001993#ifdef USE_BUILTIN_COPY
1994static int
1995deepcopy(PyObject** object, PyObject* memo)
1996{
1997 PyObject* copy;
1998
1999 copy = call(
2000 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002001 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002002 );
2003 if (!copy)
2004 return 0;
2005
2006 Py_DECREF(*object);
2007 *object = copy;
2008
2009 return 1; /* success */
2010}
2011#endif
2012
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002013static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002014pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002015{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002016 SRE_STATE state;
2017 PyObject* list;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002018 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002019 Py_ssize_t i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00002020
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002021 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002022 Py_ssize_t start = 0;
2023 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002024 static char* kwlist[] = { "source", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002025 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00002026 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002027 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002028
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002029 string = state_init(&state, self, string, start, end);
2030 if (!string)
2031 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002032
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002033 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002034 if (!list) {
2035 state_fini(&state);
2036 return NULL;
2037 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002038
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002039 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002040
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002041 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +00002042
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002043 state_reset(&state);
2044
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002045 state.ptr = state.start;
2046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002047 if (state.logical_charsize == 1) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002048 status = sre_search(&state, PatternObject_GetCode(self));
2049 } else {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002050 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002051 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002052
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002053 if (PyErr_Occurred())
2054 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00002055
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002056 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002057 if (status == 0)
2058 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002059 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002060 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002061 }
Tim Peters3d563502006-01-21 02:47:53 +00002062
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002063 /* don't bother to build a match object */
2064 switch (self->groups) {
2065 case 0:
2066 b = STATE_OFFSET(&state, state.start);
2067 e = STATE_OFFSET(&state, state.ptr);
Serhiy Storchaka25324972013-10-16 12:46:28 +03002068 item = getslice(state.logical_charsize, state.beginning,
2069 string, b, e);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002070 if (!item)
2071 goto error;
2072 break;
2073 case 1:
2074 item = state_getslice(&state, 1, string, 1);
2075 if (!item)
2076 goto error;
2077 break;
2078 default:
2079 item = PyTuple_New(self->groups);
2080 if (!item)
2081 goto error;
2082 for (i = 0; i < self->groups; i++) {
2083 PyObject* o = state_getslice(&state, i+1, string, 1);
2084 if (!o) {
2085 Py_DECREF(item);
2086 goto error;
2087 }
2088 PyTuple_SET_ITEM(item, i, o);
2089 }
2090 break;
2091 }
2092
2093 status = PyList_Append(list, item);
2094 Py_DECREF(item);
2095 if (status < 0)
2096 goto error;
2097
2098 if (state.ptr == state.start)
2099 state.start = (void*) ((char*) state.ptr + state.charsize);
2100 else
2101 state.start = state.ptr;
2102
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002103 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002104
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002105 state_fini(&state);
2106 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002107
2108error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002109 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002110 state_fini(&state);
2111 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002112
Guido van Rossumb700df92000-03-31 14:59:30 +00002113}
2114
Fredrik Lundh703ce812001-10-24 22:16:30 +00002115static PyObject*
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002116pattern_finditer(PatternObject* pattern, PyObject* args, PyObject* kw)
Fredrik Lundh703ce812001-10-24 22:16:30 +00002117{
2118 PyObject* scanner;
2119 PyObject* search;
2120 PyObject* iterator;
2121
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002122 scanner = pattern_scanner(pattern, args, kw);
Fredrik Lundh703ce812001-10-24 22:16:30 +00002123 if (!scanner)
2124 return NULL;
2125
2126 search = PyObject_GetAttrString(scanner, "search");
2127 Py_DECREF(scanner);
2128 if (!search)
2129 return NULL;
2130
2131 iterator = PyCallIter_New(search, Py_None);
2132 Py_DECREF(search);
2133
2134 return iterator;
2135}
Fredrik Lundh703ce812001-10-24 22:16:30 +00002136
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002137static PyObject*
2138pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2139{
2140 SRE_STATE state;
2141 PyObject* list;
2142 PyObject* item;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002143 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002144 Py_ssize_t n;
2145 Py_ssize_t i;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002146 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002147
2148 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002149 Py_ssize_t maxsplit = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002150 static char* kwlist[] = { "source", "maxsplit", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002151 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002152 &string, &maxsplit))
2153 return NULL;
2154
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002155 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002156 if (!string)
2157 return NULL;
2158
2159 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002160 if (!list) {
2161 state_fini(&state);
2162 return NULL;
2163 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002164
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002165 n = 0;
2166 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002167
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002168 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002169
2170 state_reset(&state);
2171
2172 state.ptr = state.start;
2173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002174 if (state.logical_charsize == 1) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002175 status = sre_search(&state, PatternObject_GetCode(self));
2176 } else {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002177 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002178 }
2179
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002180 if (PyErr_Occurred())
2181 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00002182
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002183 if (status <= 0) {
2184 if (status == 0)
2185 break;
2186 pattern_error(status);
2187 goto error;
2188 }
Tim Peters3d563502006-01-21 02:47:53 +00002189
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002190 if (state.start == state.ptr) {
2191 if (last == state.end)
2192 break;
2193 /* skip one character */
2194 state.start = (void*) ((char*) state.ptr + state.charsize);
2195 continue;
2196 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002197
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002198 /* get segment before this match */
Serhiy Storchaka25324972013-10-16 12:46:28 +03002199 item = getslice(state.logical_charsize, state.beginning,
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002200 string, STATE_OFFSET(&state, last),
2201 STATE_OFFSET(&state, state.start)
2202 );
2203 if (!item)
2204 goto error;
2205 status = PyList_Append(list, item);
2206 Py_DECREF(item);
2207 if (status < 0)
2208 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002209
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002210 /* add groups (if any) */
2211 for (i = 0; i < self->groups; i++) {
2212 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002213 if (!item)
2214 goto error;
2215 status = PyList_Append(list, item);
2216 Py_DECREF(item);
2217 if (status < 0)
2218 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002219 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002220
2221 n = n + 1;
2222
2223 last = state.start = state.ptr;
2224
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002225 }
2226
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002227 /* get segment following last match (even if empty) */
Serhiy Storchaka25324972013-10-16 12:46:28 +03002228 item = getslice(state.logical_charsize, state.beginning,
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002229 string, STATE_OFFSET(&state, last), state.endpos
2230 );
2231 if (!item)
2232 goto error;
2233 status = PyList_Append(list, item);
2234 Py_DECREF(item);
2235 if (status < 0)
2236 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002237
2238 state_fini(&state);
2239 return list;
2240
2241error:
2242 Py_DECREF(list);
2243 state_fini(&state);
2244 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002245
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002246}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002247
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002248static PyObject*
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002249pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002250 Py_ssize_t count, Py_ssize_t subn)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002251{
2252 SRE_STATE state;
2253 PyObject* list;
Serhiy Storchaka25324972013-10-16 12:46:28 +03002254 PyObject* joiner;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002255 PyObject* item;
2256 PyObject* filter;
2257 PyObject* args;
2258 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002259 void* ptr;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002260 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002261 Py_ssize_t n;
2262 Py_ssize_t i, b, e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 int logical_charsize, charsize;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002264 int filter_is_callable;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002265 Py_buffer view;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002266
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002267 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002268 /* sub/subn takes either a function or a template */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002269 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002270 Py_INCREF(filter);
2271 filter_is_callable = 1;
2272 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002273 /* if not callable, check if it's a literal string */
2274 int literal;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002275 view.buf = NULL;
Benjamin Peterson33d21a22012-03-07 14:59:13 -06002276 ptr = getstring(ptemplate, &n, &logical_charsize, &charsize, &view);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002277 b = charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002278 if (ptr) {
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002279 literal = sre_literal_template(charsize, ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002280 } else {
2281 PyErr_Clear();
2282 literal = 0;
2283 }
Benjamin Petersone48944b2012-03-07 14:50:25 -06002284 if (view.buf)
2285 PyBuffer_Release(&view);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002286 if (literal) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002287 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002288 Py_INCREF(filter);
2289 filter_is_callable = 0;
2290 } else {
2291 /* not a literal; hand it over to the template compiler */
2292 filter = call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00002293 SRE_PY_MODULE, "_subx",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002294 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002295 );
2296 if (!filter)
2297 return NULL;
2298 filter_is_callable = PyCallable_Check(filter);
2299 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002300 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002301
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002302 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002303 if (!string) {
2304 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002305 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002306 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002307
2308 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002309 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002310 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002311 state_fini(&state);
2312 return NULL;
2313 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002314
2315 n = i = 0;
2316
2317 while (!count || n < count) {
2318
2319 state_reset(&state);
2320
2321 state.ptr = state.start;
2322
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002323 if (state.logical_charsize == 1) {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002324 status = sre_search(&state, PatternObject_GetCode(self));
2325 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002326 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002327 }
2328
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002329 if (PyErr_Occurred())
2330 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00002331
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002332 if (status <= 0) {
2333 if (status == 0)
2334 break;
2335 pattern_error(status);
2336 goto error;
2337 }
Tim Peters3d563502006-01-21 02:47:53 +00002338
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002339 b = STATE_OFFSET(&state, state.start);
2340 e = STATE_OFFSET(&state, state.ptr);
2341
2342 if (i < b) {
2343 /* get segment before this match */
Serhiy Storchaka25324972013-10-16 12:46:28 +03002344 item = getslice(state.logical_charsize, state.beginning,
2345 string, i, b);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002346 if (!item)
2347 goto error;
2348 status = PyList_Append(list, item);
2349 Py_DECREF(item);
2350 if (status < 0)
2351 goto error;
2352
2353 } else if (i == b && i == e && n > 0)
2354 /* ignore empty match on latest position */
2355 goto next;
2356
2357 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002358 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002359 match = pattern_new_match(self, &state, 1);
2360 if (!match)
2361 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002362 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002363 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002364 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002365 goto error;
2366 }
2367 item = PyObject_CallObject(filter, args);
2368 Py_DECREF(args);
2369 Py_DECREF(match);
2370 if (!item)
2371 goto error;
2372 } else {
2373 /* filter is literal string */
2374 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002375 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002376 }
2377
2378 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002379 if (item != Py_None) {
2380 status = PyList_Append(list, item);
2381 Py_DECREF(item);
2382 if (status < 0)
2383 goto error;
2384 }
Tim Peters3d563502006-01-21 02:47:53 +00002385
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002386 i = e;
2387 n = n + 1;
2388
2389next:
2390 /* move on */
2391 if (state.ptr == state.start)
2392 state.start = (void*) ((char*) state.ptr + state.charsize);
2393 else
2394 state.start = state.ptr;
2395
2396 }
2397
2398 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002399 if (i < state.endpos) {
Serhiy Storchaka25324972013-10-16 12:46:28 +03002400 item = getslice(state.logical_charsize, state.beginning,
2401 string, i, state.endpos);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002402 if (!item)
2403 goto error;
2404 status = PyList_Append(list, item);
2405 Py_DECREF(item);
2406 if (status < 0)
2407 goto error;
2408 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002409
2410 state_fini(&state);
2411
Guido van Rossum4e173842001-12-07 04:25:10 +00002412 Py_DECREF(filter);
2413
Fredrik Lundhdac58492001-10-21 21:48:30 +00002414 /* convert list to single string (also removes list) */
Serhiy Storchaka25324972013-10-16 12:46:28 +03002415 joiner = getslice(state.logical_charsize, state.beginning, string, 0, 0);
2416 if (!joiner) {
2417 Py_DECREF(list);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002418 return NULL;
Serhiy Storchaka25324972013-10-16 12:46:28 +03002419 }
2420 if (PyList_GET_SIZE(list) == 0) {
2421 Py_DECREF(list);
2422 item = joiner;
2423 }
2424 else {
2425 if (state.logical_charsize == 1)
2426 item = _PyBytes_Join(joiner, list);
2427 else
2428 item = PyUnicode_Join(joiner, list);
2429 Py_DECREF(joiner);
2430 if (!item)
2431 return NULL;
2432 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002433
2434 if (subn)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01002435 return Py_BuildValue("Nn", item, n);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002436
2437 return item;
2438
2439error:
2440 Py_DECREF(list);
2441 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002442 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002443 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002444
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002445}
2446
2447static PyObject*
2448pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2449{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002450 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002451 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002452 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002453 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002454 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002455 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002456 return NULL;
2457
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002458 return pattern_subx(self, ptemplate, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002459}
2460
2461static PyObject*
2462pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2463{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002464 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002465 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002466 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002467 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002468 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002469 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002470 return NULL;
2471
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002472 return pattern_subx(self, ptemplate, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002473}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002474
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002475static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002476pattern_copy(PatternObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002477{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002478#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002479 PatternObject* copy;
2480 int offset;
2481
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002482 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2483 if (!copy)
2484 return NULL;
2485
2486 offset = offsetof(PatternObject, groups);
2487
2488 Py_XINCREF(self->groupindex);
2489 Py_XINCREF(self->indexgroup);
2490 Py_XINCREF(self->pattern);
2491
2492 memcpy((char*) copy + offset, (char*) self + offset,
2493 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00002494 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002495
2496 return (PyObject*) copy;
2497#else
2498 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2499 return NULL;
2500#endif
2501}
2502
2503static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002504pattern_deepcopy(PatternObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002505{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002506#ifdef USE_BUILTIN_COPY
2507 PatternObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00002508
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002509 copy = (PatternObject*) pattern_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002510 if (!copy)
2511 return NULL;
2512
2513 if (!deepcopy(&copy->groupindex, memo) ||
2514 !deepcopy(&copy->indexgroup, memo) ||
2515 !deepcopy(&copy->pattern, memo)) {
2516 Py_DECREF(copy);
2517 return NULL;
2518 }
2519
2520#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002521 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2522 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002523#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002524}
2525
Raymond Hettinger94478742004-09-24 04:31:19 +00002526PyDoc_STRVAR(pattern_match_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002527"match(string[, pos[, endpos]]) -> match object or None.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002528 Matches zero or more characters at the beginning of the string");
2529
2530PyDoc_STRVAR(pattern_search_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002531"search(string[, pos[, endpos]]) -> match object or None.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002532 Scan through string looking for a match, and return a corresponding\n\
Andrew Svetlov0b64c142012-12-25 18:48:54 +02002533 match object instance. Return None if no position in the string matches.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002534
2535PyDoc_STRVAR(pattern_split_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002536"split(string[, maxsplit = 0]) -> list.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002537 Split string by the occurrences of pattern.");
2538
2539PyDoc_STRVAR(pattern_findall_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002540"findall(string[, pos[, endpos]]) -> list.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002541 Return a list of all non-overlapping matches of pattern in string.");
2542
2543PyDoc_STRVAR(pattern_finditer_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002544"finditer(string[, pos[, endpos]]) -> iterator.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002545 Return an iterator over all non-overlapping matches for the \n\
2546 RE pattern in string. For each match, the iterator returns a\n\
2547 match object.");
2548
2549PyDoc_STRVAR(pattern_sub_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002550"sub(repl, string[, count = 0]) -> newstring.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002551 Return the string obtained by replacing the leftmost non-overlapping\n\
Tim Peters3d563502006-01-21 02:47:53 +00002552 occurrences of pattern in string by the replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002553
2554PyDoc_STRVAR(pattern_subn_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002555"subn(repl, string[, count = 0]) -> (newstring, number of subs)\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002556 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2557 the leftmost non-overlapping occurrences of pattern with the\n\
Tim Peters3d563502006-01-21 02:47:53 +00002558 replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002559
2560PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2561
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002562static PyMethodDef pattern_methods[] = {
Tim Peters3d563502006-01-21 02:47:53 +00002563 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002564 pattern_match_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002565 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002566 pattern_search_doc},
Raymond Hettinger94478742004-09-24 04:31:19 +00002567 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002568 pattern_sub_doc},
Raymond Hettinger94478742004-09-24 04:31:19 +00002569 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002570 pattern_subn_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002571 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002572 pattern_split_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002573 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002574 pattern_findall_doc},
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002575 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002576 pattern_finditer_doc},
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002577 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS|METH_KEYWORDS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002578 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2579 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002580 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002581};
2582
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002583#define PAT_OFF(x) offsetof(PatternObject, x)
2584static PyMemberDef pattern_members[] = {
2585 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
2586 {"flags", T_INT, PAT_OFF(flags), READONLY},
2587 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
2588 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
2589 {NULL} /* Sentinel */
2590};
Guido van Rossumb700df92000-03-31 14:59:30 +00002591
Neal Norwitz57c179c2006-03-22 07:18:02 +00002592static PyTypeObject Pattern_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00002593 PyVarObject_HEAD_INIT(NULL, 0)
2594 "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002595 sizeof(PatternObject), sizeof(SRE_CODE),
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002596 (destructor)pattern_dealloc, /* tp_dealloc */
2597 0, /* tp_print */
2598 0, /* tp_getattr */
2599 0, /* tp_setattr */
2600 0, /* tp_reserved */
2601 0, /* tp_repr */
2602 0, /* tp_as_number */
2603 0, /* tp_as_sequence */
2604 0, /* tp_as_mapping */
2605 0, /* tp_hash */
2606 0, /* tp_call */
2607 0, /* tp_str */
2608 0, /* tp_getattro */
2609 0, /* tp_setattro */
2610 0, /* tp_as_buffer */
2611 Py_TPFLAGS_DEFAULT, /* tp_flags */
2612 pattern_doc, /* tp_doc */
2613 0, /* tp_traverse */
2614 0, /* tp_clear */
2615 0, /* tp_richcompare */
2616 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
2617 0, /* tp_iter */
2618 0, /* tp_iternext */
2619 pattern_methods, /* tp_methods */
2620 pattern_members, /* tp_members */
Guido van Rossumb700df92000-03-31 14:59:30 +00002621};
2622
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002623static int _validate(PatternObject *self); /* Forward */
2624
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002625static PyObject *
2626_compile(PyObject* self_, PyObject* args)
2627{
2628 /* "compile" pattern descriptor to pattern object */
2629
2630 PatternObject* self;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002631 Py_ssize_t i, n;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002632
2633 PyObject* pattern;
2634 int flags = 0;
2635 PyObject* code;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002636 Py_ssize_t groups = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002637 PyObject* groupindex = NULL;
2638 PyObject* indexgroup = NULL;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002639
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002640 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002641 &PyList_Type, &code, &groups,
2642 &groupindex, &indexgroup))
2643 return NULL;
2644
2645 n = PyList_GET_SIZE(code);
Christian Heimes587c2bf2008-01-19 16:21:02 +00002646 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002647 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2648 if (!self)
2649 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002650 self->weakreflist = NULL;
2651 self->pattern = NULL;
2652 self->groupindex = NULL;
2653 self->indexgroup = NULL;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002654 self->view.buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002655
2656 self->codesize = n;
2657
2658 for (i = 0; i < n; i++) {
2659 PyObject *o = PyList_GET_ITEM(code, i);
Guido van Rossumddefaf32007-01-14 03:31:43 +00002660 unsigned long value = PyLong_AsUnsignedLong(o);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002661 self->code[i] = (SRE_CODE) value;
2662 if ((unsigned long) self->code[i] != value) {
2663 PyErr_SetString(PyExc_OverflowError,
2664 "regular expression code size limit exceeded");
2665 break;
2666 }
2667 }
2668
2669 if (PyErr_Occurred()) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002670 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002671 return NULL;
2672 }
2673
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002674 if (pattern == Py_None) {
2675 self->logical_charsize = -1;
2676 self->charsize = -1;
Victor Stinner63ab8752011-11-22 03:31:20 +01002677 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002678 else {
2679 Py_ssize_t p_length;
2680 if (!getstring(pattern, &p_length, &self->logical_charsize,
Benjamin Peterson33d21a22012-03-07 14:59:13 -06002681 &self->charsize, &self->view)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002682 Py_DECREF(self);
2683 return NULL;
2684 }
2685 }
Antoine Pitroufd036452008-08-19 17:56:33 +00002686
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002687 Py_INCREF(pattern);
2688 self->pattern = pattern;
2689
2690 self->flags = flags;
2691
2692 self->groups = groups;
2693
2694 Py_XINCREF(groupindex);
2695 self->groupindex = groupindex;
2696
2697 Py_XINCREF(indexgroup);
2698 self->indexgroup = indexgroup;
2699
2700 self->weakreflist = NULL;
2701
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002702 if (!_validate(self)) {
2703 Py_DECREF(self);
2704 return NULL;
2705 }
2706
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002707 return (PyObject*) self;
2708}
2709
Guido van Rossumb700df92000-03-31 14:59:30 +00002710/* -------------------------------------------------------------------- */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002711/* Code validation */
2712
2713/* To learn more about this code, have a look at the _compile() function in
2714 Lib/sre_compile.py. The validation functions below checks the code array
2715 for conformance with the code patterns generated there.
2716
2717 The nice thing about the generated code is that it is position-independent:
2718 all jumps are relative jumps forward. Also, jumps don't cross each other:
2719 the target of a later jump is always earlier than the target of an earlier
2720 jump. IOW, this is okay:
2721
2722 J---------J-------T--------T
2723 \ \_____/ /
2724 \______________________/
2725
2726 but this is not:
2727
2728 J---------J-------T--------T
2729 \_________\_____/ /
2730 \____________/
2731
2732 It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
2733 bytes wide (the latter if Python is compiled for "wide" unicode support).
2734*/
2735
2736/* Defining this one enables tracing of the validator */
2737#undef VVERBOSE
2738
2739/* Trace macro for the validator */
2740#if defined(VVERBOSE)
2741#define VTRACE(v) printf v
2742#else
Senthil Kumaran202a3c42011-10-20 02:15:36 +08002743#define VTRACE(v) do {} while(0) /* do nothing */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002744#endif
2745
2746/* Report failure */
2747#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2748
2749/* Extract opcode, argument, or skip count from code array */
2750#define GET_OP \
2751 do { \
2752 VTRACE(("%p: ", code)); \
2753 if (code >= end) FAIL; \
2754 op = *code++; \
2755 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2756 } while (0)
2757#define GET_ARG \
2758 do { \
2759 VTRACE(("%p= ", code)); \
2760 if (code >= end) FAIL; \
2761 arg = *code++; \
2762 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2763 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00002764#define GET_SKIP_ADJ(adj) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002765 do { \
2766 VTRACE(("%p= ", code)); \
2767 if (code >= end) FAIL; \
2768 skip = *code; \
2769 VTRACE(("%lu (skip to %p)\n", \
2770 (unsigned long)skip, code+skip)); \
Victor Stinner1fa174a2013-08-28 02:06:21 +02002771 if (skip-adj > (Py_uintptr_t)(end - code)) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002772 FAIL; \
2773 code++; \
2774 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00002775#define GET_SKIP GET_SKIP_ADJ(0)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002776
2777static int
2778_validate_charset(SRE_CODE *code, SRE_CODE *end)
2779{
2780 /* Some variables are manipulated by the macros above */
2781 SRE_CODE op;
2782 SRE_CODE arg;
2783 SRE_CODE offset;
2784 int i;
2785
2786 while (code < end) {
2787 GET_OP;
2788 switch (op) {
2789
2790 case SRE_OP_NEGATE:
2791 break;
2792
2793 case SRE_OP_LITERAL:
2794 GET_ARG;
2795 break;
2796
2797 case SRE_OP_RANGE:
2798 GET_ARG;
2799 GET_ARG;
2800 break;
2801
2802 case SRE_OP_CHARSET:
2803 offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
Victor Stinner1fa174a2013-08-28 02:06:21 +02002804 if (offset > (Py_uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002805 FAIL;
2806 code += offset;
2807 break;
2808
2809 case SRE_OP_BIGCHARSET:
2810 GET_ARG; /* Number of blocks */
2811 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
Victor Stinner1fa174a2013-08-28 02:06:21 +02002812 if (offset > (Py_uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002813 FAIL;
2814 /* Make sure that each byte points to a valid block */
2815 for (i = 0; i < 256; i++) {
2816 if (((unsigned char *)code)[i] >= arg)
2817 FAIL;
2818 }
2819 code += offset;
2820 offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
Victor Stinner1fa174a2013-08-28 02:06:21 +02002821 if (offset > (Py_uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002822 FAIL;
2823 code += offset;
2824 break;
2825
2826 case SRE_OP_CATEGORY:
2827 GET_ARG;
2828 switch (arg) {
2829 case SRE_CATEGORY_DIGIT:
2830 case SRE_CATEGORY_NOT_DIGIT:
2831 case SRE_CATEGORY_SPACE:
2832 case SRE_CATEGORY_NOT_SPACE:
2833 case SRE_CATEGORY_WORD:
2834 case SRE_CATEGORY_NOT_WORD:
2835 case SRE_CATEGORY_LINEBREAK:
2836 case SRE_CATEGORY_NOT_LINEBREAK:
2837 case SRE_CATEGORY_LOC_WORD:
2838 case SRE_CATEGORY_LOC_NOT_WORD:
2839 case SRE_CATEGORY_UNI_DIGIT:
2840 case SRE_CATEGORY_UNI_NOT_DIGIT:
2841 case SRE_CATEGORY_UNI_SPACE:
2842 case SRE_CATEGORY_UNI_NOT_SPACE:
2843 case SRE_CATEGORY_UNI_WORD:
2844 case SRE_CATEGORY_UNI_NOT_WORD:
2845 case SRE_CATEGORY_UNI_LINEBREAK:
2846 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2847 break;
2848 default:
2849 FAIL;
2850 }
2851 break;
2852
2853 default:
2854 FAIL;
2855
2856 }
2857 }
2858
2859 return 1;
2860}
2861
2862static int
2863_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2864{
2865 /* Some variables are manipulated by the macros above */
2866 SRE_CODE op;
2867 SRE_CODE arg;
2868 SRE_CODE skip;
2869
2870 VTRACE(("code=%p, end=%p\n", code, end));
2871
2872 if (code > end)
2873 FAIL;
2874
2875 while (code < end) {
2876 GET_OP;
2877 switch (op) {
2878
2879 case SRE_OP_MARK:
2880 /* We don't check whether marks are properly nested; the
2881 sre_match() code is robust even if they don't, and the worst
2882 you can get is nonsensical match results. */
2883 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02002884 if (arg > 2 * (size_t)groups + 1) {
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002885 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
2886 FAIL;
2887 }
2888 break;
2889
2890 case SRE_OP_LITERAL:
2891 case SRE_OP_NOT_LITERAL:
2892 case SRE_OP_LITERAL_IGNORE:
2893 case SRE_OP_NOT_LITERAL_IGNORE:
2894 GET_ARG;
2895 /* The arg is just a character, nothing to check */
2896 break;
2897
2898 case SRE_OP_SUCCESS:
2899 case SRE_OP_FAILURE:
2900 /* Nothing to check; these normally end the matching process */
2901 break;
2902
2903 case SRE_OP_AT:
2904 GET_ARG;
2905 switch (arg) {
2906 case SRE_AT_BEGINNING:
2907 case SRE_AT_BEGINNING_STRING:
2908 case SRE_AT_BEGINNING_LINE:
2909 case SRE_AT_END:
2910 case SRE_AT_END_LINE:
2911 case SRE_AT_END_STRING:
2912 case SRE_AT_BOUNDARY:
2913 case SRE_AT_NON_BOUNDARY:
2914 case SRE_AT_LOC_BOUNDARY:
2915 case SRE_AT_LOC_NON_BOUNDARY:
2916 case SRE_AT_UNI_BOUNDARY:
2917 case SRE_AT_UNI_NON_BOUNDARY:
2918 break;
2919 default:
2920 FAIL;
2921 }
2922 break;
2923
2924 case SRE_OP_ANY:
2925 case SRE_OP_ANY_ALL:
2926 /* These have no operands */
2927 break;
2928
2929 case SRE_OP_IN:
2930 case SRE_OP_IN_IGNORE:
2931 GET_SKIP;
2932 /* Stop 1 before the end; we check the FAILURE below */
2933 if (!_validate_charset(code, code+skip-2))
2934 FAIL;
2935 if (code[skip-2] != SRE_OP_FAILURE)
2936 FAIL;
2937 code += skip-1;
2938 break;
2939
2940 case SRE_OP_INFO:
2941 {
2942 /* A minimal info field is
2943 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2944 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2945 more follows. */
Ross Lagerwall88748d72012-03-06 21:48:57 +02002946 SRE_CODE flags, i;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002947 SRE_CODE *newcode;
2948 GET_SKIP;
2949 newcode = code+skip-1;
2950 GET_ARG; flags = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02002951 GET_ARG;
2952 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002953 /* Check that only valid flags are present */
2954 if ((flags & ~(SRE_INFO_PREFIX |
2955 SRE_INFO_LITERAL |
2956 SRE_INFO_CHARSET)) != 0)
2957 FAIL;
2958 /* PREFIX and CHARSET are mutually exclusive */
2959 if ((flags & SRE_INFO_PREFIX) &&
2960 (flags & SRE_INFO_CHARSET))
2961 FAIL;
2962 /* LITERAL implies PREFIX */
2963 if ((flags & SRE_INFO_LITERAL) &&
2964 !(flags & SRE_INFO_PREFIX))
2965 FAIL;
2966 /* Validate the prefix */
2967 if (flags & SRE_INFO_PREFIX) {
Ross Lagerwall88748d72012-03-06 21:48:57 +02002968 SRE_CODE prefix_len;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002969 GET_ARG; prefix_len = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02002970 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002971 /* Here comes the prefix string */
Victor Stinner1fa174a2013-08-28 02:06:21 +02002972 if (prefix_len > (Py_uintptr_t)(newcode - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002973 FAIL;
2974 code += prefix_len;
2975 /* And here comes the overlap table */
Victor Stinner1fa174a2013-08-28 02:06:21 +02002976 if (prefix_len > (Py_uintptr_t)(newcode - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002977 FAIL;
2978 /* Each overlap value should be < prefix_len */
2979 for (i = 0; i < prefix_len; i++) {
2980 if (code[i] >= prefix_len)
2981 FAIL;
2982 }
2983 code += prefix_len;
2984 }
2985 /* Validate the charset */
2986 if (flags & SRE_INFO_CHARSET) {
2987 if (!_validate_charset(code, newcode-1))
2988 FAIL;
2989 if (newcode[-1] != SRE_OP_FAILURE)
2990 FAIL;
2991 code = newcode;
2992 }
2993 else if (code != newcode) {
2994 VTRACE(("code=%p, newcode=%p\n", code, newcode));
2995 FAIL;
2996 }
2997 }
2998 break;
2999
3000 case SRE_OP_BRANCH:
3001 {
3002 SRE_CODE *target = NULL;
3003 for (;;) {
3004 GET_SKIP;
3005 if (skip == 0)
3006 break;
3007 /* Stop 2 before the end; we check the JUMP below */
3008 if (!_validate_inner(code, code+skip-3, groups))
3009 FAIL;
3010 code += skip-3;
3011 /* Check that it ends with a JUMP, and that each JUMP
3012 has the same target */
3013 GET_OP;
3014 if (op != SRE_OP_JUMP)
3015 FAIL;
3016 GET_SKIP;
3017 if (target == NULL)
3018 target = code+skip-1;
3019 else if (code+skip-1 != target)
3020 FAIL;
3021 }
3022 }
3023 break;
3024
3025 case SRE_OP_REPEAT_ONE:
3026 case SRE_OP_MIN_REPEAT_ONE:
3027 {
3028 SRE_CODE min, max;
3029 GET_SKIP;
3030 GET_ARG; min = arg;
3031 GET_ARG; max = arg;
3032 if (min > max)
3033 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02003034 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003035 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003036 if (!_validate_inner(code, code+skip-4, groups))
3037 FAIL;
3038 code += skip-4;
3039 GET_OP;
3040 if (op != SRE_OP_SUCCESS)
3041 FAIL;
3042 }
3043 break;
3044
3045 case SRE_OP_REPEAT:
3046 {
3047 SRE_CODE min, max;
3048 GET_SKIP;
3049 GET_ARG; min = arg;
3050 GET_ARG; max = arg;
3051 if (min > max)
3052 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02003053 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003054 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003055 if (!_validate_inner(code, code+skip-3, groups))
3056 FAIL;
3057 code += skip-3;
3058 GET_OP;
3059 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3060 FAIL;
3061 }
3062 break;
3063
3064 case SRE_OP_GROUPREF:
3065 case SRE_OP_GROUPREF_IGNORE:
3066 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02003067 if (arg >= (size_t)groups)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003068 FAIL;
3069 break;
3070
3071 case SRE_OP_GROUPREF_EXISTS:
3072 /* The regex syntax for this is: '(?(group)then|else)', where
3073 'group' is either an integer group number or a group name,
3074 'then' and 'else' are sub-regexes, and 'else' is optional. */
3075 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02003076 if (arg >= (size_t)groups)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003077 FAIL;
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00003078 GET_SKIP_ADJ(1);
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003079 code--; /* The skip is relative to the first arg! */
3080 /* There are two possibilities here: if there is both a 'then'
3081 part and an 'else' part, the generated code looks like:
3082
3083 GROUPREF_EXISTS
3084 <group>
3085 <skipyes>
3086 ...then part...
3087 JUMP
3088 <skipno>
3089 (<skipyes> jumps here)
3090 ...else part...
3091 (<skipno> jumps here)
3092
3093 If there is only a 'then' part, it looks like:
3094
3095 GROUPREF_EXISTS
3096 <group>
3097 <skip>
3098 ...then part...
3099 (<skip> jumps here)
3100
3101 There is no direct way to decide which it is, and we don't want
3102 to allow arbitrary jumps anywhere in the code; so we just look
3103 for a JUMP opcode preceding our skip target.
3104 */
Victor Stinner1fa174a2013-08-28 02:06:21 +02003105 if (skip >= 3 && skip-3 < (Py_uintptr_t)(end - code) &&
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003106 code[skip-3] == SRE_OP_JUMP)
3107 {
3108 VTRACE(("both then and else parts present\n"));
3109 if (!_validate_inner(code+1, code+skip-3, groups))
3110 FAIL;
3111 code += skip-2; /* Position after JUMP, at <skipno> */
3112 GET_SKIP;
3113 if (!_validate_inner(code, code+skip-1, groups))
3114 FAIL;
3115 code += skip-1;
3116 }
3117 else {
3118 VTRACE(("only a then part present\n"));
3119 if (!_validate_inner(code+1, code+skip-1, groups))
3120 FAIL;
3121 code += skip-1;
3122 }
3123 break;
3124
3125 case SRE_OP_ASSERT:
3126 case SRE_OP_ASSERT_NOT:
3127 GET_SKIP;
3128 GET_ARG; /* 0 for lookahead, width for lookbehind */
3129 code--; /* Back up over arg to simplify math below */
3130 if (arg & 0x80000000)
3131 FAIL; /* Width too large */
3132 /* Stop 1 before the end; we check the SUCCESS below */
3133 if (!_validate_inner(code+1, code+skip-2, groups))
3134 FAIL;
3135 code += skip-2;
3136 GET_OP;
3137 if (op != SRE_OP_SUCCESS)
3138 FAIL;
3139 break;
3140
3141 default:
3142 FAIL;
3143
3144 }
3145 }
3146
3147 VTRACE(("okay\n"));
3148 return 1;
3149}
3150
3151static int
3152_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3153{
3154 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3155 FAIL;
3156 if (groups == 0) /* fix for simplejson */
3157 groups = 100; /* 100 groups should always be safe */
3158 return _validate_inner(code, end-1, groups);
3159}
3160
3161static int
3162_validate(PatternObject *self)
3163{
3164 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3165 {
3166 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3167 return 0;
3168 }
3169 else
3170 VTRACE(("Success!\n"));
3171 return 1;
3172}
3173
3174/* -------------------------------------------------------------------- */
Guido van Rossumb700df92000-03-31 14:59:30 +00003175/* match methods */
3176
3177static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003178match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003179{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003180 Py_XDECREF(self->regs);
3181 Py_XDECREF(self->string);
3182 Py_DECREF(self->pattern);
3183 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00003184}
3185
3186static PyObject*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003187match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00003188{
Serhiy Storchaka25324972013-10-16 12:46:28 +03003189 Py_ssize_t length;
3190 int logical_charsize, charsize;
3191 Py_buffer view;
3192 PyObject *result;
3193 void* ptr;
3194
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003195 if (index < 0 || index >= self->groups) {
3196 /* raise IndexError if we were given a bad group number */
3197 PyErr_SetString(
3198 PyExc_IndexError,
3199 "no such group"
3200 );
3201 return NULL;
3202 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003203
Fredrik Lundh6f013982000-07-03 18:44:21 +00003204 index *= 2;
3205
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003206 if (self->string == Py_None || self->mark[index] < 0) {
3207 /* return default value if the string or group is undefined */
3208 Py_INCREF(def);
3209 return def;
3210 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003211
Serhiy Storchaka25324972013-10-16 12:46:28 +03003212 ptr = getstring(self->string, &length, &logical_charsize, &charsize, &view);
3213 if (ptr == NULL)
3214 return NULL;
3215 result = getslice(logical_charsize, ptr,
3216 self->string, self->mark[index], self->mark[index+1]);
3217 if (logical_charsize == 1 && view.buf != NULL)
3218 PyBuffer_Release(&view);
3219 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003220}
3221
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003222static Py_ssize_t
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003223match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00003224{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003225 Py_ssize_t i;
Guido van Rossumb700df92000-03-31 14:59:30 +00003226
Guido van Rossumddefaf32007-01-14 03:31:43 +00003227 if (index == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003228 /* Default value */
3229 return 0;
Guido van Rossumddefaf32007-01-14 03:31:43 +00003230
Christian Heimes217cfd12007-12-02 14:31:20 +00003231 if (PyLong_Check(index))
3232 return PyLong_AsSsize_t(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00003233
Fredrik Lundh6f013982000-07-03 18:44:21 +00003234 i = -1;
3235
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003236 if (self->pattern->groupindex) {
3237 index = PyObject_GetItem(self->pattern->groupindex, index);
3238 if (index) {
Neal Norwitz1fe5f382007-08-31 04:32:55 +00003239 if (PyLong_Check(index))
Christian Heimes217cfd12007-12-02 14:31:20 +00003240 i = PyLong_AsSsize_t(index);
Fredrik Lundh6f013982000-07-03 18:44:21 +00003241 Py_DECREF(index);
3242 } else
3243 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003244 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00003245
3246 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003247}
3248
3249static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003250match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003251{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003252 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00003253}
3254
3255static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003256match_expand(MatchObject* self, PyObject* ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003257{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003258 /* delegate to Python code */
3259 return call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00003260 SRE_PY_MODULE, "_expand",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003261 PyTuple_Pack(3, self->pattern, self, ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003262 );
3263}
3264
3265static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003266match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003267{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003268 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003269 Py_ssize_t i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00003270
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003271 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00003272
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003273 switch (size) {
3274 case 0:
3275 result = match_getslice(self, Py_False, Py_None);
3276 break;
3277 case 1:
3278 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3279 break;
3280 default:
3281 /* fetch multiple items */
3282 result = PyTuple_New(size);
3283 if (!result)
3284 return NULL;
3285 for (i = 0; i < size; i++) {
3286 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003287 self, PyTuple_GET_ITEM(args, i), Py_None
3288 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003289 if (!item) {
3290 Py_DECREF(result);
3291 return NULL;
3292 }
3293 PyTuple_SET_ITEM(result, i, item);
3294 }
3295 break;
3296 }
3297 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003298}
3299
3300static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003301match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003302{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003303 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003304 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003305
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003306 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003307 static char* kwlist[] = { "default", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00003308 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003309 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003310
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003311 result = PyTuple_New(self->groups-1);
3312 if (!result)
3313 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003314
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003315 for (index = 1; index < self->groups; index++) {
3316 PyObject* item;
3317 item = match_getslice_by_index(self, index, def);
3318 if (!item) {
3319 Py_DECREF(result);
3320 return NULL;
3321 }
3322 PyTuple_SET_ITEM(result, index-1, item);
3323 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003324
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003325 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003326}
3327
3328static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003329match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003330{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003331 PyObject* result;
3332 PyObject* keys;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003333 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003334
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003335 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003336 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00003337 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003338 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003339
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003340 result = PyDict_New();
3341 if (!result || !self->pattern->groupindex)
3342 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003343
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003344 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003345 if (!keys)
3346 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00003347
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003348 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00003349 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003350 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003351 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003352 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003353 if (!key)
3354 goto failed;
3355 value = match_getslice(self, key, def);
3356 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003357 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003358 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003359 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00003360 status = PyDict_SetItem(result, key, value);
3361 Py_DECREF(value);
3362 if (status < 0)
3363 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003364 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003365
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003366 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00003367
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003368 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003369
3370failed:
Neal Norwitz60da3162006-03-07 04:48:24 +00003371 Py_XDECREF(keys);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003372 Py_DECREF(result);
3373 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003374}
3375
3376static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003377match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003378{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003379 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003380
Guido van Rossumddefaf32007-01-14 03:31:43 +00003381 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003382 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003383 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003384
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003385 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003386
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003387 if (index < 0 || index >= self->groups) {
3388 PyErr_SetString(
3389 PyExc_IndexError,
3390 "no such group"
3391 );
3392 return NULL;
3393 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003394
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003395 /* mark is -1 if group is undefined */
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01003396 return PyLong_FromSsize_t(self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00003397}
3398
3399static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003400match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003401{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003402 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003403
Guido van Rossumddefaf32007-01-14 03:31:43 +00003404 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003405 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003406 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003407
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003408 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003409
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003410 if (index < 0 || index >= self->groups) {
3411 PyErr_SetString(
3412 PyExc_IndexError,
3413 "no such group"
3414 );
3415 return NULL;
3416 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003417
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003418 /* mark is -1 if group is undefined */
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01003419 return PyLong_FromSsize_t(self->mark[index*2+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003420}
3421
3422LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003423_pair(Py_ssize_t i1, Py_ssize_t i2)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003424{
3425 PyObject* pair;
3426 PyObject* item;
3427
3428 pair = PyTuple_New(2);
3429 if (!pair)
3430 return NULL;
3431
Christian Heimes217cfd12007-12-02 14:31:20 +00003432 item = PyLong_FromSsize_t(i1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003433 if (!item)
3434 goto error;
3435 PyTuple_SET_ITEM(pair, 0, item);
3436
Christian Heimes217cfd12007-12-02 14:31:20 +00003437 item = PyLong_FromSsize_t(i2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003438 if (!item)
3439 goto error;
3440 PyTuple_SET_ITEM(pair, 1, item);
3441
3442 return pair;
3443
3444 error:
3445 Py_DECREF(pair);
3446 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003447}
3448
3449static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003450match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003451{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003452 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003453
Guido van Rossumddefaf32007-01-14 03:31:43 +00003454 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003455 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003456 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003457
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003458 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003459
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003460 if (index < 0 || index >= self->groups) {
3461 PyErr_SetString(
3462 PyExc_IndexError,
3463 "no such group"
3464 );
3465 return NULL;
3466 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003467
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003468 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003469 return _pair(self->mark[index*2], self->mark[index*2+1]);
3470}
3471
3472static PyObject*
3473match_regs(MatchObject* self)
3474{
3475 PyObject* regs;
3476 PyObject* item;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003477 Py_ssize_t index;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003478
3479 regs = PyTuple_New(self->groups);
3480 if (!regs)
3481 return NULL;
3482
3483 for (index = 0; index < self->groups; index++) {
3484 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3485 if (!item) {
3486 Py_DECREF(regs);
3487 return NULL;
3488 }
3489 PyTuple_SET_ITEM(regs, index, item);
3490 }
3491
3492 Py_INCREF(regs);
3493 self->regs = regs;
3494
3495 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00003496}
3497
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003498static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003499match_copy(MatchObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003500{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003501#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003502 MatchObject* copy;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003503 Py_ssize_t slots, offset;
Tim Peters3d563502006-01-21 02:47:53 +00003504
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003505 slots = 2 * (self->pattern->groups+1);
3506
3507 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3508 if (!copy)
3509 return NULL;
3510
3511 /* this value a constant, but any compiler should be able to
3512 figure that out all by itself */
3513 offset = offsetof(MatchObject, string);
3514
3515 Py_XINCREF(self->pattern);
3516 Py_XINCREF(self->string);
3517 Py_XINCREF(self->regs);
3518
3519 memcpy((char*) copy + offset, (char*) self + offset,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003520 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003521
3522 return (PyObject*) copy;
3523#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003524 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003525 return NULL;
3526#endif
3527}
3528
3529static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003530match_deepcopy(MatchObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003531{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003532#ifdef USE_BUILTIN_COPY
3533 MatchObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00003534
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003535 copy = (MatchObject*) match_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003536 if (!copy)
3537 return NULL;
3538
3539 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3540 !deepcopy(&copy->string, memo) ||
3541 !deepcopy(&copy->regs, memo)) {
3542 Py_DECREF(copy);
3543 return NULL;
3544 }
3545
3546#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003547 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3548 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003549#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003550}
3551
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003552PyDoc_STRVAR(match_doc,
3553"The result of re.match() and re.search().\n\
3554Match objects always have a boolean value of True.");
3555
3556PyDoc_STRVAR(match_group_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003557"group([group1, ...]) -> str or tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003558 Return subgroup(s) of the match by indices or names.\n\
3559 For 0 returns the entire match.");
3560
3561PyDoc_STRVAR(match_start_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003562"start([group=0]) -> int.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003563 Return index of the start of the substring matched by group.");
3564
3565PyDoc_STRVAR(match_end_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003566"end([group=0]) -> int.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003567 Return index of the end of the substring matched by group.");
3568
3569PyDoc_STRVAR(match_span_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003570"span([group]) -> tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003571 For MatchObject m, return the 2-tuple (m.start(group), m.end(group)).");
3572
3573PyDoc_STRVAR(match_groups_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003574"groups([default=None]) -> tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003575 Return a tuple containing all the subgroups of the match, from 1.\n\
3576 The default argument is used for groups\n\
3577 that did not participate in the match");
3578
3579PyDoc_STRVAR(match_groupdict_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003580"groupdict([default=None]) -> dict.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003581 Return a dictionary containing all the named subgroups of the match,\n\
3582 keyed by the subgroup name. The default argument is used for groups\n\
3583 that did not participate in the match");
3584
3585PyDoc_STRVAR(match_expand_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003586"expand(template) -> str.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003587 Return the string obtained by doing backslash substitution\n\
3588 on the string template, as done by the sub() method.");
3589
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003590static PyMethodDef match_methods[] = {
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003591 {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
3592 {"start", (PyCFunction) match_start, METH_VARARGS, match_start_doc},
3593 {"end", (PyCFunction) match_end, METH_VARARGS, match_end_doc},
3594 {"span", (PyCFunction) match_span, METH_VARARGS, match_span_doc},
3595 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS,
3596 match_groups_doc},
3597 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS,
3598 match_groupdict_doc},
3599 {"expand", (PyCFunction) match_expand, METH_O, match_expand_doc},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003600 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3601 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003602 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003603};
3604
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003605static PyObject *
3606match_lastindex_get(MatchObject *self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003607{
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003608 if (self->lastindex >= 0)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01003609 return PyLong_FromSsize_t(self->lastindex);
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003610 Py_INCREF(Py_None);
3611 return Py_None;
Guido van Rossumb700df92000-03-31 14:59:30 +00003612}
3613
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003614static PyObject *
3615match_lastgroup_get(MatchObject *self)
3616{
3617 if (self->pattern->indexgroup && self->lastindex >= 0) {
3618 PyObject* result = PySequence_GetItem(
3619 self->pattern->indexgroup, self->lastindex
3620 );
3621 if (result)
3622 return result;
3623 PyErr_Clear();
3624 }
3625 Py_INCREF(Py_None);
3626 return Py_None;
3627}
3628
3629static PyObject *
3630match_regs_get(MatchObject *self)
3631{
3632 if (self->regs) {
3633 Py_INCREF(self->regs);
3634 return self->regs;
3635 } else
3636 return match_regs(self);
3637}
3638
3639static PyGetSetDef match_getset[] = {
3640 {"lastindex", (getter)match_lastindex_get, (setter)NULL},
3641 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
3642 {"regs", (getter)match_regs_get, (setter)NULL},
3643 {NULL}
3644};
3645
3646#define MATCH_OFF(x) offsetof(MatchObject, x)
3647static PyMemberDef match_members[] = {
3648 {"string", T_OBJECT, MATCH_OFF(string), READONLY},
3649 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
3650 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
3651 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
3652 {NULL}
3653};
3654
Guido van Rossumb700df92000-03-31 14:59:30 +00003655/* FIXME: implement setattr("string", None) as a special case (to
3656 detach the associated string, if any */
3657
Neal Norwitz57c179c2006-03-22 07:18:02 +00003658static PyTypeObject Match_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003659 PyVarObject_HEAD_INIT(NULL,0)
3660 "_" SRE_MODULE ".SRE_Match",
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003661 sizeof(MatchObject), sizeof(Py_ssize_t),
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003662 (destructor)match_dealloc, /* tp_dealloc */
3663 0, /* tp_print */
3664 0, /* tp_getattr */
3665 0, /* tp_setattr */
3666 0, /* tp_reserved */
3667 0, /* tp_repr */
3668 0, /* tp_as_number */
3669 0, /* tp_as_sequence */
3670 0, /* tp_as_mapping */
3671 0, /* tp_hash */
3672 0, /* tp_call */
3673 0, /* tp_str */
3674 0, /* tp_getattro */
3675 0, /* tp_setattro */
3676 0, /* tp_as_buffer */
3677 Py_TPFLAGS_DEFAULT, /* tp_flags */
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003678 match_doc, /* tp_doc */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003679 0, /* tp_traverse */
3680 0, /* tp_clear */
3681 0, /* tp_richcompare */
3682 0, /* tp_weaklistoffset */
3683 0, /* tp_iter */
3684 0, /* tp_iternext */
3685 match_methods, /* tp_methods */
3686 match_members, /* tp_members */
3687 match_getset, /* tp_getset */
Guido van Rossumb700df92000-03-31 14:59:30 +00003688};
3689
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003690static PyObject*
3691pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3692{
3693 /* create match object (from state object) */
3694
3695 MatchObject* match;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003696 Py_ssize_t i, j;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003697 char* base;
3698 int n;
3699
3700 if (status > 0) {
3701
3702 /* create match object (with room for extra group marks) */
Christian Heimes587c2bf2008-01-19 16:21:02 +00003703 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003704 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3705 2*(pattern->groups+1));
3706 if (!match)
3707 return NULL;
3708
3709 Py_INCREF(pattern);
3710 match->pattern = pattern;
3711
3712 Py_INCREF(state->string);
3713 match->string = state->string;
3714
3715 match->regs = NULL;
3716 match->groups = pattern->groups+1;
3717
3718 /* fill in group slices */
3719
3720 base = (char*) state->beginning;
3721 n = state->charsize;
3722
3723 match->mark[0] = ((char*) state->start - base) / n;
3724 match->mark[1] = ((char*) state->ptr - base) / n;
3725
3726 for (i = j = 0; i < pattern->groups; i++, j+=2)
3727 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3728 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3729 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3730 } else
3731 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3732
3733 match->pos = state->pos;
3734 match->endpos = state->endpos;
3735
3736 match->lastindex = state->lastindex;
3737
3738 return (PyObject*) match;
3739
3740 } else if (status == 0) {
3741
3742 /* no match */
3743 Py_INCREF(Py_None);
3744 return Py_None;
3745
3746 }
3747
3748 /* internal error */
3749 pattern_error(status);
3750 return NULL;
3751}
3752
3753
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003754/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003755/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003756
3757static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003758scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003759{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003760 state_fini(&self->state);
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003761 Py_XDECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003762 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003763}
3764
3765static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003766scanner_match(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003767{
3768 SRE_STATE* state = &self->state;
3769 PyObject* match;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01003770 Py_ssize_t status;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003771
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003772 state_reset(state);
3773
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003774 state->ptr = state->start;
3775
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003776 if (state->logical_charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003777 status = sre_match(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003778 } else {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003779 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003780 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003781 if (PyErr_Occurred())
3782 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003783
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003784 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003785 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003786
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003787 if (status == 0 || state->ptr == state->start)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003788 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003789 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003790 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003791
3792 return match;
3793}
3794
3795
3796static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003797scanner_search(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003798{
3799 SRE_STATE* state = &self->state;
3800 PyObject* match;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01003801 Py_ssize_t status;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003802
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003803 state_reset(state);
3804
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003805 state->ptr = state->start;
3806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003807 if (state->logical_charsize == 1) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003808 status = sre_search(state, PatternObject_GetCode(self->pattern));
3809 } else {
3810 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
3811 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003812 if (PyErr_Occurred())
3813 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003814
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003815 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003816 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003817
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003818 if (status == 0 || state->ptr == state->start)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003819 state->start = (void*) ((char*) state->ptr + state->charsize);
3820 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003821 state->start = state->ptr;
3822
3823 return match;
3824}
3825
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003826static PyMethodDef scanner_methods[] = {
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003827 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3828 {"search", (PyCFunction) scanner_search, METH_NOARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003829 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003830};
3831
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003832#define SCAN_OFF(x) offsetof(ScannerObject, x)
3833static PyMemberDef scanner_members[] = {
Ezio Melotti7c8c1ea2011-09-29 01:00:19 +03003834 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003835 {NULL} /* Sentinel */
3836};
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003837
Neal Norwitz57c179c2006-03-22 07:18:02 +00003838static PyTypeObject Scanner_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003839 PyVarObject_HEAD_INIT(NULL, 0)
3840 "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003841 sizeof(ScannerObject), 0,
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003842 (destructor)scanner_dealloc,/* tp_dealloc */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003843 0, /* tp_print */
3844 0, /* tp_getattr */
3845 0, /* tp_setattr */
3846 0, /* tp_reserved */
3847 0, /* tp_repr */
3848 0, /* tp_as_number */
3849 0, /* tp_as_sequence */
3850 0, /* tp_as_mapping */
3851 0, /* tp_hash */
3852 0, /* tp_call */
3853 0, /* tp_str */
3854 0, /* tp_getattro */
3855 0, /* tp_setattro */
3856 0, /* tp_as_buffer */
3857 Py_TPFLAGS_DEFAULT, /* tp_flags */
3858 0, /* tp_doc */
3859 0, /* tp_traverse */
3860 0, /* tp_clear */
3861 0, /* tp_richcompare */
3862 0, /* tp_weaklistoffset */
3863 0, /* tp_iter */
3864 0, /* tp_iternext */
3865 scanner_methods, /* tp_methods */
3866 scanner_members, /* tp_members */
3867 0, /* tp_getset */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003868};
3869
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003870static PyObject*
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06003871pattern_scanner(PatternObject* pattern, PyObject* args, PyObject* kw)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003872{
3873 /* create search state object */
3874
3875 ScannerObject* self;
3876
3877 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003878 Py_ssize_t start = 0;
3879 Py_ssize_t end = PY_SSIZE_T_MAX;
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06003880 static char* kwlist[] = { "source", "pos", "endpos", NULL };
3881 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:scanner", kwlist,
3882 &string, &start, &end))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003883 return NULL;
3884
3885 /* create scanner object */
3886 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3887 if (!self)
3888 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003889 self->pattern = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003890
3891 string = state_init(&self->state, pattern, string, start, end);
3892 if (!string) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003893 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003894 return NULL;
3895 }
3896
3897 Py_INCREF(pattern);
3898 self->pattern = (PyObject*) pattern;
3899
3900 return (PyObject*) self;
3901}
3902
Guido van Rossumb700df92000-03-31 14:59:30 +00003903static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00003904 {"compile", _compile, METH_VARARGS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003905 {"getcodesize", sre_codesize, METH_NOARGS},
Neal Norwitzb0493252002-03-31 14:44:22 +00003906 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003907 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003908};
3909
Martin v. Löwis1a214512008-06-11 05:26:20 +00003910static struct PyModuleDef sremodule = {
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003911 PyModuleDef_HEAD_INIT,
3912 "_" SRE_MODULE,
3913 NULL,
3914 -1,
3915 _functions,
3916 NULL,
3917 NULL,
3918 NULL,
3919 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00003920};
3921
3922PyMODINIT_FUNC PyInit__sre(void)
Guido van Rossumb700df92000-03-31 14:59:30 +00003923{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003924 PyObject* m;
3925 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00003926 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003927
Benjamin Peterson08bf91c2010-04-11 16:12:57 +00003928 /* Patch object types */
3929 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
3930 PyType_Ready(&Scanner_Type))
Martin v. Löwis1a214512008-06-11 05:26:20 +00003931 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003932
Martin v. Löwis1a214512008-06-11 05:26:20 +00003933 m = PyModule_Create(&sremodule);
Neal Norwitz1ac754f2006-01-19 06:09:39 +00003934 if (m == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003935 return NULL;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003936 d = PyModule_GetDict(m);
3937
Christian Heimes217cfd12007-12-02 14:31:20 +00003938 x = PyLong_FromLong(SRE_MAGIC);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003939 if (x) {
3940 PyDict_SetItemString(d, "MAGIC", x);
3941 Py_DECREF(x);
3942 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00003943
Christian Heimes217cfd12007-12-02 14:31:20 +00003944 x = PyLong_FromLong(sizeof(SRE_CODE));
Martin v. Löwis78e2f062003-04-19 12:56:08 +00003945 if (x) {
3946 PyDict_SetItemString(d, "CODESIZE", x);
3947 Py_DECREF(x);
3948 }
3949
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02003950 x = PyLong_FromUnsignedLong(SRE_MAXREPEAT);
3951 if (x) {
3952 PyDict_SetItemString(d, "MAXREPEAT", x);
3953 Py_DECREF(x);
3954 }
3955
Neal Norwitzfe537132007-08-26 03:55:15 +00003956 x = PyUnicode_FromString(copyright);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003957 if (x) {
3958 PyDict_SetItemString(d, "copyright", x);
3959 Py_DECREF(x);
3960 }
Martin v. Löwis1a214512008-06-11 05:26:20 +00003961 return m;
Guido van Rossumb700df92000-03-31 14:59:30 +00003962}
3963
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003964#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00003965
3966/* vim:ts=4:sw=4:et
3967*/