blob: 99c3cd5c05f9cc8dc13d9bdfa4f44f1368917f82 [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
Fredrik Lundhebc37b22000-10-28 19:30:41 +00009 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000010 * 2000-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000011 * 2000-09-20 fl added expand method
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000012 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000013 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000014 * 2001-04-28 fl added __copy__ methods (work in progress)
Fredrik Lundh09705f02002-11-22 12:46:35 +000015 * 2001-05-14 fl fixes for 1.5.2 compatibility
Fredrik Lundhf71ae462001-07-02 17:04:48 +000016 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh397a6542001-10-18 19:30:16 +000017 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Fredrik Lundh971e78b2001-10-20 17:48:46 +000018 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
Fredrik Lundhbec95b92001-10-21 16:47:57 +000019 * 2001-10-21 fl added sub/subn primitive
Fredrik Lundh703ce812001-10-24 22:16:30 +000020 * 2001-10-24 fl added finditer primitive (for 2.2 only)
Fredrik Lundh82b23072001-12-09 16:13:15 +000021 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
Fredrik Lundh09705f02002-11-22 12:46:35 +000022 * 2002-11-09 fl fixed empty sub/subn return type
Martin v. Löwis78e2f062003-04-19 12:56:08 +000023 * 2003-04-18 mvl fully support 4-byte codes
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +000024 * 2003-10-17 gn implemented non recursive scheme
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000026 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000027 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000028 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
Guido van Rossumb700df92000-03-31 14:59:30 +000032 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000033 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000034 * other compatibility work.
35 */
36
37#ifndef SRE_RECURSIVE
38
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000039static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000040 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000041
Thomas Wouters0e3f5912006-08-11 14:57:12 +000042#define PY_SSIZE_T_CLEAN
43
Guido van Rossumb700df92000-03-31 14:59:30 +000044#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000045#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000046
47#include "sre.h"
48
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000049#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000050
Fredrik Lundh436c3d582000-06-29 08:58:44 +000051/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000052#if !defined(SRE_MODULE)
53#define SRE_MODULE "sre"
54#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000055
Thomas Wouters9ada3d62006-04-21 09:47:09 +000056#define SRE_PY_MODULE "re"
57
Guido van Rossumb700df92000-03-31 14:59:30 +000058/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000059#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000060
Fredrik Lundh22d25462000-07-01 17:50:59 +000061/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +000062#define HAVE_UNICODE
Fredrik Lundh436c3d582000-06-29 08:58:44 +000063
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000065/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000066
67/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000068#define USE_FAST_SEARCH
69
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000070/* enables copy/deepcopy handling (work in progress) */
71#undef USE_BUILTIN_COPY
72
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000073/* -------------------------------------------------------------------- */
74
Fredrik Lundh80946112000-06-29 18:03:25 +000075#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000076#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000077#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000078/* fastest possible local call under MSVC */
79#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000080#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000081#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000082#else
83#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000084#endif
85
86/* error codes */
87#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000088#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000089#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000090#define SRE_ERROR_MEMORY -9 /* out of memory */
Christian Heimes2380ac72008-01-09 00:17:24 +000091#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
Guido van Rossumb700df92000-03-31 14:59:30 +000092
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000093#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +000094#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +000095#else
96#define TRACE(v)
97#endif
98
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000099/* -------------------------------------------------------------------- */
100/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000101
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000102/* default character predicates (run sre_chars.py to regenerate tables) */
103
104#define SRE_DIGIT_MASK 1
105#define SRE_SPACE_MASK 2
106#define SRE_LINEBREAK_MASK 4
107#define SRE_ALNUM_MASK 8
108#define SRE_WORD_MASK 16
109
Fredrik Lundh21009b92001-09-18 18:47:09 +0000110/* FIXME: this assumes ASCII. create tables in init_sre() instead */
111
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000112static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1132, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1140, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
11525, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
11624, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1170, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
11824, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
119
Fredrik Lundhb389df32000-06-29 12:48:37 +0000120static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000012110, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
12227, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
12344, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
12461, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
125108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
126122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
127106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
128120, 121, 122, 123, 124, 125, 126, 127 };
129
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000130#define SRE_IS_DIGIT(ch)\
131 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
132#define SRE_IS_SPACE(ch)\
133 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
134#define SRE_IS_LINEBREAK(ch)\
135 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
136#define SRE_IS_ALNUM(ch)\
137 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
138#define SRE_IS_WORD(ch)\
139 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000140
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000141static unsigned int sre_lower(unsigned int ch)
142{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000143 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000144}
145
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000146/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000147/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
148 * warnings when c's type supports only numbers < N+1 */
149#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
150#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000151#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000152#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000153#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
154
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000155static unsigned int sre_lower_locale(unsigned int ch)
156{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000157 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000158}
159
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000160/* unicode-specific character predicates */
161
Victor Stinner0058b862011-09-29 03:27:47 +0200162#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
163#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch)
164#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch)
165#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch)
166#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000167
168static unsigned int sre_lower_unicode(unsigned int ch)
169{
Victor Stinner0058b862011-09-29 03:27:47 +0200170 return (unsigned int) Py_UNICODE_TOLOWER(ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000171}
172
Guido van Rossumb700df92000-03-31 14:59:30 +0000173LOCAL(int)
174sre_category(SRE_CODE category, unsigned int ch)
175{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000176 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000177
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000178 case SRE_CATEGORY_DIGIT:
179 return SRE_IS_DIGIT(ch);
180 case SRE_CATEGORY_NOT_DIGIT:
181 return !SRE_IS_DIGIT(ch);
182 case SRE_CATEGORY_SPACE:
183 return SRE_IS_SPACE(ch);
184 case SRE_CATEGORY_NOT_SPACE:
185 return !SRE_IS_SPACE(ch);
186 case SRE_CATEGORY_WORD:
187 return SRE_IS_WORD(ch);
188 case SRE_CATEGORY_NOT_WORD:
189 return !SRE_IS_WORD(ch);
190 case SRE_CATEGORY_LINEBREAK:
191 return SRE_IS_LINEBREAK(ch);
192 case SRE_CATEGORY_NOT_LINEBREAK:
193 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000194
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000195 case SRE_CATEGORY_LOC_WORD:
196 return SRE_LOC_IS_WORD(ch);
197 case SRE_CATEGORY_LOC_NOT_WORD:
198 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000199
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000200 case SRE_CATEGORY_UNI_DIGIT:
201 return SRE_UNI_IS_DIGIT(ch);
202 case SRE_CATEGORY_UNI_NOT_DIGIT:
203 return !SRE_UNI_IS_DIGIT(ch);
204 case SRE_CATEGORY_UNI_SPACE:
205 return SRE_UNI_IS_SPACE(ch);
206 case SRE_CATEGORY_UNI_NOT_SPACE:
207 return !SRE_UNI_IS_SPACE(ch);
208 case SRE_CATEGORY_UNI_WORD:
209 return SRE_UNI_IS_WORD(ch);
210 case SRE_CATEGORY_UNI_NOT_WORD:
211 return !SRE_UNI_IS_WORD(ch);
212 case SRE_CATEGORY_UNI_LINEBREAK:
213 return SRE_UNI_IS_LINEBREAK(ch);
214 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
215 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000216 }
217 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000218}
219
220/* helpers */
221
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000222static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000223data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000224{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000225 if (state->data_stack) {
Thomas Wouters477c8d52006-05-27 19:21:47 +0000226 PyMem_FREE(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000227 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000228 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000229 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000230}
231
232static int
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000233data_stack_grow(SRE_STATE* state, Py_ssize_t size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000234{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000235 Py_ssize_t minsize, cursize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000236 minsize = state->data_stack_base+size;
237 cursize = state->data_stack_size;
238 if (cursize < minsize) {
239 void* stack;
240 cursize = minsize+minsize/4+1024;
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300241 TRACE(("allocate/grow stack %" PY_FORMAT_SIZE_T "d\n", cursize));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000242 stack = PyMem_REALLOC(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000243 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000244 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000245 return SRE_ERROR_MEMORY;
246 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000247 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000248 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000249 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000250 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000251}
252
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000253/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000254
255#define SRE_CHAR unsigned char
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200256#define SRE_CHARGET(state, buf, index) ((unsigned char*)buf)[index]
Guido van Rossumb700df92000-03-31 14:59:30 +0000257#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000258#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000259#define SRE_CHARSET sre_charset
260#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000261#define SRE_MATCH sre_match
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000262#define SRE_MATCH_CONTEXT sre_match_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000263#define SRE_SEARCH sre_search
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000264
Guido van Rossumb700df92000-03-31 14:59:30 +0000265#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000266#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000267#undef SRE_RECURSIVE
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000268
Guido van Rossumb700df92000-03-31 14:59:30 +0000269#undef SRE_SEARCH
270#undef SRE_MATCH
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000271#undef SRE_MATCH_CONTEXT
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000272#undef SRE_INFO
273#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000274#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000275#undef SRE_AT
276#undef SRE_CHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200277#undef SRE_CHARGET
Guido van Rossumb700df92000-03-31 14:59:30 +0000278
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200279/* generate 8/16/32-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000280
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200281#define SRE_CHAR void
282#define SRE_CHARGET(state, buf, index) \
283 ((state->charsize==1) ? ((Py_UCS1*)buf)[index] : \
284 (state->charsize==2) ? ((Py_UCS2*)buf)[index] : \
285 ((Py_UCS4*)buf)[index])
Guido van Rossumb700df92000-03-31 14:59:30 +0000286#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000287#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000288#define SRE_CHARSET sre_ucharset
289#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000290#define SRE_MATCH sre_umatch
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000291#define SRE_MATCH_CONTEXT sre_umatch_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000292#define SRE_SEARCH sre_usearch
293
294#endif /* SRE_RECURSIVE */
295
296/* -------------------------------------------------------------------- */
297/* String matching engine */
298
299/* the following section is compiled twice, with different character
300 settings */
301
302LOCAL(int)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200303SRE_AT(SRE_STATE* state, char* ptr, SRE_CODE at)
Guido van Rossumb700df92000-03-31 14:59:30 +0000304{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000305 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000306
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000307 Py_ssize_t thisp, thatp;
Guido van Rossumb700df92000-03-31 14:59:30 +0000308
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000309 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000310
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000311 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000312 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000313 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000314
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000315 case SRE_AT_BEGINNING_LINE:
316 return ((void*) ptr == state->beginning ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200317 SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, -1)));
Fredrik Lundh80946112000-06-29 18:03:25 +0000318
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000319 case SRE_AT_END:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200320 return (((void*) (ptr+state->charsize) == state->end &&
321 SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, 0))) ||
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000322 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000323
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000324 case SRE_AT_END_LINE:
325 return ((void*) ptr == state->end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200326 SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, 0)));
Fredrik Lundh80946112000-06-29 18:03:25 +0000327
Fredrik Lundh770617b2001-01-14 15:06:11 +0000328 case SRE_AT_END_STRING:
329 return ((void*) ptr == state->end);
330
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000331 case SRE_AT_BOUNDARY:
332 if (state->beginning == state->end)
333 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000334 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200335 SRE_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000336 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200337 SRE_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000338 return thisp != thatp;
Fredrik Lundh80946112000-06-29 18:03:25 +0000339
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000340 case SRE_AT_NON_BOUNDARY:
341 if (state->beginning == state->end)
342 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000343 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200344 SRE_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000345 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200346 SRE_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000347 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000348
349 case SRE_AT_LOC_BOUNDARY:
350 if (state->beginning == state->end)
351 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000352 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200353 SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000354 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200355 SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000356 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000357
358 case SRE_AT_LOC_NON_BOUNDARY:
359 if (state->beginning == state->end)
360 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000361 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200362 SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000363 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200364 SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000365 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000366
367 case SRE_AT_UNI_BOUNDARY:
368 if (state->beginning == state->end)
369 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000370 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200371 SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000372 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200373 SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000374 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000375
376 case SRE_AT_UNI_NON_BOUNDARY:
377 if (state->beginning == state->end)
378 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000379 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200380 SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000381 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200382 SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000383 return thisp == thatp;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000384
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000385 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000386
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000387 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000388}
389
390LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000391SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000392{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000393 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000394
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000395 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000396
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000397 for (;;) {
398 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000399
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000400 case SRE_OP_FAILURE:
401 return !ok;
402
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000403 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000404 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000405 if (ch == set[0])
406 return ok;
407 set++;
408 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000409
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000410 case SRE_OP_CATEGORY:
411 /* <CATEGORY> <code> */
412 if (sre_category(set[0], (int) ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000413 return ok;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000414 set += 1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000415 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000416
Fredrik Lundh3562f112000-07-02 12:00:07 +0000417 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000418 if (sizeof(SRE_CODE) == 2) {
419 /* <CHARSET> <bitmap> (16 bits per code word) */
420 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
421 return ok;
422 set += 16;
Tim Peters3d563502006-01-21 02:47:53 +0000423 }
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000424 else {
425 /* <CHARSET> <bitmap> (32 bits per code word) */
Gregory P. Smith90555d02012-12-10 17:44:44 -0800426 if (ch < 256 && (set[ch >> 5] & (1u << (ch & 31))))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000427 return ok;
428 set += 8;
429 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000430 break;
431
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000432 case SRE_OP_RANGE:
433 /* <RANGE> <lower> <upper> */
434 if (set[0] <= ch && ch <= set[1])
435 return ok;
436 set += 2;
437 break;
438
439 case SRE_OP_NEGATE:
440 ok = !ok;
441 break;
442
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000443 case SRE_OP_BIGCHARSET:
444 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
445 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000446 Py_ssize_t count, block;
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000447 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000448
449 if (sizeof(SRE_CODE) == 2) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200450 block = ((char*)set)[ch >> 8];
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000451 set += 128;
452 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
453 return ok;
454 set += count*16;
455 }
456 else {
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000457 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
458 * warnings when c's type supports only numbers < N+1 */
459 if (!(ch & ~65535))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200460 block = ((char*)set)[ch >> 8];
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000461 else
462 block = -1;
463 set += 64;
Tim Peters3d563502006-01-21 02:47:53 +0000464 if (block >=0 &&
Gregory P. Smith90555d02012-12-10 17:44:44 -0800465 (set[block*8 + ((ch & 255)>>5)] & (1u << (ch & 31))))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000466 return ok;
467 set += count*8;
468 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000469 break;
470 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000471
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000472 default:
473 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000474 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000475 return 0;
476 }
477 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000478}
479
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000480LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000481
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000482LOCAL(Py_ssize_t)
483SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000484{
485 SRE_CODE chr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200486 char* ptr = (char *)state->ptr;
487 char* end = (char *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000488 Py_ssize_t i;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000489
490 /* adjust end */
Serhiy Storchakaa0eb8092013-02-16 16:54:33 +0200491 if (maxcount < (end - ptr) / state->charsize && maxcount != SRE_MAXREPEAT)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200492 end = ptr + maxcount*state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000493
494 switch (pattern[0]) {
495
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000496 case SRE_OP_IN:
497 /* repeated set */
498 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
Victor Stinner63ab8752011-11-22 03:31:20 +0100499 while (ptr < end &&
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200500 SRE_CHARSET(pattern + 2, SRE_CHARGET(state, ptr, 0)))
501 ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000502 break;
503
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000504 case SRE_OP_ANY:
505 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000506 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200507 while (ptr < end && !SRE_IS_LINEBREAK(SRE_CHARGET(state, ptr, 0)))
508 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000509 break;
510
511 case SRE_OP_ANY_ALL:
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000512 /* repeated dot wildcard. skip to the end of the target
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000513 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000514 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000515 ptr = end;
516 break;
517
518 case SRE_OP_LITERAL:
519 /* repeated literal */
520 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000521 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200522 while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) == chr)
523 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000524 break;
525
526 case SRE_OP_LITERAL_IGNORE:
527 /* repeated literal */
528 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000529 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200530 while (ptr < end && (SRE_CODE) state->lower(SRE_CHARGET(state, ptr, 0)) == chr)
531 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000532 break;
533
534 case SRE_OP_NOT_LITERAL:
535 /* repeated non-literal */
536 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000537 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200538 while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) != chr)
539 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000540 break;
Tim Peters3d563502006-01-21 02:47:53 +0000541
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000542 case SRE_OP_NOT_LITERAL_IGNORE:
543 /* repeated non-literal */
544 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000545 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200546 while (ptr < end && (SRE_CODE) state->lower(SRE_CHARGET(state, ptr, 0)) != chr)
547 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000548 break;
549
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000550 default:
551 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000552 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200553 while ((char*) state->ptr < end) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000554 i = SRE_MATCH(state, pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000555 if (i < 0)
556 return i;
557 if (!i)
558 break;
559 }
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300560 TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200561 ((char*)state->ptr - ptr)/state->charsize));
562 return ((char*)state->ptr - ptr)/state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000563 }
564
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300565 TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
566 (ptr - (char*) state->ptr)/state->charsize));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200567 return (ptr - (char*) state->ptr)/state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000568}
569
Fredrik Lundh33accc12000-08-27 20:59:47 +0000570#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000571LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000572SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
573{
574 /* check if an SRE_OP_INFO block matches at the current position.
575 returns the number of SRE_CODE objects to skip if successful, 0
576 if no match */
577
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200578 char* end = state->end;
579 char* ptr = state->ptr;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000580 Py_ssize_t i;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000581
582 /* check minimal length */
Serhiy Storchakac1b59d42012-12-29 23:38:48 +0200583 if (pattern[3] && (end - ptr)/state->charsize < pattern[3])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000584 return 0;
585
586 /* check known prefix */
587 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
588 /* <length> <skip> <prefix data> <overlap data> */
589 for (i = 0; i < pattern[5]; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200590 if ((SRE_CODE) SRE_CHARGET(state, ptr, i) != pattern[7 + i])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000591 return 0;
592 return pattern[0] + 2 * pattern[6];
593 }
594 return pattern[0];
595}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000596#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000597
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000598/* The macros below should be used to protect recursive SRE_MATCH()
599 * calls that *failed* and do *not* return immediately (IOW, those
600 * that will backtrack). Explaining:
601 *
602 * - Recursive SRE_MATCH() returned true: that's usually a success
603 * (besides atypical cases like ASSERT_NOT), therefore there's no
604 * reason to restore lastmark;
605 *
606 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
607 * is returning to the caller: If the current SRE_MATCH() is the
608 * top function of the recursion, returning false will be a matching
609 * failure, and it doesn't matter where lastmark is pointing to.
610 * If it's *not* the top function, it will be a recursive SRE_MATCH()
611 * failure by itself, and the calling SRE_MATCH() will have to deal
612 * with the failure by the same rules explained here (it will restore
613 * lastmark by itself if necessary);
614 *
615 * - Recursive SRE_MATCH() returned false, and will continue the
616 * outside 'for' loop: must be protected when breaking, since the next
617 * OP could potentially depend on lastmark;
Tim Peters3d563502006-01-21 02:47:53 +0000618 *
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000619 * - Recursive SRE_MATCH() returned false, and will be called again
620 * inside a local for/while loop: must be protected between each
621 * loop iteration, since the recursive SRE_MATCH() could do anything,
622 * and could potentially depend on lastmark.
623 *
624 * For more information, check the discussion at SF patch #712900.
625 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000626#define LASTMARK_SAVE() \
627 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000628 ctx->lastmark = state->lastmark; \
629 ctx->lastindex = state->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000630 } while (0)
631#define LASTMARK_RESTORE() \
632 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000633 state->lastmark = ctx->lastmark; \
634 state->lastindex = ctx->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000635 } while (0)
636
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000637#define RETURN_ERROR(i) do { return i; } while(0)
638#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
639#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
640
641#define RETURN_ON_ERROR(i) \
642 do { if (i < 0) RETURN_ERROR(i); } while (0)
643#define RETURN_ON_SUCCESS(i) \
644 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
645#define RETURN_ON_FAILURE(i) \
646 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
647
648#define SFY(x) #x
649
650#define DATA_STACK_ALLOC(state, type, ptr) \
651do { \
652 alloc_pos = state->data_stack_base; \
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300653 TRACE(("allocating %s in %" PY_FORMAT_SIZE_T "d " \
654 "(%" PY_FORMAT_SIZE_T "d)\n", \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000655 SFY(type), alloc_pos, sizeof(type))); \
Serhiy Storchaka4bb17342013-04-13 21:15:47 +0300656 if (sizeof(type) > state->data_stack_size - alloc_pos) { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000657 int j = data_stack_grow(state, sizeof(type)); \
658 if (j < 0) return j; \
659 if (ctx_pos != -1) \
660 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
661 } \
662 ptr = (type*)(state->data_stack+alloc_pos); \
663 state->data_stack_base += sizeof(type); \
664} while (0)
665
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000666#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
667do { \
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300668 TRACE(("looking up %s at %" PY_FORMAT_SIZE_T "d\n", SFY(type), pos)); \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000669 ptr = (type*)(state->data_stack+pos); \
670} while (0)
671
672#define DATA_STACK_PUSH(state, data, size) \
673do { \
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300674 TRACE(("copy data in %p to %" PY_FORMAT_SIZE_T "d " \
675 "(%" PY_FORMAT_SIZE_T "d)\n", \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000676 data, state->data_stack_base, size)); \
Serhiy Storchaka4bb17342013-04-13 21:15:47 +0300677 if (size > state->data_stack_size - state->data_stack_base) { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000678 int j = data_stack_grow(state, size); \
679 if (j < 0) return j; \
680 if (ctx_pos != -1) \
681 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
682 } \
683 memcpy(state->data_stack+state->data_stack_base, data, size); \
684 state->data_stack_base += size; \
685} while (0)
686
687#define DATA_STACK_POP(state, data, size, discard) \
688do { \
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300689 TRACE(("copy data to %p from %" PY_FORMAT_SIZE_T "d " \
690 "(%" PY_FORMAT_SIZE_T "d)\n", \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000691 data, state->data_stack_base-size, size)); \
692 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
693 if (discard) \
694 state->data_stack_base -= size; \
695} while (0)
696
697#define DATA_STACK_POP_DISCARD(state, size) \
698do { \
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300699 TRACE(("discard data from %" PY_FORMAT_SIZE_T "d " \
700 "(%" PY_FORMAT_SIZE_T "d)\n", \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000701 state->data_stack_base-size, size)); \
702 state->data_stack_base -= size; \
703} while(0)
704
705#define DATA_PUSH(x) \
706 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
707#define DATA_POP(x) \
708 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000709#define DATA_POP_DISCARD(x) \
710 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
711#define DATA_ALLOC(t,p) \
712 DATA_STACK_ALLOC(state, t, p)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000713#define DATA_LOOKUP_AT(t,p,pos) \
714 DATA_STACK_LOOKUP_AT(state,t,p,pos)
715
716#define MARK_PUSH(lastmark) \
717 do if (lastmark > 0) { \
718 i = lastmark; /* ctx->lastmark may change if reallocated */ \
719 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
720 } while (0)
721#define MARK_POP(lastmark) \
722 do if (lastmark > 0) { \
723 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
724 } while (0)
725#define MARK_POP_KEEP(lastmark) \
726 do if (lastmark > 0) { \
727 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
728 } while (0)
729#define MARK_POP_DISCARD(lastmark) \
730 do if (lastmark > 0) { \
731 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
732 } while (0)
733
734#define JUMP_NONE 0
735#define JUMP_MAX_UNTIL_1 1
736#define JUMP_MAX_UNTIL_2 2
737#define JUMP_MAX_UNTIL_3 3
738#define JUMP_MIN_UNTIL_1 4
739#define JUMP_MIN_UNTIL_2 5
740#define JUMP_MIN_UNTIL_3 6
741#define JUMP_REPEAT 7
742#define JUMP_REPEAT_ONE_1 8
743#define JUMP_REPEAT_ONE_2 9
744#define JUMP_MIN_REPEAT_ONE 10
745#define JUMP_BRANCH 11
746#define JUMP_ASSERT 12
747#define JUMP_ASSERT_NOT 13
748
749#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
750 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
751 nextctx->last_ctx_pos = ctx_pos; \
752 nextctx->jump = jumpvalue; \
753 nextctx->pattern = nextpattern; \
754 ctx_pos = alloc_pos; \
755 ctx = nextctx; \
756 goto entrance; \
757 jumplabel: \
758 while (0) /* gcc doesn't like labels at end of scopes */ \
759
760typedef struct {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000761 Py_ssize_t last_ctx_pos;
762 Py_ssize_t jump;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200763 char* ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000764 SRE_CODE* pattern;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000765 Py_ssize_t count;
766 Py_ssize_t lastmark;
767 Py_ssize_t lastindex;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000768 union {
769 SRE_CODE chr;
770 SRE_REPEAT* rep;
771 } u;
772} SRE_MATCH_CONTEXT;
773
774/* check if string matches the given pattern. returns <0 for
775 error, 0 for failure, and 1 for success */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000776LOCAL(Py_ssize_t)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000777SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
Guido van Rossumb700df92000-03-31 14:59:30 +0000778{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200779 char* end = (char*)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000780 Py_ssize_t alloc_pos, ctx_pos = -1;
781 Py_ssize_t i, ret = 0;
782 Py_ssize_t jump;
Christian Heimes2380ac72008-01-09 00:17:24 +0000783 unsigned int sigcount=0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000784
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000785 SRE_MATCH_CONTEXT* ctx;
786 SRE_MATCH_CONTEXT* nextctx;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000787
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000788 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000789
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000790 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
791 ctx->last_ctx_pos = -1;
792 ctx->jump = JUMP_NONE;
793 ctx->pattern = pattern;
794 ctx_pos = alloc_pos;
795
796entrance:
797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200798 ctx->ptr = (char *)state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000799
800 if (ctx->pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000801 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000802 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Victor Stinner1fa174a2013-08-28 02:06:21 +0200803 if (ctx->pattern[3] && (Py_uintptr_t)(end - ctx->ptr)/state->charsize < ctx->pattern[3]) {
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300804 TRACE(("reject (got %" PY_FORMAT_SIZE_T "d chars, "
805 "need %" PY_FORMAT_SIZE_T "d)\n",
806 (end - ctx->ptr)/state->charsize,
807 (Py_ssize_t) ctx->pattern[3]));
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000808 RETURN_FAILURE;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000809 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000810 ctx->pattern += ctx->pattern[1] + 1;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000811 }
812
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000813 for (;;) {
Christian Heimes2380ac72008-01-09 00:17:24 +0000814 ++sigcount;
815 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
816 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
Guido van Rossumb700df92000-03-31 14:59:30 +0000817
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000818 switch (*ctx->pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000819
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000820 case SRE_OP_MARK:
821 /* set mark */
822 /* <MARK> <gid> */
823 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
824 ctx->ptr, ctx->pattern[0]));
825 i = ctx->pattern[0];
826 if (i & 1)
827 state->lastindex = i/2 + 1;
828 if (i > state->lastmark) {
829 /* state->lastmark is the highest valid index in the
830 state->mark array. If it is increased by more than 1,
831 the intervening marks must be set to NULL to signal
Tim Peters3d563502006-01-21 02:47:53 +0000832 that these marks have not been encountered. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000833 Py_ssize_t j = state->lastmark + 1;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000834 while (j < i)
835 state->mark[j++] = NULL;
836 state->lastmark = i;
837 }
838 state->mark[i] = ctx->ptr;
839 ctx->pattern++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000840 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000841
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000842 case SRE_OP_LITERAL:
843 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000844 /* <LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000845 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
846 ctx->ptr, *ctx->pattern));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200847 if (ctx->ptr >= end || (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) != ctx->pattern[0])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000848 RETURN_FAILURE;
849 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200850 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000851 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000852
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000853 case SRE_OP_NOT_LITERAL:
854 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000855 /* <NOT_LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000856 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
857 ctx->ptr, *ctx->pattern));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858 if (ctx->ptr >= end || (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) == ctx->pattern[0])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000859 RETURN_FAILURE;
860 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200861 ctx->ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000862 break;
863
864 case SRE_OP_SUCCESS:
865 /* end of pattern */
866 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
867 state->ptr = ctx->ptr;
868 RETURN_SUCCESS;
869
870 case SRE_OP_AT:
871 /* match at given position */
872 /* <AT> <code> */
873 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
874 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
875 RETURN_FAILURE;
876 ctx->pattern++;
877 break;
878
879 case SRE_OP_CATEGORY:
880 /* match at given category */
881 /* <CATEGORY> <code> */
882 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
883 ctx->ptr, *ctx->pattern));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200884 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], SRE_CHARGET(state, ctx->ptr, 0)))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000885 RETURN_FAILURE;
886 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200887 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000888 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000889
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000890 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000891 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000892 /* <ANY> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000893 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200894 if (ctx->ptr >= end || SRE_IS_LINEBREAK(SRE_CHARGET(state, ctx->ptr, 0)))
895 RETURN_FAILURE;
896 ctx->ptr += state->charsize;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000897 break;
898
899 case SRE_OP_ANY_ALL:
900 /* match anything */
901 /* <ANY_ALL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000902 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
903 if (ctx->ptr >= end)
904 RETURN_FAILURE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200905 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000906 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000907
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000908 case SRE_OP_IN:
909 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000910 /* <IN> <skip> <set> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000911 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, SRE_CHARGET(state, ctx->ptr, 0)))
913 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000914 ctx->pattern += ctx->pattern[0];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200915 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000916 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000917
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000918 case SRE_OP_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000919 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
920 ctx->pattern, ctx->ptr, ctx->pattern[0]));
921 if (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200922 state->lower(SRE_CHARGET(state, ctx->ptr, 0)) != state->lower(*ctx->pattern))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000923 RETURN_FAILURE;
924 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200925 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000926 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000927
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000928 case SRE_OP_NOT_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000929 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
930 ctx->pattern, ctx->ptr, *ctx->pattern));
931 if (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200932 state->lower(SRE_CHARGET(state, ctx->ptr, 0)) == state->lower(*ctx->pattern))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000933 RETURN_FAILURE;
934 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200935 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000936 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000937
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000938 case SRE_OP_IN_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000939 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
940 if (ctx->ptr >= end
941 || !SRE_CHARSET(ctx->pattern+1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200942 (SRE_CODE)state->lower(SRE_CHARGET(state, ctx->ptr, 0))))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000943 RETURN_FAILURE;
944 ctx->pattern += ctx->pattern[0];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200945 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000946 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000947
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000948 case SRE_OP_JUMP:
949 case SRE_OP_INFO:
950 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000951 /* <JUMP> <offset> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000952 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
953 ctx->ptr, ctx->pattern[0]));
954 ctx->pattern += ctx->pattern[0];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000955 break;
956
957 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000958 /* alternation */
959 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000960 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000961 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000962 ctx->u.rep = state->repeat;
963 if (ctx->u.rep)
964 MARK_PUSH(ctx->lastmark);
965 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
966 if (ctx->pattern[1] == SRE_OP_LITERAL &&
967 (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968 (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) != ctx->pattern[2]))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000969 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000970 if (ctx->pattern[1] == SRE_OP_IN &&
971 (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200972 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0))))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000973 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000974 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000975 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000976 if (ret) {
977 if (ctx->u.rep)
978 MARK_POP_DISCARD(ctx->lastmark);
979 RETURN_ON_ERROR(ret);
980 RETURN_SUCCESS;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000981 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000982 if (ctx->u.rep)
983 MARK_POP_KEEP(ctx->lastmark);
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000984 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000985 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000986 if (ctx->u.rep)
987 MARK_POP_DISCARD(ctx->lastmark);
988 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +0000989
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000990 case SRE_OP_REPEAT_ONE:
991 /* match repeated sequence (maximizing regexp) */
992
993 /* this operator only works if the repeated item is
994 exactly one character wide, and we're not already
995 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +0000996 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000997
998 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
999
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001000 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1001 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001002
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001003 if ((Py_ssize_t) ctx->pattern[1] > (end - ctx->ptr) / state->charsize)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001004 RETURN_FAILURE; /* cannot match */
Fredrik Lundhe1869832000-08-01 22:47:49 +00001005
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001006 state->ptr = ctx->ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001007
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001008 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1009 RETURN_ON_ERROR(ret);
1010 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1011 ctx->count = ret;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001012 ctx->ptr += state->charsize * ctx->count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001013
1014 /* when we arrive here, count contains the number of
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001015 matches, and ctx->ptr points to the tail of the target
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001016 string. check if the rest of the pattern matches,
1017 and backtrack if not. */
1018
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001019 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001020 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001021
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001022 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001023 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001024 state->ptr = ctx->ptr;
1025 RETURN_SUCCESS;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001026 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001027
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001028 LASTMARK_SAVE();
1029
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001030 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001031 /* tail starts with a literal. skip positions where
1032 the rest of the pattern cannot possibly match */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001033 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001034 for (;;) {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001035 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
Victor Stinner63ab8752011-11-22 03:31:20 +01001036 (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001037 SRE_CHARGET(state, ctx->ptr, 0) != ctx->u.chr)) {
1038 ctx->ptr -= state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001039 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001040 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001041 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001042 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001043 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001044 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1045 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001046 if (ret) {
1047 RETURN_ON_ERROR(ret);
1048 RETURN_SUCCESS;
1049 }
Tim Peters3d563502006-01-21 02:47:53 +00001050
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001051 LASTMARK_RESTORE();
Tim Peters3d563502006-01-21 02:47:53 +00001052
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053 ctx->ptr -= state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001054 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001055 }
1056
1057 } else {
1058 /* general case */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001059 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001060 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001061 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1062 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001063 if (ret) {
1064 RETURN_ON_ERROR(ret);
1065 RETURN_SUCCESS;
1066 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067 ctx->ptr -= state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001068 ctx->count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001069 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001070 }
1071 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001072 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001073
Guido van Rossum41c99e72003-04-14 17:59:34 +00001074 case SRE_OP_MIN_REPEAT_ONE:
1075 /* match repeated sequence (minimizing regexp) */
1076
1077 /* this operator only works if the repeated item is
1078 exactly one character wide, and we're not already
1079 collecting backtracking points. for other cases,
1080 use the MIN_REPEAT operator */
1081
1082 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1083
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001084 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1085 ctx->pattern[1], ctx->pattern[2]));
Guido van Rossum41c99e72003-04-14 17:59:34 +00001086
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001087 if ((Py_ssize_t) ctx->pattern[1] > (end - ctx->ptr) / state->charsize)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001088 RETURN_FAILURE; /* cannot match */
Guido van Rossum41c99e72003-04-14 17:59:34 +00001089
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001090 state->ptr = ctx->ptr;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001091
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001092 if (ctx->pattern[1] == 0)
1093 ctx->count = 0;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001094 else {
1095 /* count using pattern min as the maximum */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001096 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1097 RETURN_ON_ERROR(ret);
1098 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001099 if (ret < (Py_ssize_t) ctx->pattern[1])
Tim Peters3d563502006-01-21 02:47:53 +00001100 /* didn't match minimum number of times */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001101 RETURN_FAILURE;
1102 /* advance past minimum matches of repeat */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001103 ctx->count = ret;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001104 ctx->ptr += state->charsize * ctx->count;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001105 }
1106
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001107 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Guido van Rossum41c99e72003-04-14 17:59:34 +00001108 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001109 state->ptr = ctx->ptr;
1110 RETURN_SUCCESS;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001111
1112 } else {
1113 /* general case */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001114 LASTMARK_SAVE();
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001115 while ((Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001116 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001117 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001118 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1119 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001120 if (ret) {
1121 RETURN_ON_ERROR(ret);
1122 RETURN_SUCCESS;
1123 }
1124 state->ptr = ctx->ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001125 ret = SRE_COUNT(state, ctx->pattern+3, 1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001126 RETURN_ON_ERROR(ret);
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001127 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001128 if (ret == 0)
Guido van Rossum41c99e72003-04-14 17:59:34 +00001129 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001130 assert(ret == 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001131 ctx->ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001132 ctx->count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001133 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001134 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001135 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001136 RETURN_FAILURE;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001137
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001138 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001139 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001140 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001141 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001142 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1143 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001144
1145 /* install new repeat context */
Thomas Wouters477c8d52006-05-27 19:21:47 +00001146 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001147 if (!ctx->u.rep) {
1148 PyErr_NoMemory();
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001149 RETURN_FAILURE;
Thomas Wouters89f507f2006-12-13 04:49:30 +00001150 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001151 ctx->u.rep->count = -1;
1152 ctx->u.rep->pattern = ctx->pattern;
1153 ctx->u.rep->prev = state->repeat;
1154 ctx->u.rep->last_ptr = NULL;
1155 state->repeat = ctx->u.rep;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001156
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001157 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001158 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001159 state->repeat = ctx->u.rep->prev;
Thomas Wouters477c8d52006-05-27 19:21:47 +00001160 PyObject_FREE(ctx->u.rep);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001161
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001162 if (ret) {
1163 RETURN_ON_ERROR(ret);
1164 RETURN_SUCCESS;
1165 }
1166 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001167
1168 case SRE_OP_MAX_UNTIL:
1169 /* maximizing repeat */
1170 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1171
1172 /* FIXME: we probably need to deal with zero-width
1173 matches in here... */
1174
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001175 ctx->u.rep = state->repeat;
1176 if (!ctx->u.rep)
1177 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001178
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001179 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001180
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001181 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001182
Serhiy Storchaka134f0de2013-09-05 18:01:15 +03001183 TRACE(("|%p|%p|MAX_UNTIL %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001184 ctx->ptr, ctx->count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001185
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001186 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001187 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001188 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001189 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1190 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001191 if (ret) {
1192 RETURN_ON_ERROR(ret);
1193 RETURN_SUCCESS;
1194 }
1195 ctx->u.rep->count = ctx->count-1;
1196 state->ptr = ctx->ptr;
1197 RETURN_FAILURE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001198 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001199
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001200 if ((ctx->count < (Py_ssize_t) ctx->u.rep->pattern[2] ||
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001201 ctx->u.rep->pattern[2] == SRE_MAXREPEAT) &&
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001202 state->ptr != ctx->u.rep->last_ptr) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001203 /* we may have enough matches, but if we can
1204 match another item, do so */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001205 ctx->u.rep->count = ctx->count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001206 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001207 MARK_PUSH(ctx->lastmark);
1208 /* zero-width match protection */
1209 DATA_PUSH(&ctx->u.rep->last_ptr);
1210 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001211 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1212 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001213 DATA_POP(&ctx->u.rep->last_ptr);
1214 if (ret) {
1215 MARK_POP_DISCARD(ctx->lastmark);
1216 RETURN_ON_ERROR(ret);
1217 RETURN_SUCCESS;
1218 }
1219 MARK_POP(ctx->lastmark);
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001220 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001221 ctx->u.rep->count = ctx->count-1;
1222 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001223 }
1224
1225 /* cannot match more repeated items here. make sure the
1226 tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001227 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001228 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001229 RETURN_ON_SUCCESS(ret);
1230 state->repeat = ctx->u.rep;
1231 state->ptr = ctx->ptr;
1232 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001233
1234 case SRE_OP_MIN_UNTIL:
1235 /* minimizing repeat */
1236 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1237
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001238 ctx->u.rep = state->repeat;
1239 if (!ctx->u.rep)
1240 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001241
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001242 state->ptr = ctx->ptr;
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001243
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001244 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001245
Serhiy Storchaka134f0de2013-09-05 18:01:15 +03001246 TRACE(("|%p|%p|MIN_UNTIL %" PY_FORMAT_SIZE_T "d %p\n", ctx->pattern,
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001247 ctx->ptr, ctx->count, ctx->u.rep->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001248
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001249 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001250 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001251 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001252 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1253 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001254 if (ret) {
1255 RETURN_ON_ERROR(ret);
1256 RETURN_SUCCESS;
1257 }
1258 ctx->u.rep->count = ctx->count-1;
1259 state->ptr = ctx->ptr;
1260 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001261 }
1262
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001263 LASTMARK_SAVE();
1264
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001265 /* see if the tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001266 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001267 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001268 if (ret) {
1269 RETURN_ON_ERROR(ret);
1270 RETURN_SUCCESS;
1271 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001272
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001273 state->repeat = ctx->u.rep;
1274 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001275
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001276 LASTMARK_RESTORE();
1277
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001278 if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2]
Serhiy Storchakafa468162013-02-16 21:23:53 +02001279 && ctx->u.rep->pattern[2] != SRE_MAXREPEAT) ||
1280 state->ptr == ctx->u.rep->last_ptr)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001281 RETURN_FAILURE;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001282
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001283 ctx->u.rep->count = ctx->count;
Serhiy Storchakafa468162013-02-16 21:23:53 +02001284 /* zero-width match protection */
1285 DATA_PUSH(&ctx->u.rep->last_ptr);
1286 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001287 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1288 ctx->u.rep->pattern+3);
Serhiy Storchakafa468162013-02-16 21:23:53 +02001289 DATA_POP(&ctx->u.rep->last_ptr);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001290 if (ret) {
1291 RETURN_ON_ERROR(ret);
1292 RETURN_SUCCESS;
1293 }
1294 ctx->u.rep->count = ctx->count-1;
1295 state->ptr = ctx->ptr;
1296 RETURN_FAILURE;
1297
1298 case SRE_OP_GROUPREF:
1299 /* match backreference */
1300 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1301 ctx->ptr, ctx->pattern[0]));
1302 i = ctx->pattern[0];
1303 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001304 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001305 if (groupref >= state->lastmark) {
1306 RETURN_FAILURE;
1307 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001308 char* p = (char*) state->mark[groupref];
1309 char* e = (char*) state->mark[groupref+1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001310 if (!p || !e || e < p)
1311 RETURN_FAILURE;
1312 while (p < e) {
Victor Stinner63ab8752011-11-22 03:31:20 +01001313 if (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001314 SRE_CHARGET(state, ctx->ptr, 0) != SRE_CHARGET(state, p, 0))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001315 RETURN_FAILURE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 p += state->charsize;
1317 ctx->ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001318 }
1319 }
1320 }
1321 ctx->pattern++;
1322 break;
1323
1324 case SRE_OP_GROUPREF_IGNORE:
1325 /* match backreference */
1326 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1327 ctx->ptr, ctx->pattern[0]));
1328 i = ctx->pattern[0];
1329 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001330 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001331 if (groupref >= state->lastmark) {
1332 RETURN_FAILURE;
1333 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334 char* p = (char*) state->mark[groupref];
1335 char* e = (char*) state->mark[groupref+1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001336 if (!p || !e || e < p)
1337 RETURN_FAILURE;
1338 while (p < e) {
1339 if (ctx->ptr >= end ||
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001340 state->lower(SRE_CHARGET(state, ctx->ptr, 0)) !=
1341 state->lower(SRE_CHARGET(state, p, 0)))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001342 RETURN_FAILURE;
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001343 p += state->charsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344 ctx->ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001345 }
1346 }
1347 }
1348 ctx->pattern++;
1349 break;
1350
1351 case SRE_OP_GROUPREF_EXISTS:
1352 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1353 ctx->ptr, ctx->pattern[0]));
1354 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1355 i = ctx->pattern[0];
1356 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001357 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001358 if (groupref >= state->lastmark) {
1359 ctx->pattern += ctx->pattern[1];
1360 break;
1361 } else {
1362 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1363 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1364 if (!p || !e || e < p) {
1365 ctx->pattern += ctx->pattern[1];
1366 break;
1367 }
1368 }
1369 }
1370 ctx->pattern += 2;
1371 break;
1372
1373 case SRE_OP_ASSERT:
1374 /* assert subpattern */
1375 /* <ASSERT> <skip> <back> <pattern> */
1376 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1377 ctx->ptr, ctx->pattern[1]));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001378 state->ptr = ctx->ptr - state->charsize * ctx->pattern[1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001379 if (state->ptr < state->beginning)
1380 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001381 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001382 RETURN_ON_FAILURE(ret);
1383 ctx->pattern += ctx->pattern[0];
1384 break;
1385
1386 case SRE_OP_ASSERT_NOT:
1387 /* assert not subpattern */
1388 /* <ASSERT_NOT> <skip> <back> <pattern> */
1389 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1390 ctx->ptr, ctx->pattern[1]));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391 state->ptr = ctx->ptr - state->charsize * ctx->pattern[1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001392 if (state->ptr >= state->beginning) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001393 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001394 if (ret) {
1395 RETURN_ON_ERROR(ret);
1396 RETURN_FAILURE;
1397 }
1398 }
1399 ctx->pattern += ctx->pattern[0];
1400 break;
1401
1402 case SRE_OP_FAILURE:
1403 /* immediate failure */
1404 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1405 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001406
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001407 default:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001408 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1409 ctx->pattern[-1]));
1410 RETURN_ERROR(SRE_ERROR_ILLEGAL);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001411 }
1412 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001413
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001414exit:
1415 ctx_pos = ctx->last_ctx_pos;
1416 jump = ctx->jump;
1417 DATA_POP_DISCARD(ctx);
1418 if (ctx_pos == -1)
1419 return ret;
1420 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1421
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001422 switch (jump) {
1423 case JUMP_MAX_UNTIL_2:
1424 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1425 goto jump_max_until_2;
1426 case JUMP_MAX_UNTIL_3:
1427 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1428 goto jump_max_until_3;
1429 case JUMP_MIN_UNTIL_2:
1430 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1431 goto jump_min_until_2;
1432 case JUMP_MIN_UNTIL_3:
1433 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1434 goto jump_min_until_3;
1435 case JUMP_BRANCH:
1436 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1437 goto jump_branch;
1438 case JUMP_MAX_UNTIL_1:
1439 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1440 goto jump_max_until_1;
1441 case JUMP_MIN_UNTIL_1:
1442 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1443 goto jump_min_until_1;
1444 case JUMP_REPEAT:
1445 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1446 goto jump_repeat;
1447 case JUMP_REPEAT_ONE_1:
1448 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1449 goto jump_repeat_one_1;
1450 case JUMP_REPEAT_ONE_2:
1451 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1452 goto jump_repeat_one_2;
1453 case JUMP_MIN_REPEAT_ONE:
1454 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1455 goto jump_min_repeat_one;
1456 case JUMP_ASSERT:
1457 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1458 goto jump_assert;
1459 case JUMP_ASSERT_NOT:
1460 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1461 goto jump_assert_not;
1462 case JUMP_NONE:
Serhiy Storchaka134f0de2013-09-05 18:01:15 +03001463 TRACE(("|%p|%p|RETURN %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
1464 ctx->ptr, ret));
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001465 break;
1466 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001467
1468 return ret; /* should never get here */
Guido van Rossumb700df92000-03-31 14:59:30 +00001469}
1470
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001471LOCAL(Py_ssize_t)
Guido van Rossumb700df92000-03-31 14:59:30 +00001472SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1473{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474 char* ptr = (char*)state->start;
1475 char* end = (char*)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001476 Py_ssize_t status = 0;
1477 Py_ssize_t prefix_len = 0;
1478 Py_ssize_t prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001479 SRE_CODE* prefix = NULL;
1480 SRE_CODE* charset = NULL;
1481 SRE_CODE* overlap = NULL;
1482 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001483
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001484 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001485 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001486 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001487
1488 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001489
Gustavo Niemeyer28b5bb32003-06-26 14:41:08 +00001490 if (pattern[3] > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001491 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001492 character in there, so literal search will work) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001493 end -= (pattern[3]-1) * state->charsize;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001494 if (end <= ptr)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001495 end = ptr + state->charsize;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001496 }
1497
Fredrik Lundh3562f112000-07-02 12:00:07 +00001498 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001499 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001500 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001501 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001502 prefix_skip = pattern[6];
1503 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001504 overlap = prefix + prefix_len - 1;
1505 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001506 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001507 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001508 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001509
1510 pattern += 1 + pattern[1];
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001511 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001512
Serhiy Storchaka134f0de2013-09-05 18:01:15 +03001513 TRACE(("prefix = %p %" PY_FORMAT_SIZE_T "d %" PY_FORMAT_SIZE_T "d\n",
1514 prefix, prefix_len, prefix_skip));
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001515 TRACE(("charset = %p\n", charset));
1516
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001517#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001518 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001519 /* pattern starts with a known prefix. use the overlap
1520 table to skip forward as fast as we possibly can */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001521 Py_ssize_t i = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001522 end = (char *)state->end;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001523 while (ptr < end) {
1524 for (;;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001525 if ((SRE_CODE) SRE_CHARGET(state, ptr, 0) != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001526 if (!i)
1527 break;
1528 else
1529 i = overlap[i];
1530 } else {
1531 if (++i == prefix_len) {
1532 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001533 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001534 state->start = ptr - (prefix_len - 1) * state->charsize;
1535 state->ptr = ptr - (prefix_len - prefix_skip - 1) * state->charsize;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001536 if (flags & SRE_INFO_LITERAL)
1537 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001538 status = SRE_MATCH(state, pattern + 2*prefix_skip);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001539 if (status != 0)
1540 return status;
1541 /* close but no cigar -- try again */
1542 i = overlap[i];
1543 }
1544 break;
1545 }
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001546 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001547 ptr += state->charsize;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001548 }
1549 return 0;
1550 }
1551#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001552
Fredrik Lundh3562f112000-07-02 12:00:07 +00001553 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001554 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001555 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001556 SRE_CODE chr = pattern[1];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001557 end = (char*)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001558 for (;;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001559 while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) != chr)
1560 ptr += state->charsize;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001561 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001562 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001563 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001564 state->start = ptr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001565 ptr += state->charsize;
1566 state->ptr = ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001567 if (flags & SRE_INFO_LITERAL)
1568 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001569 status = SRE_MATCH(state, pattern + 2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001570 if (status != 0)
1571 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001572 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001573 } else if (charset) {
1574 /* pattern starts with a character from a known set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001575 end = (char*)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001576 for (;;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001577 while (ptr < end && !SRE_CHARSET(charset, SRE_CHARGET(state, ptr, 0)))
1578 ptr += state->charsize;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001579 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001580 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001581 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001582 state->start = ptr;
1583 state->ptr = ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001584 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001585 if (status != 0)
1586 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001587 ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001588 }
1589 } else
1590 /* general case */
1591 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001592 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001593 state->start = state->ptr = ptr;
1594 ptr += state->charsize;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001595 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001596 if (status != 0)
1597 break;
1598 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001599
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001600 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001601}
Tim Peters3d563502006-01-21 02:47:53 +00001602
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001603#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001604
1605/* -------------------------------------------------------------------- */
1606/* factories and destructors */
1607
1608/* see sre.h for object declarations */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001609static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06001610static PyObject*pattern_scanner(PatternObject*, PyObject*, PyObject* kw);
Guido van Rossumb700df92000-03-31 14:59:30 +00001611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001612static int
1613sre_literal_template(int charsize, char* ptr, Py_ssize_t len)
1614{
1615 /* check if given string is a literal template (i.e. no escapes) */
1616 struct {
1617 int charsize;
1618 } state = {
1619 charsize
1620 };
1621 while (len-- > 0) {
1622 if (SRE_CHARGET((&state), ptr, 0) == '\\')
1623 return 0;
1624 ptr += charsize;
1625 }
1626 return 1;
1627}
1628
Guido van Rossumb700df92000-03-31 14:59:30 +00001629static PyObject *
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001630sre_codesize(PyObject* self, PyObject *unused)
Guido van Rossumb700df92000-03-31 14:59:30 +00001631{
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001632 return PyLong_FromSize_t(sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001633}
1634
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001635static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001636sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001637{
1638 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001639 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001640 return NULL;
1641 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001642 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001643 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001644 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundhb389df32000-06-29 12:48:37 +00001645 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001646}
1647
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001648LOCAL(void)
1649state_reset(SRE_STATE* state)
1650{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001651 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001652 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001653
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001654 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001655 state->lastindex = -1;
1656
1657 state->repeat = NULL;
1658
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001659 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001660}
1661
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001662static void*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001663getstring(PyObject* string, Py_ssize_t* p_length,
Benjamin Peterson33d21a22012-03-07 14:59:13 -06001664 int* p_logical_charsize, int* p_charsize,
1665 Py_buffer *view)
Guido van Rossumb700df92000-03-31 14:59:30 +00001666{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001667 /* given a python object, return a data pointer, a length (in
1668 characters), and a character size. return NULL if the object
1669 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +00001670
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001671 PyBufferProcs *buffer;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001672 Py_ssize_t size, bytes;
1673 int charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001674 void* ptr;
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001675
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00001676 /* Unicode objects do not support the buffer API. So, get the data
1677 directly instead. */
1678 if (PyUnicode_Check(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001679 if (PyUnicode_READY(string) == -1)
1680 return NULL;
1681 ptr = PyUnicode_DATA(string);
1682 *p_length = PyUnicode_GET_LENGTH(string);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001683 *p_charsize = PyUnicode_KIND(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001684 *p_logical_charsize = 4;
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00001685 return ptr;
1686 }
1687
Victor Stinner0058b862011-09-29 03:27:47 +02001688 /* get pointer to byte string buffer */
Benjamin Petersone48944b2012-03-07 14:50:25 -06001689 view->len = -1;
Christian Heimes90aa7642007-12-19 02:45:37 +00001690 buffer = Py_TYPE(string)->tp_as_buffer;
Antoine Pitroufd036452008-08-19 17:56:33 +00001691 if (!buffer || !buffer->bf_getbuffer ||
Benjamin Petersone48944b2012-03-07 14:50:25 -06001692 (*buffer->bf_getbuffer)(string, view, PyBUF_SIMPLE) < 0) {
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001693 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1694 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001695 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001696
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001697 /* determine buffer size */
Benjamin Petersone48944b2012-03-07 14:50:25 -06001698 bytes = view->len;
1699 ptr = view->buf;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001700
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001701 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001702 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001703 goto err;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001704 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001705
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001706 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001707 size = PyObject_Size(string);
Guido van Rossumb700df92000-03-31 14:59:30 +00001708
Christian Heimes72b710a2008-05-26 13:28:38 +00001709 if (PyBytes_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001710 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001711 else {
1712 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001713 goto err;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001714 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001715
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001716 *p_length = size;
1717 *p_charsize = charsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001718 *p_logical_charsize = charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001719
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001720 if (ptr == NULL) {
Antoine Pitroufd036452008-08-19 17:56:33 +00001721 PyErr_SetString(PyExc_ValueError,
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001722 "Buffer is NULL");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001723 goto err;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001724 }
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001725 return ptr;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001726 err:
1727 PyBuffer_Release(view);
1728 view->buf = NULL;
1729 return NULL;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001730}
1731
1732LOCAL(PyObject*)
1733state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001734 Py_ssize_t start, Py_ssize_t end)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001735{
1736 /* prepare state object */
1737
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001738 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001739 int logical_charsize, charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001740 void* ptr;
1741
1742 memset(state, 0, sizeof(SRE_STATE));
1743
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001744 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001745 state->lastindex = -1;
1746
Benjamin Petersone48944b2012-03-07 14:50:25 -06001747 state->buffer.buf = NULL;
Benjamin Peterson33d21a22012-03-07 14:59:13 -06001748 ptr = getstring(string, &length, &logical_charsize, &charsize, &state->buffer);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001749 if (!ptr)
Benjamin Petersone48944b2012-03-07 14:50:25 -06001750 goto err;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001751
Benjamin Peterson33d21a22012-03-07 14:59:13 -06001752 if (logical_charsize == 1 && pattern->logical_charsize > 1) {
Benjamin Petersone48944b2012-03-07 14:50:25 -06001753 PyErr_SetString(PyExc_TypeError,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001754 "can't use a string pattern on a bytes-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001755 goto err;
1756 }
Benjamin Peterson33d21a22012-03-07 14:59:13 -06001757 if (logical_charsize > 1 && pattern->logical_charsize == 1) {
Benjamin Petersone48944b2012-03-07 14:50:25 -06001758 PyErr_SetString(PyExc_TypeError,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001759 "can't use a bytes pattern on a string-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001760 goto err;
1761 }
Antoine Pitroufd036452008-08-19 17:56:33 +00001762
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001763 /* adjust boundaries */
1764 if (start < 0)
1765 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001766 else if (start > length)
1767 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001768
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001769 if (end < 0)
1770 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001771 else if (end > length)
1772 end = length;
1773
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001774 state->logical_charsize = logical_charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001775 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001776
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001777 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001778
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001779 state->start = (void*) ((char*) ptr + start * state->charsize);
1780 state->end = (void*) ((char*) ptr + end * state->charsize);
1781
1782 Py_INCREF(string);
1783 state->string = string;
1784 state->pos = start;
1785 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001786
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001787 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001788 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001789 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001790 state->lower = sre_lower_unicode;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001791 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001792 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001793
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001794 return string;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001795 err:
1796 if (state->buffer.buf)
1797 PyBuffer_Release(&state->buffer);
1798 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001799}
1800
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001801LOCAL(void)
1802state_fini(SRE_STATE* state)
1803{
Benjamin Petersone48944b2012-03-07 14:50:25 -06001804 if (state->buffer.buf)
1805 PyBuffer_Release(&state->buffer);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001806 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001807 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001808}
1809
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001810/* calculate offset from start of string */
1811#define STATE_OFFSET(state, member)\
1812 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1813
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001814LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001815state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001816{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001817 Py_ssize_t i, j;
Fredrik Lundh58100642000-08-09 09:14:35 +00001818
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001819 index = (index - 1) * 2;
1820
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001821 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001822 if (empty)
1823 /* want empty string */
1824 i = j = 0;
1825 else {
1826 Py_INCREF(Py_None);
1827 return Py_None;
1828 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001829 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001830 i = STATE_OFFSET(state, state->mark[index]);
1831 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001832 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001833
Fredrik Lundh58100642000-08-09 09:14:35 +00001834 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001835}
1836
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001837static void
1838pattern_error(int status)
1839{
1840 switch (status) {
1841 case SRE_ERROR_RECURSION_LIMIT:
1842 PyErr_SetString(
1843 PyExc_RuntimeError,
1844 "maximum recursion limit exceeded"
1845 );
1846 break;
1847 case SRE_ERROR_MEMORY:
1848 PyErr_NoMemory();
1849 break;
Christian Heimes2380ac72008-01-09 00:17:24 +00001850 case SRE_ERROR_INTERRUPTED:
1851 /* An exception has already been raised, so let it fly */
1852 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001853 default:
1854 /* other error codes indicate compiler/engine bugs */
1855 PyErr_SetString(
1856 PyExc_RuntimeError,
1857 "internal error in regular expression engine"
1858 );
1859 }
1860}
1861
Guido van Rossumb700df92000-03-31 14:59:30 +00001862static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001863pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001864{
Raymond Hettinger027bb632004-05-31 03:09:25 +00001865 if (self->weakreflist != NULL)
1866 PyObject_ClearWeakRefs((PyObject *) self);
Benjamin Petersone48944b2012-03-07 14:50:25 -06001867 if (self->view.buf)
1868 PyBuffer_Release(&self->view);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001869 Py_XDECREF(self->pattern);
1870 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001871 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001872 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001873}
1874
1875static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001876pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001877{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001878 SRE_STATE state;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01001879 Py_ssize_t status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001880
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001881 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001882 Py_ssize_t start = 0;
1883 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001884 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001885 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001886 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001887 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001888
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001889 string = state_init(&state, self, string, start, end);
1890 if (!string)
1891 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001892
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001893 state.ptr = state.start;
1894
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001895 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1896
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001897 if (state.logical_charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001898 status = sre_match(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001899 } else {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001900 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001901 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001902
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001903 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001904 if (PyErr_Occurred())
1905 return NULL;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001906
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001907 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001908
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001909 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001910}
1911
1912static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001913pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001914{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001915 SRE_STATE state;
1916 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001917
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001918 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001919 Py_ssize_t start = 0;
1920 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001921 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001922 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001923 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001924 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001925
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001926 string = state_init(&state, self, string, start, end);
1927 if (!string)
1928 return NULL;
1929
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001930 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1931
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001932 if (state.logical_charsize == 1) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001933 status = sre_search(&state, PatternObject_GetCode(self));
1934 } else {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001935 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001936 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001937
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001938 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1939
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001940 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001941
Thomas Wouters89f507f2006-12-13 04:49:30 +00001942 if (PyErr_Occurred())
1943 return NULL;
1944
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001945 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001946}
1947
1948static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001949call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001950{
1951 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001952 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001953 PyObject* func;
1954 PyObject* result;
1955
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001956 if (!args)
1957 return NULL;
Neal Norwitzfe537132007-08-26 03:55:15 +00001958 name = PyUnicode_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001959 if (!name)
1960 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001961 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001962 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001963 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001964 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001965 func = PyObject_GetAttrString(mod, function);
1966 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001967 if (!func)
1968 return NULL;
1969 result = PyObject_CallObject(func, args);
1970 Py_DECREF(func);
1971 Py_DECREF(args);
1972 return result;
1973}
1974
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001975#ifdef USE_BUILTIN_COPY
1976static int
1977deepcopy(PyObject** object, PyObject* memo)
1978{
1979 PyObject* copy;
1980
1981 copy = call(
1982 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00001983 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001984 );
1985 if (!copy)
1986 return 0;
1987
1988 Py_DECREF(*object);
1989 *object = copy;
1990
1991 return 1; /* success */
1992}
1993#endif
1994
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001995static PyObject*
Thomas Wouters1b7f8912007-09-19 03:06:30 +00001996join_list(PyObject* list, PyObject* string)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001997{
1998 /* join list elements */
1999
2000 PyObject* joiner;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002001 PyObject* function;
2002 PyObject* args;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002003 PyObject* result;
2004
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002005 joiner = PySequence_GetSlice(string, 0, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002006 if (!joiner)
2007 return NULL;
2008
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002009 if (PyList_GET_SIZE(list) == 0) {
2010 Py_DECREF(list);
2011 return joiner;
2012 }
2013
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002014 function = PyObject_GetAttrString(joiner, "join");
2015 if (!function) {
2016 Py_DECREF(joiner);
2017 return NULL;
2018 }
2019 args = PyTuple_New(1);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002020 if (!args) {
2021 Py_DECREF(function);
2022 Py_DECREF(joiner);
2023 return NULL;
2024 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002025 PyTuple_SET_ITEM(args, 0, list);
2026 result = PyObject_CallObject(function, args);
2027 Py_DECREF(args); /* also removes list */
2028 Py_DECREF(function);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002029 Py_DECREF(joiner);
2030
2031 return result;
2032}
2033
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002034static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002035pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002036{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002037 SRE_STATE state;
2038 PyObject* list;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002039 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002040 Py_ssize_t i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00002041
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002042 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002043 Py_ssize_t start = 0;
2044 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002045 static char* kwlist[] = { "source", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002046 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00002047 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002048 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002049
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002050 string = state_init(&state, self, string, start, end);
2051 if (!string)
2052 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002053
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002054 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002055 if (!list) {
2056 state_fini(&state);
2057 return NULL;
2058 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002059
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002060 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002061
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002062 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +00002063
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002064 state_reset(&state);
2065
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002066 state.ptr = state.start;
2067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002068 if (state.logical_charsize == 1) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002069 status = sre_search(&state, PatternObject_GetCode(self));
2070 } else {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002071 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002072 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002073
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002074 if (PyErr_Occurred())
2075 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00002076
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002077 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002078 if (status == 0)
2079 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002080 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002081 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002082 }
Tim Peters3d563502006-01-21 02:47:53 +00002083
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002084 /* don't bother to build a match object */
2085 switch (self->groups) {
2086 case 0:
2087 b = STATE_OFFSET(&state, state.start);
2088 e = STATE_OFFSET(&state, state.ptr);
2089 item = PySequence_GetSlice(string, b, e);
2090 if (!item)
2091 goto error;
2092 break;
2093 case 1:
2094 item = state_getslice(&state, 1, string, 1);
2095 if (!item)
2096 goto error;
2097 break;
2098 default:
2099 item = PyTuple_New(self->groups);
2100 if (!item)
2101 goto error;
2102 for (i = 0; i < self->groups; i++) {
2103 PyObject* o = state_getslice(&state, i+1, string, 1);
2104 if (!o) {
2105 Py_DECREF(item);
2106 goto error;
2107 }
2108 PyTuple_SET_ITEM(item, i, o);
2109 }
2110 break;
2111 }
2112
2113 status = PyList_Append(list, item);
2114 Py_DECREF(item);
2115 if (status < 0)
2116 goto error;
2117
2118 if (state.ptr == state.start)
2119 state.start = (void*) ((char*) state.ptr + state.charsize);
2120 else
2121 state.start = state.ptr;
2122
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002123 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002124
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002125 state_fini(&state);
2126 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002127
2128error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002129 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002130 state_fini(&state);
2131 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002132
Guido van Rossumb700df92000-03-31 14:59:30 +00002133}
2134
Fredrik Lundh703ce812001-10-24 22:16:30 +00002135static PyObject*
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002136pattern_finditer(PatternObject* pattern, PyObject* args, PyObject* kw)
Fredrik Lundh703ce812001-10-24 22:16:30 +00002137{
2138 PyObject* scanner;
2139 PyObject* search;
2140 PyObject* iterator;
2141
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002142 scanner = pattern_scanner(pattern, args, kw);
Fredrik Lundh703ce812001-10-24 22:16:30 +00002143 if (!scanner)
2144 return NULL;
2145
2146 search = PyObject_GetAttrString(scanner, "search");
2147 Py_DECREF(scanner);
2148 if (!search)
2149 return NULL;
2150
2151 iterator = PyCallIter_New(search, Py_None);
2152 Py_DECREF(search);
2153
2154 return iterator;
2155}
Fredrik Lundh703ce812001-10-24 22:16:30 +00002156
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002157static PyObject*
2158pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2159{
2160 SRE_STATE state;
2161 PyObject* list;
2162 PyObject* item;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002163 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002164 Py_ssize_t n;
2165 Py_ssize_t i;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002166 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002167
2168 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002169 Py_ssize_t maxsplit = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002170 static char* kwlist[] = { "source", "maxsplit", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002171 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002172 &string, &maxsplit))
2173 return NULL;
2174
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002175 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002176 if (!string)
2177 return NULL;
2178
2179 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002180 if (!list) {
2181 state_fini(&state);
2182 return NULL;
2183 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002184
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002185 n = 0;
2186 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002187
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002188 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002189
2190 state_reset(&state);
2191
2192 state.ptr = state.start;
2193
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 if (state.logical_charsize == 1) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002195 status = sre_search(&state, PatternObject_GetCode(self));
2196 } else {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002197 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002198 }
2199
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002200 if (PyErr_Occurred())
2201 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00002202
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002203 if (status <= 0) {
2204 if (status == 0)
2205 break;
2206 pattern_error(status);
2207 goto error;
2208 }
Tim Peters3d563502006-01-21 02:47:53 +00002209
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002210 if (state.start == state.ptr) {
2211 if (last == state.end)
2212 break;
2213 /* skip one character */
2214 state.start = (void*) ((char*) state.ptr + state.charsize);
2215 continue;
2216 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002217
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002218 /* get segment before this match */
2219 item = PySequence_GetSlice(
2220 string, STATE_OFFSET(&state, last),
2221 STATE_OFFSET(&state, state.start)
2222 );
2223 if (!item)
2224 goto error;
2225 status = PyList_Append(list, item);
2226 Py_DECREF(item);
2227 if (status < 0)
2228 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002229
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002230 /* add groups (if any) */
2231 for (i = 0; i < self->groups; i++) {
2232 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002233 if (!item)
2234 goto error;
2235 status = PyList_Append(list, item);
2236 Py_DECREF(item);
2237 if (status < 0)
2238 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002239 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002240
2241 n = n + 1;
2242
2243 last = state.start = state.ptr;
2244
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002245 }
2246
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002247 /* get segment following last match (even if empty) */
2248 item = PySequence_GetSlice(
2249 string, STATE_OFFSET(&state, last), state.endpos
2250 );
2251 if (!item)
2252 goto error;
2253 status = PyList_Append(list, item);
2254 Py_DECREF(item);
2255 if (status < 0)
2256 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002257
2258 state_fini(&state);
2259 return list;
2260
2261error:
2262 Py_DECREF(list);
2263 state_fini(&state);
2264 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002265
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002266}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002267
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002268static PyObject*
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002269pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002270 Py_ssize_t count, Py_ssize_t subn)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002271{
2272 SRE_STATE state;
2273 PyObject* list;
2274 PyObject* item;
2275 PyObject* filter;
2276 PyObject* args;
2277 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002278 void* ptr;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002279 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002280 Py_ssize_t n;
2281 Py_ssize_t i, b, e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002282 int logical_charsize, charsize;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002283 int filter_is_callable;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002284 Py_buffer view;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002285
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002286 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002287 /* sub/subn takes either a function or a template */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002288 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002289 Py_INCREF(filter);
2290 filter_is_callable = 1;
2291 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002292 /* if not callable, check if it's a literal string */
2293 int literal;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002294 view.buf = NULL;
Benjamin Peterson33d21a22012-03-07 14:59:13 -06002295 ptr = getstring(ptemplate, &n, &logical_charsize, &charsize, &view);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002296 b = charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002297 if (ptr) {
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002298 literal = sre_literal_template(charsize, ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002299 } else {
2300 PyErr_Clear();
2301 literal = 0;
2302 }
Benjamin Petersone48944b2012-03-07 14:50:25 -06002303 if (view.buf)
2304 PyBuffer_Release(&view);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002305 if (literal) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002306 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002307 Py_INCREF(filter);
2308 filter_is_callable = 0;
2309 } else {
2310 /* not a literal; hand it over to the template compiler */
2311 filter = call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00002312 SRE_PY_MODULE, "_subx",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002313 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002314 );
2315 if (!filter)
2316 return NULL;
2317 filter_is_callable = PyCallable_Check(filter);
2318 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002319 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002320
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002321 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002322 if (!string) {
2323 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002324 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002325 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002326
2327 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002328 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002329 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002330 state_fini(&state);
2331 return NULL;
2332 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002333
2334 n = i = 0;
2335
2336 while (!count || n < count) {
2337
2338 state_reset(&state);
2339
2340 state.ptr = state.start;
2341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002342 if (state.logical_charsize == 1) {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002343 status = sre_search(&state, PatternObject_GetCode(self));
2344 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002345 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002346 }
2347
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002348 if (PyErr_Occurred())
2349 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00002350
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002351 if (status <= 0) {
2352 if (status == 0)
2353 break;
2354 pattern_error(status);
2355 goto error;
2356 }
Tim Peters3d563502006-01-21 02:47:53 +00002357
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002358 b = STATE_OFFSET(&state, state.start);
2359 e = STATE_OFFSET(&state, state.ptr);
2360
2361 if (i < b) {
2362 /* get segment before this match */
2363 item = PySequence_GetSlice(string, i, b);
2364 if (!item)
2365 goto error;
2366 status = PyList_Append(list, item);
2367 Py_DECREF(item);
2368 if (status < 0)
2369 goto error;
2370
2371 } else if (i == b && i == e && n > 0)
2372 /* ignore empty match on latest position */
2373 goto next;
2374
2375 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002376 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002377 match = pattern_new_match(self, &state, 1);
2378 if (!match)
2379 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002380 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002381 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002382 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002383 goto error;
2384 }
2385 item = PyObject_CallObject(filter, args);
2386 Py_DECREF(args);
2387 Py_DECREF(match);
2388 if (!item)
2389 goto error;
2390 } else {
2391 /* filter is literal string */
2392 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002393 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002394 }
2395
2396 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002397 if (item != Py_None) {
2398 status = PyList_Append(list, item);
2399 Py_DECREF(item);
2400 if (status < 0)
2401 goto error;
2402 }
Tim Peters3d563502006-01-21 02:47:53 +00002403
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002404 i = e;
2405 n = n + 1;
2406
2407next:
2408 /* move on */
2409 if (state.ptr == state.start)
2410 state.start = (void*) ((char*) state.ptr + state.charsize);
2411 else
2412 state.start = state.ptr;
2413
2414 }
2415
2416 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002417 if (i < state.endpos) {
2418 item = PySequence_GetSlice(string, i, state.endpos);
2419 if (!item)
2420 goto error;
2421 status = PyList_Append(list, item);
2422 Py_DECREF(item);
2423 if (status < 0)
2424 goto error;
2425 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002426
2427 state_fini(&state);
2428
Guido van Rossum4e173842001-12-07 04:25:10 +00002429 Py_DECREF(filter);
2430
Fredrik Lundhdac58492001-10-21 21:48:30 +00002431 /* convert list to single string (also removes list) */
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002432 item = join_list(list, string);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002433
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002434 if (!item)
2435 return NULL;
2436
2437 if (subn)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01002438 return Py_BuildValue("Nn", item, n);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002439
2440 return item;
2441
2442error:
2443 Py_DECREF(list);
2444 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002445 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002446 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002447
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002448}
2449
2450static PyObject*
2451pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2452{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002453 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002454 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002455 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002456 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002457 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002458 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002459 return NULL;
2460
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002461 return pattern_subx(self, ptemplate, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002462}
2463
2464static PyObject*
2465pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2466{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002467 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002468 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002469 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002470 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002471 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002472 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002473 return NULL;
2474
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002475 return pattern_subx(self, ptemplate, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002476}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002477
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002478static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002479pattern_copy(PatternObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002480{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002481#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002482 PatternObject* copy;
2483 int offset;
2484
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002485 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2486 if (!copy)
2487 return NULL;
2488
2489 offset = offsetof(PatternObject, groups);
2490
2491 Py_XINCREF(self->groupindex);
2492 Py_XINCREF(self->indexgroup);
2493 Py_XINCREF(self->pattern);
2494
2495 memcpy((char*) copy + offset, (char*) self + offset,
2496 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00002497 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002498
2499 return (PyObject*) copy;
2500#else
2501 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2502 return NULL;
2503#endif
2504}
2505
2506static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002507pattern_deepcopy(PatternObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002508{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002509#ifdef USE_BUILTIN_COPY
2510 PatternObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00002511
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002512 copy = (PatternObject*) pattern_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002513 if (!copy)
2514 return NULL;
2515
2516 if (!deepcopy(&copy->groupindex, memo) ||
2517 !deepcopy(&copy->indexgroup, memo) ||
2518 !deepcopy(&copy->pattern, memo)) {
2519 Py_DECREF(copy);
2520 return NULL;
2521 }
2522
2523#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002524 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2525 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002526#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002527}
2528
Raymond Hettinger94478742004-09-24 04:31:19 +00002529PyDoc_STRVAR(pattern_match_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002530"match(string[, pos[, endpos]]) -> match object or None.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002531 Matches zero or more characters at the beginning of the string");
2532
2533PyDoc_STRVAR(pattern_search_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002534"search(string[, pos[, endpos]]) -> match object or None.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002535 Scan through string looking for a match, and return a corresponding\n\
Andrew Svetlov0b64c142012-12-25 18:48:54 +02002536 match object instance. Return None if no position in the string matches.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002537
2538PyDoc_STRVAR(pattern_split_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002539"split(string[, maxsplit = 0]) -> list.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002540 Split string by the occurrences of pattern.");
2541
2542PyDoc_STRVAR(pattern_findall_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002543"findall(string[, pos[, endpos]]) -> list.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002544 Return a list of all non-overlapping matches of pattern in string.");
2545
2546PyDoc_STRVAR(pattern_finditer_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002547"finditer(string[, pos[, endpos]]) -> iterator.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002548 Return an iterator over all non-overlapping matches for the \n\
2549 RE pattern in string. For each match, the iterator returns a\n\
2550 match object.");
2551
2552PyDoc_STRVAR(pattern_sub_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002553"sub(repl, string[, count = 0]) -> newstring.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002554 Return the string obtained by replacing the leftmost non-overlapping\n\
Tim Peters3d563502006-01-21 02:47:53 +00002555 occurrences of pattern in string by the replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002556
2557PyDoc_STRVAR(pattern_subn_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002558"subn(repl, string[, count = 0]) -> (newstring, number of subs)\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002559 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2560 the leftmost non-overlapping occurrences of pattern with the\n\
Tim Peters3d563502006-01-21 02:47:53 +00002561 replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002562
2563PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2564
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002565static PyMethodDef pattern_methods[] = {
Tim Peters3d563502006-01-21 02:47:53 +00002566 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002567 pattern_match_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002568 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002569 pattern_search_doc},
Raymond Hettinger94478742004-09-24 04:31:19 +00002570 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002571 pattern_sub_doc},
Raymond Hettinger94478742004-09-24 04:31:19 +00002572 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002573 pattern_subn_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002574 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002575 pattern_split_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002576 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002577 pattern_findall_doc},
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002578 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002579 pattern_finditer_doc},
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002580 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS|METH_KEYWORDS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002581 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2582 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002583 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002584};
2585
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002586#define PAT_OFF(x) offsetof(PatternObject, x)
2587static PyMemberDef pattern_members[] = {
2588 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
2589 {"flags", T_INT, PAT_OFF(flags), READONLY},
2590 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
2591 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
2592 {NULL} /* Sentinel */
2593};
Guido van Rossumb700df92000-03-31 14:59:30 +00002594
Neal Norwitz57c179c2006-03-22 07:18:02 +00002595static PyTypeObject Pattern_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00002596 PyVarObject_HEAD_INIT(NULL, 0)
2597 "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002598 sizeof(PatternObject), sizeof(SRE_CODE),
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002599 (destructor)pattern_dealloc, /* tp_dealloc */
2600 0, /* tp_print */
2601 0, /* tp_getattr */
2602 0, /* tp_setattr */
2603 0, /* tp_reserved */
2604 0, /* tp_repr */
2605 0, /* tp_as_number */
2606 0, /* tp_as_sequence */
2607 0, /* tp_as_mapping */
2608 0, /* tp_hash */
2609 0, /* tp_call */
2610 0, /* tp_str */
2611 0, /* tp_getattro */
2612 0, /* tp_setattro */
2613 0, /* tp_as_buffer */
2614 Py_TPFLAGS_DEFAULT, /* tp_flags */
2615 pattern_doc, /* tp_doc */
2616 0, /* tp_traverse */
2617 0, /* tp_clear */
2618 0, /* tp_richcompare */
2619 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
2620 0, /* tp_iter */
2621 0, /* tp_iternext */
2622 pattern_methods, /* tp_methods */
2623 pattern_members, /* tp_members */
Guido van Rossumb700df92000-03-31 14:59:30 +00002624};
2625
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002626static int _validate(PatternObject *self); /* Forward */
2627
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002628static PyObject *
2629_compile(PyObject* self_, PyObject* args)
2630{
2631 /* "compile" pattern descriptor to pattern object */
2632
2633 PatternObject* self;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002634 Py_ssize_t i, n;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002635
2636 PyObject* pattern;
2637 int flags = 0;
2638 PyObject* code;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002639 Py_ssize_t groups = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002640 PyObject* groupindex = NULL;
2641 PyObject* indexgroup = NULL;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002642
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002643 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002644 &PyList_Type, &code, &groups,
2645 &groupindex, &indexgroup))
2646 return NULL;
2647
2648 n = PyList_GET_SIZE(code);
Christian Heimes587c2bf2008-01-19 16:21:02 +00002649 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002650 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2651 if (!self)
2652 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002653 self->weakreflist = NULL;
2654 self->pattern = NULL;
2655 self->groupindex = NULL;
2656 self->indexgroup = NULL;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002657 self->view.buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002658
2659 self->codesize = n;
2660
2661 for (i = 0; i < n; i++) {
2662 PyObject *o = PyList_GET_ITEM(code, i);
Guido van Rossumddefaf32007-01-14 03:31:43 +00002663 unsigned long value = PyLong_AsUnsignedLong(o);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002664 self->code[i] = (SRE_CODE) value;
2665 if ((unsigned long) self->code[i] != value) {
2666 PyErr_SetString(PyExc_OverflowError,
2667 "regular expression code size limit exceeded");
2668 break;
2669 }
2670 }
2671
2672 if (PyErr_Occurred()) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002673 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002674 return NULL;
2675 }
2676
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002677 if (pattern == Py_None) {
2678 self->logical_charsize = -1;
2679 self->charsize = -1;
Victor Stinner63ab8752011-11-22 03:31:20 +01002680 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002681 else {
2682 Py_ssize_t p_length;
2683 if (!getstring(pattern, &p_length, &self->logical_charsize,
Benjamin Peterson33d21a22012-03-07 14:59:13 -06002684 &self->charsize, &self->view)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002685 Py_DECREF(self);
2686 return NULL;
2687 }
2688 }
Antoine Pitroufd036452008-08-19 17:56:33 +00002689
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002690 Py_INCREF(pattern);
2691 self->pattern = pattern;
2692
2693 self->flags = flags;
2694
2695 self->groups = groups;
2696
2697 Py_XINCREF(groupindex);
2698 self->groupindex = groupindex;
2699
2700 Py_XINCREF(indexgroup);
2701 self->indexgroup = indexgroup;
2702
2703 self->weakreflist = NULL;
2704
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002705 if (!_validate(self)) {
2706 Py_DECREF(self);
2707 return NULL;
2708 }
2709
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002710 return (PyObject*) self;
2711}
2712
Guido van Rossumb700df92000-03-31 14:59:30 +00002713/* -------------------------------------------------------------------- */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002714/* Code validation */
2715
2716/* To learn more about this code, have a look at the _compile() function in
2717 Lib/sre_compile.py. The validation functions below checks the code array
2718 for conformance with the code patterns generated there.
2719
2720 The nice thing about the generated code is that it is position-independent:
2721 all jumps are relative jumps forward. Also, jumps don't cross each other:
2722 the target of a later jump is always earlier than the target of an earlier
2723 jump. IOW, this is okay:
2724
2725 J---------J-------T--------T
2726 \ \_____/ /
2727 \______________________/
2728
2729 but this is not:
2730
2731 J---------J-------T--------T
2732 \_________\_____/ /
2733 \____________/
2734
2735 It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
2736 bytes wide (the latter if Python is compiled for "wide" unicode support).
2737*/
2738
2739/* Defining this one enables tracing of the validator */
2740#undef VVERBOSE
2741
2742/* Trace macro for the validator */
2743#if defined(VVERBOSE)
2744#define VTRACE(v) printf v
2745#else
Senthil Kumaran202a3c42011-10-20 02:15:36 +08002746#define VTRACE(v) do {} while(0) /* do nothing */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002747#endif
2748
2749/* Report failure */
2750#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2751
2752/* Extract opcode, argument, or skip count from code array */
2753#define GET_OP \
2754 do { \
2755 VTRACE(("%p: ", code)); \
2756 if (code >= end) FAIL; \
2757 op = *code++; \
2758 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2759 } while (0)
2760#define GET_ARG \
2761 do { \
2762 VTRACE(("%p= ", code)); \
2763 if (code >= end) FAIL; \
2764 arg = *code++; \
2765 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2766 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00002767#define GET_SKIP_ADJ(adj) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002768 do { \
2769 VTRACE(("%p= ", code)); \
2770 if (code >= end) FAIL; \
2771 skip = *code; \
2772 VTRACE(("%lu (skip to %p)\n", \
2773 (unsigned long)skip, code+skip)); \
Victor Stinner1fa174a2013-08-28 02:06:21 +02002774 if (skip-adj > (Py_uintptr_t)(end - code)) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002775 FAIL; \
2776 code++; \
2777 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00002778#define GET_SKIP GET_SKIP_ADJ(0)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002779
2780static int
2781_validate_charset(SRE_CODE *code, SRE_CODE *end)
2782{
2783 /* Some variables are manipulated by the macros above */
2784 SRE_CODE op;
2785 SRE_CODE arg;
2786 SRE_CODE offset;
2787 int i;
2788
2789 while (code < end) {
2790 GET_OP;
2791 switch (op) {
2792
2793 case SRE_OP_NEGATE:
2794 break;
2795
2796 case SRE_OP_LITERAL:
2797 GET_ARG;
2798 break;
2799
2800 case SRE_OP_RANGE:
2801 GET_ARG;
2802 GET_ARG;
2803 break;
2804
2805 case SRE_OP_CHARSET:
2806 offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
Victor Stinner1fa174a2013-08-28 02:06:21 +02002807 if (offset > (Py_uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002808 FAIL;
2809 code += offset;
2810 break;
2811
2812 case SRE_OP_BIGCHARSET:
2813 GET_ARG; /* Number of blocks */
2814 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
Victor Stinner1fa174a2013-08-28 02:06:21 +02002815 if (offset > (Py_uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002816 FAIL;
2817 /* Make sure that each byte points to a valid block */
2818 for (i = 0; i < 256; i++) {
2819 if (((unsigned char *)code)[i] >= arg)
2820 FAIL;
2821 }
2822 code += offset;
2823 offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
Victor Stinner1fa174a2013-08-28 02:06:21 +02002824 if (offset > (Py_uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002825 FAIL;
2826 code += offset;
2827 break;
2828
2829 case SRE_OP_CATEGORY:
2830 GET_ARG;
2831 switch (arg) {
2832 case SRE_CATEGORY_DIGIT:
2833 case SRE_CATEGORY_NOT_DIGIT:
2834 case SRE_CATEGORY_SPACE:
2835 case SRE_CATEGORY_NOT_SPACE:
2836 case SRE_CATEGORY_WORD:
2837 case SRE_CATEGORY_NOT_WORD:
2838 case SRE_CATEGORY_LINEBREAK:
2839 case SRE_CATEGORY_NOT_LINEBREAK:
2840 case SRE_CATEGORY_LOC_WORD:
2841 case SRE_CATEGORY_LOC_NOT_WORD:
2842 case SRE_CATEGORY_UNI_DIGIT:
2843 case SRE_CATEGORY_UNI_NOT_DIGIT:
2844 case SRE_CATEGORY_UNI_SPACE:
2845 case SRE_CATEGORY_UNI_NOT_SPACE:
2846 case SRE_CATEGORY_UNI_WORD:
2847 case SRE_CATEGORY_UNI_NOT_WORD:
2848 case SRE_CATEGORY_UNI_LINEBREAK:
2849 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2850 break;
2851 default:
2852 FAIL;
2853 }
2854 break;
2855
2856 default:
2857 FAIL;
2858
2859 }
2860 }
2861
2862 return 1;
2863}
2864
2865static int
2866_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2867{
2868 /* Some variables are manipulated by the macros above */
2869 SRE_CODE op;
2870 SRE_CODE arg;
2871 SRE_CODE skip;
2872
2873 VTRACE(("code=%p, end=%p\n", code, end));
2874
2875 if (code > end)
2876 FAIL;
2877
2878 while (code < end) {
2879 GET_OP;
2880 switch (op) {
2881
2882 case SRE_OP_MARK:
2883 /* We don't check whether marks are properly nested; the
2884 sre_match() code is robust even if they don't, and the worst
2885 you can get is nonsensical match results. */
2886 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02002887 if (arg > 2 * (size_t)groups + 1) {
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002888 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
2889 FAIL;
2890 }
2891 break;
2892
2893 case SRE_OP_LITERAL:
2894 case SRE_OP_NOT_LITERAL:
2895 case SRE_OP_LITERAL_IGNORE:
2896 case SRE_OP_NOT_LITERAL_IGNORE:
2897 GET_ARG;
2898 /* The arg is just a character, nothing to check */
2899 break;
2900
2901 case SRE_OP_SUCCESS:
2902 case SRE_OP_FAILURE:
2903 /* Nothing to check; these normally end the matching process */
2904 break;
2905
2906 case SRE_OP_AT:
2907 GET_ARG;
2908 switch (arg) {
2909 case SRE_AT_BEGINNING:
2910 case SRE_AT_BEGINNING_STRING:
2911 case SRE_AT_BEGINNING_LINE:
2912 case SRE_AT_END:
2913 case SRE_AT_END_LINE:
2914 case SRE_AT_END_STRING:
2915 case SRE_AT_BOUNDARY:
2916 case SRE_AT_NON_BOUNDARY:
2917 case SRE_AT_LOC_BOUNDARY:
2918 case SRE_AT_LOC_NON_BOUNDARY:
2919 case SRE_AT_UNI_BOUNDARY:
2920 case SRE_AT_UNI_NON_BOUNDARY:
2921 break;
2922 default:
2923 FAIL;
2924 }
2925 break;
2926
2927 case SRE_OP_ANY:
2928 case SRE_OP_ANY_ALL:
2929 /* These have no operands */
2930 break;
2931
2932 case SRE_OP_IN:
2933 case SRE_OP_IN_IGNORE:
2934 GET_SKIP;
2935 /* Stop 1 before the end; we check the FAILURE below */
2936 if (!_validate_charset(code, code+skip-2))
2937 FAIL;
2938 if (code[skip-2] != SRE_OP_FAILURE)
2939 FAIL;
2940 code += skip-1;
2941 break;
2942
2943 case SRE_OP_INFO:
2944 {
2945 /* A minimal info field is
2946 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2947 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2948 more follows. */
Ross Lagerwall88748d72012-03-06 21:48:57 +02002949 SRE_CODE flags, i;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002950 SRE_CODE *newcode;
2951 GET_SKIP;
2952 newcode = code+skip-1;
2953 GET_ARG; flags = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02002954 GET_ARG;
2955 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002956 /* Check that only valid flags are present */
2957 if ((flags & ~(SRE_INFO_PREFIX |
2958 SRE_INFO_LITERAL |
2959 SRE_INFO_CHARSET)) != 0)
2960 FAIL;
2961 /* PREFIX and CHARSET are mutually exclusive */
2962 if ((flags & SRE_INFO_PREFIX) &&
2963 (flags & SRE_INFO_CHARSET))
2964 FAIL;
2965 /* LITERAL implies PREFIX */
2966 if ((flags & SRE_INFO_LITERAL) &&
2967 !(flags & SRE_INFO_PREFIX))
2968 FAIL;
2969 /* Validate the prefix */
2970 if (flags & SRE_INFO_PREFIX) {
Ross Lagerwall88748d72012-03-06 21:48:57 +02002971 SRE_CODE prefix_len;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002972 GET_ARG; prefix_len = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02002973 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002974 /* Here comes the prefix string */
Victor Stinner1fa174a2013-08-28 02:06:21 +02002975 if (prefix_len > (Py_uintptr_t)(newcode - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002976 FAIL;
2977 code += prefix_len;
2978 /* And here comes the overlap table */
Victor Stinner1fa174a2013-08-28 02:06:21 +02002979 if (prefix_len > (Py_uintptr_t)(newcode - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002980 FAIL;
2981 /* Each overlap value should be < prefix_len */
2982 for (i = 0; i < prefix_len; i++) {
2983 if (code[i] >= prefix_len)
2984 FAIL;
2985 }
2986 code += prefix_len;
2987 }
2988 /* Validate the charset */
2989 if (flags & SRE_INFO_CHARSET) {
2990 if (!_validate_charset(code, newcode-1))
2991 FAIL;
2992 if (newcode[-1] != SRE_OP_FAILURE)
2993 FAIL;
2994 code = newcode;
2995 }
2996 else if (code != newcode) {
2997 VTRACE(("code=%p, newcode=%p\n", code, newcode));
2998 FAIL;
2999 }
3000 }
3001 break;
3002
3003 case SRE_OP_BRANCH:
3004 {
3005 SRE_CODE *target = NULL;
3006 for (;;) {
3007 GET_SKIP;
3008 if (skip == 0)
3009 break;
3010 /* Stop 2 before the end; we check the JUMP below */
3011 if (!_validate_inner(code, code+skip-3, groups))
3012 FAIL;
3013 code += skip-3;
3014 /* Check that it ends with a JUMP, and that each JUMP
3015 has the same target */
3016 GET_OP;
3017 if (op != SRE_OP_JUMP)
3018 FAIL;
3019 GET_SKIP;
3020 if (target == NULL)
3021 target = code+skip-1;
3022 else if (code+skip-1 != target)
3023 FAIL;
3024 }
3025 }
3026 break;
3027
3028 case SRE_OP_REPEAT_ONE:
3029 case SRE_OP_MIN_REPEAT_ONE:
3030 {
3031 SRE_CODE min, max;
3032 GET_SKIP;
3033 GET_ARG; min = arg;
3034 GET_ARG; max = arg;
3035 if (min > max)
3036 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02003037 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003038 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003039 if (!_validate_inner(code, code+skip-4, groups))
3040 FAIL;
3041 code += skip-4;
3042 GET_OP;
3043 if (op != SRE_OP_SUCCESS)
3044 FAIL;
3045 }
3046 break;
3047
3048 case SRE_OP_REPEAT:
3049 {
3050 SRE_CODE min, max;
3051 GET_SKIP;
3052 GET_ARG; min = arg;
3053 GET_ARG; max = arg;
3054 if (min > max)
3055 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02003056 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003057 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003058 if (!_validate_inner(code, code+skip-3, groups))
3059 FAIL;
3060 code += skip-3;
3061 GET_OP;
3062 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3063 FAIL;
3064 }
3065 break;
3066
3067 case SRE_OP_GROUPREF:
3068 case SRE_OP_GROUPREF_IGNORE:
3069 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02003070 if (arg >= (size_t)groups)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003071 FAIL;
3072 break;
3073
3074 case SRE_OP_GROUPREF_EXISTS:
3075 /* The regex syntax for this is: '(?(group)then|else)', where
3076 'group' is either an integer group number or a group name,
3077 'then' and 'else' are sub-regexes, and 'else' is optional. */
3078 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02003079 if (arg >= (size_t)groups)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003080 FAIL;
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00003081 GET_SKIP_ADJ(1);
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003082 code--; /* The skip is relative to the first arg! */
3083 /* There are two possibilities here: if there is both a 'then'
3084 part and an 'else' part, the generated code looks like:
3085
3086 GROUPREF_EXISTS
3087 <group>
3088 <skipyes>
3089 ...then part...
3090 JUMP
3091 <skipno>
3092 (<skipyes> jumps here)
3093 ...else part...
3094 (<skipno> jumps here)
3095
3096 If there is only a 'then' part, it looks like:
3097
3098 GROUPREF_EXISTS
3099 <group>
3100 <skip>
3101 ...then part...
3102 (<skip> jumps here)
3103
3104 There is no direct way to decide which it is, and we don't want
3105 to allow arbitrary jumps anywhere in the code; so we just look
3106 for a JUMP opcode preceding our skip target.
3107 */
Victor Stinner1fa174a2013-08-28 02:06:21 +02003108 if (skip >= 3 && skip-3 < (Py_uintptr_t)(end - code) &&
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003109 code[skip-3] == SRE_OP_JUMP)
3110 {
3111 VTRACE(("both then and else parts present\n"));
3112 if (!_validate_inner(code+1, code+skip-3, groups))
3113 FAIL;
3114 code += skip-2; /* Position after JUMP, at <skipno> */
3115 GET_SKIP;
3116 if (!_validate_inner(code, code+skip-1, groups))
3117 FAIL;
3118 code += skip-1;
3119 }
3120 else {
3121 VTRACE(("only a then part present\n"));
3122 if (!_validate_inner(code+1, code+skip-1, groups))
3123 FAIL;
3124 code += skip-1;
3125 }
3126 break;
3127
3128 case SRE_OP_ASSERT:
3129 case SRE_OP_ASSERT_NOT:
3130 GET_SKIP;
3131 GET_ARG; /* 0 for lookahead, width for lookbehind */
3132 code--; /* Back up over arg to simplify math below */
3133 if (arg & 0x80000000)
3134 FAIL; /* Width too large */
3135 /* Stop 1 before the end; we check the SUCCESS below */
3136 if (!_validate_inner(code+1, code+skip-2, groups))
3137 FAIL;
3138 code += skip-2;
3139 GET_OP;
3140 if (op != SRE_OP_SUCCESS)
3141 FAIL;
3142 break;
3143
3144 default:
3145 FAIL;
3146
3147 }
3148 }
3149
3150 VTRACE(("okay\n"));
3151 return 1;
3152}
3153
3154static int
3155_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3156{
3157 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3158 FAIL;
3159 if (groups == 0) /* fix for simplejson */
3160 groups = 100; /* 100 groups should always be safe */
3161 return _validate_inner(code, end-1, groups);
3162}
3163
3164static int
3165_validate(PatternObject *self)
3166{
3167 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3168 {
3169 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3170 return 0;
3171 }
3172 else
3173 VTRACE(("Success!\n"));
3174 return 1;
3175}
3176
3177/* -------------------------------------------------------------------- */
Guido van Rossumb700df92000-03-31 14:59:30 +00003178/* match methods */
3179
3180static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003181match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003182{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003183 Py_XDECREF(self->regs);
3184 Py_XDECREF(self->string);
3185 Py_DECREF(self->pattern);
3186 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00003187}
3188
3189static PyObject*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003190match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00003191{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003192 if (index < 0 || index >= self->groups) {
3193 /* raise IndexError if we were given a bad group number */
3194 PyErr_SetString(
3195 PyExc_IndexError,
3196 "no such group"
3197 );
3198 return NULL;
3199 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003200
Fredrik Lundh6f013982000-07-03 18:44:21 +00003201 index *= 2;
3202
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003203 if (self->string == Py_None || self->mark[index] < 0) {
3204 /* return default value if the string or group is undefined */
3205 Py_INCREF(def);
3206 return def;
3207 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003208
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003209 return PySequence_GetSlice(
3210 self->string, self->mark[index], self->mark[index+1]
3211 );
Guido van Rossumb700df92000-03-31 14:59:30 +00003212}
3213
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003214static Py_ssize_t
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003215match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00003216{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003217 Py_ssize_t i;
Guido van Rossumb700df92000-03-31 14:59:30 +00003218
Guido van Rossumddefaf32007-01-14 03:31:43 +00003219 if (index == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003220 /* Default value */
3221 return 0;
Guido van Rossumddefaf32007-01-14 03:31:43 +00003222
Christian Heimes217cfd12007-12-02 14:31:20 +00003223 if (PyLong_Check(index))
3224 return PyLong_AsSsize_t(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00003225
Fredrik Lundh6f013982000-07-03 18:44:21 +00003226 i = -1;
3227
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003228 if (self->pattern->groupindex) {
3229 index = PyObject_GetItem(self->pattern->groupindex, index);
3230 if (index) {
Neal Norwitz1fe5f382007-08-31 04:32:55 +00003231 if (PyLong_Check(index))
Christian Heimes217cfd12007-12-02 14:31:20 +00003232 i = PyLong_AsSsize_t(index);
Fredrik Lundh6f013982000-07-03 18:44:21 +00003233 Py_DECREF(index);
3234 } else
3235 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003236 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00003237
3238 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003239}
3240
3241static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003242match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003243{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003244 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00003245}
3246
3247static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003248match_expand(MatchObject* self, PyObject* ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003249{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003250 /* delegate to Python code */
3251 return call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00003252 SRE_PY_MODULE, "_expand",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003253 PyTuple_Pack(3, self->pattern, self, ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003254 );
3255}
3256
3257static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003258match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003259{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003260 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003261 Py_ssize_t i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00003262
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003263 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00003264
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003265 switch (size) {
3266 case 0:
3267 result = match_getslice(self, Py_False, Py_None);
3268 break;
3269 case 1:
3270 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3271 break;
3272 default:
3273 /* fetch multiple items */
3274 result = PyTuple_New(size);
3275 if (!result)
3276 return NULL;
3277 for (i = 0; i < size; i++) {
3278 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003279 self, PyTuple_GET_ITEM(args, i), Py_None
3280 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003281 if (!item) {
3282 Py_DECREF(result);
3283 return NULL;
3284 }
3285 PyTuple_SET_ITEM(result, i, item);
3286 }
3287 break;
3288 }
3289 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003290}
3291
3292static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003293match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003294{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003295 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003296 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003297
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003298 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003299 static char* kwlist[] = { "default", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00003300 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003301 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003302
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003303 result = PyTuple_New(self->groups-1);
3304 if (!result)
3305 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003306
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003307 for (index = 1; index < self->groups; index++) {
3308 PyObject* item;
3309 item = match_getslice_by_index(self, index, def);
3310 if (!item) {
3311 Py_DECREF(result);
3312 return NULL;
3313 }
3314 PyTuple_SET_ITEM(result, index-1, item);
3315 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003316
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003317 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003318}
3319
3320static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003321match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003322{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003323 PyObject* result;
3324 PyObject* keys;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003325 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003326
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003327 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003328 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00003329 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003330 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003331
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003332 result = PyDict_New();
3333 if (!result || !self->pattern->groupindex)
3334 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003335
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003336 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003337 if (!keys)
3338 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00003339
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003340 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00003341 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003342 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003343 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003344 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003345 if (!key)
3346 goto failed;
3347 value = match_getslice(self, key, def);
3348 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003349 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003350 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003351 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00003352 status = PyDict_SetItem(result, key, value);
3353 Py_DECREF(value);
3354 if (status < 0)
3355 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003356 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003357
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003358 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00003359
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003360 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003361
3362failed:
Neal Norwitz60da3162006-03-07 04:48:24 +00003363 Py_XDECREF(keys);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003364 Py_DECREF(result);
3365 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003366}
3367
3368static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003369match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003370{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003371 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003372
Guido van Rossumddefaf32007-01-14 03:31:43 +00003373 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003374 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003375 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003376
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003377 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003378
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003379 if (index < 0 || index >= self->groups) {
3380 PyErr_SetString(
3381 PyExc_IndexError,
3382 "no such group"
3383 );
3384 return NULL;
3385 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003386
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003387 /* mark is -1 if group is undefined */
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01003388 return PyLong_FromSsize_t(self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00003389}
3390
3391static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003392match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003393{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003394 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003395
Guido van Rossumddefaf32007-01-14 03:31:43 +00003396 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003397 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003398 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003399
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003400 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003401
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003402 if (index < 0 || index >= self->groups) {
3403 PyErr_SetString(
3404 PyExc_IndexError,
3405 "no such group"
3406 );
3407 return NULL;
3408 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003409
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003410 /* mark is -1 if group is undefined */
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01003411 return PyLong_FromSsize_t(self->mark[index*2+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003412}
3413
3414LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003415_pair(Py_ssize_t i1, Py_ssize_t i2)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003416{
3417 PyObject* pair;
3418 PyObject* item;
3419
3420 pair = PyTuple_New(2);
3421 if (!pair)
3422 return NULL;
3423
Christian Heimes217cfd12007-12-02 14:31:20 +00003424 item = PyLong_FromSsize_t(i1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003425 if (!item)
3426 goto error;
3427 PyTuple_SET_ITEM(pair, 0, item);
3428
Christian Heimes217cfd12007-12-02 14:31:20 +00003429 item = PyLong_FromSsize_t(i2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003430 if (!item)
3431 goto error;
3432 PyTuple_SET_ITEM(pair, 1, item);
3433
3434 return pair;
3435
3436 error:
3437 Py_DECREF(pair);
3438 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003439}
3440
3441static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003442match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003443{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003444 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003445
Guido van Rossumddefaf32007-01-14 03:31:43 +00003446 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003447 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003448 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003449
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003450 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003451
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003452 if (index < 0 || index >= self->groups) {
3453 PyErr_SetString(
3454 PyExc_IndexError,
3455 "no such group"
3456 );
3457 return NULL;
3458 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003459
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003460 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003461 return _pair(self->mark[index*2], self->mark[index*2+1]);
3462}
3463
3464static PyObject*
3465match_regs(MatchObject* self)
3466{
3467 PyObject* regs;
3468 PyObject* item;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003469 Py_ssize_t index;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003470
3471 regs = PyTuple_New(self->groups);
3472 if (!regs)
3473 return NULL;
3474
3475 for (index = 0; index < self->groups; index++) {
3476 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3477 if (!item) {
3478 Py_DECREF(regs);
3479 return NULL;
3480 }
3481 PyTuple_SET_ITEM(regs, index, item);
3482 }
3483
3484 Py_INCREF(regs);
3485 self->regs = regs;
3486
3487 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00003488}
3489
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003490static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003491match_copy(MatchObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003492{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003493#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003494 MatchObject* copy;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003495 Py_ssize_t slots, offset;
Tim Peters3d563502006-01-21 02:47:53 +00003496
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003497 slots = 2 * (self->pattern->groups+1);
3498
3499 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3500 if (!copy)
3501 return NULL;
3502
3503 /* this value a constant, but any compiler should be able to
3504 figure that out all by itself */
3505 offset = offsetof(MatchObject, string);
3506
3507 Py_XINCREF(self->pattern);
3508 Py_XINCREF(self->string);
3509 Py_XINCREF(self->regs);
3510
3511 memcpy((char*) copy + offset, (char*) self + offset,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003512 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003513
3514 return (PyObject*) copy;
3515#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003516 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003517 return NULL;
3518#endif
3519}
3520
3521static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003522match_deepcopy(MatchObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003523{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003524#ifdef USE_BUILTIN_COPY
3525 MatchObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00003526
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003527 copy = (MatchObject*) match_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003528 if (!copy)
3529 return NULL;
3530
3531 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3532 !deepcopy(&copy->string, memo) ||
3533 !deepcopy(&copy->regs, memo)) {
3534 Py_DECREF(copy);
3535 return NULL;
3536 }
3537
3538#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003539 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3540 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003541#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003542}
3543
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003544PyDoc_STRVAR(match_doc,
3545"The result of re.match() and re.search().\n\
3546Match objects always have a boolean value of True.");
3547
3548PyDoc_STRVAR(match_group_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003549"group([group1, ...]) -> str or tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003550 Return subgroup(s) of the match by indices or names.\n\
3551 For 0 returns the entire match.");
3552
3553PyDoc_STRVAR(match_start_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003554"start([group=0]) -> int.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003555 Return index of the start of the substring matched by group.");
3556
3557PyDoc_STRVAR(match_end_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003558"end([group=0]) -> int.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003559 Return index of the end of the substring matched by group.");
3560
3561PyDoc_STRVAR(match_span_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003562"span([group]) -> tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003563 For MatchObject m, return the 2-tuple (m.start(group), m.end(group)).");
3564
3565PyDoc_STRVAR(match_groups_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003566"groups([default=None]) -> tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003567 Return a tuple containing all the subgroups of the match, from 1.\n\
3568 The default argument is used for groups\n\
3569 that did not participate in the match");
3570
3571PyDoc_STRVAR(match_groupdict_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003572"groupdict([default=None]) -> dict.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003573 Return a dictionary containing all the named subgroups of the match,\n\
3574 keyed by the subgroup name. The default argument is used for groups\n\
3575 that did not participate in the match");
3576
3577PyDoc_STRVAR(match_expand_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003578"expand(template) -> str.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003579 Return the string obtained by doing backslash substitution\n\
3580 on the string template, as done by the sub() method.");
3581
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003582static PyMethodDef match_methods[] = {
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003583 {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
3584 {"start", (PyCFunction) match_start, METH_VARARGS, match_start_doc},
3585 {"end", (PyCFunction) match_end, METH_VARARGS, match_end_doc},
3586 {"span", (PyCFunction) match_span, METH_VARARGS, match_span_doc},
3587 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS,
3588 match_groups_doc},
3589 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS,
3590 match_groupdict_doc},
3591 {"expand", (PyCFunction) match_expand, METH_O, match_expand_doc},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003592 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3593 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003594 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003595};
3596
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003597static PyObject *
3598match_lastindex_get(MatchObject *self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003599{
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003600 if (self->lastindex >= 0)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01003601 return PyLong_FromSsize_t(self->lastindex);
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003602 Py_INCREF(Py_None);
3603 return Py_None;
Guido van Rossumb700df92000-03-31 14:59:30 +00003604}
3605
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003606static PyObject *
3607match_lastgroup_get(MatchObject *self)
3608{
3609 if (self->pattern->indexgroup && self->lastindex >= 0) {
3610 PyObject* result = PySequence_GetItem(
3611 self->pattern->indexgroup, self->lastindex
3612 );
3613 if (result)
3614 return result;
3615 PyErr_Clear();
3616 }
3617 Py_INCREF(Py_None);
3618 return Py_None;
3619}
3620
3621static PyObject *
3622match_regs_get(MatchObject *self)
3623{
3624 if (self->regs) {
3625 Py_INCREF(self->regs);
3626 return self->regs;
3627 } else
3628 return match_regs(self);
3629}
3630
3631static PyGetSetDef match_getset[] = {
3632 {"lastindex", (getter)match_lastindex_get, (setter)NULL},
3633 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
3634 {"regs", (getter)match_regs_get, (setter)NULL},
3635 {NULL}
3636};
3637
3638#define MATCH_OFF(x) offsetof(MatchObject, x)
3639static PyMemberDef match_members[] = {
3640 {"string", T_OBJECT, MATCH_OFF(string), READONLY},
3641 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
3642 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
3643 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
3644 {NULL}
3645};
3646
Guido van Rossumb700df92000-03-31 14:59:30 +00003647/* FIXME: implement setattr("string", None) as a special case (to
3648 detach the associated string, if any */
3649
Neal Norwitz57c179c2006-03-22 07:18:02 +00003650static PyTypeObject Match_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003651 PyVarObject_HEAD_INIT(NULL,0)
3652 "_" SRE_MODULE ".SRE_Match",
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003653 sizeof(MatchObject), sizeof(Py_ssize_t),
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003654 (destructor)match_dealloc, /* tp_dealloc */
3655 0, /* tp_print */
3656 0, /* tp_getattr */
3657 0, /* tp_setattr */
3658 0, /* tp_reserved */
3659 0, /* tp_repr */
3660 0, /* tp_as_number */
3661 0, /* tp_as_sequence */
3662 0, /* tp_as_mapping */
3663 0, /* tp_hash */
3664 0, /* tp_call */
3665 0, /* tp_str */
3666 0, /* tp_getattro */
3667 0, /* tp_setattro */
3668 0, /* tp_as_buffer */
3669 Py_TPFLAGS_DEFAULT, /* tp_flags */
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003670 match_doc, /* tp_doc */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003671 0, /* tp_traverse */
3672 0, /* tp_clear */
3673 0, /* tp_richcompare */
3674 0, /* tp_weaklistoffset */
3675 0, /* tp_iter */
3676 0, /* tp_iternext */
3677 match_methods, /* tp_methods */
3678 match_members, /* tp_members */
3679 match_getset, /* tp_getset */
Guido van Rossumb700df92000-03-31 14:59:30 +00003680};
3681
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003682static PyObject*
3683pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3684{
3685 /* create match object (from state object) */
3686
3687 MatchObject* match;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003688 Py_ssize_t i, j;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003689 char* base;
3690 int n;
3691
3692 if (status > 0) {
3693
3694 /* create match object (with room for extra group marks) */
Christian Heimes587c2bf2008-01-19 16:21:02 +00003695 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003696 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3697 2*(pattern->groups+1));
3698 if (!match)
3699 return NULL;
3700
3701 Py_INCREF(pattern);
3702 match->pattern = pattern;
3703
3704 Py_INCREF(state->string);
3705 match->string = state->string;
3706
3707 match->regs = NULL;
3708 match->groups = pattern->groups+1;
3709
3710 /* fill in group slices */
3711
3712 base = (char*) state->beginning;
3713 n = state->charsize;
3714
3715 match->mark[0] = ((char*) state->start - base) / n;
3716 match->mark[1] = ((char*) state->ptr - base) / n;
3717
3718 for (i = j = 0; i < pattern->groups; i++, j+=2)
3719 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3720 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3721 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3722 } else
3723 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3724
3725 match->pos = state->pos;
3726 match->endpos = state->endpos;
3727
3728 match->lastindex = state->lastindex;
3729
3730 return (PyObject*) match;
3731
3732 } else if (status == 0) {
3733
3734 /* no match */
3735 Py_INCREF(Py_None);
3736 return Py_None;
3737
3738 }
3739
3740 /* internal error */
3741 pattern_error(status);
3742 return NULL;
3743}
3744
3745
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003746/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003747/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003748
3749static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003750scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003751{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003752 state_fini(&self->state);
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003753 Py_XDECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003754 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003755}
3756
3757static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003758scanner_match(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003759{
3760 SRE_STATE* state = &self->state;
3761 PyObject* match;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01003762 Py_ssize_t status;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003763
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003764 state_reset(state);
3765
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003766 state->ptr = state->start;
3767
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003768 if (state->logical_charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003769 status = sre_match(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003770 } else {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003771 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003772 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003773 if (PyErr_Occurred())
3774 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003775
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003776 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003777 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003778
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003779 if (status == 0 || state->ptr == state->start)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003780 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003781 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003782 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003783
3784 return match;
3785}
3786
3787
3788static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003789scanner_search(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003790{
3791 SRE_STATE* state = &self->state;
3792 PyObject* match;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01003793 Py_ssize_t status;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003794
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003795 state_reset(state);
3796
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003797 state->ptr = state->start;
3798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003799 if (state->logical_charsize == 1) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003800 status = sre_search(state, PatternObject_GetCode(self->pattern));
3801 } else {
3802 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
3803 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003804 if (PyErr_Occurred())
3805 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003806
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003807 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003808 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003809
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003810 if (status == 0 || state->ptr == state->start)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003811 state->start = (void*) ((char*) state->ptr + state->charsize);
3812 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003813 state->start = state->ptr;
3814
3815 return match;
3816}
3817
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003818static PyMethodDef scanner_methods[] = {
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003819 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3820 {"search", (PyCFunction) scanner_search, METH_NOARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003821 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003822};
3823
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003824#define SCAN_OFF(x) offsetof(ScannerObject, x)
3825static PyMemberDef scanner_members[] = {
Ezio Melotti7c8c1ea2011-09-29 01:00:19 +03003826 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003827 {NULL} /* Sentinel */
3828};
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003829
Neal Norwitz57c179c2006-03-22 07:18:02 +00003830static PyTypeObject Scanner_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003831 PyVarObject_HEAD_INIT(NULL, 0)
3832 "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003833 sizeof(ScannerObject), 0,
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003834 (destructor)scanner_dealloc,/* tp_dealloc */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003835 0, /* tp_print */
3836 0, /* tp_getattr */
3837 0, /* tp_setattr */
3838 0, /* tp_reserved */
3839 0, /* tp_repr */
3840 0, /* tp_as_number */
3841 0, /* tp_as_sequence */
3842 0, /* tp_as_mapping */
3843 0, /* tp_hash */
3844 0, /* tp_call */
3845 0, /* tp_str */
3846 0, /* tp_getattro */
3847 0, /* tp_setattro */
3848 0, /* tp_as_buffer */
3849 Py_TPFLAGS_DEFAULT, /* tp_flags */
3850 0, /* tp_doc */
3851 0, /* tp_traverse */
3852 0, /* tp_clear */
3853 0, /* tp_richcompare */
3854 0, /* tp_weaklistoffset */
3855 0, /* tp_iter */
3856 0, /* tp_iternext */
3857 scanner_methods, /* tp_methods */
3858 scanner_members, /* tp_members */
3859 0, /* tp_getset */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003860};
3861
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003862static PyObject*
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06003863pattern_scanner(PatternObject* pattern, PyObject* args, PyObject* kw)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003864{
3865 /* create search state object */
3866
3867 ScannerObject* self;
3868
3869 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003870 Py_ssize_t start = 0;
3871 Py_ssize_t end = PY_SSIZE_T_MAX;
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06003872 static char* kwlist[] = { "source", "pos", "endpos", NULL };
3873 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:scanner", kwlist,
3874 &string, &start, &end))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003875 return NULL;
3876
3877 /* create scanner object */
3878 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3879 if (!self)
3880 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003881 self->pattern = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003882
3883 string = state_init(&self->state, pattern, string, start, end);
3884 if (!string) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003885 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003886 return NULL;
3887 }
3888
3889 Py_INCREF(pattern);
3890 self->pattern = (PyObject*) pattern;
3891
3892 return (PyObject*) self;
3893}
3894
Guido van Rossumb700df92000-03-31 14:59:30 +00003895static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00003896 {"compile", _compile, METH_VARARGS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003897 {"getcodesize", sre_codesize, METH_NOARGS},
Neal Norwitzb0493252002-03-31 14:44:22 +00003898 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003899 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003900};
3901
Martin v. Löwis1a214512008-06-11 05:26:20 +00003902static struct PyModuleDef sremodule = {
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003903 PyModuleDef_HEAD_INIT,
3904 "_" SRE_MODULE,
3905 NULL,
3906 -1,
3907 _functions,
3908 NULL,
3909 NULL,
3910 NULL,
3911 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00003912};
3913
3914PyMODINIT_FUNC PyInit__sre(void)
Guido van Rossumb700df92000-03-31 14:59:30 +00003915{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003916 PyObject* m;
3917 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00003918 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003919
Benjamin Peterson08bf91c2010-04-11 16:12:57 +00003920 /* Patch object types */
3921 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
3922 PyType_Ready(&Scanner_Type))
Martin v. Löwis1a214512008-06-11 05:26:20 +00003923 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003924
Martin v. Löwis1a214512008-06-11 05:26:20 +00003925 m = PyModule_Create(&sremodule);
Neal Norwitz1ac754f2006-01-19 06:09:39 +00003926 if (m == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003927 return NULL;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003928 d = PyModule_GetDict(m);
3929
Christian Heimes217cfd12007-12-02 14:31:20 +00003930 x = PyLong_FromLong(SRE_MAGIC);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003931 if (x) {
3932 PyDict_SetItemString(d, "MAGIC", x);
3933 Py_DECREF(x);
3934 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00003935
Christian Heimes217cfd12007-12-02 14:31:20 +00003936 x = PyLong_FromLong(sizeof(SRE_CODE));
Martin v. Löwis78e2f062003-04-19 12:56:08 +00003937 if (x) {
3938 PyDict_SetItemString(d, "CODESIZE", x);
3939 Py_DECREF(x);
3940 }
3941
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02003942 x = PyLong_FromUnsignedLong(SRE_MAXREPEAT);
3943 if (x) {
3944 PyDict_SetItemString(d, "MAXREPEAT", x);
3945 Py_DECREF(x);
3946 }
3947
Neal Norwitzfe537132007-08-26 03:55:15 +00003948 x = PyUnicode_FromString(copyright);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003949 if (x) {
3950 PyDict_SetItemString(d, "copyright", x);
3951 Py_DECREF(x);
3952 }
Martin v. Löwis1a214512008-06-11 05:26:20 +00003953 return m;
Guido van Rossumb700df92000-03-31 14:59:30 +00003954}
3955
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003956#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00003957
3958/* vim:ts=4:sw=4:et
3959*/