blob: 41ed667a37e655bff6704f5533cb30d06a3655f5 [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
Fredrik Lundhebc37b22000-10-28 19:30:41 +00009 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000010 * 2000-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000011 * 2000-09-20 fl added expand method
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000012 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000013 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000014 * 2001-04-28 fl added __copy__ methods (work in progress)
Fredrik Lundh09705f02002-11-22 12:46:35 +000015 * 2001-05-14 fl fixes for 1.5.2 compatibility
Fredrik Lundhf71ae462001-07-02 17:04:48 +000016 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh397a6542001-10-18 19:30:16 +000017 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Fredrik Lundh971e78b2001-10-20 17:48:46 +000018 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
Fredrik Lundhbec95b92001-10-21 16:47:57 +000019 * 2001-10-21 fl added sub/subn primitive
Fredrik Lundh703ce812001-10-24 22:16:30 +000020 * 2001-10-24 fl added finditer primitive (for 2.2 only)
Fredrik Lundh82b23072001-12-09 16:13:15 +000021 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
Fredrik Lundh09705f02002-11-22 12:46:35 +000022 * 2002-11-09 fl fixed empty sub/subn return type
Martin v. Löwis78e2f062003-04-19 12:56:08 +000023 * 2003-04-18 mvl fully support 4-byte codes
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +000024 * 2003-10-17 gn implemented non recursive scheme
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000026 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000027 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000028 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
Guido van Rossumb700df92000-03-31 14:59:30 +000032 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000033 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000034 * other compatibility work.
35 */
36
37#ifndef SRE_RECURSIVE
38
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000039static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000040 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000041
Thomas Wouters0e3f5912006-08-11 14:57:12 +000042#define PY_SSIZE_T_CLEAN
43
Guido van Rossumb700df92000-03-31 14:59:30 +000044#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000045#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000046
47#include "sre.h"
48
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000049#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000050
Fredrik Lundh436c3d582000-06-29 08:58:44 +000051/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000052#if !defined(SRE_MODULE)
53#define SRE_MODULE "sre"
54#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000055
Thomas Wouters9ada3d62006-04-21 09:47:09 +000056#define SRE_PY_MODULE "re"
57
Guido van Rossumb700df92000-03-31 14:59:30 +000058/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000059#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000060
Fredrik Lundh22d25462000-07-01 17:50:59 +000061/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +000062#define HAVE_UNICODE
Fredrik Lundh436c3d582000-06-29 08:58:44 +000063
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000065/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000066
67/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000068#define USE_FAST_SEARCH
69
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000070/* enables copy/deepcopy handling (work in progress) */
71#undef USE_BUILTIN_COPY
72
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000073/* -------------------------------------------------------------------- */
74
Fredrik Lundh80946112000-06-29 18:03:25 +000075#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000076#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000077#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000078/* fastest possible local call under MSVC */
79#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000080#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000081#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000082#else
83#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000084#endif
85
86/* error codes */
87#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000088#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000089#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000090#define SRE_ERROR_MEMORY -9 /* out of memory */
Christian Heimes2380ac72008-01-09 00:17:24 +000091#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
Guido van Rossumb700df92000-03-31 14:59:30 +000092
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000093#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +000094#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +000095#else
96#define TRACE(v)
97#endif
98
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000099/* -------------------------------------------------------------------- */
100/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000101
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000102/* default character predicates (run sre_chars.py to regenerate tables) */
103
104#define SRE_DIGIT_MASK 1
105#define SRE_SPACE_MASK 2
106#define SRE_LINEBREAK_MASK 4
107#define SRE_ALNUM_MASK 8
108#define SRE_WORD_MASK 16
109
Fredrik Lundh21009b92001-09-18 18:47:09 +0000110/* FIXME: this assumes ASCII. create tables in init_sre() instead */
111
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000112static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1132, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1140, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
11525, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
11624, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1170, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
11824, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
119
Fredrik Lundhb389df32000-06-29 12:48:37 +0000120static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000012110, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
12227, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
12344, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
12461, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
125108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
126122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
127106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
128120, 121, 122, 123, 124, 125, 126, 127 };
129
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000130#define SRE_IS_DIGIT(ch)\
131 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
132#define SRE_IS_SPACE(ch)\
133 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
134#define SRE_IS_LINEBREAK(ch)\
135 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
136#define SRE_IS_ALNUM(ch)\
137 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
138#define SRE_IS_WORD(ch)\
139 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000140
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000141static unsigned int sre_lower(unsigned int ch)
142{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000143 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000144}
145
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000146/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000147/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
148 * warnings when c's type supports only numbers < N+1 */
149#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
150#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000151#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000152#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000153#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
154
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000155static unsigned int sre_lower_locale(unsigned int ch)
156{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000157 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000158}
159
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000160/* unicode-specific character predicates */
161
Victor Stinner0058b862011-09-29 03:27:47 +0200162#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
163#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch)
164#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch)
165#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch)
166#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000167
168static unsigned int sre_lower_unicode(unsigned int ch)
169{
Victor Stinner0058b862011-09-29 03:27:47 +0200170 return (unsigned int) Py_UNICODE_TOLOWER(ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000171}
172
Guido van Rossumb700df92000-03-31 14:59:30 +0000173LOCAL(int)
174sre_category(SRE_CODE category, unsigned int ch)
175{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000176 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000177
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000178 case SRE_CATEGORY_DIGIT:
179 return SRE_IS_DIGIT(ch);
180 case SRE_CATEGORY_NOT_DIGIT:
181 return !SRE_IS_DIGIT(ch);
182 case SRE_CATEGORY_SPACE:
183 return SRE_IS_SPACE(ch);
184 case SRE_CATEGORY_NOT_SPACE:
185 return !SRE_IS_SPACE(ch);
186 case SRE_CATEGORY_WORD:
187 return SRE_IS_WORD(ch);
188 case SRE_CATEGORY_NOT_WORD:
189 return !SRE_IS_WORD(ch);
190 case SRE_CATEGORY_LINEBREAK:
191 return SRE_IS_LINEBREAK(ch);
192 case SRE_CATEGORY_NOT_LINEBREAK:
193 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000194
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000195 case SRE_CATEGORY_LOC_WORD:
196 return SRE_LOC_IS_WORD(ch);
197 case SRE_CATEGORY_LOC_NOT_WORD:
198 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000199
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000200 case SRE_CATEGORY_UNI_DIGIT:
201 return SRE_UNI_IS_DIGIT(ch);
202 case SRE_CATEGORY_UNI_NOT_DIGIT:
203 return !SRE_UNI_IS_DIGIT(ch);
204 case SRE_CATEGORY_UNI_SPACE:
205 return SRE_UNI_IS_SPACE(ch);
206 case SRE_CATEGORY_UNI_NOT_SPACE:
207 return !SRE_UNI_IS_SPACE(ch);
208 case SRE_CATEGORY_UNI_WORD:
209 return SRE_UNI_IS_WORD(ch);
210 case SRE_CATEGORY_UNI_NOT_WORD:
211 return !SRE_UNI_IS_WORD(ch);
212 case SRE_CATEGORY_UNI_LINEBREAK:
213 return SRE_UNI_IS_LINEBREAK(ch);
214 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
215 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000216 }
217 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000218}
219
220/* helpers */
221
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000222static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000223data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000224{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000225 if (state->data_stack) {
Thomas Wouters477c8d52006-05-27 19:21:47 +0000226 PyMem_FREE(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000227 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000228 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000229 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000230}
231
232static int
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000233data_stack_grow(SRE_STATE* state, Py_ssize_t size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000234{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000235 Py_ssize_t minsize, cursize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000236 minsize = state->data_stack_base+size;
237 cursize = state->data_stack_size;
238 if (cursize < minsize) {
239 void* stack;
240 cursize = minsize+minsize/4+1024;
241 TRACE(("allocate/grow stack %d\n", cursize));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000242 stack = PyMem_REALLOC(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000243 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000244 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000245 return SRE_ERROR_MEMORY;
246 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000247 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000248 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000249 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000250 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000251}
252
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000253/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000254
255#define SRE_CHAR unsigned char
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200256#define SRE_CHARGET(state, buf, index) ((unsigned char*)buf)[index]
Guido van Rossumb700df92000-03-31 14:59:30 +0000257#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000258#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000259#define SRE_CHARSET sre_charset
260#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000261#define SRE_MATCH sre_match
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000262#define SRE_MATCH_CONTEXT sre_match_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000263#define SRE_SEARCH sre_search
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000264
Guido van Rossumb700df92000-03-31 14:59:30 +0000265#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000266#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000267#undef SRE_RECURSIVE
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000268
Guido van Rossumb700df92000-03-31 14:59:30 +0000269#undef SRE_SEARCH
270#undef SRE_MATCH
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000271#undef SRE_MATCH_CONTEXT
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000272#undef SRE_INFO
273#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000274#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000275#undef SRE_AT
276#undef SRE_CHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200277#undef SRE_CHARGET
Guido van Rossumb700df92000-03-31 14:59:30 +0000278
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200279/* generate 8/16/32-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000280
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200281#define SRE_CHAR void
282#define SRE_CHARGET(state, buf, index) \
283 ((state->charsize==1) ? ((Py_UCS1*)buf)[index] : \
284 (state->charsize==2) ? ((Py_UCS2*)buf)[index] : \
285 ((Py_UCS4*)buf)[index])
Guido van Rossumb700df92000-03-31 14:59:30 +0000286#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000287#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000288#define SRE_CHARSET sre_ucharset
289#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000290#define SRE_MATCH sre_umatch
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000291#define SRE_MATCH_CONTEXT sre_umatch_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000292#define SRE_SEARCH sre_usearch
293
294#endif /* SRE_RECURSIVE */
295
296/* -------------------------------------------------------------------- */
297/* String matching engine */
298
299/* the following section is compiled twice, with different character
300 settings */
301
302LOCAL(int)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200303SRE_AT(SRE_STATE* state, char* ptr, SRE_CODE at)
Guido van Rossumb700df92000-03-31 14:59:30 +0000304{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000305 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000306
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000307 Py_ssize_t thisp, thatp;
Guido van Rossumb700df92000-03-31 14:59:30 +0000308
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000309 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000310
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000311 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000312 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000313 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000314
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000315 case SRE_AT_BEGINNING_LINE:
316 return ((void*) ptr == state->beginning ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200317 SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, -1)));
Fredrik Lundh80946112000-06-29 18:03:25 +0000318
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000319 case SRE_AT_END:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200320 return (((void*) (ptr+state->charsize) == state->end &&
321 SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, 0))) ||
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000322 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000323
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000324 case SRE_AT_END_LINE:
325 return ((void*) ptr == state->end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200326 SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, 0)));
Fredrik Lundh80946112000-06-29 18:03:25 +0000327
Fredrik Lundh770617b2001-01-14 15:06:11 +0000328 case SRE_AT_END_STRING:
329 return ((void*) ptr == state->end);
330
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000331 case SRE_AT_BOUNDARY:
332 if (state->beginning == state->end)
333 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000334 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200335 SRE_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000336 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200337 SRE_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000338 return thisp != thatp;
Fredrik Lundh80946112000-06-29 18:03:25 +0000339
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000340 case SRE_AT_NON_BOUNDARY:
341 if (state->beginning == state->end)
342 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000343 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200344 SRE_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000345 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200346 SRE_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000347 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000348
349 case SRE_AT_LOC_BOUNDARY:
350 if (state->beginning == state->end)
351 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000352 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200353 SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000354 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200355 SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000356 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000357
358 case SRE_AT_LOC_NON_BOUNDARY:
359 if (state->beginning == state->end)
360 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000361 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200362 SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000363 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200364 SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000365 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000366
367 case SRE_AT_UNI_BOUNDARY:
368 if (state->beginning == state->end)
369 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000370 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200371 SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000372 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200373 SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000374 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000375
376 case SRE_AT_UNI_NON_BOUNDARY:
377 if (state->beginning == state->end)
378 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000379 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200380 SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000381 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200382 SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000383 return thisp == thatp;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000384
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000385 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000386
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000387 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000388}
389
390LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000391SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000392{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000393 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000394
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000395 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000396
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000397 for (;;) {
398 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000399
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000400 case SRE_OP_FAILURE:
401 return !ok;
402
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000403 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000404 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000405 if (ch == set[0])
406 return ok;
407 set++;
408 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000409
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000410 case SRE_OP_CATEGORY:
411 /* <CATEGORY> <code> */
412 if (sre_category(set[0], (int) ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000413 return ok;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000414 set += 1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000415 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000416
Fredrik Lundh3562f112000-07-02 12:00:07 +0000417 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000418 if (sizeof(SRE_CODE) == 2) {
419 /* <CHARSET> <bitmap> (16 bits per code word) */
420 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
421 return ok;
422 set += 16;
Tim Peters3d563502006-01-21 02:47:53 +0000423 }
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000424 else {
425 /* <CHARSET> <bitmap> (32 bits per code word) */
Gregory P. Smith90555d02012-12-10 17:44:44 -0800426 if (ch < 256 && (set[ch >> 5] & (1u << (ch & 31))))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000427 return ok;
428 set += 8;
429 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000430 break;
431
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000432 case SRE_OP_RANGE:
433 /* <RANGE> <lower> <upper> */
434 if (set[0] <= ch && ch <= set[1])
435 return ok;
436 set += 2;
437 break;
438
439 case SRE_OP_NEGATE:
440 ok = !ok;
441 break;
442
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000443 case SRE_OP_BIGCHARSET:
444 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
445 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000446 Py_ssize_t count, block;
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000447 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000448
449 if (sizeof(SRE_CODE) == 2) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200450 block = ((char*)set)[ch >> 8];
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000451 set += 128;
452 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
453 return ok;
454 set += count*16;
455 }
456 else {
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000457 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
458 * warnings when c's type supports only numbers < N+1 */
459 if (!(ch & ~65535))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200460 block = ((char*)set)[ch >> 8];
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000461 else
462 block = -1;
463 set += 64;
Tim Peters3d563502006-01-21 02:47:53 +0000464 if (block >=0 &&
Gregory P. Smith90555d02012-12-10 17:44:44 -0800465 (set[block*8 + ((ch & 255)>>5)] & (1u << (ch & 31))))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000466 return ok;
467 set += count*8;
468 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000469 break;
470 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000471
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000472 default:
473 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000474 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000475 return 0;
476 }
477 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000478}
479
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000480LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000481
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000482LOCAL(Py_ssize_t)
483SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000484{
485 SRE_CODE chr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200486 char* ptr = (char *)state->ptr;
487 char* end = (char *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000488 Py_ssize_t i;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000489
490 /* adjust end */
Serhiy Storchakaa0eb8092013-02-16 16:54:33 +0200491 if (maxcount < (end - ptr) / state->charsize && maxcount != SRE_MAXREPEAT)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200492 end = ptr + maxcount*state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000493
494 switch (pattern[0]) {
495
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000496 case SRE_OP_IN:
497 /* repeated set */
498 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
Victor Stinner63ab8752011-11-22 03:31:20 +0100499 while (ptr < end &&
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200500 SRE_CHARSET(pattern + 2, SRE_CHARGET(state, ptr, 0)))
501 ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000502 break;
503
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000504 case SRE_OP_ANY:
505 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000506 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200507 while (ptr < end && !SRE_IS_LINEBREAK(SRE_CHARGET(state, ptr, 0)))
508 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000509 break;
510
511 case SRE_OP_ANY_ALL:
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000512 /* repeated dot wildcard. skip to the end of the target
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000513 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000514 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000515 ptr = end;
516 break;
517
518 case SRE_OP_LITERAL:
519 /* repeated literal */
520 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000521 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200522 while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) == chr)
523 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000524 break;
525
526 case SRE_OP_LITERAL_IGNORE:
527 /* repeated literal */
528 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000529 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200530 while (ptr < end && (SRE_CODE) state->lower(SRE_CHARGET(state, ptr, 0)) == chr)
531 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000532 break;
533
534 case SRE_OP_NOT_LITERAL:
535 /* repeated non-literal */
536 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000537 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200538 while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) != chr)
539 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000540 break;
Tim Peters3d563502006-01-21 02:47:53 +0000541
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000542 case SRE_OP_NOT_LITERAL_IGNORE:
543 /* repeated non-literal */
544 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000545 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200546 while (ptr < end && (SRE_CODE) state->lower(SRE_CHARGET(state, ptr, 0)) != chr)
547 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000548 break;
549
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000550 default:
551 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000552 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200553 while ((char*) state->ptr < end) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000554 i = SRE_MATCH(state, pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000555 if (i < 0)
556 return i;
557 if (!i)
558 break;
559 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000560 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200561 ((char*)state->ptr - ptr)/state->charsize));
562 return ((char*)state->ptr - ptr)/state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000563 }
564
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200565 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, (ptr - (char*) state->ptr)/state->charsize));
566 return (ptr - (char*) state->ptr)/state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000567}
568
Fredrik Lundh33accc12000-08-27 20:59:47 +0000569#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000570LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000571SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
572{
573 /* check if an SRE_OP_INFO block matches at the current position.
574 returns the number of SRE_CODE objects to skip if successful, 0
575 if no match */
576
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200577 char* end = state->end;
578 char* ptr = state->ptr;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000579 Py_ssize_t i;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000580
581 /* check minimal length */
Serhiy Storchakac1b59d42012-12-29 23:38:48 +0200582 if (pattern[3] && (end - ptr)/state->charsize < pattern[3])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000583 return 0;
584
585 /* check known prefix */
586 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
587 /* <length> <skip> <prefix data> <overlap data> */
588 for (i = 0; i < pattern[5]; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200589 if ((SRE_CODE) SRE_CHARGET(state, ptr, i) != pattern[7 + i])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000590 return 0;
591 return pattern[0] + 2 * pattern[6];
592 }
593 return pattern[0];
594}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000595#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000596
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000597/* The macros below should be used to protect recursive SRE_MATCH()
598 * calls that *failed* and do *not* return immediately (IOW, those
599 * that will backtrack). Explaining:
600 *
601 * - Recursive SRE_MATCH() returned true: that's usually a success
602 * (besides atypical cases like ASSERT_NOT), therefore there's no
603 * reason to restore lastmark;
604 *
605 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
606 * is returning to the caller: If the current SRE_MATCH() is the
607 * top function of the recursion, returning false will be a matching
608 * failure, and it doesn't matter where lastmark is pointing to.
609 * If it's *not* the top function, it will be a recursive SRE_MATCH()
610 * failure by itself, and the calling SRE_MATCH() will have to deal
611 * with the failure by the same rules explained here (it will restore
612 * lastmark by itself if necessary);
613 *
614 * - Recursive SRE_MATCH() returned false, and will continue the
615 * outside 'for' loop: must be protected when breaking, since the next
616 * OP could potentially depend on lastmark;
Tim Peters3d563502006-01-21 02:47:53 +0000617 *
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000618 * - Recursive SRE_MATCH() returned false, and will be called again
619 * inside a local for/while loop: must be protected between each
620 * loop iteration, since the recursive SRE_MATCH() could do anything,
621 * and could potentially depend on lastmark.
622 *
623 * For more information, check the discussion at SF patch #712900.
624 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000625#define LASTMARK_SAVE() \
626 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000627 ctx->lastmark = state->lastmark; \
628 ctx->lastindex = state->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000629 } while (0)
630#define LASTMARK_RESTORE() \
631 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000632 state->lastmark = ctx->lastmark; \
633 state->lastindex = ctx->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000634 } while (0)
635
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000636#define RETURN_ERROR(i) do { return i; } while(0)
637#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
638#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
639
640#define RETURN_ON_ERROR(i) \
641 do { if (i < 0) RETURN_ERROR(i); } while (0)
642#define RETURN_ON_SUCCESS(i) \
643 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
644#define RETURN_ON_FAILURE(i) \
645 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
646
647#define SFY(x) #x
648
649#define DATA_STACK_ALLOC(state, type, ptr) \
650do { \
651 alloc_pos = state->data_stack_base; \
652 TRACE(("allocating %s in %d (%d)\n", \
653 SFY(type), alloc_pos, sizeof(type))); \
Serhiy Storchaka4bb17342013-04-13 21:15:47 +0300654 if (sizeof(type) > state->data_stack_size - alloc_pos) { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000655 int j = data_stack_grow(state, sizeof(type)); \
656 if (j < 0) return j; \
657 if (ctx_pos != -1) \
658 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
659 } \
660 ptr = (type*)(state->data_stack+alloc_pos); \
661 state->data_stack_base += sizeof(type); \
662} while (0)
663
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000664#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
665do { \
666 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
667 ptr = (type*)(state->data_stack+pos); \
668} while (0)
669
670#define DATA_STACK_PUSH(state, data, size) \
671do { \
672 TRACE(("copy data in %p to %d (%d)\n", \
673 data, state->data_stack_base, size)); \
Serhiy Storchaka4bb17342013-04-13 21:15:47 +0300674 if (size > state->data_stack_size - state->data_stack_base) { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000675 int j = data_stack_grow(state, size); \
676 if (j < 0) return j; \
677 if (ctx_pos != -1) \
678 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
679 } \
680 memcpy(state->data_stack+state->data_stack_base, data, size); \
681 state->data_stack_base += size; \
682} while (0)
683
684#define DATA_STACK_POP(state, data, size, discard) \
685do { \
686 TRACE(("copy data to %p from %d (%d)\n", \
687 data, state->data_stack_base-size, size)); \
688 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
689 if (discard) \
690 state->data_stack_base -= size; \
691} while (0)
692
693#define DATA_STACK_POP_DISCARD(state, size) \
694do { \
695 TRACE(("discard data from %d (%d)\n", \
696 state->data_stack_base-size, size)); \
697 state->data_stack_base -= size; \
698} while(0)
699
700#define DATA_PUSH(x) \
701 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
702#define DATA_POP(x) \
703 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000704#define DATA_POP_DISCARD(x) \
705 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
706#define DATA_ALLOC(t,p) \
707 DATA_STACK_ALLOC(state, t, p)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000708#define DATA_LOOKUP_AT(t,p,pos) \
709 DATA_STACK_LOOKUP_AT(state,t,p,pos)
710
711#define MARK_PUSH(lastmark) \
712 do if (lastmark > 0) { \
713 i = lastmark; /* ctx->lastmark may change if reallocated */ \
714 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
715 } while (0)
716#define MARK_POP(lastmark) \
717 do if (lastmark > 0) { \
718 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
719 } while (0)
720#define MARK_POP_KEEP(lastmark) \
721 do if (lastmark > 0) { \
722 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
723 } while (0)
724#define MARK_POP_DISCARD(lastmark) \
725 do if (lastmark > 0) { \
726 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
727 } while (0)
728
729#define JUMP_NONE 0
730#define JUMP_MAX_UNTIL_1 1
731#define JUMP_MAX_UNTIL_2 2
732#define JUMP_MAX_UNTIL_3 3
733#define JUMP_MIN_UNTIL_1 4
734#define JUMP_MIN_UNTIL_2 5
735#define JUMP_MIN_UNTIL_3 6
736#define JUMP_REPEAT 7
737#define JUMP_REPEAT_ONE_1 8
738#define JUMP_REPEAT_ONE_2 9
739#define JUMP_MIN_REPEAT_ONE 10
740#define JUMP_BRANCH 11
741#define JUMP_ASSERT 12
742#define JUMP_ASSERT_NOT 13
743
744#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
745 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
746 nextctx->last_ctx_pos = ctx_pos; \
747 nextctx->jump = jumpvalue; \
748 nextctx->pattern = nextpattern; \
749 ctx_pos = alloc_pos; \
750 ctx = nextctx; \
751 goto entrance; \
752 jumplabel: \
753 while (0) /* gcc doesn't like labels at end of scopes */ \
754
755typedef struct {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000756 Py_ssize_t last_ctx_pos;
757 Py_ssize_t jump;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200758 char* ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000759 SRE_CODE* pattern;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000760 Py_ssize_t count;
761 Py_ssize_t lastmark;
762 Py_ssize_t lastindex;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000763 union {
764 SRE_CODE chr;
765 SRE_REPEAT* rep;
766 } u;
767} SRE_MATCH_CONTEXT;
768
769/* check if string matches the given pattern. returns <0 for
770 error, 0 for failure, and 1 for success */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000771LOCAL(Py_ssize_t)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000772SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
Guido van Rossumb700df92000-03-31 14:59:30 +0000773{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200774 char* end = (char*)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000775 Py_ssize_t alloc_pos, ctx_pos = -1;
776 Py_ssize_t i, ret = 0;
777 Py_ssize_t jump;
Christian Heimes2380ac72008-01-09 00:17:24 +0000778 unsigned int sigcount=0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000779
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000780 SRE_MATCH_CONTEXT* ctx;
781 SRE_MATCH_CONTEXT* nextctx;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000782
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000783 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000784
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000785 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
786 ctx->last_ctx_pos = -1;
787 ctx->jump = JUMP_NONE;
788 ctx->pattern = pattern;
789 ctx_pos = alloc_pos;
790
791entrance:
792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200793 ctx->ptr = (char *)state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000794
795 if (ctx->pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000796 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000797 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200798 if (ctx->pattern[3] && (end - ctx->ptr)/state->charsize < ctx->pattern[3]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000799 TRACE(("reject (got %d chars, need %d)\n",
Serhiy Storchakac1b59d42012-12-29 23:38:48 +0200800 (end - ctx->ptr)/state->charsize, ctx->pattern[3]));
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000801 RETURN_FAILURE;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000802 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000803 ctx->pattern += ctx->pattern[1] + 1;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000804 }
805
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000806 for (;;) {
Christian Heimes2380ac72008-01-09 00:17:24 +0000807 ++sigcount;
808 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
809 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
Guido van Rossumb700df92000-03-31 14:59:30 +0000810
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000811 switch (*ctx->pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000812
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000813 case SRE_OP_MARK:
814 /* set mark */
815 /* <MARK> <gid> */
816 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
817 ctx->ptr, ctx->pattern[0]));
818 i = ctx->pattern[0];
819 if (i & 1)
820 state->lastindex = i/2 + 1;
821 if (i > state->lastmark) {
822 /* state->lastmark is the highest valid index in the
823 state->mark array. If it is increased by more than 1,
824 the intervening marks must be set to NULL to signal
Tim Peters3d563502006-01-21 02:47:53 +0000825 that these marks have not been encountered. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000826 Py_ssize_t j = state->lastmark + 1;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000827 while (j < i)
828 state->mark[j++] = NULL;
829 state->lastmark = i;
830 }
831 state->mark[i] = ctx->ptr;
832 ctx->pattern++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000833 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000834
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000835 case SRE_OP_LITERAL:
836 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000837 /* <LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000838 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
839 ctx->ptr, *ctx->pattern));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200840 if (ctx->ptr >= end || (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) != ctx->pattern[0])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000841 RETURN_FAILURE;
842 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200843 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000844 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000845
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000846 case SRE_OP_NOT_LITERAL:
847 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000848 /* <NOT_LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000849 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
850 ctx->ptr, *ctx->pattern));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200851 if (ctx->ptr >= end || (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) == ctx->pattern[0])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000852 RETURN_FAILURE;
853 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200854 ctx->ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000855 break;
856
857 case SRE_OP_SUCCESS:
858 /* end of pattern */
859 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
860 state->ptr = ctx->ptr;
861 RETURN_SUCCESS;
862
863 case SRE_OP_AT:
864 /* match at given position */
865 /* <AT> <code> */
866 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
867 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
868 RETURN_FAILURE;
869 ctx->pattern++;
870 break;
871
872 case SRE_OP_CATEGORY:
873 /* match at given category */
874 /* <CATEGORY> <code> */
875 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
876 ctx->ptr, *ctx->pattern));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200877 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], SRE_CHARGET(state, ctx->ptr, 0)))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000878 RETURN_FAILURE;
879 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200880 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000881 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000882
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000883 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000884 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000885 /* <ANY> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000886 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200887 if (ctx->ptr >= end || SRE_IS_LINEBREAK(SRE_CHARGET(state, ctx->ptr, 0)))
888 RETURN_FAILURE;
889 ctx->ptr += state->charsize;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000890 break;
891
892 case SRE_OP_ANY_ALL:
893 /* match anything */
894 /* <ANY_ALL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000895 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
896 if (ctx->ptr >= end)
897 RETURN_FAILURE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200898 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000899 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000900
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000901 case SRE_OP_IN:
902 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000903 /* <IN> <skip> <set> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000904 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200905 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, SRE_CHARGET(state, ctx->ptr, 0)))
906 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000907 ctx->pattern += ctx->pattern[0];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200908 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000909 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000910
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000911 case SRE_OP_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000912 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
913 ctx->pattern, ctx->ptr, ctx->pattern[0]));
914 if (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200915 state->lower(SRE_CHARGET(state, ctx->ptr, 0)) != state->lower(*ctx->pattern))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000916 RETURN_FAILURE;
917 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200918 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000919 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000920
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000921 case SRE_OP_NOT_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000922 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
923 ctx->pattern, ctx->ptr, *ctx->pattern));
924 if (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200925 state->lower(SRE_CHARGET(state, ctx->ptr, 0)) == state->lower(*ctx->pattern))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000926 RETURN_FAILURE;
927 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200928 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000929 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000930
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000931 case SRE_OP_IN_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000932 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
933 if (ctx->ptr >= end
934 || !SRE_CHARSET(ctx->pattern+1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200935 (SRE_CODE)state->lower(SRE_CHARGET(state, ctx->ptr, 0))))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000936 RETURN_FAILURE;
937 ctx->pattern += ctx->pattern[0];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200938 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000939 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000940
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000941 case SRE_OP_JUMP:
942 case SRE_OP_INFO:
943 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000944 /* <JUMP> <offset> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000945 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
946 ctx->ptr, ctx->pattern[0]));
947 ctx->pattern += ctx->pattern[0];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000948 break;
949
950 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000951 /* alternation */
952 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000953 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000954 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000955 ctx->u.rep = state->repeat;
956 if (ctx->u.rep)
957 MARK_PUSH(ctx->lastmark);
958 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
959 if (ctx->pattern[1] == SRE_OP_LITERAL &&
960 (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200961 (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) != ctx->pattern[2]))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000962 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000963 if (ctx->pattern[1] == SRE_OP_IN &&
964 (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200965 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0))))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000966 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000967 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000968 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000969 if (ret) {
970 if (ctx->u.rep)
971 MARK_POP_DISCARD(ctx->lastmark);
972 RETURN_ON_ERROR(ret);
973 RETURN_SUCCESS;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000974 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000975 if (ctx->u.rep)
976 MARK_POP_KEEP(ctx->lastmark);
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000977 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000978 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000979 if (ctx->u.rep)
980 MARK_POP_DISCARD(ctx->lastmark);
981 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +0000982
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000983 case SRE_OP_REPEAT_ONE:
984 /* match repeated sequence (maximizing regexp) */
985
986 /* this operator only works if the repeated item is
987 exactly one character wide, and we're not already
988 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +0000989 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000990
991 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
992
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000993 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
994 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000995
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +0300996 if ((Py_ssize_t) ctx->pattern[1] > (end - ctx->ptr) / state->charsize)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000997 RETURN_FAILURE; /* cannot match */
Fredrik Lundhe1869832000-08-01 22:47:49 +0000998
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000999 state->ptr = ctx->ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001000
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001001 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1002 RETURN_ON_ERROR(ret);
1003 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1004 ctx->count = ret;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001005 ctx->ptr += state->charsize * ctx->count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001006
1007 /* when we arrive here, count contains the number of
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001008 matches, and ctx->ptr points to the tail of the target
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001009 string. check if the rest of the pattern matches,
1010 and backtrack if not. */
1011
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001012 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001013 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001014
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001015 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001016 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001017 state->ptr = ctx->ptr;
1018 RETURN_SUCCESS;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001019 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001020
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001021 LASTMARK_SAVE();
1022
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001023 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001024 /* tail starts with a literal. skip positions where
1025 the rest of the pattern cannot possibly match */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001026 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001027 for (;;) {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001028 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
Victor Stinner63ab8752011-11-22 03:31:20 +01001029 (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001030 SRE_CHARGET(state, ctx->ptr, 0) != ctx->u.chr)) {
1031 ctx->ptr -= state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001032 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001033 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001034 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001035 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001036 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001037 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1038 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001039 if (ret) {
1040 RETURN_ON_ERROR(ret);
1041 RETURN_SUCCESS;
1042 }
Tim Peters3d563502006-01-21 02:47:53 +00001043
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001044 LASTMARK_RESTORE();
Tim Peters3d563502006-01-21 02:47:53 +00001045
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001046 ctx->ptr -= state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001047 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001048 }
1049
1050 } else {
1051 /* general case */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001052 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001053 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001054 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1055 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001056 if (ret) {
1057 RETURN_ON_ERROR(ret);
1058 RETURN_SUCCESS;
1059 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060 ctx->ptr -= state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001061 ctx->count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001062 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001063 }
1064 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001065 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001066
Guido van Rossum41c99e72003-04-14 17:59:34 +00001067 case SRE_OP_MIN_REPEAT_ONE:
1068 /* match repeated sequence (minimizing regexp) */
1069
1070 /* this operator only works if the repeated item is
1071 exactly one character wide, and we're not already
1072 collecting backtracking points. for other cases,
1073 use the MIN_REPEAT operator */
1074
1075 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1076
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001077 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1078 ctx->pattern[1], ctx->pattern[2]));
Guido van Rossum41c99e72003-04-14 17:59:34 +00001079
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001080 if ((Py_ssize_t) ctx->pattern[1] > (end - ctx->ptr) / state->charsize)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001081 RETURN_FAILURE; /* cannot match */
Guido van Rossum41c99e72003-04-14 17:59:34 +00001082
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001083 state->ptr = ctx->ptr;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001084
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001085 if (ctx->pattern[1] == 0)
1086 ctx->count = 0;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001087 else {
1088 /* count using pattern min as the maximum */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001089 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1090 RETURN_ON_ERROR(ret);
1091 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001092 if (ret < (Py_ssize_t) ctx->pattern[1])
Tim Peters3d563502006-01-21 02:47:53 +00001093 /* didn't match minimum number of times */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001094 RETURN_FAILURE;
1095 /* advance past minimum matches of repeat */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001096 ctx->count = ret;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097 ctx->ptr += state->charsize * ctx->count;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001098 }
1099
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001100 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Guido van Rossum41c99e72003-04-14 17:59:34 +00001101 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001102 state->ptr = ctx->ptr;
1103 RETURN_SUCCESS;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001104
1105 } else {
1106 /* general case */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001107 LASTMARK_SAVE();
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001108 while ((Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001109 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001110 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001111 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1112 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001113 if (ret) {
1114 RETURN_ON_ERROR(ret);
1115 RETURN_SUCCESS;
1116 }
1117 state->ptr = ctx->ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001118 ret = SRE_COUNT(state, ctx->pattern+3, 1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001119 RETURN_ON_ERROR(ret);
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001120 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001121 if (ret == 0)
Guido van Rossum41c99e72003-04-14 17:59:34 +00001122 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001123 assert(ret == 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001124 ctx->ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001125 ctx->count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001126 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001127 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001128 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001129 RETURN_FAILURE;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001130
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001131 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001132 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001133 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001134 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001135 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1136 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001137
1138 /* install new repeat context */
Thomas Wouters477c8d52006-05-27 19:21:47 +00001139 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001140 if (!ctx->u.rep) {
1141 PyErr_NoMemory();
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001142 RETURN_FAILURE;
Thomas Wouters89f507f2006-12-13 04:49:30 +00001143 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001144 ctx->u.rep->count = -1;
1145 ctx->u.rep->pattern = ctx->pattern;
1146 ctx->u.rep->prev = state->repeat;
1147 ctx->u.rep->last_ptr = NULL;
1148 state->repeat = ctx->u.rep;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001149
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001150 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001151 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001152 state->repeat = ctx->u.rep->prev;
Thomas Wouters477c8d52006-05-27 19:21:47 +00001153 PyObject_FREE(ctx->u.rep);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001154
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001155 if (ret) {
1156 RETURN_ON_ERROR(ret);
1157 RETURN_SUCCESS;
1158 }
1159 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001160
1161 case SRE_OP_MAX_UNTIL:
1162 /* maximizing repeat */
1163 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1164
1165 /* FIXME: we probably need to deal with zero-width
1166 matches in here... */
1167
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001168 ctx->u.rep = state->repeat;
1169 if (!ctx->u.rep)
1170 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001171
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001172 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001173
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001174 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001175
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001176 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1177 ctx->ptr, ctx->count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001178
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001179 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001180 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001181 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001182 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1183 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001184 if (ret) {
1185 RETURN_ON_ERROR(ret);
1186 RETURN_SUCCESS;
1187 }
1188 ctx->u.rep->count = ctx->count-1;
1189 state->ptr = ctx->ptr;
1190 RETURN_FAILURE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001191 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001192
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001193 if ((ctx->count < (Py_ssize_t) ctx->u.rep->pattern[2] ||
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001194 ctx->u.rep->pattern[2] == SRE_MAXREPEAT) &&
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001195 state->ptr != ctx->u.rep->last_ptr) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001196 /* we may have enough matches, but if we can
1197 match another item, do so */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001198 ctx->u.rep->count = ctx->count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001199 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001200 MARK_PUSH(ctx->lastmark);
1201 /* zero-width match protection */
1202 DATA_PUSH(&ctx->u.rep->last_ptr);
1203 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001204 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1205 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001206 DATA_POP(&ctx->u.rep->last_ptr);
1207 if (ret) {
1208 MARK_POP_DISCARD(ctx->lastmark);
1209 RETURN_ON_ERROR(ret);
1210 RETURN_SUCCESS;
1211 }
1212 MARK_POP(ctx->lastmark);
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001213 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001214 ctx->u.rep->count = ctx->count-1;
1215 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001216 }
1217
1218 /* cannot match more repeated items here. make sure the
1219 tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001220 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001221 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001222 RETURN_ON_SUCCESS(ret);
1223 state->repeat = ctx->u.rep;
1224 state->ptr = ctx->ptr;
1225 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001226
1227 case SRE_OP_MIN_UNTIL:
1228 /* minimizing repeat */
1229 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1230
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001231 ctx->u.rep = state->repeat;
1232 if (!ctx->u.rep)
1233 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001234
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001235 state->ptr = ctx->ptr;
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001236
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001237 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001238
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001239 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1240 ctx->ptr, ctx->count, ctx->u.rep->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001241
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001242 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001243 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001244 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001245 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1246 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001247 if (ret) {
1248 RETURN_ON_ERROR(ret);
1249 RETURN_SUCCESS;
1250 }
1251 ctx->u.rep->count = ctx->count-1;
1252 state->ptr = ctx->ptr;
1253 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001254 }
1255
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001256 LASTMARK_SAVE();
1257
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001258 /* see if the tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001259 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001260 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001261 if (ret) {
1262 RETURN_ON_ERROR(ret);
1263 RETURN_SUCCESS;
1264 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001265
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001266 state->repeat = ctx->u.rep;
1267 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001268
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001269 LASTMARK_RESTORE();
1270
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001271 if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2]
Serhiy Storchakafa468162013-02-16 21:23:53 +02001272 && ctx->u.rep->pattern[2] != SRE_MAXREPEAT) ||
1273 state->ptr == ctx->u.rep->last_ptr)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001274 RETURN_FAILURE;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001275
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001276 ctx->u.rep->count = ctx->count;
Serhiy Storchakafa468162013-02-16 21:23:53 +02001277 /* zero-width match protection */
1278 DATA_PUSH(&ctx->u.rep->last_ptr);
1279 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001280 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1281 ctx->u.rep->pattern+3);
Serhiy Storchakafa468162013-02-16 21:23:53 +02001282 DATA_POP(&ctx->u.rep->last_ptr);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001283 if (ret) {
1284 RETURN_ON_ERROR(ret);
1285 RETURN_SUCCESS;
1286 }
1287 ctx->u.rep->count = ctx->count-1;
1288 state->ptr = ctx->ptr;
1289 RETURN_FAILURE;
1290
1291 case SRE_OP_GROUPREF:
1292 /* match backreference */
1293 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1294 ctx->ptr, ctx->pattern[0]));
1295 i = ctx->pattern[0];
1296 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001297 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001298 if (groupref >= state->lastmark) {
1299 RETURN_FAILURE;
1300 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001301 char* p = (char*) state->mark[groupref];
1302 char* e = (char*) state->mark[groupref+1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001303 if (!p || !e || e < p)
1304 RETURN_FAILURE;
1305 while (p < e) {
Victor Stinner63ab8752011-11-22 03:31:20 +01001306 if (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001307 SRE_CHARGET(state, ctx->ptr, 0) != SRE_CHARGET(state, p, 0))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001308 RETURN_FAILURE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001309 p += state->charsize;
1310 ctx->ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001311 }
1312 }
1313 }
1314 ctx->pattern++;
1315 break;
1316
1317 case SRE_OP_GROUPREF_IGNORE:
1318 /* match backreference */
1319 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1320 ctx->ptr, ctx->pattern[0]));
1321 i = ctx->pattern[0];
1322 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001323 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001324 if (groupref >= state->lastmark) {
1325 RETURN_FAILURE;
1326 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001327 char* p = (char*) state->mark[groupref];
1328 char* e = (char*) state->mark[groupref+1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001329 if (!p || !e || e < p)
1330 RETURN_FAILURE;
1331 while (p < e) {
1332 if (ctx->ptr >= end ||
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001333 state->lower(SRE_CHARGET(state, ctx->ptr, 0)) !=
1334 state->lower(SRE_CHARGET(state, p, 0)))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001335 RETURN_FAILURE;
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001336 p += state->charsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337 ctx->ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001338 }
1339 }
1340 }
1341 ctx->pattern++;
1342 break;
1343
1344 case SRE_OP_GROUPREF_EXISTS:
1345 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1346 ctx->ptr, ctx->pattern[0]));
1347 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1348 i = ctx->pattern[0];
1349 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001350 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001351 if (groupref >= state->lastmark) {
1352 ctx->pattern += ctx->pattern[1];
1353 break;
1354 } else {
1355 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1356 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1357 if (!p || !e || e < p) {
1358 ctx->pattern += ctx->pattern[1];
1359 break;
1360 }
1361 }
1362 }
1363 ctx->pattern += 2;
1364 break;
1365
1366 case SRE_OP_ASSERT:
1367 /* assert subpattern */
1368 /* <ASSERT> <skip> <back> <pattern> */
1369 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1370 ctx->ptr, ctx->pattern[1]));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371 state->ptr = ctx->ptr - state->charsize * ctx->pattern[1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001372 if (state->ptr < state->beginning)
1373 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001374 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001375 RETURN_ON_FAILURE(ret);
1376 ctx->pattern += ctx->pattern[0];
1377 break;
1378
1379 case SRE_OP_ASSERT_NOT:
1380 /* assert not subpattern */
1381 /* <ASSERT_NOT> <skip> <back> <pattern> */
1382 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1383 ctx->ptr, ctx->pattern[1]));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384 state->ptr = ctx->ptr - state->charsize * ctx->pattern[1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001385 if (state->ptr >= state->beginning) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001386 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001387 if (ret) {
1388 RETURN_ON_ERROR(ret);
1389 RETURN_FAILURE;
1390 }
1391 }
1392 ctx->pattern += ctx->pattern[0];
1393 break;
1394
1395 case SRE_OP_FAILURE:
1396 /* immediate failure */
1397 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1398 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001399
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001400 default:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001401 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1402 ctx->pattern[-1]));
1403 RETURN_ERROR(SRE_ERROR_ILLEGAL);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001404 }
1405 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001406
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001407exit:
1408 ctx_pos = ctx->last_ctx_pos;
1409 jump = ctx->jump;
1410 DATA_POP_DISCARD(ctx);
1411 if (ctx_pos == -1)
1412 return ret;
1413 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1414
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001415 switch (jump) {
1416 case JUMP_MAX_UNTIL_2:
1417 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1418 goto jump_max_until_2;
1419 case JUMP_MAX_UNTIL_3:
1420 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1421 goto jump_max_until_3;
1422 case JUMP_MIN_UNTIL_2:
1423 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1424 goto jump_min_until_2;
1425 case JUMP_MIN_UNTIL_3:
1426 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1427 goto jump_min_until_3;
1428 case JUMP_BRANCH:
1429 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1430 goto jump_branch;
1431 case JUMP_MAX_UNTIL_1:
1432 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1433 goto jump_max_until_1;
1434 case JUMP_MIN_UNTIL_1:
1435 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1436 goto jump_min_until_1;
1437 case JUMP_REPEAT:
1438 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1439 goto jump_repeat;
1440 case JUMP_REPEAT_ONE_1:
1441 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1442 goto jump_repeat_one_1;
1443 case JUMP_REPEAT_ONE_2:
1444 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1445 goto jump_repeat_one_2;
1446 case JUMP_MIN_REPEAT_ONE:
1447 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1448 goto jump_min_repeat_one;
1449 case JUMP_ASSERT:
1450 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1451 goto jump_assert;
1452 case JUMP_ASSERT_NOT:
1453 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1454 goto jump_assert_not;
1455 case JUMP_NONE:
1456 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1457 break;
1458 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001459
1460 return ret; /* should never get here */
Guido van Rossumb700df92000-03-31 14:59:30 +00001461}
1462
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001463LOCAL(Py_ssize_t)
Guido van Rossumb700df92000-03-31 14:59:30 +00001464SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1465{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001466 char* ptr = (char*)state->start;
1467 char* end = (char*)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001468 Py_ssize_t status = 0;
1469 Py_ssize_t prefix_len = 0;
1470 Py_ssize_t prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001471 SRE_CODE* prefix = NULL;
1472 SRE_CODE* charset = NULL;
1473 SRE_CODE* overlap = NULL;
1474 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001475
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001476 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001477 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001478 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001479
1480 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001481
Gustavo Niemeyer28b5bb32003-06-26 14:41:08 +00001482 if (pattern[3] > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001483 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001484 character in there, so literal search will work) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485 end -= (pattern[3]-1) * state->charsize;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001486 if (end <= ptr)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 end = ptr + state->charsize;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001488 }
1489
Fredrik Lundh3562f112000-07-02 12:00:07 +00001490 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001491 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001492 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001493 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001494 prefix_skip = pattern[6];
1495 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001496 overlap = prefix + prefix_len - 1;
1497 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001498 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001499 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001500 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001501
1502 pattern += 1 + pattern[1];
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001503 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001504
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001505 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1506 TRACE(("charset = %p\n", charset));
1507
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001508#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001509 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001510 /* pattern starts with a known prefix. use the overlap
1511 table to skip forward as fast as we possibly can */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001512 Py_ssize_t i = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001513 end = (char *)state->end;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001514 while (ptr < end) {
1515 for (;;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001516 if ((SRE_CODE) SRE_CHARGET(state, ptr, 0) != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001517 if (!i)
1518 break;
1519 else
1520 i = overlap[i];
1521 } else {
1522 if (++i == prefix_len) {
1523 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001524 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001525 state->start = ptr - (prefix_len - 1) * state->charsize;
1526 state->ptr = ptr - (prefix_len - prefix_skip - 1) * state->charsize;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001527 if (flags & SRE_INFO_LITERAL)
1528 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001529 status = SRE_MATCH(state, pattern + 2*prefix_skip);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001530 if (status != 0)
1531 return status;
1532 /* close but no cigar -- try again */
1533 i = overlap[i];
1534 }
1535 break;
1536 }
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001537 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001538 ptr += state->charsize;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001539 }
1540 return 0;
1541 }
1542#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001543
Fredrik Lundh3562f112000-07-02 12:00:07 +00001544 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001545 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001546 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001547 SRE_CODE chr = pattern[1];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001548 end = (char*)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001549 for (;;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001550 while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) != chr)
1551 ptr += state->charsize;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001552 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001553 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001554 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001555 state->start = ptr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001556 ptr += state->charsize;
1557 state->ptr = ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001558 if (flags & SRE_INFO_LITERAL)
1559 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001560 status = SRE_MATCH(state, pattern + 2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001561 if (status != 0)
1562 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001563 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001564 } else if (charset) {
1565 /* pattern starts with a character from a known set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001566 end = (char*)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001567 for (;;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001568 while (ptr < end && !SRE_CHARSET(charset, SRE_CHARGET(state, ptr, 0)))
1569 ptr += state->charsize;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001570 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001571 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001572 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001573 state->start = ptr;
1574 state->ptr = ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001575 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001576 if (status != 0)
1577 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001578 ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001579 }
1580 } else
1581 /* general case */
1582 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001583 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001584 state->start = state->ptr = ptr;
1585 ptr += state->charsize;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001586 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001587 if (status != 0)
1588 break;
1589 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001590
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001591 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001592}
Tim Peters3d563502006-01-21 02:47:53 +00001593
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001594#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001595
1596/* -------------------------------------------------------------------- */
1597/* factories and destructors */
1598
1599/* see sre.h for object declarations */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001600static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06001601static PyObject*pattern_scanner(PatternObject*, PyObject*, PyObject* kw);
Guido van Rossumb700df92000-03-31 14:59:30 +00001602
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001603static int
1604sre_literal_template(int charsize, char* ptr, Py_ssize_t len)
1605{
1606 /* check if given string is a literal template (i.e. no escapes) */
1607 struct {
1608 int charsize;
1609 } state = {
1610 charsize
1611 };
1612 while (len-- > 0) {
1613 if (SRE_CHARGET((&state), ptr, 0) == '\\')
1614 return 0;
1615 ptr += charsize;
1616 }
1617 return 1;
1618}
1619
Guido van Rossumb700df92000-03-31 14:59:30 +00001620static PyObject *
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001621sre_codesize(PyObject* self, PyObject *unused)
Guido van Rossumb700df92000-03-31 14:59:30 +00001622{
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001623 return PyLong_FromSize_t(sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001624}
1625
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001626static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001627sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001628{
1629 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001630 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001631 return NULL;
1632 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001633 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001634 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001635 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundhb389df32000-06-29 12:48:37 +00001636 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001637}
1638
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001639LOCAL(void)
1640state_reset(SRE_STATE* state)
1641{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001642 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001643 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001644
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001645 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001646 state->lastindex = -1;
1647
1648 state->repeat = NULL;
1649
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001650 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001651}
1652
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001653static void*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001654getstring(PyObject* string, Py_ssize_t* p_length,
Benjamin Peterson33d21a22012-03-07 14:59:13 -06001655 int* p_logical_charsize, int* p_charsize,
1656 Py_buffer *view)
Guido van Rossumb700df92000-03-31 14:59:30 +00001657{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001658 /* given a python object, return a data pointer, a length (in
1659 characters), and a character size. return NULL if the object
1660 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +00001661
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001662 PyBufferProcs *buffer;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001663 Py_ssize_t size, bytes;
1664 int charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001665 void* ptr;
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001666
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00001667 /* Unicode objects do not support the buffer API. So, get the data
1668 directly instead. */
1669 if (PyUnicode_Check(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001670 if (PyUnicode_READY(string) == -1)
1671 return NULL;
1672 ptr = PyUnicode_DATA(string);
1673 *p_length = PyUnicode_GET_LENGTH(string);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001674 *p_charsize = PyUnicode_KIND(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001675 *p_logical_charsize = 4;
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00001676 return ptr;
1677 }
1678
Victor Stinner0058b862011-09-29 03:27:47 +02001679 /* get pointer to byte string buffer */
Benjamin Petersone48944b2012-03-07 14:50:25 -06001680 view->len = -1;
Christian Heimes90aa7642007-12-19 02:45:37 +00001681 buffer = Py_TYPE(string)->tp_as_buffer;
Antoine Pitroufd036452008-08-19 17:56:33 +00001682 if (!buffer || !buffer->bf_getbuffer ||
Benjamin Petersone48944b2012-03-07 14:50:25 -06001683 (*buffer->bf_getbuffer)(string, view, PyBUF_SIMPLE) < 0) {
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001684 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1685 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001686 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001687
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001688 /* determine buffer size */
Benjamin Petersone48944b2012-03-07 14:50:25 -06001689 bytes = view->len;
1690 ptr = view->buf;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001691
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001692 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001693 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001694 goto err;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001695 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001696
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001697 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001698 size = PyObject_Size(string);
Guido van Rossumb700df92000-03-31 14:59:30 +00001699
Christian Heimes72b710a2008-05-26 13:28:38 +00001700 if (PyBytes_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001701 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001702 else {
1703 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001704 goto err;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001705 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001706
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001707 *p_length = size;
1708 *p_charsize = charsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001709 *p_logical_charsize = charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001710
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001711 if (ptr == NULL) {
Antoine Pitroufd036452008-08-19 17:56:33 +00001712 PyErr_SetString(PyExc_ValueError,
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001713 "Buffer is NULL");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001714 goto err;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001715 }
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001716 return ptr;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001717 err:
1718 PyBuffer_Release(view);
1719 view->buf = NULL;
1720 return NULL;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001721}
1722
1723LOCAL(PyObject*)
1724state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001725 Py_ssize_t start, Py_ssize_t end)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001726{
1727 /* prepare state object */
1728
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001729 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001730 int logical_charsize, charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001731 void* ptr;
1732
1733 memset(state, 0, sizeof(SRE_STATE));
1734
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001735 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001736 state->lastindex = -1;
1737
Benjamin Petersone48944b2012-03-07 14:50:25 -06001738 state->buffer.buf = NULL;
Benjamin Peterson33d21a22012-03-07 14:59:13 -06001739 ptr = getstring(string, &length, &logical_charsize, &charsize, &state->buffer);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001740 if (!ptr)
Benjamin Petersone48944b2012-03-07 14:50:25 -06001741 goto err;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001742
Benjamin Peterson33d21a22012-03-07 14:59:13 -06001743 if (logical_charsize == 1 && pattern->logical_charsize > 1) {
Benjamin Petersone48944b2012-03-07 14:50:25 -06001744 PyErr_SetString(PyExc_TypeError,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001745 "can't use a string pattern on a bytes-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001746 goto err;
1747 }
Benjamin Peterson33d21a22012-03-07 14:59:13 -06001748 if (logical_charsize > 1 && pattern->logical_charsize == 1) {
Benjamin Petersone48944b2012-03-07 14:50:25 -06001749 PyErr_SetString(PyExc_TypeError,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001750 "can't use a bytes pattern on a string-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001751 goto err;
1752 }
Antoine Pitroufd036452008-08-19 17:56:33 +00001753
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001754 /* adjust boundaries */
1755 if (start < 0)
1756 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001757 else if (start > length)
1758 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001759
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001760 if (end < 0)
1761 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001762 else if (end > length)
1763 end = length;
1764
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765 state->logical_charsize = logical_charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001766 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001767
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001768 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001769
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001770 state->start = (void*) ((char*) ptr + start * state->charsize);
1771 state->end = (void*) ((char*) ptr + end * state->charsize);
1772
1773 Py_INCREF(string);
1774 state->string = string;
1775 state->pos = start;
1776 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001777
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001778 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001779 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001780 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001781 state->lower = sre_lower_unicode;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001782 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001783 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001784
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001785 return string;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001786 err:
1787 if (state->buffer.buf)
1788 PyBuffer_Release(&state->buffer);
1789 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001790}
1791
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001792LOCAL(void)
1793state_fini(SRE_STATE* state)
1794{
Benjamin Petersone48944b2012-03-07 14:50:25 -06001795 if (state->buffer.buf)
1796 PyBuffer_Release(&state->buffer);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001797 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001798 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001799}
1800
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001801/* calculate offset from start of string */
1802#define STATE_OFFSET(state, member)\
1803 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1804
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001805LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001806state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001807{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001808 Py_ssize_t i, j;
Fredrik Lundh58100642000-08-09 09:14:35 +00001809
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001810 index = (index - 1) * 2;
1811
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001812 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001813 if (empty)
1814 /* want empty string */
1815 i = j = 0;
1816 else {
1817 Py_INCREF(Py_None);
1818 return Py_None;
1819 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001820 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001821 i = STATE_OFFSET(state, state->mark[index]);
1822 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001823 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001824
Fredrik Lundh58100642000-08-09 09:14:35 +00001825 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001826}
1827
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001828static void
1829pattern_error(int status)
1830{
1831 switch (status) {
1832 case SRE_ERROR_RECURSION_LIMIT:
1833 PyErr_SetString(
1834 PyExc_RuntimeError,
1835 "maximum recursion limit exceeded"
1836 );
1837 break;
1838 case SRE_ERROR_MEMORY:
1839 PyErr_NoMemory();
1840 break;
Christian Heimes2380ac72008-01-09 00:17:24 +00001841 case SRE_ERROR_INTERRUPTED:
1842 /* An exception has already been raised, so let it fly */
1843 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001844 default:
1845 /* other error codes indicate compiler/engine bugs */
1846 PyErr_SetString(
1847 PyExc_RuntimeError,
1848 "internal error in regular expression engine"
1849 );
1850 }
1851}
1852
Guido van Rossumb700df92000-03-31 14:59:30 +00001853static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001854pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001855{
Raymond Hettinger027bb632004-05-31 03:09:25 +00001856 if (self->weakreflist != NULL)
1857 PyObject_ClearWeakRefs((PyObject *) self);
Benjamin Petersone48944b2012-03-07 14:50:25 -06001858 if (self->view.buf)
1859 PyBuffer_Release(&self->view);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001860 Py_XDECREF(self->pattern);
1861 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001862 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001863 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001864}
1865
1866static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001867pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001868{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001869 SRE_STATE state;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01001870 Py_ssize_t status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001871
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001872 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001873 Py_ssize_t start = 0;
1874 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001875 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001876 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001877 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001878 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001879
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001880 string = state_init(&state, self, string, start, end);
1881 if (!string)
1882 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001883
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001884 state.ptr = state.start;
1885
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001886 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1887
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001888 if (state.logical_charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001889 status = sre_match(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001890 } else {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001891 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001892 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001893
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001894 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001895 if (PyErr_Occurred())
1896 return NULL;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001897
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001898 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001899
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001900 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001901}
1902
1903static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001904pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001905{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001906 SRE_STATE state;
1907 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001908
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001909 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001910 Py_ssize_t start = 0;
1911 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001912 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001913 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001914 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001915 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001916
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001917 string = state_init(&state, self, string, start, end);
1918 if (!string)
1919 return NULL;
1920
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001921 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1922
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001923 if (state.logical_charsize == 1) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001924 status = sre_search(&state, PatternObject_GetCode(self));
1925 } else {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001926 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001927 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001928
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001929 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1930
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001931 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001932
Thomas Wouters89f507f2006-12-13 04:49:30 +00001933 if (PyErr_Occurred())
1934 return NULL;
1935
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001936 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001937}
1938
1939static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001940call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001941{
1942 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001943 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001944 PyObject* func;
1945 PyObject* result;
1946
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001947 if (!args)
1948 return NULL;
Neal Norwitzfe537132007-08-26 03:55:15 +00001949 name = PyUnicode_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001950 if (!name)
1951 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001952 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001953 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001954 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001955 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001956 func = PyObject_GetAttrString(mod, function);
1957 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001958 if (!func)
1959 return NULL;
1960 result = PyObject_CallObject(func, args);
1961 Py_DECREF(func);
1962 Py_DECREF(args);
1963 return result;
1964}
1965
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001966#ifdef USE_BUILTIN_COPY
1967static int
1968deepcopy(PyObject** object, PyObject* memo)
1969{
1970 PyObject* copy;
1971
1972 copy = call(
1973 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00001974 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001975 );
1976 if (!copy)
1977 return 0;
1978
1979 Py_DECREF(*object);
1980 *object = copy;
1981
1982 return 1; /* success */
1983}
1984#endif
1985
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001986static PyObject*
Thomas Wouters1b7f8912007-09-19 03:06:30 +00001987join_list(PyObject* list, PyObject* string)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001988{
1989 /* join list elements */
1990
1991 PyObject* joiner;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001992 PyObject* function;
1993 PyObject* args;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001994 PyObject* result;
1995
Thomas Wouters1b7f8912007-09-19 03:06:30 +00001996 joiner = PySequence_GetSlice(string, 0, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001997 if (!joiner)
1998 return NULL;
1999
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002000 if (PyList_GET_SIZE(list) == 0) {
2001 Py_DECREF(list);
2002 return joiner;
2003 }
2004
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002005 function = PyObject_GetAttrString(joiner, "join");
2006 if (!function) {
2007 Py_DECREF(joiner);
2008 return NULL;
2009 }
2010 args = PyTuple_New(1);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002011 if (!args) {
2012 Py_DECREF(function);
2013 Py_DECREF(joiner);
2014 return NULL;
2015 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002016 PyTuple_SET_ITEM(args, 0, list);
2017 result = PyObject_CallObject(function, args);
2018 Py_DECREF(args); /* also removes list */
2019 Py_DECREF(function);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002020 Py_DECREF(joiner);
2021
2022 return result;
2023}
2024
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002025static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002026pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002027{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002028 SRE_STATE state;
2029 PyObject* list;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002030 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002031 Py_ssize_t i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00002032
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002033 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002034 Py_ssize_t start = 0;
2035 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002036 static char* kwlist[] = { "source", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002037 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00002038 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002039 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002040
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002041 string = state_init(&state, self, string, start, end);
2042 if (!string)
2043 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002044
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002045 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002046 if (!list) {
2047 state_fini(&state);
2048 return NULL;
2049 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002050
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002051 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002052
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002053 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +00002054
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002055 state_reset(&state);
2056
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002057 state.ptr = state.start;
2058
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059 if (state.logical_charsize == 1) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002060 status = sre_search(&state, PatternObject_GetCode(self));
2061 } else {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002062 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002063 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002064
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002065 if (PyErr_Occurred())
2066 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00002067
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002068 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002069 if (status == 0)
2070 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002071 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002072 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002073 }
Tim Peters3d563502006-01-21 02:47:53 +00002074
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002075 /* don't bother to build a match object */
2076 switch (self->groups) {
2077 case 0:
2078 b = STATE_OFFSET(&state, state.start);
2079 e = STATE_OFFSET(&state, state.ptr);
2080 item = PySequence_GetSlice(string, b, e);
2081 if (!item)
2082 goto error;
2083 break;
2084 case 1:
2085 item = state_getslice(&state, 1, string, 1);
2086 if (!item)
2087 goto error;
2088 break;
2089 default:
2090 item = PyTuple_New(self->groups);
2091 if (!item)
2092 goto error;
2093 for (i = 0; i < self->groups; i++) {
2094 PyObject* o = state_getslice(&state, i+1, string, 1);
2095 if (!o) {
2096 Py_DECREF(item);
2097 goto error;
2098 }
2099 PyTuple_SET_ITEM(item, i, o);
2100 }
2101 break;
2102 }
2103
2104 status = PyList_Append(list, item);
2105 Py_DECREF(item);
2106 if (status < 0)
2107 goto error;
2108
2109 if (state.ptr == state.start)
2110 state.start = (void*) ((char*) state.ptr + state.charsize);
2111 else
2112 state.start = state.ptr;
2113
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002114 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002115
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002116 state_fini(&state);
2117 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002118
2119error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002120 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002121 state_fini(&state);
2122 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002123
Guido van Rossumb700df92000-03-31 14:59:30 +00002124}
2125
Fredrik Lundh703ce812001-10-24 22:16:30 +00002126static PyObject*
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002127pattern_finditer(PatternObject* pattern, PyObject* args, PyObject* kw)
Fredrik Lundh703ce812001-10-24 22:16:30 +00002128{
2129 PyObject* scanner;
2130 PyObject* search;
2131 PyObject* iterator;
2132
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002133 scanner = pattern_scanner(pattern, args, kw);
Fredrik Lundh703ce812001-10-24 22:16:30 +00002134 if (!scanner)
2135 return NULL;
2136
2137 search = PyObject_GetAttrString(scanner, "search");
2138 Py_DECREF(scanner);
2139 if (!search)
2140 return NULL;
2141
2142 iterator = PyCallIter_New(search, Py_None);
2143 Py_DECREF(search);
2144
2145 return iterator;
2146}
Fredrik Lundh703ce812001-10-24 22:16:30 +00002147
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002148static PyObject*
2149pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2150{
2151 SRE_STATE state;
2152 PyObject* list;
2153 PyObject* item;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002154 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002155 Py_ssize_t n;
2156 Py_ssize_t i;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002157 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002158
2159 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002160 Py_ssize_t maxsplit = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002161 static char* kwlist[] = { "source", "maxsplit", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002162 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002163 &string, &maxsplit))
2164 return NULL;
2165
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002166 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002167 if (!string)
2168 return NULL;
2169
2170 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002171 if (!list) {
2172 state_fini(&state);
2173 return NULL;
2174 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002175
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002176 n = 0;
2177 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002178
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002179 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002180
2181 state_reset(&state);
2182
2183 state.ptr = state.start;
2184
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002185 if (state.logical_charsize == 1) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002186 status = sre_search(&state, PatternObject_GetCode(self));
2187 } else {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002188 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002189 }
2190
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002191 if (PyErr_Occurred())
2192 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00002193
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002194 if (status <= 0) {
2195 if (status == 0)
2196 break;
2197 pattern_error(status);
2198 goto error;
2199 }
Tim Peters3d563502006-01-21 02:47:53 +00002200
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002201 if (state.start == state.ptr) {
2202 if (last == state.end)
2203 break;
2204 /* skip one character */
2205 state.start = (void*) ((char*) state.ptr + state.charsize);
2206 continue;
2207 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002208
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002209 /* get segment before this match */
2210 item = PySequence_GetSlice(
2211 string, STATE_OFFSET(&state, last),
2212 STATE_OFFSET(&state, state.start)
2213 );
2214 if (!item)
2215 goto error;
2216 status = PyList_Append(list, item);
2217 Py_DECREF(item);
2218 if (status < 0)
2219 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002220
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002221 /* add groups (if any) */
2222 for (i = 0; i < self->groups; i++) {
2223 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002224 if (!item)
2225 goto error;
2226 status = PyList_Append(list, item);
2227 Py_DECREF(item);
2228 if (status < 0)
2229 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002230 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002231
2232 n = n + 1;
2233
2234 last = state.start = state.ptr;
2235
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002236 }
2237
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002238 /* get segment following last match (even if empty) */
2239 item = PySequence_GetSlice(
2240 string, STATE_OFFSET(&state, last), state.endpos
2241 );
2242 if (!item)
2243 goto error;
2244 status = PyList_Append(list, item);
2245 Py_DECREF(item);
2246 if (status < 0)
2247 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002248
2249 state_fini(&state);
2250 return list;
2251
2252error:
2253 Py_DECREF(list);
2254 state_fini(&state);
2255 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002256
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002257}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002258
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002259static PyObject*
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002260pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002261 Py_ssize_t count, Py_ssize_t subn)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002262{
2263 SRE_STATE state;
2264 PyObject* list;
2265 PyObject* item;
2266 PyObject* filter;
2267 PyObject* args;
2268 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002269 void* ptr;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002270 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002271 Py_ssize_t n;
2272 Py_ssize_t i, b, e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002273 int logical_charsize, charsize;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002274 int filter_is_callable;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002275 Py_buffer view;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002276
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002277 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002278 /* sub/subn takes either a function or a template */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002279 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002280 Py_INCREF(filter);
2281 filter_is_callable = 1;
2282 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002283 /* if not callable, check if it's a literal string */
2284 int literal;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002285 view.buf = NULL;
Benjamin Peterson33d21a22012-03-07 14:59:13 -06002286 ptr = getstring(ptemplate, &n, &logical_charsize, &charsize, &view);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002287 b = charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002288 if (ptr) {
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002289 literal = sre_literal_template(charsize, ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002290 } else {
2291 PyErr_Clear();
2292 literal = 0;
2293 }
Benjamin Petersone48944b2012-03-07 14:50:25 -06002294 if (view.buf)
2295 PyBuffer_Release(&view);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002296 if (literal) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002297 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002298 Py_INCREF(filter);
2299 filter_is_callable = 0;
2300 } else {
2301 /* not a literal; hand it over to the template compiler */
2302 filter = call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00002303 SRE_PY_MODULE, "_subx",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002304 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002305 );
2306 if (!filter)
2307 return NULL;
2308 filter_is_callable = PyCallable_Check(filter);
2309 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002310 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002311
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002312 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002313 if (!string) {
2314 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002315 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002316 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002317
2318 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002319 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002320 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002321 state_fini(&state);
2322 return NULL;
2323 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002324
2325 n = i = 0;
2326
2327 while (!count || n < count) {
2328
2329 state_reset(&state);
2330
2331 state.ptr = state.start;
2332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002333 if (state.logical_charsize == 1) {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002334 status = sre_search(&state, PatternObject_GetCode(self));
2335 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002336 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002337 }
2338
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002339 if (PyErr_Occurred())
2340 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00002341
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002342 if (status <= 0) {
2343 if (status == 0)
2344 break;
2345 pattern_error(status);
2346 goto error;
2347 }
Tim Peters3d563502006-01-21 02:47:53 +00002348
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002349 b = STATE_OFFSET(&state, state.start);
2350 e = STATE_OFFSET(&state, state.ptr);
2351
2352 if (i < b) {
2353 /* get segment before this match */
2354 item = PySequence_GetSlice(string, i, b);
2355 if (!item)
2356 goto error;
2357 status = PyList_Append(list, item);
2358 Py_DECREF(item);
2359 if (status < 0)
2360 goto error;
2361
2362 } else if (i == b && i == e && n > 0)
2363 /* ignore empty match on latest position */
2364 goto next;
2365
2366 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002367 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002368 match = pattern_new_match(self, &state, 1);
2369 if (!match)
2370 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002371 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002372 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002373 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002374 goto error;
2375 }
2376 item = PyObject_CallObject(filter, args);
2377 Py_DECREF(args);
2378 Py_DECREF(match);
2379 if (!item)
2380 goto error;
2381 } else {
2382 /* filter is literal string */
2383 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002384 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002385 }
2386
2387 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002388 if (item != Py_None) {
2389 status = PyList_Append(list, item);
2390 Py_DECREF(item);
2391 if (status < 0)
2392 goto error;
2393 }
Tim Peters3d563502006-01-21 02:47:53 +00002394
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002395 i = e;
2396 n = n + 1;
2397
2398next:
2399 /* move on */
2400 if (state.ptr == state.start)
2401 state.start = (void*) ((char*) state.ptr + state.charsize);
2402 else
2403 state.start = state.ptr;
2404
2405 }
2406
2407 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002408 if (i < state.endpos) {
2409 item = PySequence_GetSlice(string, i, state.endpos);
2410 if (!item)
2411 goto error;
2412 status = PyList_Append(list, item);
2413 Py_DECREF(item);
2414 if (status < 0)
2415 goto error;
2416 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002417
2418 state_fini(&state);
2419
Guido van Rossum4e173842001-12-07 04:25:10 +00002420 Py_DECREF(filter);
2421
Fredrik Lundhdac58492001-10-21 21:48:30 +00002422 /* convert list to single string (also removes list) */
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002423 item = join_list(list, string);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002424
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002425 if (!item)
2426 return NULL;
2427
2428 if (subn)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01002429 return Py_BuildValue("Nn", item, n);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002430
2431 return item;
2432
2433error:
2434 Py_DECREF(list);
2435 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002436 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002437 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002438
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002439}
2440
2441static PyObject*
2442pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2443{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002444 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002445 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002446 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002447 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002448 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002449 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002450 return NULL;
2451
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002452 return pattern_subx(self, ptemplate, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002453}
2454
2455static PyObject*
2456pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2457{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002458 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002459 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002460 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002461 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002462 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002463 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002464 return NULL;
2465
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002466 return pattern_subx(self, ptemplate, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002467}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002468
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002469static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002470pattern_copy(PatternObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002471{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002472#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002473 PatternObject* copy;
2474 int offset;
2475
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002476 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2477 if (!copy)
2478 return NULL;
2479
2480 offset = offsetof(PatternObject, groups);
2481
2482 Py_XINCREF(self->groupindex);
2483 Py_XINCREF(self->indexgroup);
2484 Py_XINCREF(self->pattern);
2485
2486 memcpy((char*) copy + offset, (char*) self + offset,
2487 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00002488 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002489
2490 return (PyObject*) copy;
2491#else
2492 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2493 return NULL;
2494#endif
2495}
2496
2497static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002498pattern_deepcopy(PatternObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002499{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002500#ifdef USE_BUILTIN_COPY
2501 PatternObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00002502
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002503 copy = (PatternObject*) pattern_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002504 if (!copy)
2505 return NULL;
2506
2507 if (!deepcopy(&copy->groupindex, memo) ||
2508 !deepcopy(&copy->indexgroup, memo) ||
2509 !deepcopy(&copy->pattern, memo)) {
2510 Py_DECREF(copy);
2511 return NULL;
2512 }
2513
2514#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002515 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2516 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002517#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002518}
2519
Raymond Hettinger94478742004-09-24 04:31:19 +00002520PyDoc_STRVAR(pattern_match_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002521"match(string[, pos[, endpos]]) -> match object or None.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002522 Matches zero or more characters at the beginning of the string");
2523
2524PyDoc_STRVAR(pattern_search_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002525"search(string[, pos[, endpos]]) -> match object or None.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002526 Scan through string looking for a match, and return a corresponding\n\
Andrew Svetlov0b64c142012-12-25 18:48:54 +02002527 match object instance. Return None if no position in the string matches.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002528
2529PyDoc_STRVAR(pattern_split_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002530"split(string[, maxsplit = 0]) -> list.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002531 Split string by the occurrences of pattern.");
2532
2533PyDoc_STRVAR(pattern_findall_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002534"findall(string[, pos[, endpos]]) -> list.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002535 Return a list of all non-overlapping matches of pattern in string.");
2536
2537PyDoc_STRVAR(pattern_finditer_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002538"finditer(string[, pos[, endpos]]) -> iterator.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002539 Return an iterator over all non-overlapping matches for the \n\
2540 RE pattern in string. For each match, the iterator returns a\n\
2541 match object.");
2542
2543PyDoc_STRVAR(pattern_sub_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002544"sub(repl, string[, count = 0]) -> newstring.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002545 Return the string obtained by replacing the leftmost non-overlapping\n\
Tim Peters3d563502006-01-21 02:47:53 +00002546 occurrences of pattern in string by the replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002547
2548PyDoc_STRVAR(pattern_subn_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002549"subn(repl, string[, count = 0]) -> (newstring, number of subs)\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002550 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2551 the leftmost non-overlapping occurrences of pattern with the\n\
Tim Peters3d563502006-01-21 02:47:53 +00002552 replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002553
2554PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2555
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002556static PyMethodDef pattern_methods[] = {
Tim Peters3d563502006-01-21 02:47:53 +00002557 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002558 pattern_match_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002559 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002560 pattern_search_doc},
Raymond Hettinger94478742004-09-24 04:31:19 +00002561 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002562 pattern_sub_doc},
Raymond Hettinger94478742004-09-24 04:31:19 +00002563 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002564 pattern_subn_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002565 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002566 pattern_split_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002567 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002568 pattern_findall_doc},
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002569 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002570 pattern_finditer_doc},
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002571 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS|METH_KEYWORDS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002572 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2573 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002574 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002575};
2576
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002577#define PAT_OFF(x) offsetof(PatternObject, x)
2578static PyMemberDef pattern_members[] = {
2579 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
2580 {"flags", T_INT, PAT_OFF(flags), READONLY},
2581 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
2582 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
2583 {NULL} /* Sentinel */
2584};
Guido van Rossumb700df92000-03-31 14:59:30 +00002585
Neal Norwitz57c179c2006-03-22 07:18:02 +00002586static PyTypeObject Pattern_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00002587 PyVarObject_HEAD_INIT(NULL, 0)
2588 "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002589 sizeof(PatternObject), sizeof(SRE_CODE),
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002590 (destructor)pattern_dealloc, /* tp_dealloc */
2591 0, /* tp_print */
2592 0, /* tp_getattr */
2593 0, /* tp_setattr */
2594 0, /* tp_reserved */
2595 0, /* tp_repr */
2596 0, /* tp_as_number */
2597 0, /* tp_as_sequence */
2598 0, /* tp_as_mapping */
2599 0, /* tp_hash */
2600 0, /* tp_call */
2601 0, /* tp_str */
2602 0, /* tp_getattro */
2603 0, /* tp_setattro */
2604 0, /* tp_as_buffer */
2605 Py_TPFLAGS_DEFAULT, /* tp_flags */
2606 pattern_doc, /* tp_doc */
2607 0, /* tp_traverse */
2608 0, /* tp_clear */
2609 0, /* tp_richcompare */
2610 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
2611 0, /* tp_iter */
2612 0, /* tp_iternext */
2613 pattern_methods, /* tp_methods */
2614 pattern_members, /* tp_members */
Guido van Rossumb700df92000-03-31 14:59:30 +00002615};
2616
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002617static int _validate(PatternObject *self); /* Forward */
2618
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002619static PyObject *
2620_compile(PyObject* self_, PyObject* args)
2621{
2622 /* "compile" pattern descriptor to pattern object */
2623
2624 PatternObject* self;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002625 Py_ssize_t i, n;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002626
2627 PyObject* pattern;
2628 int flags = 0;
2629 PyObject* code;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002630 Py_ssize_t groups = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002631 PyObject* groupindex = NULL;
2632 PyObject* indexgroup = NULL;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002633
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002634 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002635 &PyList_Type, &code, &groups,
2636 &groupindex, &indexgroup))
2637 return NULL;
2638
2639 n = PyList_GET_SIZE(code);
Christian Heimes587c2bf2008-01-19 16:21:02 +00002640 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002641 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2642 if (!self)
2643 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002644 self->weakreflist = NULL;
2645 self->pattern = NULL;
2646 self->groupindex = NULL;
2647 self->indexgroup = NULL;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002648 self->view.buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002649
2650 self->codesize = n;
2651
2652 for (i = 0; i < n; i++) {
2653 PyObject *o = PyList_GET_ITEM(code, i);
Guido van Rossumddefaf32007-01-14 03:31:43 +00002654 unsigned long value = PyLong_AsUnsignedLong(o);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002655 self->code[i] = (SRE_CODE) value;
2656 if ((unsigned long) self->code[i] != value) {
2657 PyErr_SetString(PyExc_OverflowError,
2658 "regular expression code size limit exceeded");
2659 break;
2660 }
2661 }
2662
2663 if (PyErr_Occurred()) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002664 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002665 return NULL;
2666 }
2667
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002668 if (pattern == Py_None) {
2669 self->logical_charsize = -1;
2670 self->charsize = -1;
Victor Stinner63ab8752011-11-22 03:31:20 +01002671 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002672 else {
2673 Py_ssize_t p_length;
2674 if (!getstring(pattern, &p_length, &self->logical_charsize,
Benjamin Peterson33d21a22012-03-07 14:59:13 -06002675 &self->charsize, &self->view)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002676 Py_DECREF(self);
2677 return NULL;
2678 }
2679 }
Antoine Pitroufd036452008-08-19 17:56:33 +00002680
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002681 Py_INCREF(pattern);
2682 self->pattern = pattern;
2683
2684 self->flags = flags;
2685
2686 self->groups = groups;
2687
2688 Py_XINCREF(groupindex);
2689 self->groupindex = groupindex;
2690
2691 Py_XINCREF(indexgroup);
2692 self->indexgroup = indexgroup;
2693
2694 self->weakreflist = NULL;
2695
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002696 if (!_validate(self)) {
2697 Py_DECREF(self);
2698 return NULL;
2699 }
2700
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002701 return (PyObject*) self;
2702}
2703
Guido van Rossumb700df92000-03-31 14:59:30 +00002704/* -------------------------------------------------------------------- */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002705/* Code validation */
2706
2707/* To learn more about this code, have a look at the _compile() function in
2708 Lib/sre_compile.py. The validation functions below checks the code array
2709 for conformance with the code patterns generated there.
2710
2711 The nice thing about the generated code is that it is position-independent:
2712 all jumps are relative jumps forward. Also, jumps don't cross each other:
2713 the target of a later jump is always earlier than the target of an earlier
2714 jump. IOW, this is okay:
2715
2716 J---------J-------T--------T
2717 \ \_____/ /
2718 \______________________/
2719
2720 but this is not:
2721
2722 J---------J-------T--------T
2723 \_________\_____/ /
2724 \____________/
2725
2726 It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
2727 bytes wide (the latter if Python is compiled for "wide" unicode support).
2728*/
2729
2730/* Defining this one enables tracing of the validator */
2731#undef VVERBOSE
2732
2733/* Trace macro for the validator */
2734#if defined(VVERBOSE)
2735#define VTRACE(v) printf v
2736#else
Senthil Kumaran202a3c42011-10-20 02:15:36 +08002737#define VTRACE(v) do {} while(0) /* do nothing */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002738#endif
2739
2740/* Report failure */
2741#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2742
2743/* Extract opcode, argument, or skip count from code array */
2744#define GET_OP \
2745 do { \
2746 VTRACE(("%p: ", code)); \
2747 if (code >= end) FAIL; \
2748 op = *code++; \
2749 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2750 } while (0)
2751#define GET_ARG \
2752 do { \
2753 VTRACE(("%p= ", code)); \
2754 if (code >= end) FAIL; \
2755 arg = *code++; \
2756 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2757 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00002758#define GET_SKIP_ADJ(adj) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002759 do { \
2760 VTRACE(("%p= ", code)); \
2761 if (code >= end) FAIL; \
2762 skip = *code; \
2763 VTRACE(("%lu (skip to %p)\n", \
2764 (unsigned long)skip, code+skip)); \
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03002765 if (skip-adj > end-code) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002766 FAIL; \
2767 code++; \
2768 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00002769#define GET_SKIP GET_SKIP_ADJ(0)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002770
2771static int
2772_validate_charset(SRE_CODE *code, SRE_CODE *end)
2773{
2774 /* Some variables are manipulated by the macros above */
2775 SRE_CODE op;
2776 SRE_CODE arg;
2777 SRE_CODE offset;
2778 int i;
2779
2780 while (code < end) {
2781 GET_OP;
2782 switch (op) {
2783
2784 case SRE_OP_NEGATE:
2785 break;
2786
2787 case SRE_OP_LITERAL:
2788 GET_ARG;
2789 break;
2790
2791 case SRE_OP_RANGE:
2792 GET_ARG;
2793 GET_ARG;
2794 break;
2795
2796 case SRE_OP_CHARSET:
2797 offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03002798 if (offset > end-code)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002799 FAIL;
2800 code += offset;
2801 break;
2802
2803 case SRE_OP_BIGCHARSET:
2804 GET_ARG; /* Number of blocks */
2805 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03002806 if (offset > end-code)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002807 FAIL;
2808 /* Make sure that each byte points to a valid block */
2809 for (i = 0; i < 256; i++) {
2810 if (((unsigned char *)code)[i] >= arg)
2811 FAIL;
2812 }
2813 code += offset;
2814 offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03002815 if (offset > end-code)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002816 FAIL;
2817 code += offset;
2818 break;
2819
2820 case SRE_OP_CATEGORY:
2821 GET_ARG;
2822 switch (arg) {
2823 case SRE_CATEGORY_DIGIT:
2824 case SRE_CATEGORY_NOT_DIGIT:
2825 case SRE_CATEGORY_SPACE:
2826 case SRE_CATEGORY_NOT_SPACE:
2827 case SRE_CATEGORY_WORD:
2828 case SRE_CATEGORY_NOT_WORD:
2829 case SRE_CATEGORY_LINEBREAK:
2830 case SRE_CATEGORY_NOT_LINEBREAK:
2831 case SRE_CATEGORY_LOC_WORD:
2832 case SRE_CATEGORY_LOC_NOT_WORD:
2833 case SRE_CATEGORY_UNI_DIGIT:
2834 case SRE_CATEGORY_UNI_NOT_DIGIT:
2835 case SRE_CATEGORY_UNI_SPACE:
2836 case SRE_CATEGORY_UNI_NOT_SPACE:
2837 case SRE_CATEGORY_UNI_WORD:
2838 case SRE_CATEGORY_UNI_NOT_WORD:
2839 case SRE_CATEGORY_UNI_LINEBREAK:
2840 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2841 break;
2842 default:
2843 FAIL;
2844 }
2845 break;
2846
2847 default:
2848 FAIL;
2849
2850 }
2851 }
2852
2853 return 1;
2854}
2855
2856static int
2857_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2858{
2859 /* Some variables are manipulated by the macros above */
2860 SRE_CODE op;
2861 SRE_CODE arg;
2862 SRE_CODE skip;
2863
2864 VTRACE(("code=%p, end=%p\n", code, end));
2865
2866 if (code > end)
2867 FAIL;
2868
2869 while (code < end) {
2870 GET_OP;
2871 switch (op) {
2872
2873 case SRE_OP_MARK:
2874 /* We don't check whether marks are properly nested; the
2875 sre_match() code is robust even if they don't, and the worst
2876 you can get is nonsensical match results. */
2877 GET_ARG;
2878 if (arg > 2*groups+1) {
2879 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
2880 FAIL;
2881 }
2882 break;
2883
2884 case SRE_OP_LITERAL:
2885 case SRE_OP_NOT_LITERAL:
2886 case SRE_OP_LITERAL_IGNORE:
2887 case SRE_OP_NOT_LITERAL_IGNORE:
2888 GET_ARG;
2889 /* The arg is just a character, nothing to check */
2890 break;
2891
2892 case SRE_OP_SUCCESS:
2893 case SRE_OP_FAILURE:
2894 /* Nothing to check; these normally end the matching process */
2895 break;
2896
2897 case SRE_OP_AT:
2898 GET_ARG;
2899 switch (arg) {
2900 case SRE_AT_BEGINNING:
2901 case SRE_AT_BEGINNING_STRING:
2902 case SRE_AT_BEGINNING_LINE:
2903 case SRE_AT_END:
2904 case SRE_AT_END_LINE:
2905 case SRE_AT_END_STRING:
2906 case SRE_AT_BOUNDARY:
2907 case SRE_AT_NON_BOUNDARY:
2908 case SRE_AT_LOC_BOUNDARY:
2909 case SRE_AT_LOC_NON_BOUNDARY:
2910 case SRE_AT_UNI_BOUNDARY:
2911 case SRE_AT_UNI_NON_BOUNDARY:
2912 break;
2913 default:
2914 FAIL;
2915 }
2916 break;
2917
2918 case SRE_OP_ANY:
2919 case SRE_OP_ANY_ALL:
2920 /* These have no operands */
2921 break;
2922
2923 case SRE_OP_IN:
2924 case SRE_OP_IN_IGNORE:
2925 GET_SKIP;
2926 /* Stop 1 before the end; we check the FAILURE below */
2927 if (!_validate_charset(code, code+skip-2))
2928 FAIL;
2929 if (code[skip-2] != SRE_OP_FAILURE)
2930 FAIL;
2931 code += skip-1;
2932 break;
2933
2934 case SRE_OP_INFO:
2935 {
2936 /* A minimal info field is
2937 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2938 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2939 more follows. */
Ross Lagerwall88748d72012-03-06 21:48:57 +02002940 SRE_CODE flags, i;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002941 SRE_CODE *newcode;
2942 GET_SKIP;
2943 newcode = code+skip-1;
2944 GET_ARG; flags = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02002945 GET_ARG;
2946 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002947 /* Check that only valid flags are present */
2948 if ((flags & ~(SRE_INFO_PREFIX |
2949 SRE_INFO_LITERAL |
2950 SRE_INFO_CHARSET)) != 0)
2951 FAIL;
2952 /* PREFIX and CHARSET are mutually exclusive */
2953 if ((flags & SRE_INFO_PREFIX) &&
2954 (flags & SRE_INFO_CHARSET))
2955 FAIL;
2956 /* LITERAL implies PREFIX */
2957 if ((flags & SRE_INFO_LITERAL) &&
2958 !(flags & SRE_INFO_PREFIX))
2959 FAIL;
2960 /* Validate the prefix */
2961 if (flags & SRE_INFO_PREFIX) {
Ross Lagerwall88748d72012-03-06 21:48:57 +02002962 SRE_CODE prefix_len;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002963 GET_ARG; prefix_len = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02002964 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002965 /* Here comes the prefix string */
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03002966 if (prefix_len > newcode-code)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002967 FAIL;
2968 code += prefix_len;
2969 /* And here comes the overlap table */
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03002970 if (prefix_len > newcode-code)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002971 FAIL;
2972 /* Each overlap value should be < prefix_len */
2973 for (i = 0; i < prefix_len; i++) {
2974 if (code[i] >= prefix_len)
2975 FAIL;
2976 }
2977 code += prefix_len;
2978 }
2979 /* Validate the charset */
2980 if (flags & SRE_INFO_CHARSET) {
2981 if (!_validate_charset(code, newcode-1))
2982 FAIL;
2983 if (newcode[-1] != SRE_OP_FAILURE)
2984 FAIL;
2985 code = newcode;
2986 }
2987 else if (code != newcode) {
2988 VTRACE(("code=%p, newcode=%p\n", code, newcode));
2989 FAIL;
2990 }
2991 }
2992 break;
2993
2994 case SRE_OP_BRANCH:
2995 {
2996 SRE_CODE *target = NULL;
2997 for (;;) {
2998 GET_SKIP;
2999 if (skip == 0)
3000 break;
3001 /* Stop 2 before the end; we check the JUMP below */
3002 if (!_validate_inner(code, code+skip-3, groups))
3003 FAIL;
3004 code += skip-3;
3005 /* Check that it ends with a JUMP, and that each JUMP
3006 has the same target */
3007 GET_OP;
3008 if (op != SRE_OP_JUMP)
3009 FAIL;
3010 GET_SKIP;
3011 if (target == NULL)
3012 target = code+skip-1;
3013 else if (code+skip-1 != target)
3014 FAIL;
3015 }
3016 }
3017 break;
3018
3019 case SRE_OP_REPEAT_ONE:
3020 case SRE_OP_MIN_REPEAT_ONE:
3021 {
3022 SRE_CODE min, max;
3023 GET_SKIP;
3024 GET_ARG; min = arg;
3025 GET_ARG; max = arg;
3026 if (min > max)
3027 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02003028 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003029 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003030 if (!_validate_inner(code, code+skip-4, groups))
3031 FAIL;
3032 code += skip-4;
3033 GET_OP;
3034 if (op != SRE_OP_SUCCESS)
3035 FAIL;
3036 }
3037 break;
3038
3039 case SRE_OP_REPEAT:
3040 {
3041 SRE_CODE min, max;
3042 GET_SKIP;
3043 GET_ARG; min = arg;
3044 GET_ARG; max = arg;
3045 if (min > max)
3046 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02003047 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003048 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003049 if (!_validate_inner(code, code+skip-3, groups))
3050 FAIL;
3051 code += skip-3;
3052 GET_OP;
3053 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3054 FAIL;
3055 }
3056 break;
3057
3058 case SRE_OP_GROUPREF:
3059 case SRE_OP_GROUPREF_IGNORE:
3060 GET_ARG;
3061 if (arg >= groups)
3062 FAIL;
3063 break;
3064
3065 case SRE_OP_GROUPREF_EXISTS:
3066 /* The regex syntax for this is: '(?(group)then|else)', where
3067 'group' is either an integer group number or a group name,
3068 'then' and 'else' are sub-regexes, and 'else' is optional. */
3069 GET_ARG;
3070 if (arg >= groups)
3071 FAIL;
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00003072 GET_SKIP_ADJ(1);
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003073 code--; /* The skip is relative to the first arg! */
3074 /* There are two possibilities here: if there is both a 'then'
3075 part and an 'else' part, the generated code looks like:
3076
3077 GROUPREF_EXISTS
3078 <group>
3079 <skipyes>
3080 ...then part...
3081 JUMP
3082 <skipno>
3083 (<skipyes> jumps here)
3084 ...else part...
3085 (<skipno> jumps here)
3086
3087 If there is only a 'then' part, it looks like:
3088
3089 GROUPREF_EXISTS
3090 <group>
3091 <skip>
3092 ...then part...
3093 (<skip> jumps here)
3094
3095 There is no direct way to decide which it is, and we don't want
3096 to allow arbitrary jumps anywhere in the code; so we just look
3097 for a JUMP opcode preceding our skip target.
3098 */
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03003099 if (skip >= 3 && skip-3 < end-code &&
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003100 code[skip-3] == SRE_OP_JUMP)
3101 {
3102 VTRACE(("both then and else parts present\n"));
3103 if (!_validate_inner(code+1, code+skip-3, groups))
3104 FAIL;
3105 code += skip-2; /* Position after JUMP, at <skipno> */
3106 GET_SKIP;
3107 if (!_validate_inner(code, code+skip-1, groups))
3108 FAIL;
3109 code += skip-1;
3110 }
3111 else {
3112 VTRACE(("only a then part present\n"));
3113 if (!_validate_inner(code+1, code+skip-1, groups))
3114 FAIL;
3115 code += skip-1;
3116 }
3117 break;
3118
3119 case SRE_OP_ASSERT:
3120 case SRE_OP_ASSERT_NOT:
3121 GET_SKIP;
3122 GET_ARG; /* 0 for lookahead, width for lookbehind */
3123 code--; /* Back up over arg to simplify math below */
3124 if (arg & 0x80000000)
3125 FAIL; /* Width too large */
3126 /* Stop 1 before the end; we check the SUCCESS below */
3127 if (!_validate_inner(code+1, code+skip-2, groups))
3128 FAIL;
3129 code += skip-2;
3130 GET_OP;
3131 if (op != SRE_OP_SUCCESS)
3132 FAIL;
3133 break;
3134
3135 default:
3136 FAIL;
3137
3138 }
3139 }
3140
3141 VTRACE(("okay\n"));
3142 return 1;
3143}
3144
3145static int
3146_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3147{
3148 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3149 FAIL;
3150 if (groups == 0) /* fix for simplejson */
3151 groups = 100; /* 100 groups should always be safe */
3152 return _validate_inner(code, end-1, groups);
3153}
3154
3155static int
3156_validate(PatternObject *self)
3157{
3158 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3159 {
3160 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3161 return 0;
3162 }
3163 else
3164 VTRACE(("Success!\n"));
3165 return 1;
3166}
3167
3168/* -------------------------------------------------------------------- */
Guido van Rossumb700df92000-03-31 14:59:30 +00003169/* match methods */
3170
3171static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003172match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003173{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003174 Py_XDECREF(self->regs);
3175 Py_XDECREF(self->string);
3176 Py_DECREF(self->pattern);
3177 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00003178}
3179
3180static PyObject*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003181match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00003182{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003183 if (index < 0 || index >= self->groups) {
3184 /* raise IndexError if we were given a bad group number */
3185 PyErr_SetString(
3186 PyExc_IndexError,
3187 "no such group"
3188 );
3189 return NULL;
3190 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003191
Fredrik Lundh6f013982000-07-03 18:44:21 +00003192 index *= 2;
3193
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003194 if (self->string == Py_None || self->mark[index] < 0) {
3195 /* return default value if the string or group is undefined */
3196 Py_INCREF(def);
3197 return def;
3198 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003199
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003200 return PySequence_GetSlice(
3201 self->string, self->mark[index], self->mark[index+1]
3202 );
Guido van Rossumb700df92000-03-31 14:59:30 +00003203}
3204
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003205static Py_ssize_t
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003206match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00003207{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003208 Py_ssize_t i;
Guido van Rossumb700df92000-03-31 14:59:30 +00003209
Guido van Rossumddefaf32007-01-14 03:31:43 +00003210 if (index == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003211 /* Default value */
3212 return 0;
Guido van Rossumddefaf32007-01-14 03:31:43 +00003213
Christian Heimes217cfd12007-12-02 14:31:20 +00003214 if (PyLong_Check(index))
3215 return PyLong_AsSsize_t(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00003216
Fredrik Lundh6f013982000-07-03 18:44:21 +00003217 i = -1;
3218
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003219 if (self->pattern->groupindex) {
3220 index = PyObject_GetItem(self->pattern->groupindex, index);
3221 if (index) {
Neal Norwitz1fe5f382007-08-31 04:32:55 +00003222 if (PyLong_Check(index))
Christian Heimes217cfd12007-12-02 14:31:20 +00003223 i = PyLong_AsSsize_t(index);
Fredrik Lundh6f013982000-07-03 18:44:21 +00003224 Py_DECREF(index);
3225 } else
3226 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003227 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00003228
3229 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003230}
3231
3232static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003233match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003234{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003235 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00003236}
3237
3238static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003239match_expand(MatchObject* self, PyObject* ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003240{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003241 /* delegate to Python code */
3242 return call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00003243 SRE_PY_MODULE, "_expand",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003244 PyTuple_Pack(3, self->pattern, self, ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003245 );
3246}
3247
3248static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003249match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003250{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003251 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003252 Py_ssize_t i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00003253
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003254 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00003255
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003256 switch (size) {
3257 case 0:
3258 result = match_getslice(self, Py_False, Py_None);
3259 break;
3260 case 1:
3261 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3262 break;
3263 default:
3264 /* fetch multiple items */
3265 result = PyTuple_New(size);
3266 if (!result)
3267 return NULL;
3268 for (i = 0; i < size; i++) {
3269 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003270 self, PyTuple_GET_ITEM(args, i), Py_None
3271 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003272 if (!item) {
3273 Py_DECREF(result);
3274 return NULL;
3275 }
3276 PyTuple_SET_ITEM(result, i, item);
3277 }
3278 break;
3279 }
3280 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003281}
3282
3283static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003284match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003285{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003286 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003287 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003288
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003289 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003290 static char* kwlist[] = { "default", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00003291 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003292 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003293
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003294 result = PyTuple_New(self->groups-1);
3295 if (!result)
3296 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003297
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003298 for (index = 1; index < self->groups; index++) {
3299 PyObject* item;
3300 item = match_getslice_by_index(self, index, def);
3301 if (!item) {
3302 Py_DECREF(result);
3303 return NULL;
3304 }
3305 PyTuple_SET_ITEM(result, index-1, item);
3306 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003307
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003308 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003309}
3310
3311static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003312match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003313{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003314 PyObject* result;
3315 PyObject* keys;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003316 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003317
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003318 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003319 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00003320 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003321 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003322
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003323 result = PyDict_New();
3324 if (!result || !self->pattern->groupindex)
3325 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003326
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003327 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003328 if (!keys)
3329 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00003330
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003331 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00003332 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003333 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003334 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003335 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003336 if (!key)
3337 goto failed;
3338 value = match_getslice(self, key, def);
3339 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003340 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003341 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003342 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00003343 status = PyDict_SetItem(result, key, value);
3344 Py_DECREF(value);
3345 if (status < 0)
3346 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003347 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003348
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003349 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00003350
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003351 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003352
3353failed:
Neal Norwitz60da3162006-03-07 04:48:24 +00003354 Py_XDECREF(keys);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003355 Py_DECREF(result);
3356 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003357}
3358
3359static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003360match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003361{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003362 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003363
Guido van Rossumddefaf32007-01-14 03:31:43 +00003364 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003365 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003366 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003367
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003368 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003369
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003370 if (index < 0 || index >= self->groups) {
3371 PyErr_SetString(
3372 PyExc_IndexError,
3373 "no such group"
3374 );
3375 return NULL;
3376 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003377
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003378 /* mark is -1 if group is undefined */
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01003379 return PyLong_FromSsize_t(self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00003380}
3381
3382static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003383match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003384{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003385 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003386
Guido van Rossumddefaf32007-01-14 03:31:43 +00003387 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003388 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003389 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003390
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003391 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003392
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003393 if (index < 0 || index >= self->groups) {
3394 PyErr_SetString(
3395 PyExc_IndexError,
3396 "no such group"
3397 );
3398 return NULL;
3399 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003400
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003401 /* mark is -1 if group is undefined */
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01003402 return PyLong_FromSsize_t(self->mark[index*2+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003403}
3404
3405LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003406_pair(Py_ssize_t i1, Py_ssize_t i2)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003407{
3408 PyObject* pair;
3409 PyObject* item;
3410
3411 pair = PyTuple_New(2);
3412 if (!pair)
3413 return NULL;
3414
Christian Heimes217cfd12007-12-02 14:31:20 +00003415 item = PyLong_FromSsize_t(i1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003416 if (!item)
3417 goto error;
3418 PyTuple_SET_ITEM(pair, 0, item);
3419
Christian Heimes217cfd12007-12-02 14:31:20 +00003420 item = PyLong_FromSsize_t(i2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003421 if (!item)
3422 goto error;
3423 PyTuple_SET_ITEM(pair, 1, item);
3424
3425 return pair;
3426
3427 error:
3428 Py_DECREF(pair);
3429 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003430}
3431
3432static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003433match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003434{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003435 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003436
Guido van Rossumddefaf32007-01-14 03:31:43 +00003437 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003438 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003439 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003440
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003441 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003442
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003443 if (index < 0 || index >= self->groups) {
3444 PyErr_SetString(
3445 PyExc_IndexError,
3446 "no such group"
3447 );
3448 return NULL;
3449 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003450
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003451 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003452 return _pair(self->mark[index*2], self->mark[index*2+1]);
3453}
3454
3455static PyObject*
3456match_regs(MatchObject* self)
3457{
3458 PyObject* regs;
3459 PyObject* item;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003460 Py_ssize_t index;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003461
3462 regs = PyTuple_New(self->groups);
3463 if (!regs)
3464 return NULL;
3465
3466 for (index = 0; index < self->groups; index++) {
3467 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3468 if (!item) {
3469 Py_DECREF(regs);
3470 return NULL;
3471 }
3472 PyTuple_SET_ITEM(regs, index, item);
3473 }
3474
3475 Py_INCREF(regs);
3476 self->regs = regs;
3477
3478 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00003479}
3480
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003481static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003482match_copy(MatchObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003483{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003484#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003485 MatchObject* copy;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003486 Py_ssize_t slots, offset;
Tim Peters3d563502006-01-21 02:47:53 +00003487
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003488 slots = 2 * (self->pattern->groups+1);
3489
3490 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3491 if (!copy)
3492 return NULL;
3493
3494 /* this value a constant, but any compiler should be able to
3495 figure that out all by itself */
3496 offset = offsetof(MatchObject, string);
3497
3498 Py_XINCREF(self->pattern);
3499 Py_XINCREF(self->string);
3500 Py_XINCREF(self->regs);
3501
3502 memcpy((char*) copy + offset, (char*) self + offset,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003503 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003504
3505 return (PyObject*) copy;
3506#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003507 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003508 return NULL;
3509#endif
3510}
3511
3512static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003513match_deepcopy(MatchObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003514{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003515#ifdef USE_BUILTIN_COPY
3516 MatchObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00003517
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003518 copy = (MatchObject*) match_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003519 if (!copy)
3520 return NULL;
3521
3522 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3523 !deepcopy(&copy->string, memo) ||
3524 !deepcopy(&copy->regs, memo)) {
3525 Py_DECREF(copy);
3526 return NULL;
3527 }
3528
3529#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003530 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3531 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003532#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003533}
3534
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003535PyDoc_STRVAR(match_doc,
3536"The result of re.match() and re.search().\n\
3537Match objects always have a boolean value of True.");
3538
3539PyDoc_STRVAR(match_group_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003540"group([group1, ...]) -> str or tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003541 Return subgroup(s) of the match by indices or names.\n\
3542 For 0 returns the entire match.");
3543
3544PyDoc_STRVAR(match_start_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003545"start([group=0]) -> int.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003546 Return index of the start of the substring matched by group.");
3547
3548PyDoc_STRVAR(match_end_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003549"end([group=0]) -> int.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003550 Return index of the end of the substring matched by group.");
3551
3552PyDoc_STRVAR(match_span_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003553"span([group]) -> tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003554 For MatchObject m, return the 2-tuple (m.start(group), m.end(group)).");
3555
3556PyDoc_STRVAR(match_groups_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003557"groups([default=None]) -> tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003558 Return a tuple containing all the subgroups of the match, from 1.\n\
3559 The default argument is used for groups\n\
3560 that did not participate in the match");
3561
3562PyDoc_STRVAR(match_groupdict_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003563"groupdict([default=None]) -> dict.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003564 Return a dictionary containing all the named subgroups of the match,\n\
3565 keyed by the subgroup name. The default argument is used for groups\n\
3566 that did not participate in the match");
3567
3568PyDoc_STRVAR(match_expand_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003569"expand(template) -> str.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003570 Return the string obtained by doing backslash substitution\n\
3571 on the string template, as done by the sub() method.");
3572
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003573static PyMethodDef match_methods[] = {
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003574 {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
3575 {"start", (PyCFunction) match_start, METH_VARARGS, match_start_doc},
3576 {"end", (PyCFunction) match_end, METH_VARARGS, match_end_doc},
3577 {"span", (PyCFunction) match_span, METH_VARARGS, match_span_doc},
3578 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS,
3579 match_groups_doc},
3580 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS,
3581 match_groupdict_doc},
3582 {"expand", (PyCFunction) match_expand, METH_O, match_expand_doc},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003583 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3584 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003585 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003586};
3587
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003588static PyObject *
3589match_lastindex_get(MatchObject *self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003590{
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003591 if (self->lastindex >= 0)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01003592 return PyLong_FromSsize_t(self->lastindex);
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003593 Py_INCREF(Py_None);
3594 return Py_None;
Guido van Rossumb700df92000-03-31 14:59:30 +00003595}
3596
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003597static PyObject *
3598match_lastgroup_get(MatchObject *self)
3599{
3600 if (self->pattern->indexgroup && self->lastindex >= 0) {
3601 PyObject* result = PySequence_GetItem(
3602 self->pattern->indexgroup, self->lastindex
3603 );
3604 if (result)
3605 return result;
3606 PyErr_Clear();
3607 }
3608 Py_INCREF(Py_None);
3609 return Py_None;
3610}
3611
3612static PyObject *
3613match_regs_get(MatchObject *self)
3614{
3615 if (self->regs) {
3616 Py_INCREF(self->regs);
3617 return self->regs;
3618 } else
3619 return match_regs(self);
3620}
3621
3622static PyGetSetDef match_getset[] = {
3623 {"lastindex", (getter)match_lastindex_get, (setter)NULL},
3624 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
3625 {"regs", (getter)match_regs_get, (setter)NULL},
3626 {NULL}
3627};
3628
3629#define MATCH_OFF(x) offsetof(MatchObject, x)
3630static PyMemberDef match_members[] = {
3631 {"string", T_OBJECT, MATCH_OFF(string), READONLY},
3632 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
3633 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
3634 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
3635 {NULL}
3636};
3637
Guido van Rossumb700df92000-03-31 14:59:30 +00003638/* FIXME: implement setattr("string", None) as a special case (to
3639 detach the associated string, if any */
3640
Neal Norwitz57c179c2006-03-22 07:18:02 +00003641static PyTypeObject Match_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003642 PyVarObject_HEAD_INIT(NULL,0)
3643 "_" SRE_MODULE ".SRE_Match",
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003644 sizeof(MatchObject), sizeof(Py_ssize_t),
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003645 (destructor)match_dealloc, /* tp_dealloc */
3646 0, /* tp_print */
3647 0, /* tp_getattr */
3648 0, /* tp_setattr */
3649 0, /* tp_reserved */
3650 0, /* tp_repr */
3651 0, /* tp_as_number */
3652 0, /* tp_as_sequence */
3653 0, /* tp_as_mapping */
3654 0, /* tp_hash */
3655 0, /* tp_call */
3656 0, /* tp_str */
3657 0, /* tp_getattro */
3658 0, /* tp_setattro */
3659 0, /* tp_as_buffer */
3660 Py_TPFLAGS_DEFAULT, /* tp_flags */
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003661 match_doc, /* tp_doc */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003662 0, /* tp_traverse */
3663 0, /* tp_clear */
3664 0, /* tp_richcompare */
3665 0, /* tp_weaklistoffset */
3666 0, /* tp_iter */
3667 0, /* tp_iternext */
3668 match_methods, /* tp_methods */
3669 match_members, /* tp_members */
3670 match_getset, /* tp_getset */
Guido van Rossumb700df92000-03-31 14:59:30 +00003671};
3672
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003673static PyObject*
3674pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3675{
3676 /* create match object (from state object) */
3677
3678 MatchObject* match;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003679 Py_ssize_t i, j;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003680 char* base;
3681 int n;
3682
3683 if (status > 0) {
3684
3685 /* create match object (with room for extra group marks) */
Christian Heimes587c2bf2008-01-19 16:21:02 +00003686 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003687 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3688 2*(pattern->groups+1));
3689 if (!match)
3690 return NULL;
3691
3692 Py_INCREF(pattern);
3693 match->pattern = pattern;
3694
3695 Py_INCREF(state->string);
3696 match->string = state->string;
3697
3698 match->regs = NULL;
3699 match->groups = pattern->groups+1;
3700
3701 /* fill in group slices */
3702
3703 base = (char*) state->beginning;
3704 n = state->charsize;
3705
3706 match->mark[0] = ((char*) state->start - base) / n;
3707 match->mark[1] = ((char*) state->ptr - base) / n;
3708
3709 for (i = j = 0; i < pattern->groups; i++, j+=2)
3710 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3711 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3712 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3713 } else
3714 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3715
3716 match->pos = state->pos;
3717 match->endpos = state->endpos;
3718
3719 match->lastindex = state->lastindex;
3720
3721 return (PyObject*) match;
3722
3723 } else if (status == 0) {
3724
3725 /* no match */
3726 Py_INCREF(Py_None);
3727 return Py_None;
3728
3729 }
3730
3731 /* internal error */
3732 pattern_error(status);
3733 return NULL;
3734}
3735
3736
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003737/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003738/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003739
3740static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003741scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003742{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003743 state_fini(&self->state);
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003744 Py_XDECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003745 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003746}
3747
3748static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003749scanner_match(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003750{
3751 SRE_STATE* state = &self->state;
3752 PyObject* match;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01003753 Py_ssize_t status;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003754
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003755 state_reset(state);
3756
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003757 state->ptr = state->start;
3758
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003759 if (state->logical_charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003760 status = sre_match(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003761 } else {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003762 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003763 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003764 if (PyErr_Occurred())
3765 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003766
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003767 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003768 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003769
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003770 if (status == 0 || state->ptr == state->start)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003771 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003772 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003773 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003774
3775 return match;
3776}
3777
3778
3779static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003780scanner_search(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003781{
3782 SRE_STATE* state = &self->state;
3783 PyObject* match;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01003784 Py_ssize_t status;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003785
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003786 state_reset(state);
3787
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003788 state->ptr = state->start;
3789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003790 if (state->logical_charsize == 1) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003791 status = sre_search(state, PatternObject_GetCode(self->pattern));
3792 } else {
3793 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
3794 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003795 if (PyErr_Occurred())
3796 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003797
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003798 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003799 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003800
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003801 if (status == 0 || state->ptr == state->start)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003802 state->start = (void*) ((char*) state->ptr + state->charsize);
3803 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003804 state->start = state->ptr;
3805
3806 return match;
3807}
3808
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003809static PyMethodDef scanner_methods[] = {
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003810 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3811 {"search", (PyCFunction) scanner_search, METH_NOARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003812 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003813};
3814
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003815#define SCAN_OFF(x) offsetof(ScannerObject, x)
3816static PyMemberDef scanner_members[] = {
Ezio Melotti7c8c1ea2011-09-29 01:00:19 +03003817 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003818 {NULL} /* Sentinel */
3819};
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003820
Neal Norwitz57c179c2006-03-22 07:18:02 +00003821static PyTypeObject Scanner_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003822 PyVarObject_HEAD_INIT(NULL, 0)
3823 "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003824 sizeof(ScannerObject), 0,
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003825 (destructor)scanner_dealloc,/* tp_dealloc */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003826 0, /* tp_print */
3827 0, /* tp_getattr */
3828 0, /* tp_setattr */
3829 0, /* tp_reserved */
3830 0, /* tp_repr */
3831 0, /* tp_as_number */
3832 0, /* tp_as_sequence */
3833 0, /* tp_as_mapping */
3834 0, /* tp_hash */
3835 0, /* tp_call */
3836 0, /* tp_str */
3837 0, /* tp_getattro */
3838 0, /* tp_setattro */
3839 0, /* tp_as_buffer */
3840 Py_TPFLAGS_DEFAULT, /* tp_flags */
3841 0, /* tp_doc */
3842 0, /* tp_traverse */
3843 0, /* tp_clear */
3844 0, /* tp_richcompare */
3845 0, /* tp_weaklistoffset */
3846 0, /* tp_iter */
3847 0, /* tp_iternext */
3848 scanner_methods, /* tp_methods */
3849 scanner_members, /* tp_members */
3850 0, /* tp_getset */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003851};
3852
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003853static PyObject*
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06003854pattern_scanner(PatternObject* pattern, PyObject* args, PyObject* kw)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003855{
3856 /* create search state object */
3857
3858 ScannerObject* self;
3859
3860 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003861 Py_ssize_t start = 0;
3862 Py_ssize_t end = PY_SSIZE_T_MAX;
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06003863 static char* kwlist[] = { "source", "pos", "endpos", NULL };
3864 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:scanner", kwlist,
3865 &string, &start, &end))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003866 return NULL;
3867
3868 /* create scanner object */
3869 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3870 if (!self)
3871 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003872 self->pattern = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003873
3874 string = state_init(&self->state, pattern, string, start, end);
3875 if (!string) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003876 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003877 return NULL;
3878 }
3879
3880 Py_INCREF(pattern);
3881 self->pattern = (PyObject*) pattern;
3882
3883 return (PyObject*) self;
3884}
3885
Guido van Rossumb700df92000-03-31 14:59:30 +00003886static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00003887 {"compile", _compile, METH_VARARGS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003888 {"getcodesize", sre_codesize, METH_NOARGS},
Neal Norwitzb0493252002-03-31 14:44:22 +00003889 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003890 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003891};
3892
Martin v. Löwis1a214512008-06-11 05:26:20 +00003893static struct PyModuleDef sremodule = {
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003894 PyModuleDef_HEAD_INIT,
3895 "_" SRE_MODULE,
3896 NULL,
3897 -1,
3898 _functions,
3899 NULL,
3900 NULL,
3901 NULL,
3902 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00003903};
3904
3905PyMODINIT_FUNC PyInit__sre(void)
Guido van Rossumb700df92000-03-31 14:59:30 +00003906{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003907 PyObject* m;
3908 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00003909 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003910
Benjamin Peterson08bf91c2010-04-11 16:12:57 +00003911 /* Patch object types */
3912 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
3913 PyType_Ready(&Scanner_Type))
Martin v. Löwis1a214512008-06-11 05:26:20 +00003914 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003915
Martin v. Löwis1a214512008-06-11 05:26:20 +00003916 m = PyModule_Create(&sremodule);
Neal Norwitz1ac754f2006-01-19 06:09:39 +00003917 if (m == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003918 return NULL;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003919 d = PyModule_GetDict(m);
3920
Christian Heimes217cfd12007-12-02 14:31:20 +00003921 x = PyLong_FromLong(SRE_MAGIC);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003922 if (x) {
3923 PyDict_SetItemString(d, "MAGIC", x);
3924 Py_DECREF(x);
3925 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00003926
Christian Heimes217cfd12007-12-02 14:31:20 +00003927 x = PyLong_FromLong(sizeof(SRE_CODE));
Martin v. Löwis78e2f062003-04-19 12:56:08 +00003928 if (x) {
3929 PyDict_SetItemString(d, "CODESIZE", x);
3930 Py_DECREF(x);
3931 }
3932
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02003933 x = PyLong_FromUnsignedLong(SRE_MAXREPEAT);
3934 if (x) {
3935 PyDict_SetItemString(d, "MAXREPEAT", x);
3936 Py_DECREF(x);
3937 }
3938
Neal Norwitzfe537132007-08-26 03:55:15 +00003939 x = PyUnicode_FromString(copyright);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003940 if (x) {
3941 PyDict_SetItemString(d, "copyright", x);
3942 Py_DECREF(x);
3943 }
Martin v. Löwis1a214512008-06-11 05:26:20 +00003944 return m;
Guido van Rossumb700df92000-03-31 14:59:30 +00003945}
3946
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003947#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00003948
3949/* vim:ts=4:sw=4:et
3950*/