blob: 5bcc387977aec5aec2e5c473871f5672071ee10a [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
Fredrik Lundhebc37b22000-10-28 19:30:41 +00009 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000010 * 2000-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000011 * 2000-09-20 fl added expand method
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000012 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000013 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000014 * 2001-04-28 fl added __copy__ methods (work in progress)
Fredrik Lundh09705f02002-11-22 12:46:35 +000015 * 2001-05-14 fl fixes for 1.5.2 compatibility
Fredrik Lundhf71ae462001-07-02 17:04:48 +000016 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh397a6542001-10-18 19:30:16 +000017 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Fredrik Lundh971e78b2001-10-20 17:48:46 +000018 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
Fredrik Lundhbec95b92001-10-21 16:47:57 +000019 * 2001-10-21 fl added sub/subn primitive
Fredrik Lundh703ce812001-10-24 22:16:30 +000020 * 2001-10-24 fl added finditer primitive (for 2.2 only)
Fredrik Lundh82b23072001-12-09 16:13:15 +000021 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
Fredrik Lundh09705f02002-11-22 12:46:35 +000022 * 2002-11-09 fl fixed empty sub/subn return type
Martin v. Löwis78e2f062003-04-19 12:56:08 +000023 * 2003-04-18 mvl fully support 4-byte codes
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +000024 * 2003-10-17 gn implemented non recursive scheme
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000026 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000027 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000028 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
Guido van Rossumb700df92000-03-31 14:59:30 +000032 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000033 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000034 * other compatibility work.
35 */
36
37#ifndef SRE_RECURSIVE
38
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000039static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000040 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000041
Thomas Wouters0e3f5912006-08-11 14:57:12 +000042#define PY_SSIZE_T_CLEAN
43
Guido van Rossumb700df92000-03-31 14:59:30 +000044#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000045#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000046
47#include "sre.h"
48
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000049#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000050
Fredrik Lundh436c3d582000-06-29 08:58:44 +000051/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000052#if !defined(SRE_MODULE)
53#define SRE_MODULE "sre"
54#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000055
Thomas Wouters9ada3d62006-04-21 09:47:09 +000056#define SRE_PY_MODULE "re"
57
Guido van Rossumb700df92000-03-31 14:59:30 +000058/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000059#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000060
Fredrik Lundh22d25462000-07-01 17:50:59 +000061/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +000062#define HAVE_UNICODE
Fredrik Lundh436c3d582000-06-29 08:58:44 +000063
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000065/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000066
67/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000068#define USE_FAST_SEARCH
69
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000070/* enables copy/deepcopy handling (work in progress) */
71#undef USE_BUILTIN_COPY
72
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000073#if PY_VERSION_HEX < 0x01060000
74#define PyObject_DEL(op) PyMem_DEL((op))
75#endif
76
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000077/* -------------------------------------------------------------------- */
78
Fredrik Lundh80946112000-06-29 18:03:25 +000079#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000080#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000081#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000082/* fastest possible local call under MSVC */
83#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000084#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000085#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000086#else
87#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000088#endif
89
90/* error codes */
91#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000092#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000093#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000094#define SRE_ERROR_MEMORY -9 /* out of memory */
Christian Heimes2380ac72008-01-09 00:17:24 +000095#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
Guido van Rossumb700df92000-03-31 14:59:30 +000096
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000097#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +000098#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +000099#else
100#define TRACE(v)
101#endif
102
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000103/* -------------------------------------------------------------------- */
104/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000105
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000106/* default character predicates (run sre_chars.py to regenerate tables) */
107
108#define SRE_DIGIT_MASK 1
109#define SRE_SPACE_MASK 2
110#define SRE_LINEBREAK_MASK 4
111#define SRE_ALNUM_MASK 8
112#define SRE_WORD_MASK 16
113
Fredrik Lundh21009b92001-09-18 18:47:09 +0000114/* FIXME: this assumes ASCII. create tables in init_sre() instead */
115
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000116static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1172, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1180, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
11925, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12024, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1210, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12224, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
123
Fredrik Lundhb389df32000-06-29 12:48:37 +0000124static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000012510, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
12627, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
12744, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
12861, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
129108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
130122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
131106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
132120, 121, 122, 123, 124, 125, 126, 127 };
133
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000134#define SRE_IS_DIGIT(ch)\
135 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
136#define SRE_IS_SPACE(ch)\
137 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
138#define SRE_IS_LINEBREAK(ch)\
139 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
140#define SRE_IS_ALNUM(ch)\
141 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
142#define SRE_IS_WORD(ch)\
143 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000144
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000145static unsigned int sre_lower(unsigned int ch)
146{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000147 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000148}
149
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000150/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000151/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
152 * warnings when c's type supports only numbers < N+1 */
153#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
154#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000155#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000156#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000157#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
158
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000159static unsigned int sre_lower_locale(unsigned int ch)
160{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000161 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000162}
163
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000164/* unicode-specific character predicates */
165
Victor Stinner0058b862011-09-29 03:27:47 +0200166#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
167#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch)
168#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch)
169#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch)
170#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000171
172static unsigned int sre_lower_unicode(unsigned int ch)
173{
Victor Stinner0058b862011-09-29 03:27:47 +0200174 return (unsigned int) Py_UNICODE_TOLOWER(ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000175}
176
Guido van Rossumb700df92000-03-31 14:59:30 +0000177LOCAL(int)
178sre_category(SRE_CODE category, unsigned int ch)
179{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000180 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000181
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000182 case SRE_CATEGORY_DIGIT:
183 return SRE_IS_DIGIT(ch);
184 case SRE_CATEGORY_NOT_DIGIT:
185 return !SRE_IS_DIGIT(ch);
186 case SRE_CATEGORY_SPACE:
187 return SRE_IS_SPACE(ch);
188 case SRE_CATEGORY_NOT_SPACE:
189 return !SRE_IS_SPACE(ch);
190 case SRE_CATEGORY_WORD:
191 return SRE_IS_WORD(ch);
192 case SRE_CATEGORY_NOT_WORD:
193 return !SRE_IS_WORD(ch);
194 case SRE_CATEGORY_LINEBREAK:
195 return SRE_IS_LINEBREAK(ch);
196 case SRE_CATEGORY_NOT_LINEBREAK:
197 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000198
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000199 case SRE_CATEGORY_LOC_WORD:
200 return SRE_LOC_IS_WORD(ch);
201 case SRE_CATEGORY_LOC_NOT_WORD:
202 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000203
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000204 case SRE_CATEGORY_UNI_DIGIT:
205 return SRE_UNI_IS_DIGIT(ch);
206 case SRE_CATEGORY_UNI_NOT_DIGIT:
207 return !SRE_UNI_IS_DIGIT(ch);
208 case SRE_CATEGORY_UNI_SPACE:
209 return SRE_UNI_IS_SPACE(ch);
210 case SRE_CATEGORY_UNI_NOT_SPACE:
211 return !SRE_UNI_IS_SPACE(ch);
212 case SRE_CATEGORY_UNI_WORD:
213 return SRE_UNI_IS_WORD(ch);
214 case SRE_CATEGORY_UNI_NOT_WORD:
215 return !SRE_UNI_IS_WORD(ch);
216 case SRE_CATEGORY_UNI_LINEBREAK:
217 return SRE_UNI_IS_LINEBREAK(ch);
218 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
219 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000220 }
221 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000222}
223
224/* helpers */
225
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000226static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000227data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000228{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000229 if (state->data_stack) {
Thomas Wouters477c8d52006-05-27 19:21:47 +0000230 PyMem_FREE(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000231 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000232 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000233 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000234}
235
236static int
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000237data_stack_grow(SRE_STATE* state, Py_ssize_t size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000238{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000239 Py_ssize_t minsize, cursize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000240 minsize = state->data_stack_base+size;
241 cursize = state->data_stack_size;
242 if (cursize < minsize) {
243 void* stack;
244 cursize = minsize+minsize/4+1024;
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300245 TRACE(("allocate/grow stack %" PY_FORMAT_SIZE_T "d\n", cursize));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000246 stack = PyMem_REALLOC(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000247 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000248 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000249 return SRE_ERROR_MEMORY;
250 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000251 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000252 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000253 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000254 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000255}
256
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000257/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000258
259#define SRE_CHAR unsigned char
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200260#define SRE_CHARGET(state, buf, index) ((unsigned char*)buf)[index]
Guido van Rossumb700df92000-03-31 14:59:30 +0000261#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000262#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000263#define SRE_CHARSET sre_charset
264#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000265#define SRE_MATCH sre_match
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000266#define SRE_MATCH_CONTEXT sre_match_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000267#define SRE_SEARCH sre_search
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000268
Guido van Rossumb700df92000-03-31 14:59:30 +0000269#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000270#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000271#undef SRE_RECURSIVE
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000272
Guido van Rossumb700df92000-03-31 14:59:30 +0000273#undef SRE_SEARCH
274#undef SRE_MATCH
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000275#undef SRE_MATCH_CONTEXT
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000276#undef SRE_INFO
277#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000278#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000279#undef SRE_AT
280#undef SRE_CHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200281#undef SRE_CHARGET
Guido van Rossumb700df92000-03-31 14:59:30 +0000282
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200283/* generate 8/16/32-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200285#define SRE_CHAR void
286#define SRE_CHARGET(state, buf, index) \
287 ((state->charsize==1) ? ((Py_UCS1*)buf)[index] : \
288 (state->charsize==2) ? ((Py_UCS2*)buf)[index] : \
289 ((Py_UCS4*)buf)[index])
Guido van Rossumb700df92000-03-31 14:59:30 +0000290#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000291#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000292#define SRE_CHARSET sre_ucharset
293#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000294#define SRE_MATCH sre_umatch
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000295#define SRE_MATCH_CONTEXT sre_umatch_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000296#define SRE_SEARCH sre_usearch
297
298#endif /* SRE_RECURSIVE */
299
300/* -------------------------------------------------------------------- */
301/* String matching engine */
302
303/* the following section is compiled twice, with different character
304 settings */
305
306LOCAL(int)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200307SRE_AT(SRE_STATE* state, char* ptr, SRE_CODE at)
Guido van Rossumb700df92000-03-31 14:59:30 +0000308{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000309 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000310
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000311 Py_ssize_t thisp, thatp;
Guido van Rossumb700df92000-03-31 14:59:30 +0000312
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000313 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000314
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000315 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000316 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000317 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000318
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000319 case SRE_AT_BEGINNING_LINE:
320 return ((void*) ptr == state->beginning ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200321 SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, -1)));
Fredrik Lundh80946112000-06-29 18:03:25 +0000322
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000323 case SRE_AT_END:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200324 return (((void*) (ptr+state->charsize) == state->end &&
325 SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, 0))) ||
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000326 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000327
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000328 case SRE_AT_END_LINE:
329 return ((void*) ptr == state->end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200330 SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, 0)));
Fredrik Lundh80946112000-06-29 18:03:25 +0000331
Fredrik Lundh770617b2001-01-14 15:06:11 +0000332 case SRE_AT_END_STRING:
333 return ((void*) ptr == state->end);
334
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000335 case SRE_AT_BOUNDARY:
336 if (state->beginning == state->end)
337 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000338 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200339 SRE_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000340 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200341 SRE_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000342 return thisp != thatp;
Fredrik Lundh80946112000-06-29 18:03:25 +0000343
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000344 case SRE_AT_NON_BOUNDARY:
345 if (state->beginning == state->end)
346 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000347 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200348 SRE_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000349 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200350 SRE_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000351 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000352
353 case SRE_AT_LOC_BOUNDARY:
354 if (state->beginning == state->end)
355 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000356 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200357 SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000358 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200359 SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000360 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000361
362 case SRE_AT_LOC_NON_BOUNDARY:
363 if (state->beginning == state->end)
364 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000365 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200366 SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000367 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200368 SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000369 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000370
371 case SRE_AT_UNI_BOUNDARY:
372 if (state->beginning == state->end)
373 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000374 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200375 SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000376 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200377 SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000378 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000379
380 case SRE_AT_UNI_NON_BOUNDARY:
381 if (state->beginning == state->end)
382 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000383 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200384 SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000385 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200386 SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000387 return thisp == thatp;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000388
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000389 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000390
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000391 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000392}
393
394LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000395SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000396{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000397 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000398
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000399 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000400
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000401 for (;;) {
402 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000403
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000404 case SRE_OP_FAILURE:
405 return !ok;
406
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000407 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000408 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000409 if (ch == set[0])
410 return ok;
411 set++;
412 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000413
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000414 case SRE_OP_CATEGORY:
415 /* <CATEGORY> <code> */
416 if (sre_category(set[0], (int) ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000417 return ok;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000418 set += 1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000419 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000420
Fredrik Lundh3562f112000-07-02 12:00:07 +0000421 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000422 if (sizeof(SRE_CODE) == 2) {
423 /* <CHARSET> <bitmap> (16 bits per code word) */
424 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
425 return ok;
426 set += 16;
Tim Peters3d563502006-01-21 02:47:53 +0000427 }
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000428 else {
429 /* <CHARSET> <bitmap> (32 bits per code word) */
Gregory P. Smith90555d02012-12-10 17:44:44 -0800430 if (ch < 256 && (set[ch >> 5] & (1u << (ch & 31))))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000431 return ok;
432 set += 8;
433 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000434 break;
435
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000436 case SRE_OP_RANGE:
437 /* <RANGE> <lower> <upper> */
438 if (set[0] <= ch && ch <= set[1])
439 return ok;
440 set += 2;
441 break;
442
443 case SRE_OP_NEGATE:
444 ok = !ok;
445 break;
446
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000447 case SRE_OP_BIGCHARSET:
448 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
449 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000450 Py_ssize_t count, block;
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000451 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000452
453 if (sizeof(SRE_CODE) == 2) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200454 block = ((char*)set)[ch >> 8];
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000455 set += 128;
456 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
457 return ok;
458 set += count*16;
459 }
460 else {
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000461 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
462 * warnings when c's type supports only numbers < N+1 */
463 if (!(ch & ~65535))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200464 block = ((char*)set)[ch >> 8];
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000465 else
466 block = -1;
467 set += 64;
Tim Peters3d563502006-01-21 02:47:53 +0000468 if (block >=0 &&
Gregory P. Smith90555d02012-12-10 17:44:44 -0800469 (set[block*8 + ((ch & 255)>>5)] & (1u << (ch & 31))))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000470 return ok;
471 set += count*8;
472 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000473 break;
474 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000475
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000476 default:
477 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000478 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000479 return 0;
480 }
481 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000482}
483
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000484LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000485
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000486LOCAL(Py_ssize_t)
487SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000488{
489 SRE_CODE chr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200490 char* ptr = (char *)state->ptr;
491 char* end = (char *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000492 Py_ssize_t i;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000493
494 /* adjust end */
Serhiy Storchakaa0eb8092013-02-16 16:54:33 +0200495 if (maxcount < (end - ptr) / state->charsize && maxcount != SRE_MAXREPEAT)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200496 end = ptr + maxcount*state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000497
498 switch (pattern[0]) {
499
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000500 case SRE_OP_IN:
501 /* repeated set */
502 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
Victor Stinner63ab8752011-11-22 03:31:20 +0100503 while (ptr < end &&
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200504 SRE_CHARSET(pattern + 2, SRE_CHARGET(state, ptr, 0)))
505 ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000506 break;
507
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000508 case SRE_OP_ANY:
509 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000510 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200511 while (ptr < end && !SRE_IS_LINEBREAK(SRE_CHARGET(state, ptr, 0)))
512 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000513 break;
514
515 case SRE_OP_ANY_ALL:
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000516 /* repeated dot wildcard. skip to the end of the target
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000517 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000518 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000519 ptr = end;
520 break;
521
522 case SRE_OP_LITERAL:
523 /* repeated literal */
524 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000525 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200526 while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) == chr)
527 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000528 break;
529
530 case SRE_OP_LITERAL_IGNORE:
531 /* repeated literal */
532 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000533 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200534 while (ptr < end && (SRE_CODE) state->lower(SRE_CHARGET(state, ptr, 0)) == chr)
535 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000536 break;
537
538 case SRE_OP_NOT_LITERAL:
539 /* repeated non-literal */
540 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000541 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200542 while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) != chr)
543 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000544 break;
Tim Peters3d563502006-01-21 02:47:53 +0000545
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000546 case SRE_OP_NOT_LITERAL_IGNORE:
547 /* repeated non-literal */
548 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000549 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200550 while (ptr < end && (SRE_CODE) state->lower(SRE_CHARGET(state, ptr, 0)) != chr)
551 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000552 break;
553
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000554 default:
555 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000556 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200557 while ((char*) state->ptr < end) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000558 i = SRE_MATCH(state, pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000559 if (i < 0)
560 return i;
561 if (!i)
562 break;
563 }
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300564 TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200565 ((char*)state->ptr - ptr)/state->charsize));
566 return ((char*)state->ptr - ptr)/state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000567 }
568
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300569 TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
570 (ptr - (char*) state->ptr)/state->charsize));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200571 return (ptr - (char*) state->ptr)/state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000572}
573
Fredrik Lundh33accc12000-08-27 20:59:47 +0000574#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000575LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000576SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
577{
578 /* check if an SRE_OP_INFO block matches at the current position.
579 returns the number of SRE_CODE objects to skip if successful, 0
580 if no match */
581
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200582 char* end = state->end;
583 char* ptr = state->ptr;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000584 Py_ssize_t i;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000585
586 /* check minimal length */
Serhiy Storchakac1b59d42012-12-29 23:38:48 +0200587 if (pattern[3] && (end - ptr)/state->charsize < pattern[3])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000588 return 0;
589
590 /* check known prefix */
591 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
592 /* <length> <skip> <prefix data> <overlap data> */
593 for (i = 0; i < pattern[5]; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200594 if ((SRE_CODE) SRE_CHARGET(state, ptr, i) != pattern[7 + i])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000595 return 0;
596 return pattern[0] + 2 * pattern[6];
597 }
598 return pattern[0];
599}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000600#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000601
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000602/* The macros below should be used to protect recursive SRE_MATCH()
603 * calls that *failed* and do *not* return immediately (IOW, those
604 * that will backtrack). Explaining:
605 *
606 * - Recursive SRE_MATCH() returned true: that's usually a success
607 * (besides atypical cases like ASSERT_NOT), therefore there's no
608 * reason to restore lastmark;
609 *
610 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
611 * is returning to the caller: If the current SRE_MATCH() is the
612 * top function of the recursion, returning false will be a matching
613 * failure, and it doesn't matter where lastmark is pointing to.
614 * If it's *not* the top function, it will be a recursive SRE_MATCH()
615 * failure by itself, and the calling SRE_MATCH() will have to deal
616 * with the failure by the same rules explained here (it will restore
617 * lastmark by itself if necessary);
618 *
619 * - Recursive SRE_MATCH() returned false, and will continue the
620 * outside 'for' loop: must be protected when breaking, since the next
621 * OP could potentially depend on lastmark;
Tim Peters3d563502006-01-21 02:47:53 +0000622 *
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000623 * - Recursive SRE_MATCH() returned false, and will be called again
624 * inside a local for/while loop: must be protected between each
625 * loop iteration, since the recursive SRE_MATCH() could do anything,
626 * and could potentially depend on lastmark.
627 *
628 * For more information, check the discussion at SF patch #712900.
629 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000630#define LASTMARK_SAVE() \
631 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000632 ctx->lastmark = state->lastmark; \
633 ctx->lastindex = state->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000634 } while (0)
635#define LASTMARK_RESTORE() \
636 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000637 state->lastmark = ctx->lastmark; \
638 state->lastindex = ctx->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000639 } while (0)
640
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000641#define RETURN_ERROR(i) do { return i; } while(0)
642#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
643#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
644
645#define RETURN_ON_ERROR(i) \
646 do { if (i < 0) RETURN_ERROR(i); } while (0)
647#define RETURN_ON_SUCCESS(i) \
648 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
649#define RETURN_ON_FAILURE(i) \
650 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
651
652#define SFY(x) #x
653
654#define DATA_STACK_ALLOC(state, type, ptr) \
655do { \
656 alloc_pos = state->data_stack_base; \
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300657 TRACE(("allocating %s in %" PY_FORMAT_SIZE_T "d " \
658 "(%" PY_FORMAT_SIZE_T "d)\n", \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000659 SFY(type), alloc_pos, sizeof(type))); \
Serhiy Storchaka4bb17342013-04-13 21:15:47 +0300660 if (sizeof(type) > state->data_stack_size - alloc_pos) { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000661 int j = data_stack_grow(state, sizeof(type)); \
662 if (j < 0) return j; \
663 if (ctx_pos != -1) \
664 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
665 } \
666 ptr = (type*)(state->data_stack+alloc_pos); \
667 state->data_stack_base += sizeof(type); \
668} while (0)
669
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000670#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
671do { \
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300672 TRACE(("looking up %s at %" PY_FORMAT_SIZE_T "d\n", SFY(type), pos)); \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000673 ptr = (type*)(state->data_stack+pos); \
674} while (0)
675
676#define DATA_STACK_PUSH(state, data, size) \
677do { \
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300678 TRACE(("copy data in %p to %" PY_FORMAT_SIZE_T "d " \
679 "(%" PY_FORMAT_SIZE_T "d)\n", \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000680 data, state->data_stack_base, size)); \
Serhiy Storchaka4bb17342013-04-13 21:15:47 +0300681 if (size > state->data_stack_size - state->data_stack_base) { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000682 int j = data_stack_grow(state, size); \
683 if (j < 0) return j; \
684 if (ctx_pos != -1) \
685 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
686 } \
687 memcpy(state->data_stack+state->data_stack_base, data, size); \
688 state->data_stack_base += size; \
689} while (0)
690
691#define DATA_STACK_POP(state, data, size, discard) \
692do { \
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300693 TRACE(("copy data to %p from %" PY_FORMAT_SIZE_T "d " \
694 "(%" PY_FORMAT_SIZE_T "d)\n", \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000695 data, state->data_stack_base-size, size)); \
696 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
697 if (discard) \
698 state->data_stack_base -= size; \
699} while (0)
700
701#define DATA_STACK_POP_DISCARD(state, size) \
702do { \
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300703 TRACE(("discard data from %" PY_FORMAT_SIZE_T "d " \
704 "(%" PY_FORMAT_SIZE_T "d)\n", \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000705 state->data_stack_base-size, size)); \
706 state->data_stack_base -= size; \
707} while(0)
708
709#define DATA_PUSH(x) \
710 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
711#define DATA_POP(x) \
712 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000713#define DATA_POP_DISCARD(x) \
714 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
715#define DATA_ALLOC(t,p) \
716 DATA_STACK_ALLOC(state, t, p)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000717#define DATA_LOOKUP_AT(t,p,pos) \
718 DATA_STACK_LOOKUP_AT(state,t,p,pos)
719
720#define MARK_PUSH(lastmark) \
721 do if (lastmark > 0) { \
722 i = lastmark; /* ctx->lastmark may change if reallocated */ \
723 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
724 } while (0)
725#define MARK_POP(lastmark) \
726 do if (lastmark > 0) { \
727 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
728 } while (0)
729#define MARK_POP_KEEP(lastmark) \
730 do if (lastmark > 0) { \
731 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
732 } while (0)
733#define MARK_POP_DISCARD(lastmark) \
734 do if (lastmark > 0) { \
735 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
736 } while (0)
737
738#define JUMP_NONE 0
739#define JUMP_MAX_UNTIL_1 1
740#define JUMP_MAX_UNTIL_2 2
741#define JUMP_MAX_UNTIL_3 3
742#define JUMP_MIN_UNTIL_1 4
743#define JUMP_MIN_UNTIL_2 5
744#define JUMP_MIN_UNTIL_3 6
745#define JUMP_REPEAT 7
746#define JUMP_REPEAT_ONE_1 8
747#define JUMP_REPEAT_ONE_2 9
748#define JUMP_MIN_REPEAT_ONE 10
749#define JUMP_BRANCH 11
750#define JUMP_ASSERT 12
751#define JUMP_ASSERT_NOT 13
752
753#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
754 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
755 nextctx->last_ctx_pos = ctx_pos; \
756 nextctx->jump = jumpvalue; \
757 nextctx->pattern = nextpattern; \
758 ctx_pos = alloc_pos; \
759 ctx = nextctx; \
760 goto entrance; \
761 jumplabel: \
762 while (0) /* gcc doesn't like labels at end of scopes */ \
763
764typedef struct {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000765 Py_ssize_t last_ctx_pos;
766 Py_ssize_t jump;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200767 char* ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000768 SRE_CODE* pattern;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000769 Py_ssize_t count;
770 Py_ssize_t lastmark;
771 Py_ssize_t lastindex;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000772 union {
773 SRE_CODE chr;
774 SRE_REPEAT* rep;
775 } u;
776} SRE_MATCH_CONTEXT;
777
778/* check if string matches the given pattern. returns <0 for
779 error, 0 for failure, and 1 for success */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000780LOCAL(Py_ssize_t)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000781SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
Guido van Rossumb700df92000-03-31 14:59:30 +0000782{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200783 char* end = (char*)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000784 Py_ssize_t alloc_pos, ctx_pos = -1;
785 Py_ssize_t i, ret = 0;
786 Py_ssize_t jump;
Christian Heimes2380ac72008-01-09 00:17:24 +0000787 unsigned int sigcount=0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000788
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000789 SRE_MATCH_CONTEXT* ctx;
790 SRE_MATCH_CONTEXT* nextctx;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000791
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000792 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000793
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000794 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
795 ctx->last_ctx_pos = -1;
796 ctx->jump = JUMP_NONE;
797 ctx->pattern = pattern;
798 ctx_pos = alloc_pos;
799
800entrance:
801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200802 ctx->ptr = (char *)state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000803
804 if (ctx->pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000805 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000806 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200807 if (ctx->pattern[3] && (end - ctx->ptr)/state->charsize < ctx->pattern[3]) {
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300808 TRACE(("reject (got %" PY_FORMAT_SIZE_T "d chars, "
809 "need %" PY_FORMAT_SIZE_T "d)\n",
810 (end - ctx->ptr)/state->charsize,
811 (Py_ssize_t) ctx->pattern[3]));
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000812 RETURN_FAILURE;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000813 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000814 ctx->pattern += ctx->pattern[1] + 1;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000815 }
816
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000817 for (;;) {
Christian Heimes2380ac72008-01-09 00:17:24 +0000818 ++sigcount;
819 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
820 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
Guido van Rossumb700df92000-03-31 14:59:30 +0000821
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000822 switch (*ctx->pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000823
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000824 case SRE_OP_MARK:
825 /* set mark */
826 /* <MARK> <gid> */
827 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
828 ctx->ptr, ctx->pattern[0]));
829 i = ctx->pattern[0];
830 if (i & 1)
831 state->lastindex = i/2 + 1;
832 if (i > state->lastmark) {
833 /* state->lastmark is the highest valid index in the
834 state->mark array. If it is increased by more than 1,
835 the intervening marks must be set to NULL to signal
Tim Peters3d563502006-01-21 02:47:53 +0000836 that these marks have not been encountered. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000837 Py_ssize_t j = state->lastmark + 1;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000838 while (j < i)
839 state->mark[j++] = NULL;
840 state->lastmark = i;
841 }
842 state->mark[i] = ctx->ptr;
843 ctx->pattern++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000844 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000845
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000846 case SRE_OP_LITERAL:
847 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000848 /* <LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000849 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
850 ctx->ptr, *ctx->pattern));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200851 if (ctx->ptr >= end || (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) != ctx->pattern[0])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000852 RETURN_FAILURE;
853 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200854 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000855 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000856
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000857 case SRE_OP_NOT_LITERAL:
858 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000859 /* <NOT_LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000860 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
861 ctx->ptr, *ctx->pattern));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200862 if (ctx->ptr >= end || (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) == ctx->pattern[0])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000863 RETURN_FAILURE;
864 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200865 ctx->ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000866 break;
867
868 case SRE_OP_SUCCESS:
869 /* end of pattern */
870 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
871 state->ptr = ctx->ptr;
872 RETURN_SUCCESS;
873
874 case SRE_OP_AT:
875 /* match at given position */
876 /* <AT> <code> */
877 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
878 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
879 RETURN_FAILURE;
880 ctx->pattern++;
881 break;
882
883 case SRE_OP_CATEGORY:
884 /* match at given category */
885 /* <CATEGORY> <code> */
886 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
887 ctx->ptr, *ctx->pattern));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200888 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], SRE_CHARGET(state, ctx->ptr, 0)))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000889 RETURN_FAILURE;
890 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200891 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000892 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000893
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000894 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000895 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000896 /* <ANY> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000897 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200898 if (ctx->ptr >= end || SRE_IS_LINEBREAK(SRE_CHARGET(state, ctx->ptr, 0)))
899 RETURN_FAILURE;
900 ctx->ptr += state->charsize;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000901 break;
902
903 case SRE_OP_ANY_ALL:
904 /* match anything */
905 /* <ANY_ALL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000906 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
907 if (ctx->ptr >= end)
908 RETURN_FAILURE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200909 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000910 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000911
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000912 case SRE_OP_IN:
913 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000914 /* <IN> <skip> <set> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000915 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200916 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, SRE_CHARGET(state, ctx->ptr, 0)))
917 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000918 ctx->pattern += ctx->pattern[0];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200919 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000920 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000921
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000922 case SRE_OP_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000923 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
924 ctx->pattern, ctx->ptr, ctx->pattern[0]));
925 if (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200926 state->lower(SRE_CHARGET(state, ctx->ptr, 0)) != state->lower(*ctx->pattern))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000927 RETURN_FAILURE;
928 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200929 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000930 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000931
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000932 case SRE_OP_NOT_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000933 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
934 ctx->pattern, ctx->ptr, *ctx->pattern));
935 if (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200936 state->lower(SRE_CHARGET(state, ctx->ptr, 0)) == state->lower(*ctx->pattern))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000937 RETURN_FAILURE;
938 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200939 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000940 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000941
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000942 case SRE_OP_IN_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000943 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
944 if (ctx->ptr >= end
945 || !SRE_CHARSET(ctx->pattern+1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200946 (SRE_CODE)state->lower(SRE_CHARGET(state, ctx->ptr, 0))))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000947 RETURN_FAILURE;
948 ctx->pattern += ctx->pattern[0];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200949 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000950 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000951
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000952 case SRE_OP_JUMP:
953 case SRE_OP_INFO:
954 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000955 /* <JUMP> <offset> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000956 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
957 ctx->ptr, ctx->pattern[0]));
958 ctx->pattern += ctx->pattern[0];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000959 break;
960
961 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000962 /* alternation */
963 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000964 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000965 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000966 ctx->u.rep = state->repeat;
967 if (ctx->u.rep)
968 MARK_PUSH(ctx->lastmark);
969 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
970 if (ctx->pattern[1] == SRE_OP_LITERAL &&
971 (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200972 (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) != ctx->pattern[2]))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000973 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000974 if (ctx->pattern[1] == SRE_OP_IN &&
975 (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200976 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0))))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000977 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000978 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000979 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000980 if (ret) {
981 if (ctx->u.rep)
982 MARK_POP_DISCARD(ctx->lastmark);
983 RETURN_ON_ERROR(ret);
984 RETURN_SUCCESS;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000985 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000986 if (ctx->u.rep)
987 MARK_POP_KEEP(ctx->lastmark);
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000988 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000989 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000990 if (ctx->u.rep)
991 MARK_POP_DISCARD(ctx->lastmark);
992 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +0000993
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000994 case SRE_OP_REPEAT_ONE:
995 /* match repeated sequence (maximizing regexp) */
996
997 /* this operator only works if the repeated item is
998 exactly one character wide, and we're not already
999 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +00001000 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001001
1002 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1003
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001004 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1005 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001006
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001007 if ((Py_ssize_t) ctx->pattern[1] > (end - ctx->ptr) / state->charsize)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001008 RETURN_FAILURE; /* cannot match */
Fredrik Lundhe1869832000-08-01 22:47:49 +00001009
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001010 state->ptr = ctx->ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001011
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001012 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1013 RETURN_ON_ERROR(ret);
1014 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1015 ctx->count = ret;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001016 ctx->ptr += state->charsize * ctx->count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001017
1018 /* when we arrive here, count contains the number of
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001019 matches, and ctx->ptr points to the tail of the target
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001020 string. check if the rest of the pattern matches,
1021 and backtrack if not. */
1022
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001023 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001024 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001025
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001026 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001027 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001028 state->ptr = ctx->ptr;
1029 RETURN_SUCCESS;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001030 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001031
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001032 LASTMARK_SAVE();
1033
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001034 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001035 /* tail starts with a literal. skip positions where
1036 the rest of the pattern cannot possibly match */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001037 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001038 for (;;) {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001039 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
Victor Stinner63ab8752011-11-22 03:31:20 +01001040 (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001041 SRE_CHARGET(state, ctx->ptr, 0) != ctx->u.chr)) {
1042 ctx->ptr -= state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001043 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001044 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001045 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001046 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001047 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001048 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1049 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001050 if (ret) {
1051 RETURN_ON_ERROR(ret);
1052 RETURN_SUCCESS;
1053 }
Tim Peters3d563502006-01-21 02:47:53 +00001054
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001055 LASTMARK_RESTORE();
Tim Peters3d563502006-01-21 02:47:53 +00001056
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001057 ctx->ptr -= state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001058 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001059 }
1060
1061 } else {
1062 /* general case */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001063 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001064 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001065 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1066 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001067 if (ret) {
1068 RETURN_ON_ERROR(ret);
1069 RETURN_SUCCESS;
1070 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001071 ctx->ptr -= state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001072 ctx->count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001073 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001074 }
1075 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001076 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001077
Guido van Rossum41c99e72003-04-14 17:59:34 +00001078 case SRE_OP_MIN_REPEAT_ONE:
1079 /* match repeated sequence (minimizing regexp) */
1080
1081 /* this operator only works if the repeated item is
1082 exactly one character wide, and we're not already
1083 collecting backtracking points. for other cases,
1084 use the MIN_REPEAT operator */
1085
1086 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1087
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001088 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1089 ctx->pattern[1], ctx->pattern[2]));
Guido van Rossum41c99e72003-04-14 17:59:34 +00001090
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001091 if ((Py_ssize_t) ctx->pattern[1] > (end - ctx->ptr) / state->charsize)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001092 RETURN_FAILURE; /* cannot match */
Guido van Rossum41c99e72003-04-14 17:59:34 +00001093
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001094 state->ptr = ctx->ptr;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001095
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001096 if (ctx->pattern[1] == 0)
1097 ctx->count = 0;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001098 else {
1099 /* count using pattern min as the maximum */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001100 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1101 RETURN_ON_ERROR(ret);
1102 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001103 if (ret < (Py_ssize_t) ctx->pattern[1])
Tim Peters3d563502006-01-21 02:47:53 +00001104 /* didn't match minimum number of times */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001105 RETURN_FAILURE;
1106 /* advance past minimum matches of repeat */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001107 ctx->count = ret;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001108 ctx->ptr += state->charsize * ctx->count;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001109 }
1110
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001111 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Guido van Rossum41c99e72003-04-14 17:59:34 +00001112 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001113 state->ptr = ctx->ptr;
1114 RETURN_SUCCESS;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001115
1116 } else {
1117 /* general case */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001118 LASTMARK_SAVE();
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001119 while ((Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001120 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001121 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001122 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1123 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001124 if (ret) {
1125 RETURN_ON_ERROR(ret);
1126 RETURN_SUCCESS;
1127 }
1128 state->ptr = ctx->ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001129 ret = SRE_COUNT(state, ctx->pattern+3, 1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001130 RETURN_ON_ERROR(ret);
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001131 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001132 if (ret == 0)
Guido van Rossum41c99e72003-04-14 17:59:34 +00001133 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001134 assert(ret == 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001135 ctx->ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001136 ctx->count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001137 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001138 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001139 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001140 RETURN_FAILURE;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001141
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001142 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001143 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001144 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001145 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001146 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1147 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001148
1149 /* install new repeat context */
Thomas Wouters477c8d52006-05-27 19:21:47 +00001150 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001151 if (!ctx->u.rep) {
1152 PyErr_NoMemory();
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001153 RETURN_FAILURE;
Thomas Wouters89f507f2006-12-13 04:49:30 +00001154 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001155 ctx->u.rep->count = -1;
1156 ctx->u.rep->pattern = ctx->pattern;
1157 ctx->u.rep->prev = state->repeat;
1158 ctx->u.rep->last_ptr = NULL;
1159 state->repeat = ctx->u.rep;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001160
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001161 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001162 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001163 state->repeat = ctx->u.rep->prev;
Thomas Wouters477c8d52006-05-27 19:21:47 +00001164 PyObject_FREE(ctx->u.rep);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001165
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001166 if (ret) {
1167 RETURN_ON_ERROR(ret);
1168 RETURN_SUCCESS;
1169 }
1170 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001171
1172 case SRE_OP_MAX_UNTIL:
1173 /* maximizing repeat */
1174 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1175
1176 /* FIXME: we probably need to deal with zero-width
1177 matches in here... */
1178
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001179 ctx->u.rep = state->repeat;
1180 if (!ctx->u.rep)
1181 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001182
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001183 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001184
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001185 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001186
Serhiy Storchaka134f0de2013-09-05 18:01:15 +03001187 TRACE(("|%p|%p|MAX_UNTIL %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001188 ctx->ptr, ctx->count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001189
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001190 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001191 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001192 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001193 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1194 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001195 if (ret) {
1196 RETURN_ON_ERROR(ret);
1197 RETURN_SUCCESS;
1198 }
1199 ctx->u.rep->count = ctx->count-1;
1200 state->ptr = ctx->ptr;
1201 RETURN_FAILURE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001202 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001203
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001204 if ((ctx->count < (Py_ssize_t) ctx->u.rep->pattern[2] ||
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001205 ctx->u.rep->pattern[2] == SRE_MAXREPEAT) &&
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001206 state->ptr != ctx->u.rep->last_ptr) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001207 /* we may have enough matches, but if we can
1208 match another item, do so */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001209 ctx->u.rep->count = ctx->count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001210 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001211 MARK_PUSH(ctx->lastmark);
1212 /* zero-width match protection */
1213 DATA_PUSH(&ctx->u.rep->last_ptr);
1214 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001215 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1216 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001217 DATA_POP(&ctx->u.rep->last_ptr);
1218 if (ret) {
1219 MARK_POP_DISCARD(ctx->lastmark);
1220 RETURN_ON_ERROR(ret);
1221 RETURN_SUCCESS;
1222 }
1223 MARK_POP(ctx->lastmark);
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001224 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001225 ctx->u.rep->count = ctx->count-1;
1226 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001227 }
1228
1229 /* cannot match more repeated items here. make sure the
1230 tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001231 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001232 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001233 RETURN_ON_SUCCESS(ret);
1234 state->repeat = ctx->u.rep;
1235 state->ptr = ctx->ptr;
1236 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001237
1238 case SRE_OP_MIN_UNTIL:
1239 /* minimizing repeat */
1240 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1241
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001242 ctx->u.rep = state->repeat;
1243 if (!ctx->u.rep)
1244 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001245
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001246 state->ptr = ctx->ptr;
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001247
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001248 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001249
Serhiy Storchaka134f0de2013-09-05 18:01:15 +03001250 TRACE(("|%p|%p|MIN_UNTIL %" PY_FORMAT_SIZE_T "d %p\n", ctx->pattern,
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001251 ctx->ptr, ctx->count, ctx->u.rep->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001252
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001253 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001254 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001255 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001256 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1257 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001258 if (ret) {
1259 RETURN_ON_ERROR(ret);
1260 RETURN_SUCCESS;
1261 }
1262 ctx->u.rep->count = ctx->count-1;
1263 state->ptr = ctx->ptr;
1264 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001265 }
1266
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001267 LASTMARK_SAVE();
1268
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001269 /* see if the tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001270 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001271 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001272 if (ret) {
1273 RETURN_ON_ERROR(ret);
1274 RETURN_SUCCESS;
1275 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001276
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001277 state->repeat = ctx->u.rep;
1278 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001279
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001280 LASTMARK_RESTORE();
1281
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001282 if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2]
Serhiy Storchakafa468162013-02-16 21:23:53 +02001283 && ctx->u.rep->pattern[2] != SRE_MAXREPEAT) ||
1284 state->ptr == ctx->u.rep->last_ptr)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001285 RETURN_FAILURE;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001286
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001287 ctx->u.rep->count = ctx->count;
Serhiy Storchakafa468162013-02-16 21:23:53 +02001288 /* zero-width match protection */
1289 DATA_PUSH(&ctx->u.rep->last_ptr);
1290 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001291 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1292 ctx->u.rep->pattern+3);
Serhiy Storchakafa468162013-02-16 21:23:53 +02001293 DATA_POP(&ctx->u.rep->last_ptr);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001294 if (ret) {
1295 RETURN_ON_ERROR(ret);
1296 RETURN_SUCCESS;
1297 }
1298 ctx->u.rep->count = ctx->count-1;
1299 state->ptr = ctx->ptr;
1300 RETURN_FAILURE;
1301
1302 case SRE_OP_GROUPREF:
1303 /* match backreference */
1304 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1305 ctx->ptr, ctx->pattern[0]));
1306 i = ctx->pattern[0];
1307 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001308 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001309 if (groupref >= state->lastmark) {
1310 RETURN_FAILURE;
1311 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312 char* p = (char*) state->mark[groupref];
1313 char* e = (char*) state->mark[groupref+1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001314 if (!p || !e || e < p)
1315 RETURN_FAILURE;
1316 while (p < e) {
Victor Stinner63ab8752011-11-22 03:31:20 +01001317 if (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001318 SRE_CHARGET(state, ctx->ptr, 0) != SRE_CHARGET(state, p, 0))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001319 RETURN_FAILURE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 p += state->charsize;
1321 ctx->ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001322 }
1323 }
1324 }
1325 ctx->pattern++;
1326 break;
1327
1328 case SRE_OP_GROUPREF_IGNORE:
1329 /* match backreference */
1330 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1331 ctx->ptr, ctx->pattern[0]));
1332 i = ctx->pattern[0];
1333 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001334 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001335 if (groupref >= state->lastmark) {
1336 RETURN_FAILURE;
1337 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338 char* p = (char*) state->mark[groupref];
1339 char* e = (char*) state->mark[groupref+1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001340 if (!p || !e || e < p)
1341 RETURN_FAILURE;
1342 while (p < e) {
1343 if (ctx->ptr >= end ||
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001344 state->lower(SRE_CHARGET(state, ctx->ptr, 0)) !=
1345 state->lower(SRE_CHARGET(state, p, 0)))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001346 RETURN_FAILURE;
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001347 p += state->charsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001348 ctx->ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001349 }
1350 }
1351 }
1352 ctx->pattern++;
1353 break;
1354
1355 case SRE_OP_GROUPREF_EXISTS:
1356 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1357 ctx->ptr, ctx->pattern[0]));
1358 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1359 i = ctx->pattern[0];
1360 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001361 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001362 if (groupref >= state->lastmark) {
1363 ctx->pattern += ctx->pattern[1];
1364 break;
1365 } else {
1366 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1367 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1368 if (!p || !e || e < p) {
1369 ctx->pattern += ctx->pattern[1];
1370 break;
1371 }
1372 }
1373 }
1374 ctx->pattern += 2;
1375 break;
1376
1377 case SRE_OP_ASSERT:
1378 /* assert subpattern */
1379 /* <ASSERT> <skip> <back> <pattern> */
1380 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1381 ctx->ptr, ctx->pattern[1]));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382 state->ptr = ctx->ptr - state->charsize * ctx->pattern[1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001383 if (state->ptr < state->beginning)
1384 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001385 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001386 RETURN_ON_FAILURE(ret);
1387 ctx->pattern += ctx->pattern[0];
1388 break;
1389
1390 case SRE_OP_ASSERT_NOT:
1391 /* assert not subpattern */
1392 /* <ASSERT_NOT> <skip> <back> <pattern> */
1393 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1394 ctx->ptr, ctx->pattern[1]));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 state->ptr = ctx->ptr - state->charsize * ctx->pattern[1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001396 if (state->ptr >= state->beginning) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001397 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001398 if (ret) {
1399 RETURN_ON_ERROR(ret);
1400 RETURN_FAILURE;
1401 }
1402 }
1403 ctx->pattern += ctx->pattern[0];
1404 break;
1405
1406 case SRE_OP_FAILURE:
1407 /* immediate failure */
1408 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1409 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001410
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001411 default:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001412 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1413 ctx->pattern[-1]));
1414 RETURN_ERROR(SRE_ERROR_ILLEGAL);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001415 }
1416 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001417
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001418exit:
1419 ctx_pos = ctx->last_ctx_pos;
1420 jump = ctx->jump;
1421 DATA_POP_DISCARD(ctx);
1422 if (ctx_pos == -1)
1423 return ret;
1424 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1425
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001426 switch (jump) {
1427 case JUMP_MAX_UNTIL_2:
1428 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1429 goto jump_max_until_2;
1430 case JUMP_MAX_UNTIL_3:
1431 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1432 goto jump_max_until_3;
1433 case JUMP_MIN_UNTIL_2:
1434 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1435 goto jump_min_until_2;
1436 case JUMP_MIN_UNTIL_3:
1437 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1438 goto jump_min_until_3;
1439 case JUMP_BRANCH:
1440 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1441 goto jump_branch;
1442 case JUMP_MAX_UNTIL_1:
1443 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1444 goto jump_max_until_1;
1445 case JUMP_MIN_UNTIL_1:
1446 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1447 goto jump_min_until_1;
1448 case JUMP_REPEAT:
1449 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1450 goto jump_repeat;
1451 case JUMP_REPEAT_ONE_1:
1452 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1453 goto jump_repeat_one_1;
1454 case JUMP_REPEAT_ONE_2:
1455 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1456 goto jump_repeat_one_2;
1457 case JUMP_MIN_REPEAT_ONE:
1458 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1459 goto jump_min_repeat_one;
1460 case JUMP_ASSERT:
1461 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1462 goto jump_assert;
1463 case JUMP_ASSERT_NOT:
1464 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1465 goto jump_assert_not;
1466 case JUMP_NONE:
Serhiy Storchaka134f0de2013-09-05 18:01:15 +03001467 TRACE(("|%p|%p|RETURN %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
1468 ctx->ptr, ret));
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001469 break;
1470 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001471
1472 return ret; /* should never get here */
Guido van Rossumb700df92000-03-31 14:59:30 +00001473}
1474
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001475LOCAL(Py_ssize_t)
Guido van Rossumb700df92000-03-31 14:59:30 +00001476SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1477{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 char* ptr = (char*)state->start;
1479 char* end = (char*)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001480 Py_ssize_t status = 0;
1481 Py_ssize_t prefix_len = 0;
1482 Py_ssize_t prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001483 SRE_CODE* prefix = NULL;
1484 SRE_CODE* charset = NULL;
1485 SRE_CODE* overlap = NULL;
1486 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001487
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001488 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001489 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001490 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001491
1492 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001493
Gustavo Niemeyer28b5bb32003-06-26 14:41:08 +00001494 if (pattern[3] > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001495 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001496 character in there, so literal search will work) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001497 end -= (pattern[3]-1) * state->charsize;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001498 if (end <= ptr)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001499 end = ptr + state->charsize;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001500 }
1501
Fredrik Lundh3562f112000-07-02 12:00:07 +00001502 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001503 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001504 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001505 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001506 prefix_skip = pattern[6];
1507 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001508 overlap = prefix + prefix_len - 1;
1509 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001510 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001511 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001512 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001513
1514 pattern += 1 + pattern[1];
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001515 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001516
Serhiy Storchaka134f0de2013-09-05 18:01:15 +03001517 TRACE(("prefix = %p %" PY_FORMAT_SIZE_T "d %" PY_FORMAT_SIZE_T "d\n",
1518 prefix, prefix_len, prefix_skip));
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001519 TRACE(("charset = %p\n", charset));
1520
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001521#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001522 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001523 /* pattern starts with a known prefix. use the overlap
1524 table to skip forward as fast as we possibly can */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001525 Py_ssize_t i = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001526 end = (char *)state->end;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001527 while (ptr < end) {
1528 for (;;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001529 if ((SRE_CODE) SRE_CHARGET(state, ptr, 0) != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001530 if (!i)
1531 break;
1532 else
1533 i = overlap[i];
1534 } else {
1535 if (++i == prefix_len) {
1536 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001537 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001538 state->start = ptr - (prefix_len - 1) * state->charsize;
1539 state->ptr = ptr - (prefix_len - prefix_skip - 1) * state->charsize;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001540 if (flags & SRE_INFO_LITERAL)
1541 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001542 status = SRE_MATCH(state, pattern + 2*prefix_skip);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001543 if (status != 0)
1544 return status;
1545 /* close but no cigar -- try again */
1546 i = overlap[i];
1547 }
1548 break;
1549 }
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001551 ptr += state->charsize;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001552 }
1553 return 0;
1554 }
1555#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001556
Fredrik Lundh3562f112000-07-02 12:00:07 +00001557 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001558 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001559 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001560 SRE_CODE chr = pattern[1];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001561 end = (char*)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001562 for (;;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001563 while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) != chr)
1564 ptr += state->charsize;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001565 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001566 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001567 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001568 state->start = ptr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001569 ptr += state->charsize;
1570 state->ptr = ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001571 if (flags & SRE_INFO_LITERAL)
1572 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001573 status = SRE_MATCH(state, pattern + 2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001574 if (status != 0)
1575 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001576 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001577 } else if (charset) {
1578 /* pattern starts with a character from a known set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001579 end = (char*)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001580 for (;;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001581 while (ptr < end && !SRE_CHARSET(charset, SRE_CHARGET(state, ptr, 0)))
1582 ptr += state->charsize;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001583 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001584 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001585 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001586 state->start = ptr;
1587 state->ptr = ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001588 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001589 if (status != 0)
1590 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001591 ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001592 }
1593 } else
1594 /* general case */
1595 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001596 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001597 state->start = state->ptr = ptr;
1598 ptr += state->charsize;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001599 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001600 if (status != 0)
1601 break;
1602 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001603
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001604 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001605}
Tim Peters3d563502006-01-21 02:47:53 +00001606
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001607#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001608
1609/* -------------------------------------------------------------------- */
1610/* factories and destructors */
1611
1612/* see sre.h for object declarations */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001613static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06001614static PyObject*pattern_scanner(PatternObject*, PyObject*, PyObject* kw);
Guido van Rossumb700df92000-03-31 14:59:30 +00001615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001616static int
1617sre_literal_template(int charsize, char* ptr, Py_ssize_t len)
1618{
1619 /* check if given string is a literal template (i.e. no escapes) */
1620 struct {
1621 int charsize;
1622 } state = {
1623 charsize
1624 };
1625 while (len-- > 0) {
1626 if (SRE_CHARGET((&state), ptr, 0) == '\\')
1627 return 0;
1628 ptr += charsize;
1629 }
1630 return 1;
1631}
1632
Guido van Rossumb700df92000-03-31 14:59:30 +00001633static PyObject *
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001634sre_codesize(PyObject* self, PyObject *unused)
Guido van Rossumb700df92000-03-31 14:59:30 +00001635{
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001636 return PyLong_FromSize_t(sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001637}
1638
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001639static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001640sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001641{
1642 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001643 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001644 return NULL;
1645 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001646 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001647 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001648 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundhb389df32000-06-29 12:48:37 +00001649 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001650}
1651
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001652LOCAL(void)
1653state_reset(SRE_STATE* state)
1654{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001655 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001656 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001657
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001658 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001659 state->lastindex = -1;
1660
1661 state->repeat = NULL;
1662
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001663 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001664}
1665
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001666static void*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667getstring(PyObject* string, Py_ssize_t* p_length,
Benjamin Peterson33d21a22012-03-07 14:59:13 -06001668 int* p_logical_charsize, int* p_charsize,
1669 Py_buffer *view)
Guido van Rossumb700df92000-03-31 14:59:30 +00001670{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001671 /* given a python object, return a data pointer, a length (in
1672 characters), and a character size. return NULL if the object
1673 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +00001674
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001675 PyBufferProcs *buffer;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001676 Py_ssize_t size, bytes;
1677 int charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001678 void* ptr;
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001679
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00001680 /* Unicode objects do not support the buffer API. So, get the data
1681 directly instead. */
1682 if (PyUnicode_Check(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001683 if (PyUnicode_READY(string) == -1)
1684 return NULL;
1685 ptr = PyUnicode_DATA(string);
1686 *p_length = PyUnicode_GET_LENGTH(string);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001687 *p_charsize = PyUnicode_KIND(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001688 *p_logical_charsize = 4;
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00001689 return ptr;
1690 }
1691
Victor Stinner0058b862011-09-29 03:27:47 +02001692 /* get pointer to byte string buffer */
Benjamin Petersone48944b2012-03-07 14:50:25 -06001693 view->len = -1;
Christian Heimes90aa7642007-12-19 02:45:37 +00001694 buffer = Py_TYPE(string)->tp_as_buffer;
Antoine Pitroufd036452008-08-19 17:56:33 +00001695 if (!buffer || !buffer->bf_getbuffer ||
Benjamin Petersone48944b2012-03-07 14:50:25 -06001696 (*buffer->bf_getbuffer)(string, view, PyBUF_SIMPLE) < 0) {
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001697 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1698 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001699 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001700
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001701 /* determine buffer size */
Benjamin Petersone48944b2012-03-07 14:50:25 -06001702 bytes = view->len;
1703 ptr = view->buf;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001704
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001705 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001706 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001707 goto err;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001708 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001709
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001710 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001711 size = PyObject_Size(string);
Guido van Rossumb700df92000-03-31 14:59:30 +00001712
Christian Heimes72b710a2008-05-26 13:28:38 +00001713 if (PyBytes_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001714 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001715 else {
1716 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001717 goto err;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001718 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001719
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001720 *p_length = size;
1721 *p_charsize = charsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001722 *p_logical_charsize = charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001723
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001724 if (ptr == NULL) {
Antoine Pitroufd036452008-08-19 17:56:33 +00001725 PyErr_SetString(PyExc_ValueError,
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001726 "Buffer is NULL");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001727 goto err;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001728 }
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001729 return ptr;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001730 err:
1731 PyBuffer_Release(view);
1732 view->buf = NULL;
1733 return NULL;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001734}
1735
1736LOCAL(PyObject*)
1737state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001738 Py_ssize_t start, Py_ssize_t end)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001739{
1740 /* prepare state object */
1741
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001742 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001743 int logical_charsize, charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001744 void* ptr;
1745
1746 memset(state, 0, sizeof(SRE_STATE));
1747
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001748 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001749 state->lastindex = -1;
1750
Benjamin Petersone48944b2012-03-07 14:50:25 -06001751 state->buffer.buf = NULL;
Benjamin Peterson33d21a22012-03-07 14:59:13 -06001752 ptr = getstring(string, &length, &logical_charsize, &charsize, &state->buffer);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001753 if (!ptr)
Benjamin Petersone48944b2012-03-07 14:50:25 -06001754 goto err;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001755
Benjamin Peterson33d21a22012-03-07 14:59:13 -06001756 if (logical_charsize == 1 && pattern->logical_charsize > 1) {
Benjamin Petersone48944b2012-03-07 14:50:25 -06001757 PyErr_SetString(PyExc_TypeError,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001758 "can't use a string pattern on a bytes-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001759 goto err;
1760 }
Benjamin Peterson33d21a22012-03-07 14:59:13 -06001761 if (logical_charsize > 1 && pattern->logical_charsize == 1) {
Benjamin Petersone48944b2012-03-07 14:50:25 -06001762 PyErr_SetString(PyExc_TypeError,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001763 "can't use a bytes pattern on a string-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001764 goto err;
1765 }
Antoine Pitroufd036452008-08-19 17:56:33 +00001766
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001767 /* adjust boundaries */
1768 if (start < 0)
1769 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001770 else if (start > length)
1771 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001772
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001773 if (end < 0)
1774 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001775 else if (end > length)
1776 end = length;
1777
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001778 state->logical_charsize = logical_charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001779 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001780
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001781 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001782
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001783 state->start = (void*) ((char*) ptr + start * state->charsize);
1784 state->end = (void*) ((char*) ptr + end * state->charsize);
1785
1786 Py_INCREF(string);
1787 state->string = string;
1788 state->pos = start;
1789 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001790
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001791 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001792 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001793 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001794 state->lower = sre_lower_unicode;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001795 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001796 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001797
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001798 return string;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001799 err:
1800 if (state->buffer.buf)
1801 PyBuffer_Release(&state->buffer);
1802 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001803}
1804
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001805LOCAL(void)
1806state_fini(SRE_STATE* state)
1807{
Benjamin Petersone48944b2012-03-07 14:50:25 -06001808 if (state->buffer.buf)
1809 PyBuffer_Release(&state->buffer);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001810 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001811 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001812}
1813
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001814/* calculate offset from start of string */
1815#define STATE_OFFSET(state, member)\
1816 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1817
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001818LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001819state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001820{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001821 Py_ssize_t i, j;
Fredrik Lundh58100642000-08-09 09:14:35 +00001822
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001823 index = (index - 1) * 2;
1824
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001825 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001826 if (empty)
1827 /* want empty string */
1828 i = j = 0;
1829 else {
1830 Py_INCREF(Py_None);
1831 return Py_None;
1832 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001833 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001834 i = STATE_OFFSET(state, state->mark[index]);
1835 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001836 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001837
Fredrik Lundh58100642000-08-09 09:14:35 +00001838 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001839}
1840
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001841static void
1842pattern_error(int status)
1843{
1844 switch (status) {
1845 case SRE_ERROR_RECURSION_LIMIT:
1846 PyErr_SetString(
1847 PyExc_RuntimeError,
1848 "maximum recursion limit exceeded"
1849 );
1850 break;
1851 case SRE_ERROR_MEMORY:
1852 PyErr_NoMemory();
1853 break;
Christian Heimes2380ac72008-01-09 00:17:24 +00001854 case SRE_ERROR_INTERRUPTED:
1855 /* An exception has already been raised, so let it fly */
1856 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001857 default:
1858 /* other error codes indicate compiler/engine bugs */
1859 PyErr_SetString(
1860 PyExc_RuntimeError,
1861 "internal error in regular expression engine"
1862 );
1863 }
1864}
1865
Guido van Rossumb700df92000-03-31 14:59:30 +00001866static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001867pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001868{
Raymond Hettinger027bb632004-05-31 03:09:25 +00001869 if (self->weakreflist != NULL)
1870 PyObject_ClearWeakRefs((PyObject *) self);
Benjamin Petersone48944b2012-03-07 14:50:25 -06001871 if (self->view.buf)
1872 PyBuffer_Release(&self->view);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001873 Py_XDECREF(self->pattern);
1874 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001875 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001876 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001877}
1878
1879static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001880pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001881{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001882 SRE_STATE state;
1883 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001884
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001885 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001886 Py_ssize_t start = 0;
1887 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001888 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001889 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001890 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001891 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001892
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001893 string = state_init(&state, self, string, start, end);
1894 if (!string)
1895 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001896
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001897 state.ptr = state.start;
1898
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001899 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1900
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001901 if (state.logical_charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001902 status = sre_match(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001903 } else {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001904 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001905 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001906
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001907 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001908 if (PyErr_Occurred())
1909 return NULL;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001910
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001911 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001912
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001913 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001914}
1915
1916static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001917pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001918{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001919 SRE_STATE state;
1920 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001921
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001922 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001923 Py_ssize_t start = 0;
1924 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001925 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001926 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001927 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001928 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001929
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001930 string = state_init(&state, self, string, start, end);
1931 if (!string)
1932 return NULL;
1933
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001934 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001936 if (state.logical_charsize == 1) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001937 status = sre_search(&state, PatternObject_GetCode(self));
1938 } else {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001939 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001940 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001941
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001942 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1943
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001944 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001945
Thomas Wouters89f507f2006-12-13 04:49:30 +00001946 if (PyErr_Occurred())
1947 return NULL;
1948
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001949 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001950}
1951
1952static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001953call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001954{
1955 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001956 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001957 PyObject* func;
1958 PyObject* result;
1959
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001960 if (!args)
1961 return NULL;
Neal Norwitzfe537132007-08-26 03:55:15 +00001962 name = PyUnicode_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001963 if (!name)
1964 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001965 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001966 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001967 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001968 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001969 func = PyObject_GetAttrString(mod, function);
1970 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001971 if (!func)
1972 return NULL;
1973 result = PyObject_CallObject(func, args);
1974 Py_DECREF(func);
1975 Py_DECREF(args);
1976 return result;
1977}
1978
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001979#ifdef USE_BUILTIN_COPY
1980static int
1981deepcopy(PyObject** object, PyObject* memo)
1982{
1983 PyObject* copy;
1984
1985 copy = call(
1986 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00001987 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001988 );
1989 if (!copy)
1990 return 0;
1991
1992 Py_DECREF(*object);
1993 *object = copy;
1994
1995 return 1; /* success */
1996}
1997#endif
1998
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001999static PyObject*
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002000join_list(PyObject* list, PyObject* string)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002001{
2002 /* join list elements */
2003
2004 PyObject* joiner;
2005#if PY_VERSION_HEX >= 0x01060000
2006 PyObject* function;
2007 PyObject* args;
2008#endif
2009 PyObject* result;
2010
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002011 joiner = PySequence_GetSlice(string, 0, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002012 if (!joiner)
2013 return NULL;
2014
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002015 if (PyList_GET_SIZE(list) == 0) {
2016 Py_DECREF(list);
2017 return joiner;
2018 }
2019
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002020#if PY_VERSION_HEX >= 0x01060000
2021 function = PyObject_GetAttrString(joiner, "join");
2022 if (!function) {
2023 Py_DECREF(joiner);
2024 return NULL;
2025 }
2026 args = PyTuple_New(1);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002027 if (!args) {
2028 Py_DECREF(function);
2029 Py_DECREF(joiner);
2030 return NULL;
2031 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002032 PyTuple_SET_ITEM(args, 0, list);
2033 result = PyObject_CallObject(function, args);
2034 Py_DECREF(args); /* also removes list */
2035 Py_DECREF(function);
2036#else
2037 result = call(
2038 "string", "join",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002039 PyTuple_Pack(2, list, joiner)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002040 );
2041#endif
2042 Py_DECREF(joiner);
2043
2044 return result;
2045}
2046
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002047static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002048pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002049{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002050 SRE_STATE state;
2051 PyObject* list;
2052 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002053 Py_ssize_t i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00002054
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002055 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002056 Py_ssize_t start = 0;
2057 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002058 static char* kwlist[] = { "source", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002059 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00002060 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002061 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002062
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002063 string = state_init(&state, self, string, start, end);
2064 if (!string)
2065 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002066
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002067 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002068 if (!list) {
2069 state_fini(&state);
2070 return NULL;
2071 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002072
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002073 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002074
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002075 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +00002076
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002077 state_reset(&state);
2078
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002079 state.ptr = state.start;
2080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 if (state.logical_charsize == 1) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002082 status = sre_search(&state, PatternObject_GetCode(self));
2083 } else {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002084 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002085 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002086
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002087 if (PyErr_Occurred())
2088 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00002089
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002090 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002091 if (status == 0)
2092 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002093 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002094 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002095 }
Tim Peters3d563502006-01-21 02:47:53 +00002096
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002097 /* don't bother to build a match object */
2098 switch (self->groups) {
2099 case 0:
2100 b = STATE_OFFSET(&state, state.start);
2101 e = STATE_OFFSET(&state, state.ptr);
2102 item = PySequence_GetSlice(string, b, e);
2103 if (!item)
2104 goto error;
2105 break;
2106 case 1:
2107 item = state_getslice(&state, 1, string, 1);
2108 if (!item)
2109 goto error;
2110 break;
2111 default:
2112 item = PyTuple_New(self->groups);
2113 if (!item)
2114 goto error;
2115 for (i = 0; i < self->groups; i++) {
2116 PyObject* o = state_getslice(&state, i+1, string, 1);
2117 if (!o) {
2118 Py_DECREF(item);
2119 goto error;
2120 }
2121 PyTuple_SET_ITEM(item, i, o);
2122 }
2123 break;
2124 }
2125
2126 status = PyList_Append(list, item);
2127 Py_DECREF(item);
2128 if (status < 0)
2129 goto error;
2130
2131 if (state.ptr == state.start)
2132 state.start = (void*) ((char*) state.ptr + state.charsize);
2133 else
2134 state.start = state.ptr;
2135
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002136 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002137
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002138 state_fini(&state);
2139 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002140
2141error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002142 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002143 state_fini(&state);
2144 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002145
Guido van Rossumb700df92000-03-31 14:59:30 +00002146}
2147
Fredrik Lundh703ce812001-10-24 22:16:30 +00002148#if PY_VERSION_HEX >= 0x02020000
2149static PyObject*
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002150pattern_finditer(PatternObject* pattern, PyObject* args, PyObject* kw)
Fredrik Lundh703ce812001-10-24 22:16:30 +00002151{
2152 PyObject* scanner;
2153 PyObject* search;
2154 PyObject* iterator;
2155
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002156 scanner = pattern_scanner(pattern, args, kw);
Fredrik Lundh703ce812001-10-24 22:16:30 +00002157 if (!scanner)
2158 return NULL;
2159
2160 search = PyObject_GetAttrString(scanner, "search");
2161 Py_DECREF(scanner);
2162 if (!search)
2163 return NULL;
2164
2165 iterator = PyCallIter_New(search, Py_None);
2166 Py_DECREF(search);
2167
2168 return iterator;
2169}
2170#endif
2171
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002172static PyObject*
2173pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2174{
2175 SRE_STATE state;
2176 PyObject* list;
2177 PyObject* item;
2178 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002179 Py_ssize_t n;
2180 Py_ssize_t i;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002181 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002182
2183 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002184 Py_ssize_t maxsplit = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002185 static char* kwlist[] = { "source", "maxsplit", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002186 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002187 &string, &maxsplit))
2188 return NULL;
2189
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002190 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002191 if (!string)
2192 return NULL;
2193
2194 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002195 if (!list) {
2196 state_fini(&state);
2197 return NULL;
2198 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002199
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002200 n = 0;
2201 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002202
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002203 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002204
2205 state_reset(&state);
2206
2207 state.ptr = state.start;
2208
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209 if (state.logical_charsize == 1) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002210 status = sre_search(&state, PatternObject_GetCode(self));
2211 } else {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002212 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002213 }
2214
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002215 if (PyErr_Occurred())
2216 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00002217
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002218 if (status <= 0) {
2219 if (status == 0)
2220 break;
2221 pattern_error(status);
2222 goto error;
2223 }
Tim Peters3d563502006-01-21 02:47:53 +00002224
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002225 if (state.start == state.ptr) {
2226 if (last == state.end)
2227 break;
2228 /* skip one character */
2229 state.start = (void*) ((char*) state.ptr + state.charsize);
2230 continue;
2231 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002232
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002233 /* get segment before this match */
2234 item = PySequence_GetSlice(
2235 string, STATE_OFFSET(&state, last),
2236 STATE_OFFSET(&state, state.start)
2237 );
2238 if (!item)
2239 goto error;
2240 status = PyList_Append(list, item);
2241 Py_DECREF(item);
2242 if (status < 0)
2243 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002244
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002245 /* add groups (if any) */
2246 for (i = 0; i < self->groups; i++) {
2247 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002248 if (!item)
2249 goto error;
2250 status = PyList_Append(list, item);
2251 Py_DECREF(item);
2252 if (status < 0)
2253 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002254 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002255
2256 n = n + 1;
2257
2258 last = state.start = state.ptr;
2259
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002260 }
2261
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002262 /* get segment following last match (even if empty) */
2263 item = PySequence_GetSlice(
2264 string, STATE_OFFSET(&state, last), state.endpos
2265 );
2266 if (!item)
2267 goto error;
2268 status = PyList_Append(list, item);
2269 Py_DECREF(item);
2270 if (status < 0)
2271 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002272
2273 state_fini(&state);
2274 return list;
2275
2276error:
2277 Py_DECREF(list);
2278 state_fini(&state);
2279 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002280
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002281}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002282
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002283static PyObject*
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002284pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002285 Py_ssize_t count, Py_ssize_t subn)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002286{
2287 SRE_STATE state;
2288 PyObject* list;
2289 PyObject* item;
2290 PyObject* filter;
2291 PyObject* args;
2292 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002293 void* ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002294 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002295 Py_ssize_t n;
2296 Py_ssize_t i, b, e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002297 int logical_charsize, charsize;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002298 int filter_is_callable;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002299 Py_buffer view;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002300
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002301 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002302 /* sub/subn takes either a function or a template */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002303 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002304 Py_INCREF(filter);
2305 filter_is_callable = 1;
2306 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002307 /* if not callable, check if it's a literal string */
2308 int literal;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002309 view.buf = NULL;
Benjamin Peterson33d21a22012-03-07 14:59:13 -06002310 ptr = getstring(ptemplate, &n, &logical_charsize, &charsize, &view);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002311 b = charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002312 if (ptr) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002313 literal = sre_literal_template(b, ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002314 } else {
2315 PyErr_Clear();
2316 literal = 0;
2317 }
Benjamin Petersone48944b2012-03-07 14:50:25 -06002318 if (view.buf)
2319 PyBuffer_Release(&view);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002320 if (literal) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002321 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002322 Py_INCREF(filter);
2323 filter_is_callable = 0;
2324 } else {
2325 /* not a literal; hand it over to the template compiler */
2326 filter = call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00002327 SRE_PY_MODULE, "_subx",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002328 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002329 );
2330 if (!filter)
2331 return NULL;
2332 filter_is_callable = PyCallable_Check(filter);
2333 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002334 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002335
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002336 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002337 if (!string) {
2338 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002339 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002340 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002341
2342 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002343 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002344 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002345 state_fini(&state);
2346 return NULL;
2347 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002348
2349 n = i = 0;
2350
2351 while (!count || n < count) {
2352
2353 state_reset(&state);
2354
2355 state.ptr = state.start;
2356
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002357 if (state.logical_charsize == 1) {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002358 status = sre_search(&state, PatternObject_GetCode(self));
2359 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002360 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002361 }
2362
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002363 if (PyErr_Occurred())
2364 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00002365
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002366 if (status <= 0) {
2367 if (status == 0)
2368 break;
2369 pattern_error(status);
2370 goto error;
2371 }
Tim Peters3d563502006-01-21 02:47:53 +00002372
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002373 b = STATE_OFFSET(&state, state.start);
2374 e = STATE_OFFSET(&state, state.ptr);
2375
2376 if (i < b) {
2377 /* get segment before this match */
2378 item = PySequence_GetSlice(string, i, b);
2379 if (!item)
2380 goto error;
2381 status = PyList_Append(list, item);
2382 Py_DECREF(item);
2383 if (status < 0)
2384 goto error;
2385
2386 } else if (i == b && i == e && n > 0)
2387 /* ignore empty match on latest position */
2388 goto next;
2389
2390 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002391 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002392 match = pattern_new_match(self, &state, 1);
2393 if (!match)
2394 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002395 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002396 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002397 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002398 goto error;
2399 }
2400 item = PyObject_CallObject(filter, args);
2401 Py_DECREF(args);
2402 Py_DECREF(match);
2403 if (!item)
2404 goto error;
2405 } else {
2406 /* filter is literal string */
2407 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002408 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002409 }
2410
2411 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002412 if (item != Py_None) {
2413 status = PyList_Append(list, item);
2414 Py_DECREF(item);
2415 if (status < 0)
2416 goto error;
2417 }
Tim Peters3d563502006-01-21 02:47:53 +00002418
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002419 i = e;
2420 n = n + 1;
2421
2422next:
2423 /* move on */
2424 if (state.ptr == state.start)
2425 state.start = (void*) ((char*) state.ptr + state.charsize);
2426 else
2427 state.start = state.ptr;
2428
2429 }
2430
2431 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002432 if (i < state.endpos) {
2433 item = PySequence_GetSlice(string, i, state.endpos);
2434 if (!item)
2435 goto error;
2436 status = PyList_Append(list, item);
2437 Py_DECREF(item);
2438 if (status < 0)
2439 goto error;
2440 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002441
2442 state_fini(&state);
2443
Guido van Rossum4e173842001-12-07 04:25:10 +00002444 Py_DECREF(filter);
2445
Fredrik Lundhdac58492001-10-21 21:48:30 +00002446 /* convert list to single string (also removes list) */
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002447 item = join_list(list, string);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002448
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002449 if (!item)
2450 return NULL;
2451
2452 if (subn)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01002453 return Py_BuildValue("Nn", item, n);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002454
2455 return item;
2456
2457error:
2458 Py_DECREF(list);
2459 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002460 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002461 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002462
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002463}
2464
2465static PyObject*
2466pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2467{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002468 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002469 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002470 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002471 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002472 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002473 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002474 return NULL;
2475
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002476 return pattern_subx(self, ptemplate, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002477}
2478
2479static PyObject*
2480pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2481{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002482 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002483 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002484 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002485 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002486 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002487 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002488 return NULL;
2489
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002490 return pattern_subx(self, ptemplate, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002491}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002492
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002493static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002494pattern_copy(PatternObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002495{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002496#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002497 PatternObject* copy;
2498 int offset;
2499
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002500 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2501 if (!copy)
2502 return NULL;
2503
2504 offset = offsetof(PatternObject, groups);
2505
2506 Py_XINCREF(self->groupindex);
2507 Py_XINCREF(self->indexgroup);
2508 Py_XINCREF(self->pattern);
2509
2510 memcpy((char*) copy + offset, (char*) self + offset,
2511 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00002512 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002513
2514 return (PyObject*) copy;
2515#else
2516 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2517 return NULL;
2518#endif
2519}
2520
2521static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002522pattern_deepcopy(PatternObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002523{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002524#ifdef USE_BUILTIN_COPY
2525 PatternObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00002526
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002527 copy = (PatternObject*) pattern_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002528 if (!copy)
2529 return NULL;
2530
2531 if (!deepcopy(&copy->groupindex, memo) ||
2532 !deepcopy(&copy->indexgroup, memo) ||
2533 !deepcopy(&copy->pattern, memo)) {
2534 Py_DECREF(copy);
2535 return NULL;
2536 }
2537
2538#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002539 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2540 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002541#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002542}
2543
Raymond Hettinger94478742004-09-24 04:31:19 +00002544PyDoc_STRVAR(pattern_match_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002545"match(string[, pos[, endpos]]) -> match object or None.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002546 Matches zero or more characters at the beginning of the string");
2547
2548PyDoc_STRVAR(pattern_search_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002549"search(string[, pos[, endpos]]) -> match object or None.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002550 Scan through string looking for a match, and return a corresponding\n\
Andrew Svetlov0b64c142012-12-25 18:48:54 +02002551 match object instance. Return None if no position in the string matches.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002552
2553PyDoc_STRVAR(pattern_split_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002554"split(string[, maxsplit = 0]) -> list.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002555 Split string by the occurrences of pattern.");
2556
2557PyDoc_STRVAR(pattern_findall_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002558"findall(string[, pos[, endpos]]) -> list.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002559 Return a list of all non-overlapping matches of pattern in string.");
2560
2561PyDoc_STRVAR(pattern_finditer_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002562"finditer(string[, pos[, endpos]]) -> iterator.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002563 Return an iterator over all non-overlapping matches for the \n\
2564 RE pattern in string. For each match, the iterator returns a\n\
2565 match object.");
2566
2567PyDoc_STRVAR(pattern_sub_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002568"sub(repl, string[, count = 0]) -> newstring.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002569 Return the string obtained by replacing the leftmost non-overlapping\n\
Tim Peters3d563502006-01-21 02:47:53 +00002570 occurrences of pattern in string by the replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002571
2572PyDoc_STRVAR(pattern_subn_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002573"subn(repl, string[, count = 0]) -> (newstring, number of subs)\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002574 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2575 the leftmost non-overlapping occurrences of pattern with the\n\
Tim Peters3d563502006-01-21 02:47:53 +00002576 replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002577
2578PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2579
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002580static PyMethodDef pattern_methods[] = {
Tim Peters3d563502006-01-21 02:47:53 +00002581 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002582 pattern_match_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002583 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002584 pattern_search_doc},
Raymond Hettinger94478742004-09-24 04:31:19 +00002585 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002586 pattern_sub_doc},
Raymond Hettinger94478742004-09-24 04:31:19 +00002587 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002588 pattern_subn_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002589 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002590 pattern_split_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002591 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002592 pattern_findall_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002593#if PY_VERSION_HEX >= 0x02020000
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002594 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002595 pattern_finditer_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002596#endif
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002597 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS|METH_KEYWORDS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002598 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2599 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002600 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002601};
2602
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002603#define PAT_OFF(x) offsetof(PatternObject, x)
2604static PyMemberDef pattern_members[] = {
2605 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
2606 {"flags", T_INT, PAT_OFF(flags), READONLY},
2607 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
2608 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
2609 {NULL} /* Sentinel */
2610};
Guido van Rossumb700df92000-03-31 14:59:30 +00002611
Neal Norwitz57c179c2006-03-22 07:18:02 +00002612static PyTypeObject Pattern_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00002613 PyVarObject_HEAD_INIT(NULL, 0)
2614 "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002615 sizeof(PatternObject), sizeof(SRE_CODE),
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002616 (destructor)pattern_dealloc, /* tp_dealloc */
2617 0, /* tp_print */
2618 0, /* tp_getattr */
2619 0, /* tp_setattr */
2620 0, /* tp_reserved */
2621 0, /* tp_repr */
2622 0, /* tp_as_number */
2623 0, /* tp_as_sequence */
2624 0, /* tp_as_mapping */
2625 0, /* tp_hash */
2626 0, /* tp_call */
2627 0, /* tp_str */
2628 0, /* tp_getattro */
2629 0, /* tp_setattro */
2630 0, /* tp_as_buffer */
2631 Py_TPFLAGS_DEFAULT, /* tp_flags */
2632 pattern_doc, /* tp_doc */
2633 0, /* tp_traverse */
2634 0, /* tp_clear */
2635 0, /* tp_richcompare */
2636 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
2637 0, /* tp_iter */
2638 0, /* tp_iternext */
2639 pattern_methods, /* tp_methods */
2640 pattern_members, /* tp_members */
Guido van Rossumb700df92000-03-31 14:59:30 +00002641};
2642
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002643static int _validate(PatternObject *self); /* Forward */
2644
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002645static PyObject *
2646_compile(PyObject* self_, PyObject* args)
2647{
2648 /* "compile" pattern descriptor to pattern object */
2649
2650 PatternObject* self;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002651 Py_ssize_t i, n;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002652
2653 PyObject* pattern;
2654 int flags = 0;
2655 PyObject* code;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002656 Py_ssize_t groups = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002657 PyObject* groupindex = NULL;
2658 PyObject* indexgroup = NULL;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002659
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002660 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002661 &PyList_Type, &code, &groups,
2662 &groupindex, &indexgroup))
2663 return NULL;
2664
2665 n = PyList_GET_SIZE(code);
Christian Heimes587c2bf2008-01-19 16:21:02 +00002666 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002667 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2668 if (!self)
2669 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002670 self->weakreflist = NULL;
2671 self->pattern = NULL;
2672 self->groupindex = NULL;
2673 self->indexgroup = NULL;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002674 self->view.buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002675
2676 self->codesize = n;
2677
2678 for (i = 0; i < n; i++) {
2679 PyObject *o = PyList_GET_ITEM(code, i);
Guido van Rossumddefaf32007-01-14 03:31:43 +00002680 unsigned long value = PyLong_AsUnsignedLong(o);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002681 self->code[i] = (SRE_CODE) value;
2682 if ((unsigned long) self->code[i] != value) {
2683 PyErr_SetString(PyExc_OverflowError,
2684 "regular expression code size limit exceeded");
2685 break;
2686 }
2687 }
2688
2689 if (PyErr_Occurred()) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002690 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002691 return NULL;
2692 }
2693
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002694 if (pattern == Py_None) {
2695 self->logical_charsize = -1;
2696 self->charsize = -1;
Victor Stinner63ab8752011-11-22 03:31:20 +01002697 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002698 else {
2699 Py_ssize_t p_length;
2700 if (!getstring(pattern, &p_length, &self->logical_charsize,
Benjamin Peterson33d21a22012-03-07 14:59:13 -06002701 &self->charsize, &self->view)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002702 Py_DECREF(self);
2703 return NULL;
2704 }
2705 }
Antoine Pitroufd036452008-08-19 17:56:33 +00002706
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002707 Py_INCREF(pattern);
2708 self->pattern = pattern;
2709
2710 self->flags = flags;
2711
2712 self->groups = groups;
2713
2714 Py_XINCREF(groupindex);
2715 self->groupindex = groupindex;
2716
2717 Py_XINCREF(indexgroup);
2718 self->indexgroup = indexgroup;
2719
2720 self->weakreflist = NULL;
2721
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002722 if (!_validate(self)) {
2723 Py_DECREF(self);
2724 return NULL;
2725 }
2726
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002727 return (PyObject*) self;
2728}
2729
Guido van Rossumb700df92000-03-31 14:59:30 +00002730/* -------------------------------------------------------------------- */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002731/* Code validation */
2732
2733/* To learn more about this code, have a look at the _compile() function in
2734 Lib/sre_compile.py. The validation functions below checks the code array
2735 for conformance with the code patterns generated there.
2736
2737 The nice thing about the generated code is that it is position-independent:
2738 all jumps are relative jumps forward. Also, jumps don't cross each other:
2739 the target of a later jump is always earlier than the target of an earlier
2740 jump. IOW, this is okay:
2741
2742 J---------J-------T--------T
2743 \ \_____/ /
2744 \______________________/
2745
2746 but this is not:
2747
2748 J---------J-------T--------T
2749 \_________\_____/ /
2750 \____________/
2751
2752 It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
2753 bytes wide (the latter if Python is compiled for "wide" unicode support).
2754*/
2755
2756/* Defining this one enables tracing of the validator */
2757#undef VVERBOSE
2758
2759/* Trace macro for the validator */
2760#if defined(VVERBOSE)
2761#define VTRACE(v) printf v
2762#else
Senthil Kumaran202a3c42011-10-20 02:15:36 +08002763#define VTRACE(v) do {} while(0) /* do nothing */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002764#endif
2765
2766/* Report failure */
2767#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2768
2769/* Extract opcode, argument, or skip count from code array */
2770#define GET_OP \
2771 do { \
2772 VTRACE(("%p: ", code)); \
2773 if (code >= end) FAIL; \
2774 op = *code++; \
2775 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2776 } while (0)
2777#define GET_ARG \
2778 do { \
2779 VTRACE(("%p= ", code)); \
2780 if (code >= end) FAIL; \
2781 arg = *code++; \
2782 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2783 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00002784#define GET_SKIP_ADJ(adj) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002785 do { \
2786 VTRACE(("%p= ", code)); \
2787 if (code >= end) FAIL; \
2788 skip = *code; \
2789 VTRACE(("%lu (skip to %p)\n", \
2790 (unsigned long)skip, code+skip)); \
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03002791 if (skip-adj > end-code) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002792 FAIL; \
2793 code++; \
2794 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00002795#define GET_SKIP GET_SKIP_ADJ(0)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002796
2797static int
2798_validate_charset(SRE_CODE *code, SRE_CODE *end)
2799{
2800 /* Some variables are manipulated by the macros above */
2801 SRE_CODE op;
2802 SRE_CODE arg;
2803 SRE_CODE offset;
2804 int i;
2805
2806 while (code < end) {
2807 GET_OP;
2808 switch (op) {
2809
2810 case SRE_OP_NEGATE:
2811 break;
2812
2813 case SRE_OP_LITERAL:
2814 GET_ARG;
2815 break;
2816
2817 case SRE_OP_RANGE:
2818 GET_ARG;
2819 GET_ARG;
2820 break;
2821
2822 case SRE_OP_CHARSET:
2823 offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03002824 if (offset > end-code)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002825 FAIL;
2826 code += offset;
2827 break;
2828
2829 case SRE_OP_BIGCHARSET:
2830 GET_ARG; /* Number of blocks */
2831 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03002832 if (offset > end-code)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002833 FAIL;
2834 /* Make sure that each byte points to a valid block */
2835 for (i = 0; i < 256; i++) {
2836 if (((unsigned char *)code)[i] >= arg)
2837 FAIL;
2838 }
2839 code += offset;
2840 offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03002841 if (offset > end-code)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002842 FAIL;
2843 code += offset;
2844 break;
2845
2846 case SRE_OP_CATEGORY:
2847 GET_ARG;
2848 switch (arg) {
2849 case SRE_CATEGORY_DIGIT:
2850 case SRE_CATEGORY_NOT_DIGIT:
2851 case SRE_CATEGORY_SPACE:
2852 case SRE_CATEGORY_NOT_SPACE:
2853 case SRE_CATEGORY_WORD:
2854 case SRE_CATEGORY_NOT_WORD:
2855 case SRE_CATEGORY_LINEBREAK:
2856 case SRE_CATEGORY_NOT_LINEBREAK:
2857 case SRE_CATEGORY_LOC_WORD:
2858 case SRE_CATEGORY_LOC_NOT_WORD:
2859 case SRE_CATEGORY_UNI_DIGIT:
2860 case SRE_CATEGORY_UNI_NOT_DIGIT:
2861 case SRE_CATEGORY_UNI_SPACE:
2862 case SRE_CATEGORY_UNI_NOT_SPACE:
2863 case SRE_CATEGORY_UNI_WORD:
2864 case SRE_CATEGORY_UNI_NOT_WORD:
2865 case SRE_CATEGORY_UNI_LINEBREAK:
2866 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2867 break;
2868 default:
2869 FAIL;
2870 }
2871 break;
2872
2873 default:
2874 FAIL;
2875
2876 }
2877 }
2878
2879 return 1;
2880}
2881
2882static int
2883_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2884{
2885 /* Some variables are manipulated by the macros above */
2886 SRE_CODE op;
2887 SRE_CODE arg;
2888 SRE_CODE skip;
2889
2890 VTRACE(("code=%p, end=%p\n", code, end));
2891
2892 if (code > end)
2893 FAIL;
2894
2895 while (code < end) {
2896 GET_OP;
2897 switch (op) {
2898
2899 case SRE_OP_MARK:
2900 /* We don't check whether marks are properly nested; the
2901 sre_match() code is robust even if they don't, and the worst
2902 you can get is nonsensical match results. */
2903 GET_ARG;
2904 if (arg > 2*groups+1) {
2905 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
2906 FAIL;
2907 }
2908 break;
2909
2910 case SRE_OP_LITERAL:
2911 case SRE_OP_NOT_LITERAL:
2912 case SRE_OP_LITERAL_IGNORE:
2913 case SRE_OP_NOT_LITERAL_IGNORE:
2914 GET_ARG;
2915 /* The arg is just a character, nothing to check */
2916 break;
2917
2918 case SRE_OP_SUCCESS:
2919 case SRE_OP_FAILURE:
2920 /* Nothing to check; these normally end the matching process */
2921 break;
2922
2923 case SRE_OP_AT:
2924 GET_ARG;
2925 switch (arg) {
2926 case SRE_AT_BEGINNING:
2927 case SRE_AT_BEGINNING_STRING:
2928 case SRE_AT_BEGINNING_LINE:
2929 case SRE_AT_END:
2930 case SRE_AT_END_LINE:
2931 case SRE_AT_END_STRING:
2932 case SRE_AT_BOUNDARY:
2933 case SRE_AT_NON_BOUNDARY:
2934 case SRE_AT_LOC_BOUNDARY:
2935 case SRE_AT_LOC_NON_BOUNDARY:
2936 case SRE_AT_UNI_BOUNDARY:
2937 case SRE_AT_UNI_NON_BOUNDARY:
2938 break;
2939 default:
2940 FAIL;
2941 }
2942 break;
2943
2944 case SRE_OP_ANY:
2945 case SRE_OP_ANY_ALL:
2946 /* These have no operands */
2947 break;
2948
2949 case SRE_OP_IN:
2950 case SRE_OP_IN_IGNORE:
2951 GET_SKIP;
2952 /* Stop 1 before the end; we check the FAILURE below */
2953 if (!_validate_charset(code, code+skip-2))
2954 FAIL;
2955 if (code[skip-2] != SRE_OP_FAILURE)
2956 FAIL;
2957 code += skip-1;
2958 break;
2959
2960 case SRE_OP_INFO:
2961 {
2962 /* A minimal info field is
2963 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2964 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2965 more follows. */
Ross Lagerwall88748d72012-03-06 21:48:57 +02002966 SRE_CODE flags, i;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002967 SRE_CODE *newcode;
2968 GET_SKIP;
2969 newcode = code+skip-1;
2970 GET_ARG; flags = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02002971 GET_ARG;
2972 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002973 /* Check that only valid flags are present */
2974 if ((flags & ~(SRE_INFO_PREFIX |
2975 SRE_INFO_LITERAL |
2976 SRE_INFO_CHARSET)) != 0)
2977 FAIL;
2978 /* PREFIX and CHARSET are mutually exclusive */
2979 if ((flags & SRE_INFO_PREFIX) &&
2980 (flags & SRE_INFO_CHARSET))
2981 FAIL;
2982 /* LITERAL implies PREFIX */
2983 if ((flags & SRE_INFO_LITERAL) &&
2984 !(flags & SRE_INFO_PREFIX))
2985 FAIL;
2986 /* Validate the prefix */
2987 if (flags & SRE_INFO_PREFIX) {
Ross Lagerwall88748d72012-03-06 21:48:57 +02002988 SRE_CODE prefix_len;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002989 GET_ARG; prefix_len = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02002990 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002991 /* Here comes the prefix string */
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03002992 if (prefix_len > newcode-code)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002993 FAIL;
2994 code += prefix_len;
2995 /* And here comes the overlap table */
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03002996 if (prefix_len > newcode-code)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002997 FAIL;
2998 /* Each overlap value should be < prefix_len */
2999 for (i = 0; i < prefix_len; i++) {
3000 if (code[i] >= prefix_len)
3001 FAIL;
3002 }
3003 code += prefix_len;
3004 }
3005 /* Validate the charset */
3006 if (flags & SRE_INFO_CHARSET) {
3007 if (!_validate_charset(code, newcode-1))
3008 FAIL;
3009 if (newcode[-1] != SRE_OP_FAILURE)
3010 FAIL;
3011 code = newcode;
3012 }
3013 else if (code != newcode) {
3014 VTRACE(("code=%p, newcode=%p\n", code, newcode));
3015 FAIL;
3016 }
3017 }
3018 break;
3019
3020 case SRE_OP_BRANCH:
3021 {
3022 SRE_CODE *target = NULL;
3023 for (;;) {
3024 GET_SKIP;
3025 if (skip == 0)
3026 break;
3027 /* Stop 2 before the end; we check the JUMP below */
3028 if (!_validate_inner(code, code+skip-3, groups))
3029 FAIL;
3030 code += skip-3;
3031 /* Check that it ends with a JUMP, and that each JUMP
3032 has the same target */
3033 GET_OP;
3034 if (op != SRE_OP_JUMP)
3035 FAIL;
3036 GET_SKIP;
3037 if (target == NULL)
3038 target = code+skip-1;
3039 else if (code+skip-1 != target)
3040 FAIL;
3041 }
3042 }
3043 break;
3044
3045 case SRE_OP_REPEAT_ONE:
3046 case SRE_OP_MIN_REPEAT_ONE:
3047 {
3048 SRE_CODE min, max;
3049 GET_SKIP;
3050 GET_ARG; min = arg;
3051 GET_ARG; max = arg;
3052 if (min > max)
3053 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02003054 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003055 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003056 if (!_validate_inner(code, code+skip-4, groups))
3057 FAIL;
3058 code += skip-4;
3059 GET_OP;
3060 if (op != SRE_OP_SUCCESS)
3061 FAIL;
3062 }
3063 break;
3064
3065 case SRE_OP_REPEAT:
3066 {
3067 SRE_CODE min, max;
3068 GET_SKIP;
3069 GET_ARG; min = arg;
3070 GET_ARG; max = arg;
3071 if (min > max)
3072 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02003073 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003074 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003075 if (!_validate_inner(code, code+skip-3, groups))
3076 FAIL;
3077 code += skip-3;
3078 GET_OP;
3079 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3080 FAIL;
3081 }
3082 break;
3083
3084 case SRE_OP_GROUPREF:
3085 case SRE_OP_GROUPREF_IGNORE:
3086 GET_ARG;
3087 if (arg >= groups)
3088 FAIL;
3089 break;
3090
3091 case SRE_OP_GROUPREF_EXISTS:
3092 /* The regex syntax for this is: '(?(group)then|else)', where
3093 'group' is either an integer group number or a group name,
3094 'then' and 'else' are sub-regexes, and 'else' is optional. */
3095 GET_ARG;
3096 if (arg >= groups)
3097 FAIL;
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00003098 GET_SKIP_ADJ(1);
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003099 code--; /* The skip is relative to the first arg! */
3100 /* There are two possibilities here: if there is both a 'then'
3101 part and an 'else' part, the generated code looks like:
3102
3103 GROUPREF_EXISTS
3104 <group>
3105 <skipyes>
3106 ...then part...
3107 JUMP
3108 <skipno>
3109 (<skipyes> jumps here)
3110 ...else part...
3111 (<skipno> jumps here)
3112
3113 If there is only a 'then' part, it looks like:
3114
3115 GROUPREF_EXISTS
3116 <group>
3117 <skip>
3118 ...then part...
3119 (<skip> jumps here)
3120
3121 There is no direct way to decide which it is, and we don't want
3122 to allow arbitrary jumps anywhere in the code; so we just look
3123 for a JUMP opcode preceding our skip target.
3124 */
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03003125 if (skip >= 3 && skip-3 < end-code &&
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003126 code[skip-3] == SRE_OP_JUMP)
3127 {
3128 VTRACE(("both then and else parts present\n"));
3129 if (!_validate_inner(code+1, code+skip-3, groups))
3130 FAIL;
3131 code += skip-2; /* Position after JUMP, at <skipno> */
3132 GET_SKIP;
3133 if (!_validate_inner(code, code+skip-1, groups))
3134 FAIL;
3135 code += skip-1;
3136 }
3137 else {
3138 VTRACE(("only a then part present\n"));
3139 if (!_validate_inner(code+1, code+skip-1, groups))
3140 FAIL;
3141 code += skip-1;
3142 }
3143 break;
3144
3145 case SRE_OP_ASSERT:
3146 case SRE_OP_ASSERT_NOT:
3147 GET_SKIP;
3148 GET_ARG; /* 0 for lookahead, width for lookbehind */
3149 code--; /* Back up over arg to simplify math below */
3150 if (arg & 0x80000000)
3151 FAIL; /* Width too large */
3152 /* Stop 1 before the end; we check the SUCCESS below */
3153 if (!_validate_inner(code+1, code+skip-2, groups))
3154 FAIL;
3155 code += skip-2;
3156 GET_OP;
3157 if (op != SRE_OP_SUCCESS)
3158 FAIL;
3159 break;
3160
3161 default:
3162 FAIL;
3163
3164 }
3165 }
3166
3167 VTRACE(("okay\n"));
3168 return 1;
3169}
3170
3171static int
3172_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3173{
3174 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3175 FAIL;
3176 if (groups == 0) /* fix for simplejson */
3177 groups = 100; /* 100 groups should always be safe */
3178 return _validate_inner(code, end-1, groups);
3179}
3180
3181static int
3182_validate(PatternObject *self)
3183{
3184 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3185 {
3186 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3187 return 0;
3188 }
3189 else
3190 VTRACE(("Success!\n"));
3191 return 1;
3192}
3193
3194/* -------------------------------------------------------------------- */
Guido van Rossumb700df92000-03-31 14:59:30 +00003195/* match methods */
3196
3197static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003198match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003199{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003200 Py_XDECREF(self->regs);
3201 Py_XDECREF(self->string);
3202 Py_DECREF(self->pattern);
3203 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00003204}
3205
3206static PyObject*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003207match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00003208{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003209 if (index < 0 || index >= self->groups) {
3210 /* raise IndexError if we were given a bad group number */
3211 PyErr_SetString(
3212 PyExc_IndexError,
3213 "no such group"
3214 );
3215 return NULL;
3216 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003217
Fredrik Lundh6f013982000-07-03 18:44:21 +00003218 index *= 2;
3219
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003220 if (self->string == Py_None || self->mark[index] < 0) {
3221 /* return default value if the string or group is undefined */
3222 Py_INCREF(def);
3223 return def;
3224 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003225
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003226 return PySequence_GetSlice(
3227 self->string, self->mark[index], self->mark[index+1]
3228 );
Guido van Rossumb700df92000-03-31 14:59:30 +00003229}
3230
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003231static Py_ssize_t
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003232match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00003233{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003234 Py_ssize_t i;
Guido van Rossumb700df92000-03-31 14:59:30 +00003235
Guido van Rossumddefaf32007-01-14 03:31:43 +00003236 if (index == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003237 /* Default value */
3238 return 0;
Guido van Rossumddefaf32007-01-14 03:31:43 +00003239
Christian Heimes217cfd12007-12-02 14:31:20 +00003240 if (PyLong_Check(index))
3241 return PyLong_AsSsize_t(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00003242
Fredrik Lundh6f013982000-07-03 18:44:21 +00003243 i = -1;
3244
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003245 if (self->pattern->groupindex) {
3246 index = PyObject_GetItem(self->pattern->groupindex, index);
3247 if (index) {
Neal Norwitz1fe5f382007-08-31 04:32:55 +00003248 if (PyLong_Check(index))
Christian Heimes217cfd12007-12-02 14:31:20 +00003249 i = PyLong_AsSsize_t(index);
Fredrik Lundh6f013982000-07-03 18:44:21 +00003250 Py_DECREF(index);
3251 } else
3252 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003253 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00003254
3255 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003256}
3257
3258static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003259match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003260{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003261 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00003262}
3263
3264static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003265match_expand(MatchObject* self, PyObject* ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003266{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003267 /* delegate to Python code */
3268 return call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00003269 SRE_PY_MODULE, "_expand",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003270 PyTuple_Pack(3, self->pattern, self, ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003271 );
3272}
3273
3274static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003275match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003276{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003277 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003278 Py_ssize_t i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00003279
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003280 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00003281
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003282 switch (size) {
3283 case 0:
3284 result = match_getslice(self, Py_False, Py_None);
3285 break;
3286 case 1:
3287 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3288 break;
3289 default:
3290 /* fetch multiple items */
3291 result = PyTuple_New(size);
3292 if (!result)
3293 return NULL;
3294 for (i = 0; i < size; i++) {
3295 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003296 self, PyTuple_GET_ITEM(args, i), Py_None
3297 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003298 if (!item) {
3299 Py_DECREF(result);
3300 return NULL;
3301 }
3302 PyTuple_SET_ITEM(result, i, item);
3303 }
3304 break;
3305 }
3306 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003307}
3308
3309static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003310match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003311{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003312 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003313 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003314
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003315 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003316 static char* kwlist[] = { "default", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00003317 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003318 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003319
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003320 result = PyTuple_New(self->groups-1);
3321 if (!result)
3322 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003323
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003324 for (index = 1; index < self->groups; index++) {
3325 PyObject* item;
3326 item = match_getslice_by_index(self, index, def);
3327 if (!item) {
3328 Py_DECREF(result);
3329 return NULL;
3330 }
3331 PyTuple_SET_ITEM(result, index-1, item);
3332 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003333
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003334 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003335}
3336
3337static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003338match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003339{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003340 PyObject* result;
3341 PyObject* keys;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003342 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003343
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003344 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003345 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00003346 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003347 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003348
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003349 result = PyDict_New();
3350 if (!result || !self->pattern->groupindex)
3351 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003352
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003353 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003354 if (!keys)
3355 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00003356
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003357 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00003358 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003359 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003360 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003361 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003362 if (!key)
3363 goto failed;
3364 value = match_getslice(self, key, def);
3365 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003366 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003367 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003368 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00003369 status = PyDict_SetItem(result, key, value);
3370 Py_DECREF(value);
3371 if (status < 0)
3372 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003373 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003374
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003375 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00003376
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003377 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003378
3379failed:
Neal Norwitz60da3162006-03-07 04:48:24 +00003380 Py_XDECREF(keys);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003381 Py_DECREF(result);
3382 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003383}
3384
3385static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003386match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003387{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003388 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003389
Guido van Rossumddefaf32007-01-14 03:31:43 +00003390 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003391 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003392 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003393
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003394 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003395
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003396 if (index < 0 || index >= self->groups) {
3397 PyErr_SetString(
3398 PyExc_IndexError,
3399 "no such group"
3400 );
3401 return NULL;
3402 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003403
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003404 /* mark is -1 if group is undefined */
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01003405 return PyLong_FromSsize_t(self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00003406}
3407
3408static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003409match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003410{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003411 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003412
Guido van Rossumddefaf32007-01-14 03:31:43 +00003413 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003414 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003415 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003416
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003417 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003418
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003419 if (index < 0 || index >= self->groups) {
3420 PyErr_SetString(
3421 PyExc_IndexError,
3422 "no such group"
3423 );
3424 return NULL;
3425 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003426
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003427 /* mark is -1 if group is undefined */
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01003428 return PyLong_FromSsize_t(self->mark[index*2+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003429}
3430
3431LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003432_pair(Py_ssize_t i1, Py_ssize_t i2)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003433{
3434 PyObject* pair;
3435 PyObject* item;
3436
3437 pair = PyTuple_New(2);
3438 if (!pair)
3439 return NULL;
3440
Christian Heimes217cfd12007-12-02 14:31:20 +00003441 item = PyLong_FromSsize_t(i1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003442 if (!item)
3443 goto error;
3444 PyTuple_SET_ITEM(pair, 0, item);
3445
Christian Heimes217cfd12007-12-02 14:31:20 +00003446 item = PyLong_FromSsize_t(i2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003447 if (!item)
3448 goto error;
3449 PyTuple_SET_ITEM(pair, 1, item);
3450
3451 return pair;
3452
3453 error:
3454 Py_DECREF(pair);
3455 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003456}
3457
3458static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003459match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003460{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003461 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003462
Guido van Rossumddefaf32007-01-14 03:31:43 +00003463 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003464 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003465 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003466
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003467 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003468
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003469 if (index < 0 || index >= self->groups) {
3470 PyErr_SetString(
3471 PyExc_IndexError,
3472 "no such group"
3473 );
3474 return NULL;
3475 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003476
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003477 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003478 return _pair(self->mark[index*2], self->mark[index*2+1]);
3479}
3480
3481static PyObject*
3482match_regs(MatchObject* self)
3483{
3484 PyObject* regs;
3485 PyObject* item;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003486 Py_ssize_t index;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003487
3488 regs = PyTuple_New(self->groups);
3489 if (!regs)
3490 return NULL;
3491
3492 for (index = 0; index < self->groups; index++) {
3493 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3494 if (!item) {
3495 Py_DECREF(regs);
3496 return NULL;
3497 }
3498 PyTuple_SET_ITEM(regs, index, item);
3499 }
3500
3501 Py_INCREF(regs);
3502 self->regs = regs;
3503
3504 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00003505}
3506
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003507static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003508match_copy(MatchObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003509{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003510#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003511 MatchObject* copy;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003512 Py_ssize_t slots, offset;
Tim Peters3d563502006-01-21 02:47:53 +00003513
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003514 slots = 2 * (self->pattern->groups+1);
3515
3516 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3517 if (!copy)
3518 return NULL;
3519
3520 /* this value a constant, but any compiler should be able to
3521 figure that out all by itself */
3522 offset = offsetof(MatchObject, string);
3523
3524 Py_XINCREF(self->pattern);
3525 Py_XINCREF(self->string);
3526 Py_XINCREF(self->regs);
3527
3528 memcpy((char*) copy + offset, (char*) self + offset,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003529 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003530
3531 return (PyObject*) copy;
3532#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003533 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003534 return NULL;
3535#endif
3536}
3537
3538static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003539match_deepcopy(MatchObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003540{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003541#ifdef USE_BUILTIN_COPY
3542 MatchObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00003543
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003544 copy = (MatchObject*) match_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003545 if (!copy)
3546 return NULL;
3547
3548 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3549 !deepcopy(&copy->string, memo) ||
3550 !deepcopy(&copy->regs, memo)) {
3551 Py_DECREF(copy);
3552 return NULL;
3553 }
3554
3555#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003556 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3557 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003558#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003559}
3560
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003561PyDoc_STRVAR(match_doc,
3562"The result of re.match() and re.search().\n\
3563Match objects always have a boolean value of True.");
3564
3565PyDoc_STRVAR(match_group_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003566"group([group1, ...]) -> str or tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003567 Return subgroup(s) of the match by indices or names.\n\
3568 For 0 returns the entire match.");
3569
3570PyDoc_STRVAR(match_start_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003571"start([group=0]) -> int.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003572 Return index of the start of the substring matched by group.");
3573
3574PyDoc_STRVAR(match_end_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003575"end([group=0]) -> int.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003576 Return index of the end of the substring matched by group.");
3577
3578PyDoc_STRVAR(match_span_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003579"span([group]) -> tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003580 For MatchObject m, return the 2-tuple (m.start(group), m.end(group)).");
3581
3582PyDoc_STRVAR(match_groups_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003583"groups([default=None]) -> tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003584 Return a tuple containing all the subgroups of the match, from 1.\n\
3585 The default argument is used for groups\n\
3586 that did not participate in the match");
3587
3588PyDoc_STRVAR(match_groupdict_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003589"groupdict([default=None]) -> dict.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003590 Return a dictionary containing all the named subgroups of the match,\n\
3591 keyed by the subgroup name. The default argument is used for groups\n\
3592 that did not participate in the match");
3593
3594PyDoc_STRVAR(match_expand_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003595"expand(template) -> str.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003596 Return the string obtained by doing backslash substitution\n\
3597 on the string template, as done by the sub() method.");
3598
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003599static PyMethodDef match_methods[] = {
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003600 {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
3601 {"start", (PyCFunction) match_start, METH_VARARGS, match_start_doc},
3602 {"end", (PyCFunction) match_end, METH_VARARGS, match_end_doc},
3603 {"span", (PyCFunction) match_span, METH_VARARGS, match_span_doc},
3604 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS,
3605 match_groups_doc},
3606 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS,
3607 match_groupdict_doc},
3608 {"expand", (PyCFunction) match_expand, METH_O, match_expand_doc},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003609 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3610 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003611 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003612};
3613
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003614static PyObject *
3615match_lastindex_get(MatchObject *self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003616{
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003617 if (self->lastindex >= 0)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01003618 return PyLong_FromSsize_t(self->lastindex);
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003619 Py_INCREF(Py_None);
3620 return Py_None;
Guido van Rossumb700df92000-03-31 14:59:30 +00003621}
3622
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003623static PyObject *
3624match_lastgroup_get(MatchObject *self)
3625{
3626 if (self->pattern->indexgroup && self->lastindex >= 0) {
3627 PyObject* result = PySequence_GetItem(
3628 self->pattern->indexgroup, self->lastindex
3629 );
3630 if (result)
3631 return result;
3632 PyErr_Clear();
3633 }
3634 Py_INCREF(Py_None);
3635 return Py_None;
3636}
3637
3638static PyObject *
3639match_regs_get(MatchObject *self)
3640{
3641 if (self->regs) {
3642 Py_INCREF(self->regs);
3643 return self->regs;
3644 } else
3645 return match_regs(self);
3646}
3647
3648static PyGetSetDef match_getset[] = {
3649 {"lastindex", (getter)match_lastindex_get, (setter)NULL},
3650 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
3651 {"regs", (getter)match_regs_get, (setter)NULL},
3652 {NULL}
3653};
3654
3655#define MATCH_OFF(x) offsetof(MatchObject, x)
3656static PyMemberDef match_members[] = {
3657 {"string", T_OBJECT, MATCH_OFF(string), READONLY},
3658 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
3659 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
3660 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
3661 {NULL}
3662};
3663
Guido van Rossumb700df92000-03-31 14:59:30 +00003664/* FIXME: implement setattr("string", None) as a special case (to
3665 detach the associated string, if any */
3666
Neal Norwitz57c179c2006-03-22 07:18:02 +00003667static PyTypeObject Match_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003668 PyVarObject_HEAD_INIT(NULL,0)
3669 "_" SRE_MODULE ".SRE_Match",
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003670 sizeof(MatchObject), sizeof(Py_ssize_t),
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003671 (destructor)match_dealloc, /* tp_dealloc */
3672 0, /* tp_print */
3673 0, /* tp_getattr */
3674 0, /* tp_setattr */
3675 0, /* tp_reserved */
3676 0, /* tp_repr */
3677 0, /* tp_as_number */
3678 0, /* tp_as_sequence */
3679 0, /* tp_as_mapping */
3680 0, /* tp_hash */
3681 0, /* tp_call */
3682 0, /* tp_str */
3683 0, /* tp_getattro */
3684 0, /* tp_setattro */
3685 0, /* tp_as_buffer */
3686 Py_TPFLAGS_DEFAULT, /* tp_flags */
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003687 match_doc, /* tp_doc */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003688 0, /* tp_traverse */
3689 0, /* tp_clear */
3690 0, /* tp_richcompare */
3691 0, /* tp_weaklistoffset */
3692 0, /* tp_iter */
3693 0, /* tp_iternext */
3694 match_methods, /* tp_methods */
3695 match_members, /* tp_members */
3696 match_getset, /* tp_getset */
Guido van Rossumb700df92000-03-31 14:59:30 +00003697};
3698
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003699static PyObject*
3700pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3701{
3702 /* create match object (from state object) */
3703
3704 MatchObject* match;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003705 Py_ssize_t i, j;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003706 char* base;
3707 int n;
3708
3709 if (status > 0) {
3710
3711 /* create match object (with room for extra group marks) */
Christian Heimes587c2bf2008-01-19 16:21:02 +00003712 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003713 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3714 2*(pattern->groups+1));
3715 if (!match)
3716 return NULL;
3717
3718 Py_INCREF(pattern);
3719 match->pattern = pattern;
3720
3721 Py_INCREF(state->string);
3722 match->string = state->string;
3723
3724 match->regs = NULL;
3725 match->groups = pattern->groups+1;
3726
3727 /* fill in group slices */
3728
3729 base = (char*) state->beginning;
3730 n = state->charsize;
3731
3732 match->mark[0] = ((char*) state->start - base) / n;
3733 match->mark[1] = ((char*) state->ptr - base) / n;
3734
3735 for (i = j = 0; i < pattern->groups; i++, j+=2)
3736 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3737 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3738 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3739 } else
3740 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3741
3742 match->pos = state->pos;
3743 match->endpos = state->endpos;
3744
3745 match->lastindex = state->lastindex;
3746
3747 return (PyObject*) match;
3748
3749 } else if (status == 0) {
3750
3751 /* no match */
3752 Py_INCREF(Py_None);
3753 return Py_None;
3754
3755 }
3756
3757 /* internal error */
3758 pattern_error(status);
3759 return NULL;
3760}
3761
3762
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003763/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003764/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003765
3766static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003767scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003768{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003769 state_fini(&self->state);
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003770 Py_XDECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003771 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003772}
3773
3774static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003775scanner_match(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003776{
3777 SRE_STATE* state = &self->state;
3778 PyObject* match;
3779 int status;
3780
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003781 state_reset(state);
3782
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003783 state->ptr = state->start;
3784
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003785 if (state->logical_charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003786 status = sre_match(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003787 } else {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003788 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003789 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003790 if (PyErr_Occurred())
3791 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003792
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003793 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003794 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003795
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003796 if (status == 0 || state->ptr == state->start)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003797 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003798 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003799 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003800
3801 return match;
3802}
3803
3804
3805static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003806scanner_search(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003807{
3808 SRE_STATE* state = &self->state;
3809 PyObject* match;
3810 int status;
3811
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003812 state_reset(state);
3813
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003814 state->ptr = state->start;
3815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003816 if (state->logical_charsize == 1) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003817 status = sre_search(state, PatternObject_GetCode(self->pattern));
3818 } else {
3819 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
3820 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003821 if (PyErr_Occurred())
3822 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003823
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003824 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003825 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003826
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003827 if (status == 0 || state->ptr == state->start)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003828 state->start = (void*) ((char*) state->ptr + state->charsize);
3829 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003830 state->start = state->ptr;
3831
3832 return match;
3833}
3834
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003835static PyMethodDef scanner_methods[] = {
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003836 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3837 {"search", (PyCFunction) scanner_search, METH_NOARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003838 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003839};
3840
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003841#define SCAN_OFF(x) offsetof(ScannerObject, x)
3842static PyMemberDef scanner_members[] = {
Ezio Melotti7c8c1ea2011-09-29 01:00:19 +03003843 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003844 {NULL} /* Sentinel */
3845};
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003846
Neal Norwitz57c179c2006-03-22 07:18:02 +00003847static PyTypeObject Scanner_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003848 PyVarObject_HEAD_INIT(NULL, 0)
3849 "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003850 sizeof(ScannerObject), 0,
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003851 (destructor)scanner_dealloc,/* tp_dealloc */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003852 0, /* tp_print */
3853 0, /* tp_getattr */
3854 0, /* tp_setattr */
3855 0, /* tp_reserved */
3856 0, /* tp_repr */
3857 0, /* tp_as_number */
3858 0, /* tp_as_sequence */
3859 0, /* tp_as_mapping */
3860 0, /* tp_hash */
3861 0, /* tp_call */
3862 0, /* tp_str */
3863 0, /* tp_getattro */
3864 0, /* tp_setattro */
3865 0, /* tp_as_buffer */
3866 Py_TPFLAGS_DEFAULT, /* tp_flags */
3867 0, /* tp_doc */
3868 0, /* tp_traverse */
3869 0, /* tp_clear */
3870 0, /* tp_richcompare */
3871 0, /* tp_weaklistoffset */
3872 0, /* tp_iter */
3873 0, /* tp_iternext */
3874 scanner_methods, /* tp_methods */
3875 scanner_members, /* tp_members */
3876 0, /* tp_getset */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003877};
3878
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003879static PyObject*
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06003880pattern_scanner(PatternObject* pattern, PyObject* args, PyObject* kw)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003881{
3882 /* create search state object */
3883
3884 ScannerObject* self;
3885
3886 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003887 Py_ssize_t start = 0;
3888 Py_ssize_t end = PY_SSIZE_T_MAX;
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06003889 static char* kwlist[] = { "source", "pos", "endpos", NULL };
3890 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:scanner", kwlist,
3891 &string, &start, &end))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003892 return NULL;
3893
3894 /* create scanner object */
3895 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3896 if (!self)
3897 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003898 self->pattern = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003899
3900 string = state_init(&self->state, pattern, string, start, end);
3901 if (!string) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003902 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003903 return NULL;
3904 }
3905
3906 Py_INCREF(pattern);
3907 self->pattern = (PyObject*) pattern;
3908
3909 return (PyObject*) self;
3910}
3911
Guido van Rossumb700df92000-03-31 14:59:30 +00003912static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00003913 {"compile", _compile, METH_VARARGS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003914 {"getcodesize", sre_codesize, METH_NOARGS},
Neal Norwitzb0493252002-03-31 14:44:22 +00003915 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003916 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003917};
3918
Martin v. Löwis1a214512008-06-11 05:26:20 +00003919static struct PyModuleDef sremodule = {
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003920 PyModuleDef_HEAD_INIT,
3921 "_" SRE_MODULE,
3922 NULL,
3923 -1,
3924 _functions,
3925 NULL,
3926 NULL,
3927 NULL,
3928 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00003929};
3930
3931PyMODINIT_FUNC PyInit__sre(void)
Guido van Rossumb700df92000-03-31 14:59:30 +00003932{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003933 PyObject* m;
3934 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00003935 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003936
Benjamin Peterson08bf91c2010-04-11 16:12:57 +00003937 /* Patch object types */
3938 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
3939 PyType_Ready(&Scanner_Type))
Martin v. Löwis1a214512008-06-11 05:26:20 +00003940 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003941
Martin v. Löwis1a214512008-06-11 05:26:20 +00003942 m = PyModule_Create(&sremodule);
Neal Norwitz1ac754f2006-01-19 06:09:39 +00003943 if (m == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003944 return NULL;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003945 d = PyModule_GetDict(m);
3946
Christian Heimes217cfd12007-12-02 14:31:20 +00003947 x = PyLong_FromLong(SRE_MAGIC);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003948 if (x) {
3949 PyDict_SetItemString(d, "MAGIC", x);
3950 Py_DECREF(x);
3951 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00003952
Christian Heimes217cfd12007-12-02 14:31:20 +00003953 x = PyLong_FromLong(sizeof(SRE_CODE));
Martin v. Löwis78e2f062003-04-19 12:56:08 +00003954 if (x) {
3955 PyDict_SetItemString(d, "CODESIZE", x);
3956 Py_DECREF(x);
3957 }
3958
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02003959 x = PyLong_FromUnsignedLong(SRE_MAXREPEAT);
3960 if (x) {
3961 PyDict_SetItemString(d, "MAXREPEAT", x);
3962 Py_DECREF(x);
3963 }
3964
Neal Norwitzfe537132007-08-26 03:55:15 +00003965 x = PyUnicode_FromString(copyright);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003966 if (x) {
3967 PyDict_SetItemString(d, "copyright", x);
3968 Py_DECREF(x);
3969 }
Martin v. Löwis1a214512008-06-11 05:26:20 +00003970 return m;
Guido van Rossumb700df92000-03-31 14:59:30 +00003971}
3972
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003973#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00003974
3975/* vim:ts=4:sw=4:et
3976*/