blob: 1c76d24dbd31a09518a5c71d73af8f448a07438c [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
Fredrik Lundhebc37b22000-10-28 19:30:41 +00009 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000010 * 2000-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000011 * 2000-09-20 fl added expand method
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000012 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000013 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000014 * 2001-04-28 fl added __copy__ methods (work in progress)
Fredrik Lundh09705f02002-11-22 12:46:35 +000015 * 2001-05-14 fl fixes for 1.5.2 compatibility
Fredrik Lundhf71ae462001-07-02 17:04:48 +000016 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh397a6542001-10-18 19:30:16 +000017 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Fredrik Lundh971e78b2001-10-20 17:48:46 +000018 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
Fredrik Lundhbec95b92001-10-21 16:47:57 +000019 * 2001-10-21 fl added sub/subn primitive
Fredrik Lundh703ce812001-10-24 22:16:30 +000020 * 2001-10-24 fl added finditer primitive (for 2.2 only)
Fredrik Lundh82b23072001-12-09 16:13:15 +000021 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
Fredrik Lundh09705f02002-11-22 12:46:35 +000022 * 2002-11-09 fl fixed empty sub/subn return type
Martin v. Löwis78e2f062003-04-19 12:56:08 +000023 * 2003-04-18 mvl fully support 4-byte codes
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +000024 * 2003-10-17 gn implemented non recursive scheme
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000026 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000027 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000028 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
Guido van Rossumb700df92000-03-31 14:59:30 +000032 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000033 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000034 * other compatibility work.
35 */
36
37#ifndef SRE_RECURSIVE
38
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000039static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000040 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000041
Thomas Wouters0e3f5912006-08-11 14:57:12 +000042#define PY_SSIZE_T_CLEAN
43
Guido van Rossumb700df92000-03-31 14:59:30 +000044#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000045#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000046
47#include "sre.h"
48
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000049#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000050
Fredrik Lundh436c3d582000-06-29 08:58:44 +000051/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000052#if !defined(SRE_MODULE)
53#define SRE_MODULE "sre"
54#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000055
Thomas Wouters9ada3d62006-04-21 09:47:09 +000056#define SRE_PY_MODULE "re"
57
Guido van Rossumb700df92000-03-31 14:59:30 +000058/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000059#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000060
Fredrik Lundh22d25462000-07-01 17:50:59 +000061/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +000062#define HAVE_UNICODE
Fredrik Lundh436c3d582000-06-29 08:58:44 +000063
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000065/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000066
67/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000068#define USE_FAST_SEARCH
69
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000070/* enables copy/deepcopy handling (work in progress) */
71#undef USE_BUILTIN_COPY
72
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000073#if PY_VERSION_HEX < 0x01060000
74#define PyObject_DEL(op) PyMem_DEL((op))
75#endif
76
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000077/* -------------------------------------------------------------------- */
78
Fredrik Lundh80946112000-06-29 18:03:25 +000079#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000080#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000081#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000082/* fastest possible local call under MSVC */
83#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000084#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000085#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000086#else
87#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000088#endif
89
90/* error codes */
91#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000092#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000093#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000094#define SRE_ERROR_MEMORY -9 /* out of memory */
Christian Heimes2380ac72008-01-09 00:17:24 +000095#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
Guido van Rossumb700df92000-03-31 14:59:30 +000096
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000097#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +000098#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +000099#else
100#define TRACE(v)
101#endif
102
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000103/* -------------------------------------------------------------------- */
104/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000105
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000106/* default character predicates (run sre_chars.py to regenerate tables) */
107
108#define SRE_DIGIT_MASK 1
109#define SRE_SPACE_MASK 2
110#define SRE_LINEBREAK_MASK 4
111#define SRE_ALNUM_MASK 8
112#define SRE_WORD_MASK 16
113
Fredrik Lundh21009b92001-09-18 18:47:09 +0000114/* FIXME: this assumes ASCII. create tables in init_sre() instead */
115
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000116static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1172, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1180, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
11925, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12024, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1210, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12224, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
123
Fredrik Lundhb389df32000-06-29 12:48:37 +0000124static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000012510, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
12627, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
12744, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
12861, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
129108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
130122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
131106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
132120, 121, 122, 123, 124, 125, 126, 127 };
133
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000134#define SRE_IS_DIGIT(ch)\
135 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
136#define SRE_IS_SPACE(ch)\
137 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
138#define SRE_IS_LINEBREAK(ch)\
139 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
140#define SRE_IS_ALNUM(ch)\
141 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
142#define SRE_IS_WORD(ch)\
143 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000144
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000145static unsigned int sre_lower(unsigned int ch)
146{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000147 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000148}
149
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000150/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000151/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
152 * warnings when c's type supports only numbers < N+1 */
153#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
154#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000155#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000156#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000157#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
158
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000159static unsigned int sre_lower_locale(unsigned int ch)
160{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000161 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000162}
163
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000164/* unicode-specific character predicates */
165
Victor Stinner0058b862011-09-29 03:27:47 +0200166#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
167#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch)
168#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch)
169#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch)
170#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000171
172static unsigned int sre_lower_unicode(unsigned int ch)
173{
Victor Stinner0058b862011-09-29 03:27:47 +0200174 return (unsigned int) Py_UNICODE_TOLOWER(ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000175}
176
Guido van Rossumb700df92000-03-31 14:59:30 +0000177LOCAL(int)
178sre_category(SRE_CODE category, unsigned int ch)
179{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000180 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000181
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000182 case SRE_CATEGORY_DIGIT:
183 return SRE_IS_DIGIT(ch);
184 case SRE_CATEGORY_NOT_DIGIT:
185 return !SRE_IS_DIGIT(ch);
186 case SRE_CATEGORY_SPACE:
187 return SRE_IS_SPACE(ch);
188 case SRE_CATEGORY_NOT_SPACE:
189 return !SRE_IS_SPACE(ch);
190 case SRE_CATEGORY_WORD:
191 return SRE_IS_WORD(ch);
192 case SRE_CATEGORY_NOT_WORD:
193 return !SRE_IS_WORD(ch);
194 case SRE_CATEGORY_LINEBREAK:
195 return SRE_IS_LINEBREAK(ch);
196 case SRE_CATEGORY_NOT_LINEBREAK:
197 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000198
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000199 case SRE_CATEGORY_LOC_WORD:
200 return SRE_LOC_IS_WORD(ch);
201 case SRE_CATEGORY_LOC_NOT_WORD:
202 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000203
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000204 case SRE_CATEGORY_UNI_DIGIT:
205 return SRE_UNI_IS_DIGIT(ch);
206 case SRE_CATEGORY_UNI_NOT_DIGIT:
207 return !SRE_UNI_IS_DIGIT(ch);
208 case SRE_CATEGORY_UNI_SPACE:
209 return SRE_UNI_IS_SPACE(ch);
210 case SRE_CATEGORY_UNI_NOT_SPACE:
211 return !SRE_UNI_IS_SPACE(ch);
212 case SRE_CATEGORY_UNI_WORD:
213 return SRE_UNI_IS_WORD(ch);
214 case SRE_CATEGORY_UNI_NOT_WORD:
215 return !SRE_UNI_IS_WORD(ch);
216 case SRE_CATEGORY_UNI_LINEBREAK:
217 return SRE_UNI_IS_LINEBREAK(ch);
218 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
219 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000220 }
221 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000222}
223
224/* helpers */
225
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000226static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000227data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000228{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000229 if (state->data_stack) {
Thomas Wouters477c8d52006-05-27 19:21:47 +0000230 PyMem_FREE(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000231 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000232 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000233 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000234}
235
236static int
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000237data_stack_grow(SRE_STATE* state, Py_ssize_t size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000238{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000239 Py_ssize_t minsize, cursize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000240 minsize = state->data_stack_base+size;
241 cursize = state->data_stack_size;
242 if (cursize < minsize) {
243 void* stack;
244 cursize = minsize+minsize/4+1024;
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300245 TRACE(("allocate/grow stack %" PY_FORMAT_SIZE_T "d\n", cursize));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000246 stack = PyMem_REALLOC(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000247 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000248 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000249 return SRE_ERROR_MEMORY;
250 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000251 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000252 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000253 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000254 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000255}
256
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000257/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000258
259#define SRE_CHAR unsigned char
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200260#define SRE_CHARGET(state, buf, index) ((unsigned char*)buf)[index]
Guido van Rossumb700df92000-03-31 14:59:30 +0000261#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000262#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000263#define SRE_CHARSET sre_charset
264#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000265#define SRE_MATCH sre_match
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000266#define SRE_MATCH_CONTEXT sre_match_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000267#define SRE_SEARCH sre_search
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000268
Guido van Rossumb700df92000-03-31 14:59:30 +0000269#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000270#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000271#undef SRE_RECURSIVE
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000272
Guido van Rossumb700df92000-03-31 14:59:30 +0000273#undef SRE_SEARCH
274#undef SRE_MATCH
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000275#undef SRE_MATCH_CONTEXT
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000276#undef SRE_INFO
277#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000278#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000279#undef SRE_AT
280#undef SRE_CHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200281#undef SRE_CHARGET
Guido van Rossumb700df92000-03-31 14:59:30 +0000282
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200283/* generate 8/16/32-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200285#define SRE_CHAR void
286#define SRE_CHARGET(state, buf, index) \
287 ((state->charsize==1) ? ((Py_UCS1*)buf)[index] : \
288 (state->charsize==2) ? ((Py_UCS2*)buf)[index] : \
289 ((Py_UCS4*)buf)[index])
Guido van Rossumb700df92000-03-31 14:59:30 +0000290#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000291#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000292#define SRE_CHARSET sre_ucharset
293#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000294#define SRE_MATCH sre_umatch
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000295#define SRE_MATCH_CONTEXT sre_umatch_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000296#define SRE_SEARCH sre_usearch
297
298#endif /* SRE_RECURSIVE */
299
300/* -------------------------------------------------------------------- */
301/* String matching engine */
302
303/* the following section is compiled twice, with different character
304 settings */
305
306LOCAL(int)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200307SRE_AT(SRE_STATE* state, char* ptr, SRE_CODE at)
Guido van Rossumb700df92000-03-31 14:59:30 +0000308{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000309 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000310
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000311 Py_ssize_t thisp, thatp;
Guido van Rossumb700df92000-03-31 14:59:30 +0000312
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000313 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000314
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000315 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000316 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000317 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000318
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000319 case SRE_AT_BEGINNING_LINE:
320 return ((void*) ptr == state->beginning ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200321 SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, -1)));
Fredrik Lundh80946112000-06-29 18:03:25 +0000322
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000323 case SRE_AT_END:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200324 return (((void*) (ptr+state->charsize) == state->end &&
325 SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, 0))) ||
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000326 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000327
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000328 case SRE_AT_END_LINE:
329 return ((void*) ptr == state->end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200330 SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, 0)));
Fredrik Lundh80946112000-06-29 18:03:25 +0000331
Fredrik Lundh770617b2001-01-14 15:06:11 +0000332 case SRE_AT_END_STRING:
333 return ((void*) ptr == state->end);
334
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000335 case SRE_AT_BOUNDARY:
336 if (state->beginning == state->end)
337 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000338 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200339 SRE_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000340 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200341 SRE_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000342 return thisp != thatp;
Fredrik Lundh80946112000-06-29 18:03:25 +0000343
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000344 case SRE_AT_NON_BOUNDARY:
345 if (state->beginning == state->end)
346 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000347 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200348 SRE_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000349 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200350 SRE_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000351 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000352
353 case SRE_AT_LOC_BOUNDARY:
354 if (state->beginning == state->end)
355 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000356 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200357 SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000358 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200359 SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000360 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000361
362 case SRE_AT_LOC_NON_BOUNDARY:
363 if (state->beginning == state->end)
364 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000365 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200366 SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000367 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200368 SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000369 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000370
371 case SRE_AT_UNI_BOUNDARY:
372 if (state->beginning == state->end)
373 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000374 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200375 SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000376 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200377 SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000378 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000379
380 case SRE_AT_UNI_NON_BOUNDARY:
381 if (state->beginning == state->end)
382 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000383 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200384 SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000385 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200386 SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000387 return thisp == thatp;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000388
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000389 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000390
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000391 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000392}
393
394LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000395SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000396{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000397 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000398
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000399 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000400
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000401 for (;;) {
402 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000403
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000404 case SRE_OP_FAILURE:
405 return !ok;
406
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000407 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000408 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000409 if (ch == set[0])
410 return ok;
411 set++;
412 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000413
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000414 case SRE_OP_CATEGORY:
415 /* <CATEGORY> <code> */
416 if (sre_category(set[0], (int) ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000417 return ok;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000418 set += 1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000419 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000420
Fredrik Lundh3562f112000-07-02 12:00:07 +0000421 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000422 if (sizeof(SRE_CODE) == 2) {
423 /* <CHARSET> <bitmap> (16 bits per code word) */
424 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
425 return ok;
426 set += 16;
Tim Peters3d563502006-01-21 02:47:53 +0000427 }
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000428 else {
429 /* <CHARSET> <bitmap> (32 bits per code word) */
Gregory P. Smith90555d02012-12-10 17:44:44 -0800430 if (ch < 256 && (set[ch >> 5] & (1u << (ch & 31))))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000431 return ok;
432 set += 8;
433 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000434 break;
435
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000436 case SRE_OP_RANGE:
437 /* <RANGE> <lower> <upper> */
438 if (set[0] <= ch && ch <= set[1])
439 return ok;
440 set += 2;
441 break;
442
443 case SRE_OP_NEGATE:
444 ok = !ok;
445 break;
446
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000447 case SRE_OP_BIGCHARSET:
448 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
449 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000450 Py_ssize_t count, block;
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000451 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000452
453 if (sizeof(SRE_CODE) == 2) {
Serhiy Storchakabe80fc92013-10-24 22:02:58 +0300454 block = ((unsigned char*)set)[ch >> 8];
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000455 set += 128;
456 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
457 return ok;
458 set += count*16;
459 }
460 else {
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000461 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
462 * warnings when c's type supports only numbers < N+1 */
463 if (!(ch & ~65535))
Serhiy Storchakabe80fc92013-10-24 22:02:58 +0300464 block = ((unsigned char*)set)[ch >> 8];
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000465 else
466 block = -1;
467 set += 64;
Tim Peters3d563502006-01-21 02:47:53 +0000468 if (block >=0 &&
Gregory P. Smith90555d02012-12-10 17:44:44 -0800469 (set[block*8 + ((ch & 255)>>5)] & (1u << (ch & 31))))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000470 return ok;
471 set += count*8;
472 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000473 break;
474 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000475
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000476 default:
477 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000478 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000479 return 0;
480 }
481 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000482}
483
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000484LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000485
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000486LOCAL(Py_ssize_t)
487SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000488{
489 SRE_CODE chr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200490 char* ptr = (char *)state->ptr;
491 char* end = (char *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000492 Py_ssize_t i;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000493
494 /* adjust end */
Serhiy Storchakaa0eb8092013-02-16 16:54:33 +0200495 if (maxcount < (end - ptr) / state->charsize && maxcount != SRE_MAXREPEAT)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200496 end = ptr + maxcount*state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000497
498 switch (pattern[0]) {
499
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000500 case SRE_OP_IN:
501 /* repeated set */
502 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
Victor Stinner63ab8752011-11-22 03:31:20 +0100503 while (ptr < end &&
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200504 SRE_CHARSET(pattern + 2, SRE_CHARGET(state, ptr, 0)))
505 ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000506 break;
507
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000508 case SRE_OP_ANY:
509 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000510 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200511 while (ptr < end && !SRE_IS_LINEBREAK(SRE_CHARGET(state, ptr, 0)))
512 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000513 break;
514
515 case SRE_OP_ANY_ALL:
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000516 /* repeated dot wildcard. skip to the end of the target
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000517 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000518 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000519 ptr = end;
520 break;
521
522 case SRE_OP_LITERAL:
523 /* repeated literal */
524 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000525 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200526 while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) == chr)
527 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000528 break;
529
530 case SRE_OP_LITERAL_IGNORE:
531 /* repeated literal */
532 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000533 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200534 while (ptr < end && (SRE_CODE) state->lower(SRE_CHARGET(state, ptr, 0)) == chr)
535 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000536 break;
537
538 case SRE_OP_NOT_LITERAL:
539 /* repeated non-literal */
540 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000541 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200542 while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) != chr)
543 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000544 break;
Tim Peters3d563502006-01-21 02:47:53 +0000545
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000546 case SRE_OP_NOT_LITERAL_IGNORE:
547 /* repeated non-literal */
548 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000549 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200550 while (ptr < end && (SRE_CODE) state->lower(SRE_CHARGET(state, ptr, 0)) != chr)
551 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000552 break;
553
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000554 default:
555 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000556 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200557 while ((char*) state->ptr < end) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000558 i = SRE_MATCH(state, pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000559 if (i < 0)
560 return i;
561 if (!i)
562 break;
563 }
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300564 TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200565 ((char*)state->ptr - ptr)/state->charsize));
566 return ((char*)state->ptr - ptr)/state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000567 }
568
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300569 TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
570 (ptr - (char*) state->ptr)/state->charsize));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200571 return (ptr - (char*) state->ptr)/state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000572}
573
Fredrik Lundh33accc12000-08-27 20:59:47 +0000574#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000575LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000576SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
577{
578 /* check if an SRE_OP_INFO block matches at the current position.
579 returns the number of SRE_CODE objects to skip if successful, 0
580 if no match */
581
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200582 char* end = state->end;
583 char* ptr = state->ptr;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000584 Py_ssize_t i;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000585
586 /* check minimal length */
Serhiy Storchakac1b59d42012-12-29 23:38:48 +0200587 if (pattern[3] && (end - ptr)/state->charsize < pattern[3])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000588 return 0;
589
590 /* check known prefix */
591 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
592 /* <length> <skip> <prefix data> <overlap data> */
593 for (i = 0; i < pattern[5]; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200594 if ((SRE_CODE) SRE_CHARGET(state, ptr, i) != pattern[7 + i])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000595 return 0;
596 return pattern[0] + 2 * pattern[6];
597 }
598 return pattern[0];
599}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000600#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000601
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000602/* The macros below should be used to protect recursive SRE_MATCH()
603 * calls that *failed* and do *not* return immediately (IOW, those
604 * that will backtrack). Explaining:
605 *
606 * - Recursive SRE_MATCH() returned true: that's usually a success
607 * (besides atypical cases like ASSERT_NOT), therefore there's no
608 * reason to restore lastmark;
609 *
610 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
611 * is returning to the caller: If the current SRE_MATCH() is the
612 * top function of the recursion, returning false will be a matching
613 * failure, and it doesn't matter where lastmark is pointing to.
614 * If it's *not* the top function, it will be a recursive SRE_MATCH()
615 * failure by itself, and the calling SRE_MATCH() will have to deal
616 * with the failure by the same rules explained here (it will restore
617 * lastmark by itself if necessary);
618 *
619 * - Recursive SRE_MATCH() returned false, and will continue the
620 * outside 'for' loop: must be protected when breaking, since the next
621 * OP could potentially depend on lastmark;
Tim Peters3d563502006-01-21 02:47:53 +0000622 *
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000623 * - Recursive SRE_MATCH() returned false, and will be called again
624 * inside a local for/while loop: must be protected between each
625 * loop iteration, since the recursive SRE_MATCH() could do anything,
626 * and could potentially depend on lastmark.
627 *
628 * For more information, check the discussion at SF patch #712900.
629 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000630#define LASTMARK_SAVE() \
631 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000632 ctx->lastmark = state->lastmark; \
633 ctx->lastindex = state->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000634 } while (0)
635#define LASTMARK_RESTORE() \
636 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000637 state->lastmark = ctx->lastmark; \
638 state->lastindex = ctx->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000639 } while (0)
640
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000641#define RETURN_ERROR(i) do { return i; } while(0)
642#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
643#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
644
645#define RETURN_ON_ERROR(i) \
646 do { if (i < 0) RETURN_ERROR(i); } while (0)
647#define RETURN_ON_SUCCESS(i) \
648 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
649#define RETURN_ON_FAILURE(i) \
650 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
651
652#define SFY(x) #x
653
654#define DATA_STACK_ALLOC(state, type, ptr) \
655do { \
656 alloc_pos = state->data_stack_base; \
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300657 TRACE(("allocating %s in %" PY_FORMAT_SIZE_T "d " \
658 "(%" PY_FORMAT_SIZE_T "d)\n", \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000659 SFY(type), alloc_pos, sizeof(type))); \
Serhiy Storchaka4bb17342013-04-13 21:15:47 +0300660 if (sizeof(type) > state->data_stack_size - alloc_pos) { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000661 int j = data_stack_grow(state, sizeof(type)); \
662 if (j < 0) return j; \
663 if (ctx_pos != -1) \
664 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
665 } \
666 ptr = (type*)(state->data_stack+alloc_pos); \
667 state->data_stack_base += sizeof(type); \
668} while (0)
669
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000670#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
671do { \
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300672 TRACE(("looking up %s at %" PY_FORMAT_SIZE_T "d\n", SFY(type), pos)); \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000673 ptr = (type*)(state->data_stack+pos); \
674} while (0)
675
676#define DATA_STACK_PUSH(state, data, size) \
677do { \
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300678 TRACE(("copy data in %p to %" PY_FORMAT_SIZE_T "d " \
679 "(%" PY_FORMAT_SIZE_T "d)\n", \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000680 data, state->data_stack_base, size)); \
Serhiy Storchaka4bb17342013-04-13 21:15:47 +0300681 if (size > state->data_stack_size - state->data_stack_base) { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000682 int j = data_stack_grow(state, size); \
683 if (j < 0) return j; \
684 if (ctx_pos != -1) \
685 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
686 } \
687 memcpy(state->data_stack+state->data_stack_base, data, size); \
688 state->data_stack_base += size; \
689} while (0)
690
691#define DATA_STACK_POP(state, data, size, discard) \
692do { \
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300693 TRACE(("copy data to %p from %" PY_FORMAT_SIZE_T "d " \
694 "(%" PY_FORMAT_SIZE_T "d)\n", \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000695 data, state->data_stack_base-size, size)); \
696 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
697 if (discard) \
698 state->data_stack_base -= size; \
699} while (0)
700
701#define DATA_STACK_POP_DISCARD(state, size) \
702do { \
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300703 TRACE(("discard data from %" PY_FORMAT_SIZE_T "d " \
704 "(%" PY_FORMAT_SIZE_T "d)\n", \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000705 state->data_stack_base-size, size)); \
706 state->data_stack_base -= size; \
707} while(0)
708
709#define DATA_PUSH(x) \
710 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
711#define DATA_POP(x) \
712 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000713#define DATA_POP_DISCARD(x) \
714 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
715#define DATA_ALLOC(t,p) \
716 DATA_STACK_ALLOC(state, t, p)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000717#define DATA_LOOKUP_AT(t,p,pos) \
718 DATA_STACK_LOOKUP_AT(state,t,p,pos)
719
720#define MARK_PUSH(lastmark) \
721 do if (lastmark > 0) { \
722 i = lastmark; /* ctx->lastmark may change if reallocated */ \
723 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
724 } while (0)
725#define MARK_POP(lastmark) \
726 do if (lastmark > 0) { \
727 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
728 } while (0)
729#define MARK_POP_KEEP(lastmark) \
730 do if (lastmark > 0) { \
731 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
732 } while (0)
733#define MARK_POP_DISCARD(lastmark) \
734 do if (lastmark > 0) { \
735 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
736 } while (0)
737
738#define JUMP_NONE 0
739#define JUMP_MAX_UNTIL_1 1
740#define JUMP_MAX_UNTIL_2 2
741#define JUMP_MAX_UNTIL_3 3
742#define JUMP_MIN_UNTIL_1 4
743#define JUMP_MIN_UNTIL_2 5
744#define JUMP_MIN_UNTIL_3 6
745#define JUMP_REPEAT 7
746#define JUMP_REPEAT_ONE_1 8
747#define JUMP_REPEAT_ONE_2 9
748#define JUMP_MIN_REPEAT_ONE 10
749#define JUMP_BRANCH 11
750#define JUMP_ASSERT 12
751#define JUMP_ASSERT_NOT 13
752
753#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
754 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
755 nextctx->last_ctx_pos = ctx_pos; \
756 nextctx->jump = jumpvalue; \
757 nextctx->pattern = nextpattern; \
758 ctx_pos = alloc_pos; \
759 ctx = nextctx; \
760 goto entrance; \
761 jumplabel: \
762 while (0) /* gcc doesn't like labels at end of scopes */ \
763
764typedef struct {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000765 Py_ssize_t last_ctx_pos;
766 Py_ssize_t jump;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200767 char* ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000768 SRE_CODE* pattern;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000769 Py_ssize_t count;
770 Py_ssize_t lastmark;
771 Py_ssize_t lastindex;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000772 union {
773 SRE_CODE chr;
774 SRE_REPEAT* rep;
775 } u;
776} SRE_MATCH_CONTEXT;
777
778/* check if string matches the given pattern. returns <0 for
779 error, 0 for failure, and 1 for success */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000780LOCAL(Py_ssize_t)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000781SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
Guido van Rossumb700df92000-03-31 14:59:30 +0000782{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200783 char* end = (char*)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000784 Py_ssize_t alloc_pos, ctx_pos = -1;
785 Py_ssize_t i, ret = 0;
786 Py_ssize_t jump;
Christian Heimes2380ac72008-01-09 00:17:24 +0000787 unsigned int sigcount=0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000788
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000789 SRE_MATCH_CONTEXT* ctx;
790 SRE_MATCH_CONTEXT* nextctx;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000791
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000792 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000793
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000794 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
795 ctx->last_ctx_pos = -1;
796 ctx->jump = JUMP_NONE;
797 ctx->pattern = pattern;
798 ctx_pos = alloc_pos;
799
800entrance:
801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200802 ctx->ptr = (char *)state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000803
804 if (ctx->pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000805 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000806 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200807 if (ctx->pattern[3] && (end - ctx->ptr)/state->charsize < ctx->pattern[3]) {
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300808 TRACE(("reject (got %" PY_FORMAT_SIZE_T "d chars, "
809 "need %" PY_FORMAT_SIZE_T "d)\n",
810 (end - ctx->ptr)/state->charsize,
811 (Py_ssize_t) ctx->pattern[3]));
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000812 RETURN_FAILURE;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000813 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000814 ctx->pattern += ctx->pattern[1] + 1;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000815 }
816
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000817 for (;;) {
Christian Heimes2380ac72008-01-09 00:17:24 +0000818 ++sigcount;
819 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
820 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
Guido van Rossumb700df92000-03-31 14:59:30 +0000821
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000822 switch (*ctx->pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000823
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000824 case SRE_OP_MARK:
825 /* set mark */
826 /* <MARK> <gid> */
827 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
828 ctx->ptr, ctx->pattern[0]));
829 i = ctx->pattern[0];
830 if (i & 1)
831 state->lastindex = i/2 + 1;
832 if (i > state->lastmark) {
833 /* state->lastmark is the highest valid index in the
834 state->mark array. If it is increased by more than 1,
835 the intervening marks must be set to NULL to signal
Tim Peters3d563502006-01-21 02:47:53 +0000836 that these marks have not been encountered. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000837 Py_ssize_t j = state->lastmark + 1;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000838 while (j < i)
839 state->mark[j++] = NULL;
840 state->lastmark = i;
841 }
842 state->mark[i] = ctx->ptr;
843 ctx->pattern++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000844 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000845
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000846 case SRE_OP_LITERAL:
847 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000848 /* <LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000849 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
850 ctx->ptr, *ctx->pattern));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200851 if (ctx->ptr >= end || (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) != ctx->pattern[0])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000852 RETURN_FAILURE;
853 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200854 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000855 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000856
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000857 case SRE_OP_NOT_LITERAL:
858 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000859 /* <NOT_LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000860 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
861 ctx->ptr, *ctx->pattern));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200862 if (ctx->ptr >= end || (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) == ctx->pattern[0])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000863 RETURN_FAILURE;
864 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200865 ctx->ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000866 break;
867
868 case SRE_OP_SUCCESS:
869 /* end of pattern */
870 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
871 state->ptr = ctx->ptr;
872 RETURN_SUCCESS;
873
874 case SRE_OP_AT:
875 /* match at given position */
876 /* <AT> <code> */
877 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
878 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
879 RETURN_FAILURE;
880 ctx->pattern++;
881 break;
882
883 case SRE_OP_CATEGORY:
884 /* match at given category */
885 /* <CATEGORY> <code> */
886 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
887 ctx->ptr, *ctx->pattern));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200888 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], SRE_CHARGET(state, ctx->ptr, 0)))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000889 RETURN_FAILURE;
890 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200891 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000892 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000893
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000894 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000895 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000896 /* <ANY> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000897 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200898 if (ctx->ptr >= end || SRE_IS_LINEBREAK(SRE_CHARGET(state, ctx->ptr, 0)))
899 RETURN_FAILURE;
900 ctx->ptr += state->charsize;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000901 break;
902
903 case SRE_OP_ANY_ALL:
904 /* match anything */
905 /* <ANY_ALL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000906 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
907 if (ctx->ptr >= end)
908 RETURN_FAILURE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200909 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000910 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000911
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000912 case SRE_OP_IN:
913 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000914 /* <IN> <skip> <set> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000915 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200916 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, SRE_CHARGET(state, ctx->ptr, 0)))
917 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000918 ctx->pattern += ctx->pattern[0];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200919 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000920 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000921
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000922 case SRE_OP_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000923 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
924 ctx->pattern, ctx->ptr, ctx->pattern[0]));
925 if (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200926 state->lower(SRE_CHARGET(state, ctx->ptr, 0)) != state->lower(*ctx->pattern))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000927 RETURN_FAILURE;
928 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200929 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000930 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000931
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000932 case SRE_OP_NOT_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000933 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
934 ctx->pattern, ctx->ptr, *ctx->pattern));
935 if (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200936 state->lower(SRE_CHARGET(state, ctx->ptr, 0)) == state->lower(*ctx->pattern))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000937 RETURN_FAILURE;
938 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200939 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000940 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000941
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000942 case SRE_OP_IN_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000943 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
944 if (ctx->ptr >= end
945 || !SRE_CHARSET(ctx->pattern+1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200946 (SRE_CODE)state->lower(SRE_CHARGET(state, ctx->ptr, 0))))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000947 RETURN_FAILURE;
948 ctx->pattern += ctx->pattern[0];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200949 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000950 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000951
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000952 case SRE_OP_JUMP:
953 case SRE_OP_INFO:
954 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000955 /* <JUMP> <offset> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000956 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
957 ctx->ptr, ctx->pattern[0]));
958 ctx->pattern += ctx->pattern[0];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000959 break;
960
961 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000962 /* alternation */
963 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000964 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000965 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000966 ctx->u.rep = state->repeat;
967 if (ctx->u.rep)
968 MARK_PUSH(ctx->lastmark);
969 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
970 if (ctx->pattern[1] == SRE_OP_LITERAL &&
971 (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200972 (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) != ctx->pattern[2]))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000973 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000974 if (ctx->pattern[1] == SRE_OP_IN &&
975 (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200976 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0))))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000977 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000978 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000979 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000980 if (ret) {
981 if (ctx->u.rep)
982 MARK_POP_DISCARD(ctx->lastmark);
983 RETURN_ON_ERROR(ret);
984 RETURN_SUCCESS;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000985 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000986 if (ctx->u.rep)
987 MARK_POP_KEEP(ctx->lastmark);
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000988 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000989 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000990 if (ctx->u.rep)
991 MARK_POP_DISCARD(ctx->lastmark);
992 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +0000993
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000994 case SRE_OP_REPEAT_ONE:
995 /* match repeated sequence (maximizing regexp) */
996
997 /* this operator only works if the repeated item is
998 exactly one character wide, and we're not already
999 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +00001000 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001001
1002 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1003
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001004 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1005 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001006
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001007 if ((Py_ssize_t) ctx->pattern[1] > (end - ctx->ptr) / state->charsize)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001008 RETURN_FAILURE; /* cannot match */
Fredrik Lundhe1869832000-08-01 22:47:49 +00001009
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001010 state->ptr = ctx->ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001011
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001012 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1013 RETURN_ON_ERROR(ret);
1014 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1015 ctx->count = ret;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001016 ctx->ptr += state->charsize * ctx->count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001017
1018 /* when we arrive here, count contains the number of
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001019 matches, and ctx->ptr points to the tail of the target
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001020 string. check if the rest of the pattern matches,
1021 and backtrack if not. */
1022
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001023 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001024 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001025
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001026 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001027 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001028 state->ptr = ctx->ptr;
1029 RETURN_SUCCESS;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001030 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001031
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001032 LASTMARK_SAVE();
1033
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001034 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001035 /* tail starts with a literal. skip positions where
1036 the rest of the pattern cannot possibly match */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001037 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001038 for (;;) {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001039 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
Victor Stinner63ab8752011-11-22 03:31:20 +01001040 (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001041 SRE_CHARGET(state, ctx->ptr, 0) != ctx->u.chr)) {
1042 ctx->ptr -= state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001043 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001044 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001045 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001046 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001047 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001048 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1049 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001050 if (ret) {
1051 RETURN_ON_ERROR(ret);
1052 RETURN_SUCCESS;
1053 }
Tim Peters3d563502006-01-21 02:47:53 +00001054
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001055 LASTMARK_RESTORE();
Tim Peters3d563502006-01-21 02:47:53 +00001056
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001057 ctx->ptr -= state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001058 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001059 }
1060
1061 } else {
1062 /* general case */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001063 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001064 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001065 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1066 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001067 if (ret) {
1068 RETURN_ON_ERROR(ret);
1069 RETURN_SUCCESS;
1070 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001071 ctx->ptr -= state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001072 ctx->count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001073 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001074 }
1075 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001076 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001077
Guido van Rossum41c99e72003-04-14 17:59:34 +00001078 case SRE_OP_MIN_REPEAT_ONE:
1079 /* match repeated sequence (minimizing regexp) */
1080
1081 /* this operator only works if the repeated item is
1082 exactly one character wide, and we're not already
1083 collecting backtracking points. for other cases,
1084 use the MIN_REPEAT operator */
1085
1086 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1087
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001088 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1089 ctx->pattern[1], ctx->pattern[2]));
Guido van Rossum41c99e72003-04-14 17:59:34 +00001090
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001091 if ((Py_ssize_t) ctx->pattern[1] > (end - ctx->ptr) / state->charsize)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001092 RETURN_FAILURE; /* cannot match */
Guido van Rossum41c99e72003-04-14 17:59:34 +00001093
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001094 state->ptr = ctx->ptr;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001095
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001096 if (ctx->pattern[1] == 0)
1097 ctx->count = 0;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001098 else {
1099 /* count using pattern min as the maximum */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001100 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1101 RETURN_ON_ERROR(ret);
1102 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001103 if (ret < (Py_ssize_t) ctx->pattern[1])
Tim Peters3d563502006-01-21 02:47:53 +00001104 /* didn't match minimum number of times */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001105 RETURN_FAILURE;
1106 /* advance past minimum matches of repeat */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001107 ctx->count = ret;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001108 ctx->ptr += state->charsize * ctx->count;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001109 }
1110
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001111 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Guido van Rossum41c99e72003-04-14 17:59:34 +00001112 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001113 state->ptr = ctx->ptr;
1114 RETURN_SUCCESS;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001115
1116 } else {
1117 /* general case */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001118 LASTMARK_SAVE();
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001119 while ((Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001120 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001121 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001122 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1123 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001124 if (ret) {
1125 RETURN_ON_ERROR(ret);
1126 RETURN_SUCCESS;
1127 }
1128 state->ptr = ctx->ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001129 ret = SRE_COUNT(state, ctx->pattern+3, 1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001130 RETURN_ON_ERROR(ret);
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001131 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001132 if (ret == 0)
Guido van Rossum41c99e72003-04-14 17:59:34 +00001133 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001134 assert(ret == 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001135 ctx->ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001136 ctx->count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001137 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001138 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001139 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001140 RETURN_FAILURE;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001141
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001142 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001143 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001144 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001145 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001146 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1147 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001148
1149 /* install new repeat context */
Thomas Wouters477c8d52006-05-27 19:21:47 +00001150 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001151 if (!ctx->u.rep) {
1152 PyErr_NoMemory();
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001153 RETURN_FAILURE;
Thomas Wouters89f507f2006-12-13 04:49:30 +00001154 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001155 ctx->u.rep->count = -1;
1156 ctx->u.rep->pattern = ctx->pattern;
1157 ctx->u.rep->prev = state->repeat;
1158 ctx->u.rep->last_ptr = NULL;
1159 state->repeat = ctx->u.rep;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001160
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001161 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001162 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001163 state->repeat = ctx->u.rep->prev;
Thomas Wouters477c8d52006-05-27 19:21:47 +00001164 PyObject_FREE(ctx->u.rep);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001165
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001166 if (ret) {
1167 RETURN_ON_ERROR(ret);
1168 RETURN_SUCCESS;
1169 }
1170 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001171
1172 case SRE_OP_MAX_UNTIL:
1173 /* maximizing repeat */
1174 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1175
1176 /* FIXME: we probably need to deal with zero-width
1177 matches in here... */
1178
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001179 ctx->u.rep = state->repeat;
1180 if (!ctx->u.rep)
1181 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001182
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001183 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001184
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001185 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001186
Serhiy Storchaka134f0de2013-09-05 18:01:15 +03001187 TRACE(("|%p|%p|MAX_UNTIL %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001188 ctx->ptr, ctx->count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001189
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001190 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001191 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001192 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001193 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1194 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001195 if (ret) {
1196 RETURN_ON_ERROR(ret);
1197 RETURN_SUCCESS;
1198 }
1199 ctx->u.rep->count = ctx->count-1;
1200 state->ptr = ctx->ptr;
1201 RETURN_FAILURE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001202 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001203
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001204 if ((ctx->count < (Py_ssize_t) ctx->u.rep->pattern[2] ||
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001205 ctx->u.rep->pattern[2] == SRE_MAXREPEAT) &&
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001206 state->ptr != ctx->u.rep->last_ptr) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001207 /* we may have enough matches, but if we can
1208 match another item, do so */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001209 ctx->u.rep->count = ctx->count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001210 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001211 MARK_PUSH(ctx->lastmark);
1212 /* zero-width match protection */
1213 DATA_PUSH(&ctx->u.rep->last_ptr);
1214 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001215 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1216 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001217 DATA_POP(&ctx->u.rep->last_ptr);
1218 if (ret) {
1219 MARK_POP_DISCARD(ctx->lastmark);
1220 RETURN_ON_ERROR(ret);
1221 RETURN_SUCCESS;
1222 }
1223 MARK_POP(ctx->lastmark);
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001224 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001225 ctx->u.rep->count = ctx->count-1;
1226 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001227 }
1228
1229 /* cannot match more repeated items here. make sure the
1230 tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001231 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001232 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001233 RETURN_ON_SUCCESS(ret);
1234 state->repeat = ctx->u.rep;
1235 state->ptr = ctx->ptr;
1236 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001237
1238 case SRE_OP_MIN_UNTIL:
1239 /* minimizing repeat */
1240 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1241
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001242 ctx->u.rep = state->repeat;
1243 if (!ctx->u.rep)
1244 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001245
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001246 state->ptr = ctx->ptr;
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001247
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001248 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001249
Serhiy Storchaka134f0de2013-09-05 18:01:15 +03001250 TRACE(("|%p|%p|MIN_UNTIL %" PY_FORMAT_SIZE_T "d %p\n", ctx->pattern,
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001251 ctx->ptr, ctx->count, ctx->u.rep->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001252
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001253 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001254 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001255 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001256 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1257 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001258 if (ret) {
1259 RETURN_ON_ERROR(ret);
1260 RETURN_SUCCESS;
1261 }
1262 ctx->u.rep->count = ctx->count-1;
1263 state->ptr = ctx->ptr;
1264 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001265 }
1266
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001267 LASTMARK_SAVE();
1268
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001269 /* see if the tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001270 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001271 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001272 if (ret) {
1273 RETURN_ON_ERROR(ret);
1274 RETURN_SUCCESS;
1275 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001276
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001277 state->repeat = ctx->u.rep;
1278 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001279
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001280 LASTMARK_RESTORE();
1281
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001282 if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2]
Serhiy Storchakafa468162013-02-16 21:23:53 +02001283 && ctx->u.rep->pattern[2] != SRE_MAXREPEAT) ||
1284 state->ptr == ctx->u.rep->last_ptr)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001285 RETURN_FAILURE;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001286
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001287 ctx->u.rep->count = ctx->count;
Serhiy Storchakafa468162013-02-16 21:23:53 +02001288 /* zero-width match protection */
1289 DATA_PUSH(&ctx->u.rep->last_ptr);
1290 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001291 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1292 ctx->u.rep->pattern+3);
Serhiy Storchakafa468162013-02-16 21:23:53 +02001293 DATA_POP(&ctx->u.rep->last_ptr);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001294 if (ret) {
1295 RETURN_ON_ERROR(ret);
1296 RETURN_SUCCESS;
1297 }
1298 ctx->u.rep->count = ctx->count-1;
1299 state->ptr = ctx->ptr;
1300 RETURN_FAILURE;
1301
1302 case SRE_OP_GROUPREF:
1303 /* match backreference */
1304 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1305 ctx->ptr, ctx->pattern[0]));
1306 i = ctx->pattern[0];
1307 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001308 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001309 if (groupref >= state->lastmark) {
1310 RETURN_FAILURE;
1311 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312 char* p = (char*) state->mark[groupref];
1313 char* e = (char*) state->mark[groupref+1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001314 if (!p || !e || e < p)
1315 RETURN_FAILURE;
1316 while (p < e) {
Victor Stinner63ab8752011-11-22 03:31:20 +01001317 if (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001318 SRE_CHARGET(state, ctx->ptr, 0) != SRE_CHARGET(state, p, 0))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001319 RETURN_FAILURE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 p += state->charsize;
1321 ctx->ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001322 }
1323 }
1324 }
1325 ctx->pattern++;
1326 break;
1327
1328 case SRE_OP_GROUPREF_IGNORE:
1329 /* match backreference */
1330 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1331 ctx->ptr, ctx->pattern[0]));
1332 i = ctx->pattern[0];
1333 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001334 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001335 if (groupref >= state->lastmark) {
1336 RETURN_FAILURE;
1337 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338 char* p = (char*) state->mark[groupref];
1339 char* e = (char*) state->mark[groupref+1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001340 if (!p || !e || e < p)
1341 RETURN_FAILURE;
1342 while (p < e) {
1343 if (ctx->ptr >= end ||
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001344 state->lower(SRE_CHARGET(state, ctx->ptr, 0)) !=
1345 state->lower(SRE_CHARGET(state, p, 0)))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001346 RETURN_FAILURE;
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001347 p += state->charsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001348 ctx->ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001349 }
1350 }
1351 }
1352 ctx->pattern++;
1353 break;
1354
1355 case SRE_OP_GROUPREF_EXISTS:
1356 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1357 ctx->ptr, ctx->pattern[0]));
1358 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1359 i = ctx->pattern[0];
1360 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001361 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001362 if (groupref >= state->lastmark) {
1363 ctx->pattern += ctx->pattern[1];
1364 break;
1365 } else {
1366 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1367 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1368 if (!p || !e || e < p) {
1369 ctx->pattern += ctx->pattern[1];
1370 break;
1371 }
1372 }
1373 }
1374 ctx->pattern += 2;
1375 break;
1376
1377 case SRE_OP_ASSERT:
1378 /* assert subpattern */
1379 /* <ASSERT> <skip> <back> <pattern> */
1380 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1381 ctx->ptr, ctx->pattern[1]));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382 state->ptr = ctx->ptr - state->charsize * ctx->pattern[1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001383 if (state->ptr < state->beginning)
1384 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001385 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001386 RETURN_ON_FAILURE(ret);
1387 ctx->pattern += ctx->pattern[0];
1388 break;
1389
1390 case SRE_OP_ASSERT_NOT:
1391 /* assert not subpattern */
1392 /* <ASSERT_NOT> <skip> <back> <pattern> */
1393 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1394 ctx->ptr, ctx->pattern[1]));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 state->ptr = ctx->ptr - state->charsize * ctx->pattern[1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001396 if (state->ptr >= state->beginning) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001397 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001398 if (ret) {
1399 RETURN_ON_ERROR(ret);
1400 RETURN_FAILURE;
1401 }
1402 }
1403 ctx->pattern += ctx->pattern[0];
1404 break;
1405
1406 case SRE_OP_FAILURE:
1407 /* immediate failure */
1408 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1409 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001410
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001411 default:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001412 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1413 ctx->pattern[-1]));
1414 RETURN_ERROR(SRE_ERROR_ILLEGAL);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001415 }
1416 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001417
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001418exit:
1419 ctx_pos = ctx->last_ctx_pos;
1420 jump = ctx->jump;
1421 DATA_POP_DISCARD(ctx);
1422 if (ctx_pos == -1)
1423 return ret;
1424 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1425
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001426 switch (jump) {
1427 case JUMP_MAX_UNTIL_2:
1428 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1429 goto jump_max_until_2;
1430 case JUMP_MAX_UNTIL_3:
1431 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1432 goto jump_max_until_3;
1433 case JUMP_MIN_UNTIL_2:
1434 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1435 goto jump_min_until_2;
1436 case JUMP_MIN_UNTIL_3:
1437 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1438 goto jump_min_until_3;
1439 case JUMP_BRANCH:
1440 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1441 goto jump_branch;
1442 case JUMP_MAX_UNTIL_1:
1443 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1444 goto jump_max_until_1;
1445 case JUMP_MIN_UNTIL_1:
1446 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1447 goto jump_min_until_1;
1448 case JUMP_REPEAT:
1449 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1450 goto jump_repeat;
1451 case JUMP_REPEAT_ONE_1:
1452 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1453 goto jump_repeat_one_1;
1454 case JUMP_REPEAT_ONE_2:
1455 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1456 goto jump_repeat_one_2;
1457 case JUMP_MIN_REPEAT_ONE:
1458 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1459 goto jump_min_repeat_one;
1460 case JUMP_ASSERT:
1461 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1462 goto jump_assert;
1463 case JUMP_ASSERT_NOT:
1464 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1465 goto jump_assert_not;
1466 case JUMP_NONE:
Serhiy Storchaka134f0de2013-09-05 18:01:15 +03001467 TRACE(("|%p|%p|RETURN %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
1468 ctx->ptr, ret));
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001469 break;
1470 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001471
1472 return ret; /* should never get here */
Guido van Rossumb700df92000-03-31 14:59:30 +00001473}
1474
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001475LOCAL(Py_ssize_t)
Guido van Rossumb700df92000-03-31 14:59:30 +00001476SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1477{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 char* ptr = (char*)state->start;
1479 char* end = (char*)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001480 Py_ssize_t status = 0;
1481 Py_ssize_t prefix_len = 0;
1482 Py_ssize_t prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001483 SRE_CODE* prefix = NULL;
1484 SRE_CODE* charset = NULL;
1485 SRE_CODE* overlap = NULL;
1486 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001487
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001488 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001489 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001490 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001491
1492 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001493
Gustavo Niemeyer28b5bb32003-06-26 14:41:08 +00001494 if (pattern[3] > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001495 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001496 character in there, so literal search will work) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001497 end -= (pattern[3]-1) * state->charsize;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001498 if (end <= ptr)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001499 end = ptr + state->charsize;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001500 }
1501
Fredrik Lundh3562f112000-07-02 12:00:07 +00001502 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001503 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001504 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001505 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001506 prefix_skip = pattern[6];
1507 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001508 overlap = prefix + prefix_len - 1;
1509 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001510 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001511 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001512 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001513
1514 pattern += 1 + pattern[1];
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001515 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001516
Serhiy Storchaka134f0de2013-09-05 18:01:15 +03001517 TRACE(("prefix = %p %" PY_FORMAT_SIZE_T "d %" PY_FORMAT_SIZE_T "d\n",
1518 prefix, prefix_len, prefix_skip));
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001519 TRACE(("charset = %p\n", charset));
1520
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001521#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001522 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001523 /* pattern starts with a known prefix. use the overlap
1524 table to skip forward as fast as we possibly can */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001525 Py_ssize_t i = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001526 end = (char *)state->end;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001527 while (ptr < end) {
1528 for (;;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001529 if ((SRE_CODE) SRE_CHARGET(state, ptr, 0) != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001530 if (!i)
1531 break;
1532 else
1533 i = overlap[i];
1534 } else {
1535 if (++i == prefix_len) {
1536 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001537 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001538 state->start = ptr - (prefix_len - 1) * state->charsize;
1539 state->ptr = ptr - (prefix_len - prefix_skip - 1) * state->charsize;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001540 if (flags & SRE_INFO_LITERAL)
1541 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001542 status = SRE_MATCH(state, pattern + 2*prefix_skip);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001543 if (status != 0)
1544 return status;
1545 /* close but no cigar -- try again */
1546 i = overlap[i];
1547 }
1548 break;
1549 }
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001551 ptr += state->charsize;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001552 }
1553 return 0;
1554 }
1555#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001556
Fredrik Lundh3562f112000-07-02 12:00:07 +00001557 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001558 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001559 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001560 SRE_CODE chr = pattern[1];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001561 end = (char*)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001562 for (;;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001563 while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) != chr)
1564 ptr += state->charsize;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001565 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001566 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001567 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001568 state->start = ptr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001569 ptr += state->charsize;
1570 state->ptr = ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001571 if (flags & SRE_INFO_LITERAL)
1572 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001573 status = SRE_MATCH(state, pattern + 2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001574 if (status != 0)
1575 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001576 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001577 } else if (charset) {
1578 /* pattern starts with a character from a known set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001579 end = (char*)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001580 for (;;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001581 while (ptr < end && !SRE_CHARSET(charset, SRE_CHARGET(state, ptr, 0)))
1582 ptr += state->charsize;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001583 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001584 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001585 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001586 state->start = ptr;
1587 state->ptr = ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001588 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001589 if (status != 0)
1590 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001591 ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001592 }
1593 } else
1594 /* general case */
1595 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001596 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001597 state->start = state->ptr = ptr;
1598 ptr += state->charsize;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001599 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001600 if (status != 0)
1601 break;
1602 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001603
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001604 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001605}
Tim Peters3d563502006-01-21 02:47:53 +00001606
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001607#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001608
1609/* -------------------------------------------------------------------- */
1610/* factories and destructors */
1611
1612/* see sre.h for object declarations */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001613static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06001614static PyObject*pattern_scanner(PatternObject*, PyObject*, PyObject* kw);
Guido van Rossumb700df92000-03-31 14:59:30 +00001615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001616static int
1617sre_literal_template(int charsize, char* ptr, Py_ssize_t len)
1618{
1619 /* check if given string is a literal template (i.e. no escapes) */
1620 struct {
1621 int charsize;
1622 } state = {
1623 charsize
1624 };
1625 while (len-- > 0) {
1626 if (SRE_CHARGET((&state), ptr, 0) == '\\')
1627 return 0;
1628 ptr += charsize;
1629 }
1630 return 1;
1631}
1632
Guido van Rossumb700df92000-03-31 14:59:30 +00001633static PyObject *
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001634sre_codesize(PyObject* self, PyObject *unused)
Guido van Rossumb700df92000-03-31 14:59:30 +00001635{
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001636 return PyLong_FromSize_t(sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001637}
1638
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001639static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001640sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001641{
1642 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001643 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001644 return NULL;
1645 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001646 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001647 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001648 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundhb389df32000-06-29 12:48:37 +00001649 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001650}
1651
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001652LOCAL(void)
1653state_reset(SRE_STATE* state)
1654{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001655 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001656 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001657
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001658 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001659 state->lastindex = -1;
1660
1661 state->repeat = NULL;
1662
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001663 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001664}
1665
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001666static void*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667getstring(PyObject* string, Py_ssize_t* p_length,
Benjamin Peterson33d21a22012-03-07 14:59:13 -06001668 int* p_logical_charsize, int* p_charsize,
1669 Py_buffer *view)
Guido van Rossumb700df92000-03-31 14:59:30 +00001670{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001671 /* given a python object, return a data pointer, a length (in
1672 characters), and a character size. return NULL if the object
1673 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +00001674
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001675 PyBufferProcs *buffer;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001676 Py_ssize_t size, bytes;
1677 int charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001678 void* ptr;
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001679
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00001680 /* Unicode objects do not support the buffer API. So, get the data
1681 directly instead. */
1682 if (PyUnicode_Check(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001683 if (PyUnicode_READY(string) == -1)
1684 return NULL;
1685 ptr = PyUnicode_DATA(string);
1686 *p_length = PyUnicode_GET_LENGTH(string);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001687 *p_charsize = PyUnicode_KIND(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001688 *p_logical_charsize = 4;
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00001689 return ptr;
1690 }
1691
Victor Stinner0058b862011-09-29 03:27:47 +02001692 /* get pointer to byte string buffer */
Benjamin Petersone48944b2012-03-07 14:50:25 -06001693 view->len = -1;
Christian Heimes90aa7642007-12-19 02:45:37 +00001694 buffer = Py_TYPE(string)->tp_as_buffer;
Antoine Pitroufd036452008-08-19 17:56:33 +00001695 if (!buffer || !buffer->bf_getbuffer ||
Benjamin Petersone48944b2012-03-07 14:50:25 -06001696 (*buffer->bf_getbuffer)(string, view, PyBUF_SIMPLE) < 0) {
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001697 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1698 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001699 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001700
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001701 /* determine buffer size */
Benjamin Petersone48944b2012-03-07 14:50:25 -06001702 bytes = view->len;
1703 ptr = view->buf;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001704
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001705 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001706 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001707 goto err;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001708 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001709
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001710 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001711 size = PyObject_Size(string);
Guido van Rossumb700df92000-03-31 14:59:30 +00001712
Christian Heimes72b710a2008-05-26 13:28:38 +00001713 if (PyBytes_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001714 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001715 else {
1716 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001717 goto err;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001718 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001719
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001720 *p_length = size;
1721 *p_charsize = charsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001722 *p_logical_charsize = charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001723
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001724 if (ptr == NULL) {
Antoine Pitroufd036452008-08-19 17:56:33 +00001725 PyErr_SetString(PyExc_ValueError,
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001726 "Buffer is NULL");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001727 goto err;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001728 }
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001729 return ptr;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001730 err:
1731 PyBuffer_Release(view);
1732 view->buf = NULL;
1733 return NULL;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001734}
1735
1736LOCAL(PyObject*)
1737state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001738 Py_ssize_t start, Py_ssize_t end)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001739{
1740 /* prepare state object */
1741
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001742 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001743 int logical_charsize, charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001744 void* ptr;
1745
1746 memset(state, 0, sizeof(SRE_STATE));
1747
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001748 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001749 state->lastindex = -1;
1750
Benjamin Petersone48944b2012-03-07 14:50:25 -06001751 state->buffer.buf = NULL;
Benjamin Peterson33d21a22012-03-07 14:59:13 -06001752 ptr = getstring(string, &length, &logical_charsize, &charsize, &state->buffer);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001753 if (!ptr)
Benjamin Petersone48944b2012-03-07 14:50:25 -06001754 goto err;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001755
Benjamin Peterson33d21a22012-03-07 14:59:13 -06001756 if (logical_charsize == 1 && pattern->logical_charsize > 1) {
Benjamin Petersone48944b2012-03-07 14:50:25 -06001757 PyErr_SetString(PyExc_TypeError,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001758 "can't use a string pattern on a bytes-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001759 goto err;
1760 }
Benjamin Peterson33d21a22012-03-07 14:59:13 -06001761 if (logical_charsize > 1 && pattern->logical_charsize == 1) {
Benjamin Petersone48944b2012-03-07 14:50:25 -06001762 PyErr_SetString(PyExc_TypeError,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001763 "can't use a bytes pattern on a string-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001764 goto err;
1765 }
Antoine Pitroufd036452008-08-19 17:56:33 +00001766
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001767 /* adjust boundaries */
1768 if (start < 0)
1769 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001770 else if (start > length)
1771 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001772
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001773 if (end < 0)
1774 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001775 else if (end > length)
1776 end = length;
1777
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001778 state->logical_charsize = logical_charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001779 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001780
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001781 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001782
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001783 state->start = (void*) ((char*) ptr + start * state->charsize);
1784 state->end = (void*) ((char*) ptr + end * state->charsize);
1785
1786 Py_INCREF(string);
1787 state->string = string;
1788 state->pos = start;
1789 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001790
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001791 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001792 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001793 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001794 state->lower = sre_lower_unicode;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001795 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001796 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001797
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001798 return string;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001799 err:
1800 if (state->buffer.buf)
1801 PyBuffer_Release(&state->buffer);
1802 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001803}
1804
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001805LOCAL(void)
1806state_fini(SRE_STATE* state)
1807{
Benjamin Petersone48944b2012-03-07 14:50:25 -06001808 if (state->buffer.buf)
1809 PyBuffer_Release(&state->buffer);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001810 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001811 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001812}
1813
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001814/* calculate offset from start of string */
1815#define STATE_OFFSET(state, member)\
1816 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1817
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001818LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001819state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001820{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001821 Py_ssize_t i, j;
Fredrik Lundh58100642000-08-09 09:14:35 +00001822
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001823 index = (index - 1) * 2;
1824
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001825 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001826 if (empty)
1827 /* want empty string */
1828 i = j = 0;
1829 else {
1830 Py_INCREF(Py_None);
1831 return Py_None;
1832 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001833 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001834 i = STATE_OFFSET(state, state->mark[index]);
1835 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001836 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001837
Fredrik Lundh58100642000-08-09 09:14:35 +00001838 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001839}
1840
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001841static void
1842pattern_error(int status)
1843{
1844 switch (status) {
1845 case SRE_ERROR_RECURSION_LIMIT:
1846 PyErr_SetString(
1847 PyExc_RuntimeError,
1848 "maximum recursion limit exceeded"
1849 );
1850 break;
1851 case SRE_ERROR_MEMORY:
1852 PyErr_NoMemory();
1853 break;
Christian Heimes2380ac72008-01-09 00:17:24 +00001854 case SRE_ERROR_INTERRUPTED:
1855 /* An exception has already been raised, so let it fly */
1856 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001857 default:
1858 /* other error codes indicate compiler/engine bugs */
1859 PyErr_SetString(
1860 PyExc_RuntimeError,
1861 "internal error in regular expression engine"
1862 );
1863 }
1864}
1865
Guido van Rossumb700df92000-03-31 14:59:30 +00001866static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001867pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001868{
Raymond Hettinger027bb632004-05-31 03:09:25 +00001869 if (self->weakreflist != NULL)
1870 PyObject_ClearWeakRefs((PyObject *) self);
Benjamin Petersone48944b2012-03-07 14:50:25 -06001871 if (self->view.buf)
1872 PyBuffer_Release(&self->view);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001873 Py_XDECREF(self->pattern);
1874 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001875 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001876 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001877}
1878
1879static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001880pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001881{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001882 SRE_STATE state;
1883 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001884
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001885 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001886 Py_ssize_t start = 0;
1887 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001888 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001889 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001890 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001891 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001892
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001893 string = state_init(&state, self, string, start, end);
1894 if (!string)
1895 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001896
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001897 state.ptr = state.start;
1898
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001899 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1900
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001901 if (state.logical_charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001902 status = sre_match(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001903 } else {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001904 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001905 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001906
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001907 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001908 if (PyErr_Occurred())
1909 return NULL;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001910
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001911 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001912
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001913 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001914}
1915
1916static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001917pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001918{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001919 SRE_STATE state;
1920 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001921
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001922 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001923 Py_ssize_t start = 0;
1924 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001925 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001926 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001927 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001928 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001929
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001930 string = state_init(&state, self, string, start, end);
1931 if (!string)
1932 return NULL;
1933
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001934 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001936 if (state.logical_charsize == 1) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001937 status = sre_search(&state, PatternObject_GetCode(self));
1938 } else {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001939 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001940 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001941
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001942 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1943
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001944 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001945
Thomas Wouters89f507f2006-12-13 04:49:30 +00001946 if (PyErr_Occurred())
1947 return NULL;
1948
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001949 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001950}
1951
1952static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001953call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001954{
1955 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001956 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001957 PyObject* func;
1958 PyObject* result;
1959
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001960 if (!args)
1961 return NULL;
Neal Norwitzfe537132007-08-26 03:55:15 +00001962 name = PyUnicode_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001963 if (!name)
1964 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001965 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001966 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001967 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001968 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001969 func = PyObject_GetAttrString(mod, function);
1970 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001971 if (!func)
1972 return NULL;
1973 result = PyObject_CallObject(func, args);
1974 Py_DECREF(func);
1975 Py_DECREF(args);
1976 return result;
1977}
1978
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001979#ifdef USE_BUILTIN_COPY
1980static int
1981deepcopy(PyObject** object, PyObject* memo)
1982{
1983 PyObject* copy;
1984
1985 copy = call(
1986 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00001987 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001988 );
1989 if (!copy)
1990 return 0;
1991
1992 Py_DECREF(*object);
1993 *object = copy;
1994
1995 return 1; /* success */
1996}
1997#endif
1998
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001999static PyObject*
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002000join_list(PyObject* list, PyObject* string)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002001{
2002 /* join list elements */
2003
2004 PyObject* joiner;
2005#if PY_VERSION_HEX >= 0x01060000
2006 PyObject* function;
2007 PyObject* args;
2008#endif
2009 PyObject* result;
2010
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002011 joiner = PySequence_GetSlice(string, 0, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002012 if (!joiner)
2013 return NULL;
2014
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002015 if (PyList_GET_SIZE(list) == 0) {
2016 Py_DECREF(list);
2017 return joiner;
2018 }
2019
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002020#if PY_VERSION_HEX >= 0x01060000
2021 function = PyObject_GetAttrString(joiner, "join");
2022 if (!function) {
2023 Py_DECREF(joiner);
2024 return NULL;
2025 }
2026 args = PyTuple_New(1);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002027 if (!args) {
2028 Py_DECREF(function);
2029 Py_DECREF(joiner);
2030 return NULL;
2031 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002032 PyTuple_SET_ITEM(args, 0, list);
2033 result = PyObject_CallObject(function, args);
2034 Py_DECREF(args); /* also removes list */
2035 Py_DECREF(function);
2036#else
2037 result = call(
2038 "string", "join",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002039 PyTuple_Pack(2, list, joiner)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002040 );
2041#endif
2042 Py_DECREF(joiner);
2043
2044 return result;
2045}
2046
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002047static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002048pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002049{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002050 SRE_STATE state;
2051 PyObject* list;
2052 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002053 Py_ssize_t i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00002054
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002055 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002056 Py_ssize_t start = 0;
2057 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002058 static char* kwlist[] = { "source", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002059 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00002060 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002061 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002062
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002063 string = state_init(&state, self, string, start, end);
2064 if (!string)
2065 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002066
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002067 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002068 if (!list) {
2069 state_fini(&state);
2070 return NULL;
2071 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002072
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002073 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002074
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002075 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +00002076
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002077 state_reset(&state);
2078
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002079 state.ptr = state.start;
2080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 if (state.logical_charsize == 1) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002082 status = sre_search(&state, PatternObject_GetCode(self));
2083 } else {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002084 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002085 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002086
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002087 if (PyErr_Occurred())
2088 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00002089
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002090 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002091 if (status == 0)
2092 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002093 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002094 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002095 }
Tim Peters3d563502006-01-21 02:47:53 +00002096
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002097 /* don't bother to build a match object */
2098 switch (self->groups) {
2099 case 0:
2100 b = STATE_OFFSET(&state, state.start);
2101 e = STATE_OFFSET(&state, state.ptr);
2102 item = PySequence_GetSlice(string, b, e);
2103 if (!item)
2104 goto error;
2105 break;
2106 case 1:
2107 item = state_getslice(&state, 1, string, 1);
2108 if (!item)
2109 goto error;
2110 break;
2111 default:
2112 item = PyTuple_New(self->groups);
2113 if (!item)
2114 goto error;
2115 for (i = 0; i < self->groups; i++) {
2116 PyObject* o = state_getslice(&state, i+1, string, 1);
2117 if (!o) {
2118 Py_DECREF(item);
2119 goto error;
2120 }
2121 PyTuple_SET_ITEM(item, i, o);
2122 }
2123 break;
2124 }
2125
2126 status = PyList_Append(list, item);
2127 Py_DECREF(item);
2128 if (status < 0)
2129 goto error;
2130
2131 if (state.ptr == state.start)
2132 state.start = (void*) ((char*) state.ptr + state.charsize);
2133 else
2134 state.start = state.ptr;
2135
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002136 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002137
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002138 state_fini(&state);
2139 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002140
2141error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002142 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002143 state_fini(&state);
2144 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002145
Guido van Rossumb700df92000-03-31 14:59:30 +00002146}
2147
Fredrik Lundh703ce812001-10-24 22:16:30 +00002148#if PY_VERSION_HEX >= 0x02020000
2149static PyObject*
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002150pattern_finditer(PatternObject* pattern, PyObject* args, PyObject* kw)
Fredrik Lundh703ce812001-10-24 22:16:30 +00002151{
2152 PyObject* scanner;
2153 PyObject* search;
2154 PyObject* iterator;
2155
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002156 scanner = pattern_scanner(pattern, args, kw);
Fredrik Lundh703ce812001-10-24 22:16:30 +00002157 if (!scanner)
2158 return NULL;
2159
2160 search = PyObject_GetAttrString(scanner, "search");
2161 Py_DECREF(scanner);
2162 if (!search)
2163 return NULL;
2164
2165 iterator = PyCallIter_New(search, Py_None);
2166 Py_DECREF(search);
2167
2168 return iterator;
2169}
2170#endif
2171
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002172static PyObject*
2173pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2174{
2175 SRE_STATE state;
2176 PyObject* list;
2177 PyObject* item;
2178 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002179 Py_ssize_t n;
2180 Py_ssize_t i;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002181 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002182
2183 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002184 Py_ssize_t maxsplit = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002185 static char* kwlist[] = { "source", "maxsplit", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002186 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002187 &string, &maxsplit))
2188 return NULL;
2189
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002190 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002191 if (!string)
2192 return NULL;
2193
2194 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002195 if (!list) {
2196 state_fini(&state);
2197 return NULL;
2198 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002199
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002200 n = 0;
2201 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002202
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002203 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002204
2205 state_reset(&state);
2206
2207 state.ptr = state.start;
2208
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209 if (state.logical_charsize == 1) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002210 status = sre_search(&state, PatternObject_GetCode(self));
2211 } else {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002212 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002213 }
2214
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002215 if (PyErr_Occurred())
2216 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00002217
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002218 if (status <= 0) {
2219 if (status == 0)
2220 break;
2221 pattern_error(status);
2222 goto error;
2223 }
Tim Peters3d563502006-01-21 02:47:53 +00002224
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002225 if (state.start == state.ptr) {
2226 if (last == state.end)
2227 break;
2228 /* skip one character */
2229 state.start = (void*) ((char*) state.ptr + state.charsize);
2230 continue;
2231 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002232
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002233 /* get segment before this match */
2234 item = PySequence_GetSlice(
2235 string, STATE_OFFSET(&state, last),
2236 STATE_OFFSET(&state, state.start)
2237 );
2238 if (!item)
2239 goto error;
2240 status = PyList_Append(list, item);
2241 Py_DECREF(item);
2242 if (status < 0)
2243 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002244
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002245 /* add groups (if any) */
2246 for (i = 0; i < self->groups; i++) {
2247 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002248 if (!item)
2249 goto error;
2250 status = PyList_Append(list, item);
2251 Py_DECREF(item);
2252 if (status < 0)
2253 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002254 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002255
2256 n = n + 1;
2257
2258 last = state.start = state.ptr;
2259
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002260 }
2261
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002262 /* get segment following last match (even if empty) */
2263 item = PySequence_GetSlice(
2264 string, STATE_OFFSET(&state, last), state.endpos
2265 );
2266 if (!item)
2267 goto error;
2268 status = PyList_Append(list, item);
2269 Py_DECREF(item);
2270 if (status < 0)
2271 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002272
2273 state_fini(&state);
2274 return list;
2275
2276error:
2277 Py_DECREF(list);
2278 state_fini(&state);
2279 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002280
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002281}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002282
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002283static PyObject*
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002284pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002285 Py_ssize_t count, Py_ssize_t subn)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002286{
2287 SRE_STATE state;
2288 PyObject* list;
2289 PyObject* item;
2290 PyObject* filter;
2291 PyObject* args;
2292 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002293 void* ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002294 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002295 Py_ssize_t n;
2296 Py_ssize_t i, b, e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002297 int logical_charsize, charsize;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002298 int filter_is_callable;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002299 Py_buffer view;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002300
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002301 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002302 /* sub/subn takes either a function or a template */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002303 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002304 Py_INCREF(filter);
2305 filter_is_callable = 1;
2306 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002307 /* if not callable, check if it's a literal string */
2308 int literal;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002309 view.buf = NULL;
Benjamin Peterson33d21a22012-03-07 14:59:13 -06002310 ptr = getstring(ptemplate, &n, &logical_charsize, &charsize, &view);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002311 b = charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002312 if (ptr) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002313 literal = sre_literal_template(b, ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002314 } else {
2315 PyErr_Clear();
2316 literal = 0;
2317 }
Benjamin Petersone48944b2012-03-07 14:50:25 -06002318 if (view.buf)
2319 PyBuffer_Release(&view);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002320 if (literal) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002321 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002322 Py_INCREF(filter);
2323 filter_is_callable = 0;
2324 } else {
2325 /* not a literal; hand it over to the template compiler */
2326 filter = call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00002327 SRE_PY_MODULE, "_subx",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002328 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002329 );
2330 if (!filter)
2331 return NULL;
2332 filter_is_callable = PyCallable_Check(filter);
2333 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002334 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002335
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002336 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002337 if (!string) {
2338 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002339 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002340 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002341
2342 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002343 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002344 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002345 state_fini(&state);
2346 return NULL;
2347 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002348
2349 n = i = 0;
2350
2351 while (!count || n < count) {
2352
2353 state_reset(&state);
2354
2355 state.ptr = state.start;
2356
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002357 if (state.logical_charsize == 1) {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002358 status = sre_search(&state, PatternObject_GetCode(self));
2359 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002360 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002361 }
2362
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002363 if (PyErr_Occurred())
2364 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00002365
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002366 if (status <= 0) {
2367 if (status == 0)
2368 break;
2369 pattern_error(status);
2370 goto error;
2371 }
Tim Peters3d563502006-01-21 02:47:53 +00002372
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002373 b = STATE_OFFSET(&state, state.start);
2374 e = STATE_OFFSET(&state, state.ptr);
2375
2376 if (i < b) {
2377 /* get segment before this match */
2378 item = PySequence_GetSlice(string, i, b);
2379 if (!item)
2380 goto error;
2381 status = PyList_Append(list, item);
2382 Py_DECREF(item);
2383 if (status < 0)
2384 goto error;
2385
2386 } else if (i == b && i == e && n > 0)
2387 /* ignore empty match on latest position */
2388 goto next;
2389
2390 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002391 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002392 match = pattern_new_match(self, &state, 1);
2393 if (!match)
2394 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002395 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002396 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002397 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002398 goto error;
2399 }
2400 item = PyObject_CallObject(filter, args);
2401 Py_DECREF(args);
2402 Py_DECREF(match);
2403 if (!item)
2404 goto error;
2405 } else {
2406 /* filter is literal string */
2407 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002408 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002409 }
2410
2411 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002412 if (item != Py_None) {
2413 status = PyList_Append(list, item);
2414 Py_DECREF(item);
2415 if (status < 0)
2416 goto error;
2417 }
Tim Peters3d563502006-01-21 02:47:53 +00002418
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002419 i = e;
2420 n = n + 1;
2421
2422next:
2423 /* move on */
2424 if (state.ptr == state.start)
2425 state.start = (void*) ((char*) state.ptr + state.charsize);
2426 else
2427 state.start = state.ptr;
2428
2429 }
2430
2431 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002432 if (i < state.endpos) {
2433 item = PySequence_GetSlice(string, i, state.endpos);
2434 if (!item)
2435 goto error;
2436 status = PyList_Append(list, item);
2437 Py_DECREF(item);
2438 if (status < 0)
2439 goto error;
2440 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002441
2442 state_fini(&state);
2443
Guido van Rossum4e173842001-12-07 04:25:10 +00002444 Py_DECREF(filter);
2445
Fredrik Lundhdac58492001-10-21 21:48:30 +00002446 /* convert list to single string (also removes list) */
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002447 item = join_list(list, string);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002448
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002449 if (!item)
2450 return NULL;
2451
2452 if (subn)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01002453 return Py_BuildValue("Nn", item, n);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002454
2455 return item;
2456
2457error:
2458 Py_DECREF(list);
2459 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002460 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002461 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002462
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002463}
2464
2465static PyObject*
2466pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2467{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002468 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002469 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002470 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002471 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002472 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002473 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002474 return NULL;
2475
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002476 return pattern_subx(self, ptemplate, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002477}
2478
2479static PyObject*
2480pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2481{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002482 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002483 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002484 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002485 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002486 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002487 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002488 return NULL;
2489
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002490 return pattern_subx(self, ptemplate, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002491}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002492
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002493static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002494pattern_copy(PatternObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002495{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002496#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002497 PatternObject* copy;
2498 int offset;
2499
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002500 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2501 if (!copy)
2502 return NULL;
2503
2504 offset = offsetof(PatternObject, groups);
2505
2506 Py_XINCREF(self->groupindex);
2507 Py_XINCREF(self->indexgroup);
2508 Py_XINCREF(self->pattern);
2509
2510 memcpy((char*) copy + offset, (char*) self + offset,
2511 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00002512 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002513
2514 return (PyObject*) copy;
2515#else
2516 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2517 return NULL;
2518#endif
2519}
2520
2521static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002522pattern_deepcopy(PatternObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002523{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002524#ifdef USE_BUILTIN_COPY
2525 PatternObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00002526
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002527 copy = (PatternObject*) pattern_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002528 if (!copy)
2529 return NULL;
2530
2531 if (!deepcopy(&copy->groupindex, memo) ||
2532 !deepcopy(&copy->indexgroup, memo) ||
2533 !deepcopy(&copy->pattern, memo)) {
2534 Py_DECREF(copy);
2535 return NULL;
2536 }
2537
2538#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002539 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2540 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002541#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002542}
2543
Raymond Hettinger94478742004-09-24 04:31:19 +00002544PyDoc_STRVAR(pattern_match_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002545"match(string[, pos[, endpos]]) -> match object or None.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002546 Matches zero or more characters at the beginning of the string");
2547
2548PyDoc_STRVAR(pattern_search_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002549"search(string[, pos[, endpos]]) -> match object or None.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002550 Scan through string looking for a match, and return a corresponding\n\
Andrew Svetlov0b64c142012-12-25 18:48:54 +02002551 match object instance. Return None if no position in the string matches.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002552
2553PyDoc_STRVAR(pattern_split_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002554"split(string[, maxsplit = 0]) -> list.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002555 Split string by the occurrences of pattern.");
2556
2557PyDoc_STRVAR(pattern_findall_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002558"findall(string[, pos[, endpos]]) -> list.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002559 Return a list of all non-overlapping matches of pattern in string.");
2560
2561PyDoc_STRVAR(pattern_finditer_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002562"finditer(string[, pos[, endpos]]) -> iterator.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002563 Return an iterator over all non-overlapping matches for the \n\
2564 RE pattern in string. For each match, the iterator returns a\n\
2565 match object.");
2566
2567PyDoc_STRVAR(pattern_sub_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002568"sub(repl, string[, count = 0]) -> newstring.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002569 Return the string obtained by replacing the leftmost non-overlapping\n\
Tim Peters3d563502006-01-21 02:47:53 +00002570 occurrences of pattern in string by the replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002571
2572PyDoc_STRVAR(pattern_subn_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002573"subn(repl, string[, count = 0]) -> (newstring, number of subs)\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002574 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2575 the leftmost non-overlapping occurrences of pattern with the\n\
Tim Peters3d563502006-01-21 02:47:53 +00002576 replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002577
2578PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2579
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002580static PyMethodDef pattern_methods[] = {
Tim Peters3d563502006-01-21 02:47:53 +00002581 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002582 pattern_match_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002583 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002584 pattern_search_doc},
Raymond Hettinger94478742004-09-24 04:31:19 +00002585 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002586 pattern_sub_doc},
Raymond Hettinger94478742004-09-24 04:31:19 +00002587 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002588 pattern_subn_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002589 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002590 pattern_split_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002591 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002592 pattern_findall_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002593#if PY_VERSION_HEX >= 0x02020000
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002594 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002595 pattern_finditer_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002596#endif
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002597 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS|METH_KEYWORDS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002598 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2599 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002600 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002601};
2602
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002603#define PAT_OFF(x) offsetof(PatternObject, x)
2604static PyMemberDef pattern_members[] = {
2605 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
2606 {"flags", T_INT, PAT_OFF(flags), READONLY},
2607 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
2608 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
2609 {NULL} /* Sentinel */
2610};
Guido van Rossumb700df92000-03-31 14:59:30 +00002611
Neal Norwitz57c179c2006-03-22 07:18:02 +00002612static PyTypeObject Pattern_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00002613 PyVarObject_HEAD_INIT(NULL, 0)
2614 "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002615 sizeof(PatternObject), sizeof(SRE_CODE),
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002616 (destructor)pattern_dealloc, /* tp_dealloc */
2617 0, /* tp_print */
2618 0, /* tp_getattr */
2619 0, /* tp_setattr */
2620 0, /* tp_reserved */
2621 0, /* tp_repr */
2622 0, /* tp_as_number */
2623 0, /* tp_as_sequence */
2624 0, /* tp_as_mapping */
2625 0, /* tp_hash */
2626 0, /* tp_call */
2627 0, /* tp_str */
2628 0, /* tp_getattro */
2629 0, /* tp_setattro */
2630 0, /* tp_as_buffer */
2631 Py_TPFLAGS_DEFAULT, /* tp_flags */
2632 pattern_doc, /* tp_doc */
2633 0, /* tp_traverse */
2634 0, /* tp_clear */
2635 0, /* tp_richcompare */
2636 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
2637 0, /* tp_iter */
2638 0, /* tp_iternext */
2639 pattern_methods, /* tp_methods */
2640 pattern_members, /* tp_members */
Guido van Rossumb700df92000-03-31 14:59:30 +00002641};
2642
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002643static int _validate(PatternObject *self); /* Forward */
2644
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002645static PyObject *
2646_compile(PyObject* self_, PyObject* args)
2647{
2648 /* "compile" pattern descriptor to pattern object */
2649
2650 PatternObject* self;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002651 Py_ssize_t i, n;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002652
2653 PyObject* pattern;
2654 int flags = 0;
2655 PyObject* code;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002656 Py_ssize_t groups = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002657 PyObject* groupindex = NULL;
2658 PyObject* indexgroup = NULL;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002659
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002660 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002661 &PyList_Type, &code, &groups,
2662 &groupindex, &indexgroup))
2663 return NULL;
2664
2665 n = PyList_GET_SIZE(code);
Christian Heimes587c2bf2008-01-19 16:21:02 +00002666 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002667 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2668 if (!self)
2669 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002670 self->weakreflist = NULL;
2671 self->pattern = NULL;
2672 self->groupindex = NULL;
2673 self->indexgroup = NULL;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002674 self->view.buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002675
2676 self->codesize = n;
2677
2678 for (i = 0; i < n; i++) {
2679 PyObject *o = PyList_GET_ITEM(code, i);
Guido van Rossumddefaf32007-01-14 03:31:43 +00002680 unsigned long value = PyLong_AsUnsignedLong(o);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002681 self->code[i] = (SRE_CODE) value;
2682 if ((unsigned long) self->code[i] != value) {
2683 PyErr_SetString(PyExc_OverflowError,
2684 "regular expression code size limit exceeded");
2685 break;
2686 }
2687 }
2688
2689 if (PyErr_Occurred()) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002690 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002691 return NULL;
2692 }
2693
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002694 if (pattern == Py_None) {
2695 self->logical_charsize = -1;
2696 self->charsize = -1;
Victor Stinner63ab8752011-11-22 03:31:20 +01002697 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002698 else {
2699 Py_ssize_t p_length;
2700 if (!getstring(pattern, &p_length, &self->logical_charsize,
Benjamin Peterson33d21a22012-03-07 14:59:13 -06002701 &self->charsize, &self->view)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002702 Py_DECREF(self);
2703 return NULL;
2704 }
2705 }
Antoine Pitroufd036452008-08-19 17:56:33 +00002706
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002707 Py_INCREF(pattern);
2708 self->pattern = pattern;
2709
2710 self->flags = flags;
2711
2712 self->groups = groups;
2713
2714 Py_XINCREF(groupindex);
2715 self->groupindex = groupindex;
2716
2717 Py_XINCREF(indexgroup);
2718 self->indexgroup = indexgroup;
2719
2720 self->weakreflist = NULL;
2721
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002722 if (!_validate(self)) {
2723 Py_DECREF(self);
2724 return NULL;
2725 }
2726
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002727 return (PyObject*) self;
2728}
2729
Guido van Rossumb700df92000-03-31 14:59:30 +00002730/* -------------------------------------------------------------------- */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002731/* Code validation */
2732
2733/* To learn more about this code, have a look at the _compile() function in
2734 Lib/sre_compile.py. The validation functions below checks the code array
2735 for conformance with the code patterns generated there.
2736
2737 The nice thing about the generated code is that it is position-independent:
2738 all jumps are relative jumps forward. Also, jumps don't cross each other:
2739 the target of a later jump is always earlier than the target of an earlier
2740 jump. IOW, this is okay:
2741
2742 J---------J-------T--------T
2743 \ \_____/ /
2744 \______________________/
2745
2746 but this is not:
2747
2748 J---------J-------T--------T
2749 \_________\_____/ /
2750 \____________/
2751
Serhiy Storchakaefa5a392013-10-27 08:04:58 +02002752 It also helps that SRE_CODE is always an unsigned type.
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002753*/
2754
2755/* Defining this one enables tracing of the validator */
2756#undef VVERBOSE
2757
2758/* Trace macro for the validator */
2759#if defined(VVERBOSE)
2760#define VTRACE(v) printf v
2761#else
Senthil Kumaran202a3c42011-10-20 02:15:36 +08002762#define VTRACE(v) do {} while(0) /* do nothing */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002763#endif
2764
2765/* Report failure */
2766#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2767
2768/* Extract opcode, argument, or skip count from code array */
2769#define GET_OP \
2770 do { \
2771 VTRACE(("%p: ", code)); \
2772 if (code >= end) FAIL; \
2773 op = *code++; \
2774 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2775 } while (0)
2776#define GET_ARG \
2777 do { \
2778 VTRACE(("%p= ", code)); \
2779 if (code >= end) FAIL; \
2780 arg = *code++; \
2781 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2782 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00002783#define GET_SKIP_ADJ(adj) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002784 do { \
2785 VTRACE(("%p= ", code)); \
2786 if (code >= end) FAIL; \
2787 skip = *code; \
2788 VTRACE(("%lu (skip to %p)\n", \
2789 (unsigned long)skip, code+skip)); \
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03002790 if (skip-adj > end-code) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002791 FAIL; \
2792 code++; \
2793 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00002794#define GET_SKIP GET_SKIP_ADJ(0)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002795
2796static int
2797_validate_charset(SRE_CODE *code, SRE_CODE *end)
2798{
2799 /* Some variables are manipulated by the macros above */
2800 SRE_CODE op;
2801 SRE_CODE arg;
2802 SRE_CODE offset;
2803 int i;
2804
2805 while (code < end) {
2806 GET_OP;
2807 switch (op) {
2808
2809 case SRE_OP_NEGATE:
2810 break;
2811
2812 case SRE_OP_LITERAL:
2813 GET_ARG;
2814 break;
2815
2816 case SRE_OP_RANGE:
2817 GET_ARG;
2818 GET_ARG;
2819 break;
2820
2821 case SRE_OP_CHARSET:
2822 offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03002823 if (offset > end-code)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002824 FAIL;
2825 code += offset;
2826 break;
2827
2828 case SRE_OP_BIGCHARSET:
2829 GET_ARG; /* Number of blocks */
2830 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03002831 if (offset > end-code)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002832 FAIL;
2833 /* Make sure that each byte points to a valid block */
2834 for (i = 0; i < 256; i++) {
2835 if (((unsigned char *)code)[i] >= arg)
2836 FAIL;
2837 }
2838 code += offset;
2839 offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03002840 if (offset > end-code)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002841 FAIL;
2842 code += offset;
2843 break;
2844
2845 case SRE_OP_CATEGORY:
2846 GET_ARG;
2847 switch (arg) {
2848 case SRE_CATEGORY_DIGIT:
2849 case SRE_CATEGORY_NOT_DIGIT:
2850 case SRE_CATEGORY_SPACE:
2851 case SRE_CATEGORY_NOT_SPACE:
2852 case SRE_CATEGORY_WORD:
2853 case SRE_CATEGORY_NOT_WORD:
2854 case SRE_CATEGORY_LINEBREAK:
2855 case SRE_CATEGORY_NOT_LINEBREAK:
2856 case SRE_CATEGORY_LOC_WORD:
2857 case SRE_CATEGORY_LOC_NOT_WORD:
2858 case SRE_CATEGORY_UNI_DIGIT:
2859 case SRE_CATEGORY_UNI_NOT_DIGIT:
2860 case SRE_CATEGORY_UNI_SPACE:
2861 case SRE_CATEGORY_UNI_NOT_SPACE:
2862 case SRE_CATEGORY_UNI_WORD:
2863 case SRE_CATEGORY_UNI_NOT_WORD:
2864 case SRE_CATEGORY_UNI_LINEBREAK:
2865 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2866 break;
2867 default:
2868 FAIL;
2869 }
2870 break;
2871
2872 default:
2873 FAIL;
2874
2875 }
2876 }
2877
2878 return 1;
2879}
2880
2881static int
2882_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2883{
2884 /* Some variables are manipulated by the macros above */
2885 SRE_CODE op;
2886 SRE_CODE arg;
2887 SRE_CODE skip;
2888
2889 VTRACE(("code=%p, end=%p\n", code, end));
2890
2891 if (code > end)
2892 FAIL;
2893
2894 while (code < end) {
2895 GET_OP;
2896 switch (op) {
2897
2898 case SRE_OP_MARK:
2899 /* We don't check whether marks are properly nested; the
2900 sre_match() code is robust even if they don't, and the worst
2901 you can get is nonsensical match results. */
2902 GET_ARG;
2903 if (arg > 2*groups+1) {
2904 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
2905 FAIL;
2906 }
2907 break;
2908
2909 case SRE_OP_LITERAL:
2910 case SRE_OP_NOT_LITERAL:
2911 case SRE_OP_LITERAL_IGNORE:
2912 case SRE_OP_NOT_LITERAL_IGNORE:
2913 GET_ARG;
2914 /* The arg is just a character, nothing to check */
2915 break;
2916
2917 case SRE_OP_SUCCESS:
2918 case SRE_OP_FAILURE:
2919 /* Nothing to check; these normally end the matching process */
2920 break;
2921
2922 case SRE_OP_AT:
2923 GET_ARG;
2924 switch (arg) {
2925 case SRE_AT_BEGINNING:
2926 case SRE_AT_BEGINNING_STRING:
2927 case SRE_AT_BEGINNING_LINE:
2928 case SRE_AT_END:
2929 case SRE_AT_END_LINE:
2930 case SRE_AT_END_STRING:
2931 case SRE_AT_BOUNDARY:
2932 case SRE_AT_NON_BOUNDARY:
2933 case SRE_AT_LOC_BOUNDARY:
2934 case SRE_AT_LOC_NON_BOUNDARY:
2935 case SRE_AT_UNI_BOUNDARY:
2936 case SRE_AT_UNI_NON_BOUNDARY:
2937 break;
2938 default:
2939 FAIL;
2940 }
2941 break;
2942
2943 case SRE_OP_ANY:
2944 case SRE_OP_ANY_ALL:
2945 /* These have no operands */
2946 break;
2947
2948 case SRE_OP_IN:
2949 case SRE_OP_IN_IGNORE:
2950 GET_SKIP;
2951 /* Stop 1 before the end; we check the FAILURE below */
2952 if (!_validate_charset(code, code+skip-2))
2953 FAIL;
2954 if (code[skip-2] != SRE_OP_FAILURE)
2955 FAIL;
2956 code += skip-1;
2957 break;
2958
2959 case SRE_OP_INFO:
2960 {
2961 /* A minimal info field is
2962 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2963 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2964 more follows. */
Ross Lagerwall88748d72012-03-06 21:48:57 +02002965 SRE_CODE flags, i;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002966 SRE_CODE *newcode;
2967 GET_SKIP;
2968 newcode = code+skip-1;
2969 GET_ARG; flags = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02002970 GET_ARG;
2971 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002972 /* Check that only valid flags are present */
2973 if ((flags & ~(SRE_INFO_PREFIX |
2974 SRE_INFO_LITERAL |
2975 SRE_INFO_CHARSET)) != 0)
2976 FAIL;
2977 /* PREFIX and CHARSET are mutually exclusive */
2978 if ((flags & SRE_INFO_PREFIX) &&
2979 (flags & SRE_INFO_CHARSET))
2980 FAIL;
2981 /* LITERAL implies PREFIX */
2982 if ((flags & SRE_INFO_LITERAL) &&
2983 !(flags & SRE_INFO_PREFIX))
2984 FAIL;
2985 /* Validate the prefix */
2986 if (flags & SRE_INFO_PREFIX) {
Ross Lagerwall88748d72012-03-06 21:48:57 +02002987 SRE_CODE prefix_len;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002988 GET_ARG; prefix_len = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02002989 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002990 /* Here comes the prefix string */
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03002991 if (prefix_len > newcode-code)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002992 FAIL;
2993 code += prefix_len;
2994 /* And here comes the overlap table */
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03002995 if (prefix_len > newcode-code)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002996 FAIL;
2997 /* Each overlap value should be < prefix_len */
2998 for (i = 0; i < prefix_len; i++) {
2999 if (code[i] >= prefix_len)
3000 FAIL;
3001 }
3002 code += prefix_len;
3003 }
3004 /* Validate the charset */
3005 if (flags & SRE_INFO_CHARSET) {
3006 if (!_validate_charset(code, newcode-1))
3007 FAIL;
3008 if (newcode[-1] != SRE_OP_FAILURE)
3009 FAIL;
3010 code = newcode;
3011 }
3012 else if (code != newcode) {
3013 VTRACE(("code=%p, newcode=%p\n", code, newcode));
3014 FAIL;
3015 }
3016 }
3017 break;
3018
3019 case SRE_OP_BRANCH:
3020 {
3021 SRE_CODE *target = NULL;
3022 for (;;) {
3023 GET_SKIP;
3024 if (skip == 0)
3025 break;
3026 /* Stop 2 before the end; we check the JUMP below */
3027 if (!_validate_inner(code, code+skip-3, groups))
3028 FAIL;
3029 code += skip-3;
3030 /* Check that it ends with a JUMP, and that each JUMP
3031 has the same target */
3032 GET_OP;
3033 if (op != SRE_OP_JUMP)
3034 FAIL;
3035 GET_SKIP;
3036 if (target == NULL)
3037 target = code+skip-1;
3038 else if (code+skip-1 != target)
3039 FAIL;
3040 }
3041 }
3042 break;
3043
3044 case SRE_OP_REPEAT_ONE:
3045 case SRE_OP_MIN_REPEAT_ONE:
3046 {
3047 SRE_CODE min, max;
3048 GET_SKIP;
3049 GET_ARG; min = arg;
3050 GET_ARG; max = arg;
3051 if (min > max)
3052 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02003053 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003054 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003055 if (!_validate_inner(code, code+skip-4, groups))
3056 FAIL;
3057 code += skip-4;
3058 GET_OP;
3059 if (op != SRE_OP_SUCCESS)
3060 FAIL;
3061 }
3062 break;
3063
3064 case SRE_OP_REPEAT:
3065 {
3066 SRE_CODE min, max;
3067 GET_SKIP;
3068 GET_ARG; min = arg;
3069 GET_ARG; max = arg;
3070 if (min > max)
3071 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02003072 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003073 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003074 if (!_validate_inner(code, code+skip-3, groups))
3075 FAIL;
3076 code += skip-3;
3077 GET_OP;
3078 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3079 FAIL;
3080 }
3081 break;
3082
3083 case SRE_OP_GROUPREF:
3084 case SRE_OP_GROUPREF_IGNORE:
3085 GET_ARG;
3086 if (arg >= groups)
3087 FAIL;
3088 break;
3089
3090 case SRE_OP_GROUPREF_EXISTS:
3091 /* The regex syntax for this is: '(?(group)then|else)', where
3092 'group' is either an integer group number or a group name,
3093 'then' and 'else' are sub-regexes, and 'else' is optional. */
3094 GET_ARG;
3095 if (arg >= groups)
3096 FAIL;
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00003097 GET_SKIP_ADJ(1);
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003098 code--; /* The skip is relative to the first arg! */
3099 /* There are two possibilities here: if there is both a 'then'
3100 part and an 'else' part, the generated code looks like:
3101
3102 GROUPREF_EXISTS
3103 <group>
3104 <skipyes>
3105 ...then part...
3106 JUMP
3107 <skipno>
3108 (<skipyes> jumps here)
3109 ...else part...
3110 (<skipno> jumps here)
3111
3112 If there is only a 'then' part, it looks like:
3113
3114 GROUPREF_EXISTS
3115 <group>
3116 <skip>
3117 ...then part...
3118 (<skip> jumps here)
3119
3120 There is no direct way to decide which it is, and we don't want
3121 to allow arbitrary jumps anywhere in the code; so we just look
3122 for a JUMP opcode preceding our skip target.
3123 */
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03003124 if (skip >= 3 && skip-3 < end-code &&
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003125 code[skip-3] == SRE_OP_JUMP)
3126 {
3127 VTRACE(("both then and else parts present\n"));
3128 if (!_validate_inner(code+1, code+skip-3, groups))
3129 FAIL;
3130 code += skip-2; /* Position after JUMP, at <skipno> */
3131 GET_SKIP;
3132 if (!_validate_inner(code, code+skip-1, groups))
3133 FAIL;
3134 code += skip-1;
3135 }
3136 else {
3137 VTRACE(("only a then part present\n"));
3138 if (!_validate_inner(code+1, code+skip-1, groups))
3139 FAIL;
3140 code += skip-1;
3141 }
3142 break;
3143
3144 case SRE_OP_ASSERT:
3145 case SRE_OP_ASSERT_NOT:
3146 GET_SKIP;
3147 GET_ARG; /* 0 for lookahead, width for lookbehind */
3148 code--; /* Back up over arg to simplify math below */
3149 if (arg & 0x80000000)
3150 FAIL; /* Width too large */
3151 /* Stop 1 before the end; we check the SUCCESS below */
3152 if (!_validate_inner(code+1, code+skip-2, groups))
3153 FAIL;
3154 code += skip-2;
3155 GET_OP;
3156 if (op != SRE_OP_SUCCESS)
3157 FAIL;
3158 break;
3159
3160 default:
3161 FAIL;
3162
3163 }
3164 }
3165
3166 VTRACE(("okay\n"));
3167 return 1;
3168}
3169
3170static int
3171_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3172{
3173 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3174 FAIL;
3175 if (groups == 0) /* fix for simplejson */
3176 groups = 100; /* 100 groups should always be safe */
3177 return _validate_inner(code, end-1, groups);
3178}
3179
3180static int
3181_validate(PatternObject *self)
3182{
3183 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3184 {
3185 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3186 return 0;
3187 }
3188 else
3189 VTRACE(("Success!\n"));
3190 return 1;
3191}
3192
3193/* -------------------------------------------------------------------- */
Guido van Rossumb700df92000-03-31 14:59:30 +00003194/* match methods */
3195
3196static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003197match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003198{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003199 Py_XDECREF(self->regs);
3200 Py_XDECREF(self->string);
3201 Py_DECREF(self->pattern);
3202 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00003203}
3204
3205static PyObject*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003206match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00003207{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003208 if (index < 0 || index >= self->groups) {
3209 /* raise IndexError if we were given a bad group number */
3210 PyErr_SetString(
3211 PyExc_IndexError,
3212 "no such group"
3213 );
3214 return NULL;
3215 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003216
Fredrik Lundh6f013982000-07-03 18:44:21 +00003217 index *= 2;
3218
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003219 if (self->string == Py_None || self->mark[index] < 0) {
3220 /* return default value if the string or group is undefined */
3221 Py_INCREF(def);
3222 return def;
3223 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003224
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003225 return PySequence_GetSlice(
3226 self->string, self->mark[index], self->mark[index+1]
3227 );
Guido van Rossumb700df92000-03-31 14:59:30 +00003228}
3229
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003230static Py_ssize_t
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003231match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00003232{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003233 Py_ssize_t i;
Guido van Rossumb700df92000-03-31 14:59:30 +00003234
Guido van Rossumddefaf32007-01-14 03:31:43 +00003235 if (index == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003236 /* Default value */
3237 return 0;
Guido van Rossumddefaf32007-01-14 03:31:43 +00003238
Christian Heimes217cfd12007-12-02 14:31:20 +00003239 if (PyLong_Check(index))
3240 return PyLong_AsSsize_t(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00003241
Fredrik Lundh6f013982000-07-03 18:44:21 +00003242 i = -1;
3243
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003244 if (self->pattern->groupindex) {
3245 index = PyObject_GetItem(self->pattern->groupindex, index);
3246 if (index) {
Neal Norwitz1fe5f382007-08-31 04:32:55 +00003247 if (PyLong_Check(index))
Christian Heimes217cfd12007-12-02 14:31:20 +00003248 i = PyLong_AsSsize_t(index);
Fredrik Lundh6f013982000-07-03 18:44:21 +00003249 Py_DECREF(index);
3250 } else
3251 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003252 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00003253
3254 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003255}
3256
3257static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003258match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003259{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003260 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00003261}
3262
3263static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003264match_expand(MatchObject* self, PyObject* ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003265{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003266 /* delegate to Python code */
3267 return call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00003268 SRE_PY_MODULE, "_expand",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003269 PyTuple_Pack(3, self->pattern, self, ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003270 );
3271}
3272
3273static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003274match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003275{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003276 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003277 Py_ssize_t i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00003278
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003279 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00003280
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003281 switch (size) {
3282 case 0:
3283 result = match_getslice(self, Py_False, Py_None);
3284 break;
3285 case 1:
3286 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3287 break;
3288 default:
3289 /* fetch multiple items */
3290 result = PyTuple_New(size);
3291 if (!result)
3292 return NULL;
3293 for (i = 0; i < size; i++) {
3294 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003295 self, PyTuple_GET_ITEM(args, i), Py_None
3296 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003297 if (!item) {
3298 Py_DECREF(result);
3299 return NULL;
3300 }
3301 PyTuple_SET_ITEM(result, i, item);
3302 }
3303 break;
3304 }
3305 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003306}
3307
3308static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003309match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003310{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003311 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003312 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003313
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003314 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003315 static char* kwlist[] = { "default", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00003316 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003317 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003318
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003319 result = PyTuple_New(self->groups-1);
3320 if (!result)
3321 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003322
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003323 for (index = 1; index < self->groups; index++) {
3324 PyObject* item;
3325 item = match_getslice_by_index(self, index, def);
3326 if (!item) {
3327 Py_DECREF(result);
3328 return NULL;
3329 }
3330 PyTuple_SET_ITEM(result, index-1, item);
3331 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003332
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003333 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003334}
3335
3336static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003337match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003338{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003339 PyObject* result;
3340 PyObject* keys;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003341 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003342
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003343 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003344 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00003345 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003346 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003347
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003348 result = PyDict_New();
3349 if (!result || !self->pattern->groupindex)
3350 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003351
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003352 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003353 if (!keys)
3354 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00003355
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003356 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00003357 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003358 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003359 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003360 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003361 if (!key)
3362 goto failed;
3363 value = match_getslice(self, key, def);
3364 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003365 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003366 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003367 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00003368 status = PyDict_SetItem(result, key, value);
3369 Py_DECREF(value);
3370 if (status < 0)
3371 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003372 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003373
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003374 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00003375
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003376 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003377
3378failed:
Neal Norwitz60da3162006-03-07 04:48:24 +00003379 Py_XDECREF(keys);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003380 Py_DECREF(result);
3381 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003382}
3383
3384static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003385match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003386{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003387 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003388
Guido van Rossumddefaf32007-01-14 03:31:43 +00003389 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003390 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003391 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003392
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003393 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003394
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003395 if (index < 0 || index >= self->groups) {
3396 PyErr_SetString(
3397 PyExc_IndexError,
3398 "no such group"
3399 );
3400 return NULL;
3401 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003402
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003403 /* mark is -1 if group is undefined */
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01003404 return PyLong_FromSsize_t(self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00003405}
3406
3407static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003408match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003409{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003410 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003411
Guido van Rossumddefaf32007-01-14 03:31:43 +00003412 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003413 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003414 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003415
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003416 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003417
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003418 if (index < 0 || index >= self->groups) {
3419 PyErr_SetString(
3420 PyExc_IndexError,
3421 "no such group"
3422 );
3423 return NULL;
3424 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003425
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003426 /* mark is -1 if group is undefined */
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01003427 return PyLong_FromSsize_t(self->mark[index*2+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003428}
3429
3430LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003431_pair(Py_ssize_t i1, Py_ssize_t i2)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003432{
3433 PyObject* pair;
3434 PyObject* item;
3435
3436 pair = PyTuple_New(2);
3437 if (!pair)
3438 return NULL;
3439
Christian Heimes217cfd12007-12-02 14:31:20 +00003440 item = PyLong_FromSsize_t(i1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003441 if (!item)
3442 goto error;
3443 PyTuple_SET_ITEM(pair, 0, item);
3444
Christian Heimes217cfd12007-12-02 14:31:20 +00003445 item = PyLong_FromSsize_t(i2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003446 if (!item)
3447 goto error;
3448 PyTuple_SET_ITEM(pair, 1, item);
3449
3450 return pair;
3451
3452 error:
3453 Py_DECREF(pair);
3454 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003455}
3456
3457static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003458match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003459{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003460 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003461
Guido van Rossumddefaf32007-01-14 03:31:43 +00003462 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003463 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003464 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003465
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003466 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003467
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003468 if (index < 0 || index >= self->groups) {
3469 PyErr_SetString(
3470 PyExc_IndexError,
3471 "no such group"
3472 );
3473 return NULL;
3474 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003475
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003476 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003477 return _pair(self->mark[index*2], self->mark[index*2+1]);
3478}
3479
3480static PyObject*
3481match_regs(MatchObject* self)
3482{
3483 PyObject* regs;
3484 PyObject* item;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003485 Py_ssize_t index;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003486
3487 regs = PyTuple_New(self->groups);
3488 if (!regs)
3489 return NULL;
3490
3491 for (index = 0; index < self->groups; index++) {
3492 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3493 if (!item) {
3494 Py_DECREF(regs);
3495 return NULL;
3496 }
3497 PyTuple_SET_ITEM(regs, index, item);
3498 }
3499
3500 Py_INCREF(regs);
3501 self->regs = regs;
3502
3503 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00003504}
3505
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003506static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003507match_copy(MatchObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003508{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003509#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003510 MatchObject* copy;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003511 Py_ssize_t slots, offset;
Tim Peters3d563502006-01-21 02:47:53 +00003512
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003513 slots = 2 * (self->pattern->groups+1);
3514
3515 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3516 if (!copy)
3517 return NULL;
3518
3519 /* this value a constant, but any compiler should be able to
3520 figure that out all by itself */
3521 offset = offsetof(MatchObject, string);
3522
3523 Py_XINCREF(self->pattern);
3524 Py_XINCREF(self->string);
3525 Py_XINCREF(self->regs);
3526
3527 memcpy((char*) copy + offset, (char*) self + offset,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003528 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003529
3530 return (PyObject*) copy;
3531#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003532 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003533 return NULL;
3534#endif
3535}
3536
3537static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003538match_deepcopy(MatchObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003539{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003540#ifdef USE_BUILTIN_COPY
3541 MatchObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00003542
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003543 copy = (MatchObject*) match_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003544 if (!copy)
3545 return NULL;
3546
3547 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3548 !deepcopy(&copy->string, memo) ||
3549 !deepcopy(&copy->regs, memo)) {
3550 Py_DECREF(copy);
3551 return NULL;
3552 }
3553
3554#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003555 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3556 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003557#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003558}
3559
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003560PyDoc_STRVAR(match_doc,
3561"The result of re.match() and re.search().\n\
3562Match objects always have a boolean value of True.");
3563
3564PyDoc_STRVAR(match_group_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003565"group([group1, ...]) -> str or tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003566 Return subgroup(s) of the match by indices or names.\n\
3567 For 0 returns the entire match.");
3568
3569PyDoc_STRVAR(match_start_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003570"start([group=0]) -> int.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003571 Return index of the start of the substring matched by group.");
3572
3573PyDoc_STRVAR(match_end_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003574"end([group=0]) -> int.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003575 Return index of the end of the substring matched by group.");
3576
3577PyDoc_STRVAR(match_span_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003578"span([group]) -> tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003579 For MatchObject m, return the 2-tuple (m.start(group), m.end(group)).");
3580
3581PyDoc_STRVAR(match_groups_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003582"groups([default=None]) -> tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003583 Return a tuple containing all the subgroups of the match, from 1.\n\
3584 The default argument is used for groups\n\
3585 that did not participate in the match");
3586
3587PyDoc_STRVAR(match_groupdict_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003588"groupdict([default=None]) -> dict.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003589 Return a dictionary containing all the named subgroups of the match,\n\
3590 keyed by the subgroup name. The default argument is used for groups\n\
3591 that did not participate in the match");
3592
3593PyDoc_STRVAR(match_expand_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003594"expand(template) -> str.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003595 Return the string obtained by doing backslash substitution\n\
3596 on the string template, as done by the sub() method.");
3597
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003598static PyMethodDef match_methods[] = {
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003599 {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
3600 {"start", (PyCFunction) match_start, METH_VARARGS, match_start_doc},
3601 {"end", (PyCFunction) match_end, METH_VARARGS, match_end_doc},
3602 {"span", (PyCFunction) match_span, METH_VARARGS, match_span_doc},
3603 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS,
3604 match_groups_doc},
3605 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS,
3606 match_groupdict_doc},
3607 {"expand", (PyCFunction) match_expand, METH_O, match_expand_doc},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003608 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3609 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003610 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003611};
3612
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003613static PyObject *
3614match_lastindex_get(MatchObject *self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003615{
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003616 if (self->lastindex >= 0)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01003617 return PyLong_FromSsize_t(self->lastindex);
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003618 Py_INCREF(Py_None);
3619 return Py_None;
Guido van Rossumb700df92000-03-31 14:59:30 +00003620}
3621
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003622static PyObject *
3623match_lastgroup_get(MatchObject *self)
3624{
3625 if (self->pattern->indexgroup && self->lastindex >= 0) {
3626 PyObject* result = PySequence_GetItem(
3627 self->pattern->indexgroup, self->lastindex
3628 );
3629 if (result)
3630 return result;
3631 PyErr_Clear();
3632 }
3633 Py_INCREF(Py_None);
3634 return Py_None;
3635}
3636
3637static PyObject *
3638match_regs_get(MatchObject *self)
3639{
3640 if (self->regs) {
3641 Py_INCREF(self->regs);
3642 return self->regs;
3643 } else
3644 return match_regs(self);
3645}
3646
3647static PyGetSetDef match_getset[] = {
3648 {"lastindex", (getter)match_lastindex_get, (setter)NULL},
3649 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
3650 {"regs", (getter)match_regs_get, (setter)NULL},
3651 {NULL}
3652};
3653
3654#define MATCH_OFF(x) offsetof(MatchObject, x)
3655static PyMemberDef match_members[] = {
3656 {"string", T_OBJECT, MATCH_OFF(string), READONLY},
3657 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
3658 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
3659 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
3660 {NULL}
3661};
3662
Guido van Rossumb700df92000-03-31 14:59:30 +00003663/* FIXME: implement setattr("string", None) as a special case (to
3664 detach the associated string, if any */
3665
Neal Norwitz57c179c2006-03-22 07:18:02 +00003666static PyTypeObject Match_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003667 PyVarObject_HEAD_INIT(NULL,0)
3668 "_" SRE_MODULE ".SRE_Match",
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003669 sizeof(MatchObject), sizeof(Py_ssize_t),
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003670 (destructor)match_dealloc, /* tp_dealloc */
3671 0, /* tp_print */
3672 0, /* tp_getattr */
3673 0, /* tp_setattr */
3674 0, /* tp_reserved */
3675 0, /* tp_repr */
3676 0, /* tp_as_number */
3677 0, /* tp_as_sequence */
3678 0, /* tp_as_mapping */
3679 0, /* tp_hash */
3680 0, /* tp_call */
3681 0, /* tp_str */
3682 0, /* tp_getattro */
3683 0, /* tp_setattro */
3684 0, /* tp_as_buffer */
3685 Py_TPFLAGS_DEFAULT, /* tp_flags */
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003686 match_doc, /* tp_doc */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003687 0, /* tp_traverse */
3688 0, /* tp_clear */
3689 0, /* tp_richcompare */
3690 0, /* tp_weaklistoffset */
3691 0, /* tp_iter */
3692 0, /* tp_iternext */
3693 match_methods, /* tp_methods */
3694 match_members, /* tp_members */
3695 match_getset, /* tp_getset */
Guido van Rossumb700df92000-03-31 14:59:30 +00003696};
3697
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003698static PyObject*
3699pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3700{
3701 /* create match object (from state object) */
3702
3703 MatchObject* match;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003704 Py_ssize_t i, j;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003705 char* base;
3706 int n;
3707
3708 if (status > 0) {
3709
3710 /* create match object (with room for extra group marks) */
Christian Heimes587c2bf2008-01-19 16:21:02 +00003711 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003712 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3713 2*(pattern->groups+1));
3714 if (!match)
3715 return NULL;
3716
3717 Py_INCREF(pattern);
3718 match->pattern = pattern;
3719
3720 Py_INCREF(state->string);
3721 match->string = state->string;
3722
3723 match->regs = NULL;
3724 match->groups = pattern->groups+1;
3725
3726 /* fill in group slices */
3727
3728 base = (char*) state->beginning;
3729 n = state->charsize;
3730
3731 match->mark[0] = ((char*) state->start - base) / n;
3732 match->mark[1] = ((char*) state->ptr - base) / n;
3733
3734 for (i = j = 0; i < pattern->groups; i++, j+=2)
3735 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3736 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3737 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3738 } else
3739 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3740
3741 match->pos = state->pos;
3742 match->endpos = state->endpos;
3743
3744 match->lastindex = state->lastindex;
3745
3746 return (PyObject*) match;
3747
3748 } else if (status == 0) {
3749
3750 /* no match */
3751 Py_INCREF(Py_None);
3752 return Py_None;
3753
3754 }
3755
3756 /* internal error */
3757 pattern_error(status);
3758 return NULL;
3759}
3760
3761
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003762/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003763/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003764
3765static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003766scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003767{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003768 state_fini(&self->state);
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003769 Py_XDECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003770 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003771}
3772
3773static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003774scanner_match(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003775{
3776 SRE_STATE* state = &self->state;
3777 PyObject* match;
3778 int status;
3779
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003780 state_reset(state);
3781
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003782 state->ptr = state->start;
3783
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003784 if (state->logical_charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003785 status = sre_match(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003786 } else {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003787 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003788 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003789 if (PyErr_Occurred())
3790 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003791
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003792 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003793 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003794
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003795 if (status == 0 || state->ptr == state->start)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003796 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003797 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003798 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003799
3800 return match;
3801}
3802
3803
3804static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003805scanner_search(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003806{
3807 SRE_STATE* state = &self->state;
3808 PyObject* match;
3809 int status;
3810
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003811 state_reset(state);
3812
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003813 state->ptr = state->start;
3814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003815 if (state->logical_charsize == 1) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003816 status = sre_search(state, PatternObject_GetCode(self->pattern));
3817 } else {
3818 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
3819 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003820 if (PyErr_Occurred())
3821 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003822
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003823 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003824 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003825
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003826 if (status == 0 || state->ptr == state->start)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003827 state->start = (void*) ((char*) state->ptr + state->charsize);
3828 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003829 state->start = state->ptr;
3830
3831 return match;
3832}
3833
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003834static PyMethodDef scanner_methods[] = {
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003835 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3836 {"search", (PyCFunction) scanner_search, METH_NOARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003837 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003838};
3839
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003840#define SCAN_OFF(x) offsetof(ScannerObject, x)
3841static PyMemberDef scanner_members[] = {
Ezio Melotti7c8c1ea2011-09-29 01:00:19 +03003842 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003843 {NULL} /* Sentinel */
3844};
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003845
Neal Norwitz57c179c2006-03-22 07:18:02 +00003846static PyTypeObject Scanner_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003847 PyVarObject_HEAD_INIT(NULL, 0)
3848 "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003849 sizeof(ScannerObject), 0,
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003850 (destructor)scanner_dealloc,/* tp_dealloc */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003851 0, /* tp_print */
3852 0, /* tp_getattr */
3853 0, /* tp_setattr */
3854 0, /* tp_reserved */
3855 0, /* tp_repr */
3856 0, /* tp_as_number */
3857 0, /* tp_as_sequence */
3858 0, /* tp_as_mapping */
3859 0, /* tp_hash */
3860 0, /* tp_call */
3861 0, /* tp_str */
3862 0, /* tp_getattro */
3863 0, /* tp_setattro */
3864 0, /* tp_as_buffer */
3865 Py_TPFLAGS_DEFAULT, /* tp_flags */
3866 0, /* tp_doc */
3867 0, /* tp_traverse */
3868 0, /* tp_clear */
3869 0, /* tp_richcompare */
3870 0, /* tp_weaklistoffset */
3871 0, /* tp_iter */
3872 0, /* tp_iternext */
3873 scanner_methods, /* tp_methods */
3874 scanner_members, /* tp_members */
3875 0, /* tp_getset */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003876};
3877
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003878static PyObject*
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06003879pattern_scanner(PatternObject* pattern, PyObject* args, PyObject* kw)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003880{
3881 /* create search state object */
3882
3883 ScannerObject* self;
3884
3885 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003886 Py_ssize_t start = 0;
3887 Py_ssize_t end = PY_SSIZE_T_MAX;
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06003888 static char* kwlist[] = { "source", "pos", "endpos", NULL };
3889 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:scanner", kwlist,
3890 &string, &start, &end))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003891 return NULL;
3892
3893 /* create scanner object */
3894 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3895 if (!self)
3896 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003897 self->pattern = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003898
3899 string = state_init(&self->state, pattern, string, start, end);
3900 if (!string) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003901 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003902 return NULL;
3903 }
3904
3905 Py_INCREF(pattern);
3906 self->pattern = (PyObject*) pattern;
3907
3908 return (PyObject*) self;
3909}
3910
Guido van Rossumb700df92000-03-31 14:59:30 +00003911static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00003912 {"compile", _compile, METH_VARARGS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003913 {"getcodesize", sre_codesize, METH_NOARGS},
Neal Norwitzb0493252002-03-31 14:44:22 +00003914 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003915 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003916};
3917
Martin v. Löwis1a214512008-06-11 05:26:20 +00003918static struct PyModuleDef sremodule = {
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003919 PyModuleDef_HEAD_INIT,
3920 "_" SRE_MODULE,
3921 NULL,
3922 -1,
3923 _functions,
3924 NULL,
3925 NULL,
3926 NULL,
3927 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00003928};
3929
3930PyMODINIT_FUNC PyInit__sre(void)
Guido van Rossumb700df92000-03-31 14:59:30 +00003931{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003932 PyObject* m;
3933 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00003934 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003935
Benjamin Peterson08bf91c2010-04-11 16:12:57 +00003936 /* Patch object types */
3937 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
3938 PyType_Ready(&Scanner_Type))
Martin v. Löwis1a214512008-06-11 05:26:20 +00003939 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003940
Martin v. Löwis1a214512008-06-11 05:26:20 +00003941 m = PyModule_Create(&sremodule);
Neal Norwitz1ac754f2006-01-19 06:09:39 +00003942 if (m == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003943 return NULL;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003944 d = PyModule_GetDict(m);
3945
Christian Heimes217cfd12007-12-02 14:31:20 +00003946 x = PyLong_FromLong(SRE_MAGIC);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003947 if (x) {
3948 PyDict_SetItemString(d, "MAGIC", x);
3949 Py_DECREF(x);
3950 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00003951
Christian Heimes217cfd12007-12-02 14:31:20 +00003952 x = PyLong_FromLong(sizeof(SRE_CODE));
Martin v. Löwis78e2f062003-04-19 12:56:08 +00003953 if (x) {
3954 PyDict_SetItemString(d, "CODESIZE", x);
3955 Py_DECREF(x);
3956 }
3957
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02003958 x = PyLong_FromUnsignedLong(SRE_MAXREPEAT);
3959 if (x) {
3960 PyDict_SetItemString(d, "MAXREPEAT", x);
3961 Py_DECREF(x);
3962 }
3963
Neal Norwitzfe537132007-08-26 03:55:15 +00003964 x = PyUnicode_FromString(copyright);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003965 if (x) {
3966 PyDict_SetItemString(d, "copyright", x);
3967 Py_DECREF(x);
3968 }
Martin v. Löwis1a214512008-06-11 05:26:20 +00003969 return m;
Guido van Rossumb700df92000-03-31 14:59:30 +00003970}
3971
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003972#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00003973
3974/* vim:ts=4:sw=4:et
3975*/